| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 | // SPDX-License-Identifier: GPL-2.0 struct io_timeout_data { struct io_kiocb *req; struct hrtimer timer; struct timespec64 ts; enum hrtimer_mode mode; u32 flags; }; struct io_kiocb *__io_disarm_linked_timeout(struct io_kiocb *req, struct io_kiocb *link); static inline struct io_kiocb *io_disarm_linked_timeout(struct io_kiocb *req) { struct io_kiocb *link = req->link; if (link && link->opcode == IORING_OP_LINK_TIMEOUT) return __io_disarm_linked_timeout(req, link); return NULL; } __cold void io_flush_timeouts(struct io_ring_ctx *ctx); struct io_cancel_data; int io_timeout_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd); __cold bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk, bool cancel_all); void io_queue_linked_timeout(struct io_kiocb *req); void io_disarm_next(struct io_kiocb *req); int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_link_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_timeout(struct io_kiocb *req, unsigned int issue_flags); int io_timeout_remove_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags); |
| 1 23 22 13 12 1 11 23 24 24 23 4 3 1 23 23 23 23 23 4 21 21 20 21 19 5 21 23 126 127 126 126 125 7 126 1 38 38 1 37 37 36 4 36 5 5 36 8 38 8 8 7 1 6 7 1 6 1 28 14 14 10 9 13 2 24 5 21 35 23 22 23 4 3 3 3 5 3 4 3 8 8 8 3 8 8 8 2 2 2 2 8 4 6 1 6 4 4 4 2 1 8 8 9 2 9 1 1 2 2 9 7 7 7 7 7 7 1 1 1 1 1 7 1 7 7 1 7 7 1 7 1 1 1 1 1 7 7 1 28 29 28 27 2 24 24 5 5 2 1 1 1 4 4 4 1 29 3 4 1 4 4 4 4 3 2 2 5 4 4 4 3 3 3 3 3 3 3 2 2 2 3 2 2 2 2 2 1 4 4 1 1 1 2 1 1 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 | // SPDX-License-Identifier: GPL-2.0-or-later /* * common UDP/RAW code * Linux INET6 implementation * * Authors: * Pedro Roque <roque@di.fc.ul.pt> */ #include <linux/capability.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/interrupt.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/in6.h> #include <linux/ipv6.h> #include <linux/route.h> #include <linux/slab.h> #include <linux/export.h> #include <linux/icmp.h> #include <net/ipv6.h> #include <net/ndisc.h> #include <net/addrconf.h> #include <net/transp_v6.h> #include <net/ip6_route.h> #include <net/tcp_states.h> #include <net/dsfield.h> #include <net/sock_reuseport.h> #include <linux/errqueue.h> #include <linux/uaccess.h> static bool ipv6_mapped_addr_any(const struct in6_addr *a) { return ipv6_addr_v4mapped(a) && (a->s6_addr32[3] == 0); } static void ip6_datagram_flow_key_init(struct flowi6 *fl6, const struct sock *sk) { const struct inet_sock *inet = inet_sk(sk); const struct ipv6_pinfo *np = inet6_sk(sk); int oif = sk->sk_bound_dev_if; memset(fl6, 0, sizeof(*fl6)); fl6->flowi6_proto = sk->sk_protocol; fl6->daddr = sk->sk_v6_daddr; fl6->saddr = np->saddr; fl6->flowi6_mark = sk->sk_mark; fl6->fl6_dport = inet->inet_dport; fl6->fl6_sport = inet->inet_sport; fl6->flowlabel = ip6_make_flowinfo(np->tclass, np->flow_label); fl6->flowi6_uid = sk->sk_uid; if (!oif) oif = np->sticky_pktinfo.ipi6_ifindex; if (!oif) { if (ipv6_addr_is_multicast(&fl6->daddr)) oif = READ_ONCE(np->mcast_oif); else oif = READ_ONCE(np->ucast_oif); } fl6->flowi6_oif = oif; security_sk_classify_flow(sk, flowi6_to_flowi_common(fl6)); } int ip6_datagram_dst_update(struct sock *sk, bool fix_sk_saddr) { struct ip6_flowlabel *flowlabel = NULL; struct in6_addr *final_p, final; struct ipv6_txoptions *opt; struct dst_entry *dst; struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); struct flowi6 fl6; int err = 0; if (inet6_test_bit(SNDFLOW, sk) && (np->flow_label & IPV6_FLOWLABEL_MASK)) { flowlabel = fl6_sock_lookup(sk, np->flow_label); if (IS_ERR(flowlabel)) return -EINVAL; } ip6_datagram_flow_key_init(&fl6, sk); rcu_read_lock(); opt = flowlabel ? flowlabel->opt : rcu_dereference(np->opt); final_p = fl6_update_dst(&fl6, opt, &final); rcu_read_unlock(); dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p); if (IS_ERR(dst)) { err = PTR_ERR(dst); goto out; } if (fix_sk_saddr) { if (ipv6_addr_any(&np->saddr)) np->saddr = fl6.saddr; if (ipv6_addr_any(&sk->sk_v6_rcv_saddr)) { sk->sk_v6_rcv_saddr = fl6.saddr; inet->inet_rcv_saddr = LOOPBACK4_IPV6; if (sk->sk_prot->rehash) sk->sk_prot->rehash(sk); } } ip6_sk_dst_store_flow(sk, dst, &fl6); out: fl6_sock_release(flowlabel); return err; } void ip6_datagram_release_cb(struct sock *sk) { struct dst_entry *dst; if (ipv6_addr_v4mapped(&sk->sk_v6_daddr)) return; rcu_read_lock(); dst = __sk_dst_get(sk); if (!dst || !dst->obsolete || dst->ops->check(dst, inet6_sk(sk)->dst_cookie)) { rcu_read_unlock(); return; } rcu_read_unlock(); ip6_datagram_dst_update(sk, false); } EXPORT_SYMBOL_GPL(ip6_datagram_release_cb); int __ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr; struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); struct in6_addr *daddr, old_daddr; __be32 fl6_flowlabel = 0; __be32 old_fl6_flowlabel; __be16 old_dport; int addr_type; int err; if (usin->sin6_family == AF_INET) { if (ipv6_only_sock(sk)) return -EAFNOSUPPORT; err = __ip4_datagram_connect(sk, uaddr, addr_len); goto ipv4_connected; } if (addr_len < SIN6_LEN_RFC2133) return -EINVAL; if (usin->sin6_family != AF_INET6) return -EAFNOSUPPORT; if (inet6_test_bit(SNDFLOW, sk)) fl6_flowlabel = usin->sin6_flowinfo & IPV6_FLOWINFO_MASK; if (ipv6_addr_any(&usin->sin6_addr)) { /* * connect to self */ if (ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)) ipv6_addr_set_v4mapped(htonl(INADDR_LOOPBACK), &usin->sin6_addr); else usin->sin6_addr = in6addr_loopback; } addr_type = ipv6_addr_type(&usin->sin6_addr); daddr = &usin->sin6_addr; if (addr_type & IPV6_ADDR_MAPPED) { struct sockaddr_in sin; if (ipv6_only_sock(sk)) { err = -ENETUNREACH; goto out; } sin.sin_family = AF_INET; sin.sin_addr.s_addr = daddr->s6_addr32[3]; sin.sin_port = usin->sin6_port; err = __ip4_datagram_connect(sk, (struct sockaddr *) &sin, sizeof(sin)); ipv4_connected: if (err) goto out; ipv6_addr_set_v4mapped(inet->inet_daddr, &sk->sk_v6_daddr); if (ipv6_addr_any(&np->saddr) || ipv6_mapped_addr_any(&np->saddr)) ipv6_addr_set_v4mapped(inet->inet_saddr, &np->saddr); if (ipv6_addr_any(&sk->sk_v6_rcv_saddr) || ipv6_mapped_addr_any(&sk->sk_v6_rcv_saddr)) { ipv6_addr_set_v4mapped(inet->inet_rcv_saddr, &sk->sk_v6_rcv_saddr); if (sk->sk_prot->rehash) sk->sk_prot->rehash(sk); } goto out; } if (__ipv6_addr_needs_scope_id(addr_type)) { if (addr_len >= sizeof(struct sockaddr_in6) && usin->sin6_scope_id) { if (!sk_dev_equal_l3scope(sk, usin->sin6_scope_id)) { err = -EINVAL; goto out; } WRITE_ONCE(sk->sk_bound_dev_if, usin->sin6_scope_id); } if (!sk->sk_bound_dev_if && (addr_type & IPV6_ADDR_MULTICAST)) WRITE_ONCE(sk->sk_bound_dev_if, READ_ONCE(np->mcast_oif)); /* Connect to link-local address requires an interface */ if (!sk->sk_bound_dev_if) { err = -EINVAL; goto out; } } /* save the current peer information before updating it */ old_daddr = sk->sk_v6_daddr; old_fl6_flowlabel = np->flow_label; old_dport = inet->inet_dport; sk->sk_v6_daddr = *daddr; np->flow_label = fl6_flowlabel; inet->inet_dport = usin->sin6_port; /* * Check for a route to destination an obtain the * destination cache for it. */ err = ip6_datagram_dst_update(sk, true); if (err) { /* Restore the socket peer info, to keep it consistent with * the old socket state */ sk->sk_v6_daddr = old_daddr; np->flow_label = old_fl6_flowlabel; inet->inet_dport = old_dport; goto out; } reuseport_has_conns_set(sk); sk->sk_state = TCP_ESTABLISHED; sk_set_txhash(sk); out: return err; } EXPORT_SYMBOL_GPL(__ip6_datagram_connect); int ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { int res; lock_sock(sk); res = __ip6_datagram_connect(sk, uaddr, addr_len); release_sock(sk); return res; } EXPORT_SYMBOL_GPL(ip6_datagram_connect); int ip6_datagram_connect_v6_only(struct sock *sk, struct sockaddr *uaddr, int addr_len) { DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, uaddr); if (sin6->sin6_family != AF_INET6) return -EAFNOSUPPORT; return ip6_datagram_connect(sk, uaddr, addr_len); } EXPORT_SYMBOL_GPL(ip6_datagram_connect_v6_only); static void ipv6_icmp_error_rfc4884(const struct sk_buff *skb, struct sock_ee_data_rfc4884 *out) { switch (icmp6_hdr(skb)->icmp6_type) { case ICMPV6_TIME_EXCEED: case ICMPV6_DEST_UNREACH: ip_icmp_error_rfc4884(skb, out, sizeof(struct icmp6hdr), icmp6_hdr(skb)->icmp6_datagram_len * 8); } } void ipv6_icmp_error(struct sock *sk, struct sk_buff *skb, int err, __be16 port, u32 info, u8 *payload) { struct icmp6hdr *icmph = icmp6_hdr(skb); struct sock_exterr_skb *serr; if (!inet6_test_bit(RECVERR6, sk)) return; skb = skb_clone(skb, GFP_ATOMIC); if (!skb) return; skb->protocol = htons(ETH_P_IPV6); serr = SKB_EXT_ERR(skb); serr->ee.ee_errno = err; serr->ee.ee_origin = SO_EE_ORIGIN_ICMP6; serr->ee.ee_type = icmph->icmp6_type; serr->ee.ee_code = icmph->icmp6_code; serr->ee.ee_pad = 0; serr->ee.ee_info = info; serr->ee.ee_data = 0; serr->addr_offset = (u8 *)&(((struct ipv6hdr *)(icmph + 1))->daddr) - skb_network_header(skb); serr->port = port; __skb_pull(skb, payload - skb->data); if (inet6_test_bit(RECVERR6_RFC4884, sk)) ipv6_icmp_error_rfc4884(skb, &serr->ee.ee_rfc4884); skb_reset_transport_header(skb); if (sock_queue_err_skb(sk, skb)) kfree_skb(skb); } EXPORT_SYMBOL_GPL(ipv6_icmp_error); void ipv6_local_error(struct sock *sk, int err, struct flowi6 *fl6, u32 info) { struct sock_exterr_skb *serr; struct ipv6hdr *iph; struct sk_buff *skb; if (!inet6_test_bit(RECVERR6, sk)) return; skb = alloc_skb(sizeof(struct ipv6hdr), GFP_ATOMIC); if (!skb) return; skb->protocol = htons(ETH_P_IPV6); skb_put(skb, sizeof(struct ipv6hdr)); skb_reset_network_header(skb); iph = ipv6_hdr(skb); iph->daddr = fl6->daddr; ip6_flow_hdr(iph, 0, 0); serr = SKB_EXT_ERR(skb); serr->ee.ee_errno = err; serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL; serr->ee.ee_type = 0; serr->ee.ee_code = 0; serr->ee.ee_pad = 0; serr->ee.ee_info = info; serr->ee.ee_data = 0; serr->addr_offset = (u8 *)&iph->daddr - skb_network_header(skb); serr->port = fl6->fl6_dport; __skb_pull(skb, skb_tail_pointer(skb) - skb->data); skb_reset_transport_header(skb); if (sock_queue_err_skb(sk, skb)) kfree_skb(skb); } void ipv6_local_rxpmtu(struct sock *sk, struct flowi6 *fl6, u32 mtu) { struct ipv6_pinfo *np = inet6_sk(sk); struct ipv6hdr *iph; struct sk_buff *skb; struct ip6_mtuinfo *mtu_info; if (!np->rxopt.bits.rxpmtu) return; skb = alloc_skb(sizeof(struct ipv6hdr), GFP_ATOMIC); if (!skb) return; skb_put(skb, sizeof(struct ipv6hdr)); skb_reset_network_header(skb); iph = ipv6_hdr(skb); iph->daddr = fl6->daddr; mtu_info = IP6CBMTU(skb); mtu_info->ip6m_mtu = mtu; mtu_info->ip6m_addr.sin6_family = AF_INET6; mtu_info->ip6m_addr.sin6_port = 0; mtu_info->ip6m_addr.sin6_flowinfo = 0; mtu_info->ip6m_addr.sin6_scope_id = fl6->flowi6_oif; mtu_info->ip6m_addr.sin6_addr = ipv6_hdr(skb)->daddr; __skb_pull(skb, skb_tail_pointer(skb) - skb->data); skb_reset_transport_header(skb); skb = xchg(&np->rxpmtu, skb); kfree_skb(skb); } /* For some errors we have valid addr_offset even with zero payload and * zero port. Also, addr_offset should be supported if port is set. */ static inline bool ipv6_datagram_support_addr(struct sock_exterr_skb *serr) { return serr->ee.ee_origin == SO_EE_ORIGIN_ICMP6 || serr->ee.ee_origin == SO_EE_ORIGIN_ICMP || serr->ee.ee_origin == SO_EE_ORIGIN_LOCAL || serr->port; } /* IPv6 supports cmsg on all origins aside from SO_EE_ORIGIN_LOCAL. * * At one point, excluding local errors was a quick test to identify icmp/icmp6 * errors. This is no longer true, but the test remained, so the v6 stack, * unlike v4, also honors cmsg requests on all wifi and timestamp errors. */ static bool ip6_datagram_support_cmsg(struct sk_buff *skb, struct sock_exterr_skb *serr) { if (serr->ee.ee_origin == SO_EE_ORIGIN_ICMP || serr->ee.ee_origin == SO_EE_ORIGIN_ICMP6) return true; if (serr->ee.ee_origin == SO_EE_ORIGIN_LOCAL) return false; if (!IP6CB(skb)->iif) return false; return true; } /* * Handle MSG_ERRQUEUE */ int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) { struct ipv6_pinfo *np = inet6_sk(sk); struct sock_exterr_skb *serr; struct sk_buff *skb; DECLARE_SOCKADDR(struct sockaddr_in6 *, sin, msg->msg_name); struct { struct sock_extended_err ee; struct sockaddr_in6 offender; } errhdr; int err; int copied; err = -EAGAIN; skb = sock_dequeue_err_skb(sk); if (!skb) goto out; copied = skb->len; if (copied > len) { msg->msg_flags |= MSG_TRUNC; copied = len; } err = skb_copy_datagram_msg(skb, 0, msg, copied); if (unlikely(err)) { kfree_skb(skb); return err; } sock_recv_timestamp(msg, sk, skb); serr = SKB_EXT_ERR(skb); if (sin && ipv6_datagram_support_addr(serr)) { const unsigned char *nh = skb_network_header(skb); sin->sin6_family = AF_INET6; sin->sin6_flowinfo = 0; sin->sin6_port = serr->port; if (skb->protocol == htons(ETH_P_IPV6)) { const struct ipv6hdr *ip6h = container_of((struct in6_addr *)(nh + serr->addr_offset), struct ipv6hdr, daddr); sin->sin6_addr = ip6h->daddr; if (inet6_test_bit(SNDFLOW, sk)) sin->sin6_flowinfo = ip6_flowinfo(ip6h); sin->sin6_scope_id = ipv6_iface_scope_id(&sin->sin6_addr, IP6CB(skb)->iif); } else { ipv6_addr_set_v4mapped(*(__be32 *)(nh + serr->addr_offset), &sin->sin6_addr); sin->sin6_scope_id = 0; } *addr_len = sizeof(*sin); } memcpy(&errhdr.ee, &serr->ee, sizeof(struct sock_extended_err)); sin = &errhdr.offender; memset(sin, 0, sizeof(*sin)); if (ip6_datagram_support_cmsg(skb, serr)) { sin->sin6_family = AF_INET6; if (np->rxopt.all) ip6_datagram_recv_common_ctl(sk, msg, skb); if (skb->protocol == htons(ETH_P_IPV6)) { sin->sin6_addr = ipv6_hdr(skb)->saddr; if (np->rxopt.all) ip6_datagram_recv_specific_ctl(sk, msg, skb); sin->sin6_scope_id = ipv6_iface_scope_id(&sin->sin6_addr, IP6CB(skb)->iif); } else { ipv6_addr_set_v4mapped(ip_hdr(skb)->saddr, &sin->sin6_addr); if (inet_cmsg_flags(inet_sk(sk))) ip_cmsg_recv(msg, skb); } } put_cmsg(msg, SOL_IPV6, IPV6_RECVERR, sizeof(errhdr), &errhdr); /* Now we could try to dump offended packet options */ msg->msg_flags |= MSG_ERRQUEUE; err = copied; consume_skb(skb); out: return err; } EXPORT_SYMBOL_GPL(ipv6_recv_error); /* * Handle IPV6_RECVPATHMTU */ int ipv6_recv_rxpmtu(struct sock *sk, struct msghdr *msg, int len, int *addr_len) { struct ipv6_pinfo *np = inet6_sk(sk); struct sk_buff *skb; struct ip6_mtuinfo mtu_info; DECLARE_SOCKADDR(struct sockaddr_in6 *, sin, msg->msg_name); int err; int copied; err = -EAGAIN; skb = xchg(&np->rxpmtu, NULL); if (!skb) goto out; copied = skb->len; if (copied > len) { msg->msg_flags |= MSG_TRUNC; copied = len; } err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto out_free_skb; sock_recv_timestamp(msg, sk, skb); memcpy(&mtu_info, IP6CBMTU(skb), sizeof(mtu_info)); if (sin) { sin->sin6_family = AF_INET6; sin->sin6_flowinfo = 0; sin->sin6_port = 0; sin->sin6_scope_id = mtu_info.ip6m_addr.sin6_scope_id; sin->sin6_addr = mtu_info.ip6m_addr.sin6_addr; *addr_len = sizeof(*sin); } put_cmsg(msg, SOL_IPV6, IPV6_PATHMTU, sizeof(mtu_info), &mtu_info); err = copied; out_free_skb: kfree_skb(skb); out: return err; } void ip6_datagram_recv_common_ctl(struct sock *sk, struct msghdr *msg, struct sk_buff *skb) { struct ipv6_pinfo *np = inet6_sk(sk); bool is_ipv6 = skb->protocol == htons(ETH_P_IPV6); if (np->rxopt.bits.rxinfo) { struct in6_pktinfo src_info; if (is_ipv6) { src_info.ipi6_ifindex = IP6CB(skb)->iif; src_info.ipi6_addr = ipv6_hdr(skb)->daddr; } else { src_info.ipi6_ifindex = PKTINFO_SKB_CB(skb)->ipi_ifindex; ipv6_addr_set_v4mapped(ip_hdr(skb)->daddr, &src_info.ipi6_addr); } if (src_info.ipi6_ifindex >= 0) put_cmsg(msg, SOL_IPV6, IPV6_PKTINFO, sizeof(src_info), &src_info); } } void ip6_datagram_recv_specific_ctl(struct sock *sk, struct msghdr *msg, struct sk_buff *skb) { struct ipv6_pinfo *np = inet6_sk(sk); struct inet6_skb_parm *opt = IP6CB(skb); unsigned char *nh = skb_network_header(skb); if (np->rxopt.bits.rxhlim) { int hlim = ipv6_hdr(skb)->hop_limit; put_cmsg(msg, SOL_IPV6, IPV6_HOPLIMIT, sizeof(hlim), &hlim); } if (np->rxopt.bits.rxtclass) { int tclass = ipv6_get_dsfield(ipv6_hdr(skb)); put_cmsg(msg, SOL_IPV6, IPV6_TCLASS, sizeof(tclass), &tclass); } if (np->rxopt.bits.rxflow) { __be32 flowinfo = ip6_flowinfo((struct ipv6hdr *)nh); if (flowinfo) put_cmsg(msg, SOL_IPV6, IPV6_FLOWINFO, sizeof(flowinfo), &flowinfo); } /* HbH is allowed only once */ if (np->rxopt.bits.hopopts && (opt->flags & IP6SKB_HOPBYHOP)) { u8 *ptr = nh + sizeof(struct ipv6hdr); put_cmsg(msg, SOL_IPV6, IPV6_HOPOPTS, (ptr[1]+1)<<3, ptr); } if (opt->lastopt && (np->rxopt.bits.dstopts || np->rxopt.bits.srcrt)) { /* * Silly enough, but we need to reparse in order to * report extension headers (except for HbH) * in order. * * Also note that IPV6_RECVRTHDRDSTOPTS is NOT * (and WILL NOT be) defined because * IPV6_RECVDSTOPTS is more generic. --yoshfuji */ unsigned int off = sizeof(struct ipv6hdr); u8 nexthdr = ipv6_hdr(skb)->nexthdr; while (off <= opt->lastopt) { unsigned int len; u8 *ptr = nh + off; switch (nexthdr) { case IPPROTO_DSTOPTS: nexthdr = ptr[0]; len = (ptr[1] + 1) << 3; if (np->rxopt.bits.dstopts) put_cmsg(msg, SOL_IPV6, IPV6_DSTOPTS, len, ptr); break; case IPPROTO_ROUTING: nexthdr = ptr[0]; len = (ptr[1] + 1) << 3; if (np->rxopt.bits.srcrt) put_cmsg(msg, SOL_IPV6, IPV6_RTHDR, len, ptr); break; case IPPROTO_AH: nexthdr = ptr[0]; len = (ptr[1] + 2) << 2; break; default: nexthdr = ptr[0]; len = (ptr[1] + 1) << 3; break; } off += len; } } /* socket options in old style */ if (np->rxopt.bits.rxoinfo) { struct in6_pktinfo src_info; src_info.ipi6_ifindex = opt->iif; src_info.ipi6_addr = ipv6_hdr(skb)->daddr; put_cmsg(msg, SOL_IPV6, IPV6_2292PKTINFO, sizeof(src_info), &src_info); } if (np->rxopt.bits.rxohlim) { int hlim = ipv6_hdr(skb)->hop_limit; put_cmsg(msg, SOL_IPV6, IPV6_2292HOPLIMIT, sizeof(hlim), &hlim); } if (np->rxopt.bits.ohopopts && (opt->flags & IP6SKB_HOPBYHOP)) { u8 *ptr = nh + sizeof(struct ipv6hdr); put_cmsg(msg, SOL_IPV6, IPV6_2292HOPOPTS, (ptr[1]+1)<<3, ptr); } if (np->rxopt.bits.odstopts && opt->dst0) { u8 *ptr = nh + opt->dst0; put_cmsg(msg, SOL_IPV6, IPV6_2292DSTOPTS, (ptr[1]+1)<<3, ptr); } if (np->rxopt.bits.osrcrt && opt->srcrt) { struct ipv6_rt_hdr *rthdr = (struct ipv6_rt_hdr *)(nh + opt->srcrt); put_cmsg(msg, SOL_IPV6, IPV6_2292RTHDR, (rthdr->hdrlen+1) << 3, rthdr); } if (np->rxopt.bits.odstopts && opt->dst1) { u8 *ptr = nh + opt->dst1; put_cmsg(msg, SOL_IPV6, IPV6_2292DSTOPTS, (ptr[1]+1)<<3, ptr); } if (np->rxopt.bits.rxorigdstaddr) { struct sockaddr_in6 sin6; __be16 _ports[2], *ports; ports = skb_header_pointer(skb, skb_transport_offset(skb), sizeof(_ports), &_ports); if (ports) { /* All current transport protocols have the port numbers in the * first four bytes of the transport header and this function is * written with this assumption in mind. */ sin6.sin6_family = AF_INET6; sin6.sin6_addr = ipv6_hdr(skb)->daddr; sin6.sin6_port = ports[1]; sin6.sin6_flowinfo = 0; sin6.sin6_scope_id = ipv6_iface_scope_id(&ipv6_hdr(skb)->daddr, opt->iif); put_cmsg(msg, SOL_IPV6, IPV6_ORIGDSTADDR, sizeof(sin6), &sin6); } } if (np->rxopt.bits.recvfragsize && opt->frag_max_size) { int val = opt->frag_max_size; put_cmsg(msg, SOL_IPV6, IPV6_RECVFRAGSIZE, sizeof(val), &val); } } void ip6_datagram_recv_ctl(struct sock *sk, struct msghdr *msg, struct sk_buff *skb) { ip6_datagram_recv_common_ctl(sk, msg, skb); ip6_datagram_recv_specific_ctl(sk, msg, skb); } EXPORT_SYMBOL_GPL(ip6_datagram_recv_ctl); int ip6_datagram_send_ctl(struct net *net, struct sock *sk, struct msghdr *msg, struct flowi6 *fl6, struct ipcm6_cookie *ipc6) { struct in6_pktinfo *src_info; struct cmsghdr *cmsg; struct ipv6_rt_hdr *rthdr; struct ipv6_opt_hdr *hdr; struct ipv6_txoptions *opt = ipc6->opt; int len; int err = 0; for_each_cmsghdr(cmsg, msg) { int addr_type; if (!CMSG_OK(msg, cmsg)) { err = -EINVAL; goto exit_f; } if (cmsg->cmsg_level == SOL_SOCKET) { err = __sock_cmsg_send(sk, cmsg, &ipc6->sockc); if (err) return err; continue; } if (cmsg->cmsg_level != SOL_IPV6) continue; switch (cmsg->cmsg_type) { case IPV6_PKTINFO: case IPV6_2292PKTINFO: { struct net_device *dev = NULL; int src_idx; if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct in6_pktinfo))) { err = -EINVAL; goto exit_f; } src_info = (struct in6_pktinfo *)CMSG_DATA(cmsg); src_idx = src_info->ipi6_ifindex; if (src_idx) { if (fl6->flowi6_oif && src_idx != fl6->flowi6_oif && (READ_ONCE(sk->sk_bound_dev_if) != fl6->flowi6_oif || !sk_dev_equal_l3scope(sk, src_idx))) return -EINVAL; fl6->flowi6_oif = src_idx; } addr_type = __ipv6_addr_type(&src_info->ipi6_addr); rcu_read_lock(); if (fl6->flowi6_oif) { dev = dev_get_by_index_rcu(net, fl6->flowi6_oif); if (!dev) { rcu_read_unlock(); return -ENODEV; } } else if (addr_type & IPV6_ADDR_LINKLOCAL) { rcu_read_unlock(); return -EINVAL; } if (addr_type != IPV6_ADDR_ANY) { int strict = __ipv6_addr_src_scope(addr_type) <= IPV6_ADDR_SCOPE_LINKLOCAL; if (!ipv6_can_nonlocal_bind(net, inet_sk(sk)) && !ipv6_chk_addr_and_flags(net, &src_info->ipi6_addr, dev, !strict, 0, IFA_F_TENTATIVE) && !ipv6_chk_acast_addr_src(net, dev, &src_info->ipi6_addr)) err = -EINVAL; else fl6->saddr = src_info->ipi6_addr; } rcu_read_unlock(); if (err) goto exit_f; break; } case IPV6_FLOWINFO: if (cmsg->cmsg_len < CMSG_LEN(4)) { err = -EINVAL; goto exit_f; } if (fl6->flowlabel&IPV6_FLOWINFO_MASK) { if ((fl6->flowlabel^*(__be32 *)CMSG_DATA(cmsg))&~IPV6_FLOWINFO_MASK) { err = -EINVAL; goto exit_f; } } fl6->flowlabel = IPV6_FLOWINFO_MASK & *(__be32 *)CMSG_DATA(cmsg); break; case IPV6_2292HOPOPTS: case IPV6_HOPOPTS: if (opt->hopopt || cmsg->cmsg_len < CMSG_LEN(sizeof(struct ipv6_opt_hdr))) { err = -EINVAL; goto exit_f; } hdr = (struct ipv6_opt_hdr *)CMSG_DATA(cmsg); len = ((hdr->hdrlen + 1) << 3); if (cmsg->cmsg_len < CMSG_LEN(len)) { err = -EINVAL; goto exit_f; } if (!ns_capable(net->user_ns, CAP_NET_RAW)) { err = -EPERM; goto exit_f; } opt->opt_nflen += len; opt->hopopt = hdr; break; case IPV6_2292DSTOPTS: if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct ipv6_opt_hdr))) { err = -EINVAL; goto exit_f; } hdr = (struct ipv6_opt_hdr *)CMSG_DATA(cmsg); len = ((hdr->hdrlen + 1) << 3); if (cmsg->cmsg_len < CMSG_LEN(len)) { err = -EINVAL; goto exit_f; } if (!ns_capable(net->user_ns, CAP_NET_RAW)) { err = -EPERM; goto exit_f; } if (opt->dst1opt) { err = -EINVAL; goto exit_f; } opt->opt_flen += len; opt->dst1opt = hdr; break; case IPV6_DSTOPTS: case IPV6_RTHDRDSTOPTS: if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct ipv6_opt_hdr))) { err = -EINVAL; goto exit_f; } hdr = (struct ipv6_opt_hdr *)CMSG_DATA(cmsg); len = ((hdr->hdrlen + 1) << 3); if (cmsg->cmsg_len < CMSG_LEN(len)) { err = -EINVAL; goto exit_f; } if (!ns_capable(net->user_ns, CAP_NET_RAW)) { err = -EPERM; goto exit_f; } if (cmsg->cmsg_type == IPV6_DSTOPTS) { opt->opt_flen += len; opt->dst1opt = hdr; } else { opt->opt_nflen += len; opt->dst0opt = hdr; } break; case IPV6_2292RTHDR: case IPV6_RTHDR: if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct ipv6_rt_hdr))) { err = -EINVAL; goto exit_f; } rthdr = (struct ipv6_rt_hdr *)CMSG_DATA(cmsg); switch (rthdr->type) { #if IS_ENABLED(CONFIG_IPV6_MIP6) case IPV6_SRCRT_TYPE_2: if (rthdr->hdrlen != 2 || rthdr->segments_left != 1) { err = -EINVAL; goto exit_f; } break; #endif default: err = -EINVAL; goto exit_f; } len = ((rthdr->hdrlen + 1) << 3); if (cmsg->cmsg_len < CMSG_LEN(len)) { err = -EINVAL; goto exit_f; } /* segments left must also match */ if ((rthdr->hdrlen >> 1) != rthdr->segments_left) { err = -EINVAL; goto exit_f; } opt->opt_nflen += len; opt->srcrt = rthdr; if (cmsg->cmsg_type == IPV6_2292RTHDR && opt->dst1opt) { int dsthdrlen = ((opt->dst1opt->hdrlen+1)<<3); opt->opt_nflen += dsthdrlen; opt->dst0opt = opt->dst1opt; opt->dst1opt = NULL; opt->opt_flen -= dsthdrlen; } break; case IPV6_2292HOPLIMIT: case IPV6_HOPLIMIT: if (cmsg->cmsg_len != CMSG_LEN(sizeof(int))) { err = -EINVAL; goto exit_f; } ipc6->hlimit = *(int *)CMSG_DATA(cmsg); if (ipc6->hlimit < -1 || ipc6->hlimit > 0xff) { err = -EINVAL; goto exit_f; } break; case IPV6_TCLASS: { int tc; err = -EINVAL; if (cmsg->cmsg_len != CMSG_LEN(sizeof(int))) goto exit_f; tc = *(int *)CMSG_DATA(cmsg); if (tc < -1 || tc > 0xff) goto exit_f; err = 0; ipc6->tclass = tc; break; } case IPV6_DONTFRAG: { int df; err = -EINVAL; if (cmsg->cmsg_len != CMSG_LEN(sizeof(int))) goto exit_f; df = *(int *)CMSG_DATA(cmsg); if (df < 0 || df > 1) goto exit_f; err = 0; ipc6->dontfrag = df; break; } default: net_dbg_ratelimited("invalid cmsg type: %d\n", cmsg->cmsg_type); err = -EINVAL; goto exit_f; } } exit_f: return err; } EXPORT_SYMBOL_GPL(ip6_datagram_send_ctl); void __ip6_dgram_sock_seq_show(struct seq_file *seq, struct sock *sp, __u16 srcp, __u16 destp, int rqueue, int bucket) { const struct in6_addr *dest, *src; dest = &sp->sk_v6_daddr; src = &sp->sk_v6_rcv_saddr; seq_printf(seq, "%5d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X " "%02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %u\n", bucket, src->s6_addr32[0], src->s6_addr32[1], src->s6_addr32[2], src->s6_addr32[3], srcp, dest->s6_addr32[0], dest->s6_addr32[1], dest->s6_addr32[2], dest->s6_addr32[3], destp, sp->sk_state, sk_wmem_alloc_get(sp), rqueue, 0, 0L, 0, from_kuid_munged(seq_user_ns(seq), sock_i_uid(sp)), 0, sock_i_ino(sp), refcount_read(&sp->sk_refcnt), sp, atomic_read(&sp->sk_drops)); } |
| 25 9 17 25 25 25 25 25 25 24 25 20 20 4 20 20 20 15 14 14 14 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 | // SPDX-License-Identifier: GPL-2.0-only /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> */ #include <linux/types.h> #include <linux/ipv6.h> #include <linux/in6.h> #include <linux/netfilter.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/icmp.h> #include <linux/rcupdate.h> #include <linux/sysctl.h> #include <net/ipv6_frag.h> #include <linux/netfilter_ipv6.h> #include <linux/netfilter_bridge.h> #if IS_ENABLED(CONFIG_NF_CONNTRACK) #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/ipv6/nf_conntrack_ipv6.h> #endif #include <net/netfilter/nf_conntrack_zones.h> #include <net/netfilter/ipv6/nf_defrag_ipv6.h> static DEFINE_MUTEX(defrag6_mutex); static enum ip6_defrag_users nf_ct6_defrag_user(unsigned int hooknum, struct sk_buff *skb) { u16 zone_id = NF_CT_DEFAULT_ZONE_ID; #if IS_ENABLED(CONFIG_NF_CONNTRACK) if (skb_nfct(skb)) { enum ip_conntrack_info ctinfo; const struct nf_conn *ct = nf_ct_get(skb, &ctinfo); zone_id = nf_ct_zone_id(nf_ct_zone(ct), CTINFO2DIR(ctinfo)); } #endif if (nf_bridge_in_prerouting(skb)) return IP6_DEFRAG_CONNTRACK_BRIDGE_IN + zone_id; if (hooknum == NF_INET_PRE_ROUTING) return IP6_DEFRAG_CONNTRACK_IN + zone_id; else return IP6_DEFRAG_CONNTRACK_OUT + zone_id; } static unsigned int ipv6_defrag(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { int err; #if IS_ENABLED(CONFIG_NF_CONNTRACK) /* Previously seen (loopback)? */ if (skb_nfct(skb) && !nf_ct_is_template((struct nf_conn *)skb_nfct(skb))) return NF_ACCEPT; if (skb->_nfct == IP_CT_UNTRACKED) return NF_ACCEPT; #endif err = nf_ct_frag6_gather(state->net, skb, nf_ct6_defrag_user(state->hook, skb)); /* queued */ if (err == -EINPROGRESS) return NF_STOLEN; return err == 0 ? NF_ACCEPT : NF_DROP; } static const struct nf_hook_ops ipv6_defrag_ops[] = { { .hook = ipv6_defrag, .pf = NFPROTO_IPV6, .hooknum = NF_INET_PRE_ROUTING, .priority = NF_IP6_PRI_CONNTRACK_DEFRAG, }, { .hook = ipv6_defrag, .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP6_PRI_CONNTRACK_DEFRAG, }, }; static void __net_exit defrag6_net_exit(struct net *net) { if (net->nf.defrag_ipv6_users) { nf_unregister_net_hooks(net, ipv6_defrag_ops, ARRAY_SIZE(ipv6_defrag_ops)); net->nf.defrag_ipv6_users = 0; } } static const struct nf_defrag_hook defrag_hook = { .owner = THIS_MODULE, .enable = nf_defrag_ipv6_enable, .disable = nf_defrag_ipv6_disable, }; static struct pernet_operations defrag6_net_ops = { .exit = defrag6_net_exit, }; static int __init nf_defrag_init(void) { int ret = 0; ret = nf_ct_frag6_init(); if (ret < 0) { pr_err("nf_defrag_ipv6: can't initialize frag6.\n"); return ret; } ret = register_pernet_subsys(&defrag6_net_ops); if (ret < 0) { pr_err("nf_defrag_ipv6: can't register pernet ops\n"); goto cleanup_frag6; } rcu_assign_pointer(nf_defrag_v6_hook, &defrag_hook); return ret; cleanup_frag6: nf_ct_frag6_cleanup(); return ret; } static void __exit nf_defrag_fini(void) { rcu_assign_pointer(nf_defrag_v6_hook, NULL); unregister_pernet_subsys(&defrag6_net_ops); nf_ct_frag6_cleanup(); } int nf_defrag_ipv6_enable(struct net *net) { int err = 0; mutex_lock(&defrag6_mutex); if (net->nf.defrag_ipv6_users == UINT_MAX) { err = -EOVERFLOW; goto out_unlock; } if (net->nf.defrag_ipv6_users) { net->nf.defrag_ipv6_users++; goto out_unlock; } err = nf_register_net_hooks(net, ipv6_defrag_ops, ARRAY_SIZE(ipv6_defrag_ops)); if (err == 0) net->nf.defrag_ipv6_users = 1; out_unlock: mutex_unlock(&defrag6_mutex); return err; } EXPORT_SYMBOL_GPL(nf_defrag_ipv6_enable); void nf_defrag_ipv6_disable(struct net *net) { mutex_lock(&defrag6_mutex); if (net->nf.defrag_ipv6_users) { net->nf.defrag_ipv6_users--; if (net->nf.defrag_ipv6_users == 0) nf_unregister_net_hooks(net, ipv6_defrag_ops, ARRAY_SIZE(ipv6_defrag_ops)); } mutex_unlock(&defrag6_mutex); } EXPORT_SYMBOL_GPL(nf_defrag_ipv6_disable); module_init(nf_defrag_init); module_exit(nf_defrag_fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("IPv6 defragmentation support"); |
| 86 51 86 1 2 151 9 220 98 74 97 37 97 1 1 149 98 134 98 150 53 53 38 27 8 8 24 149 3 73 46 46 170 35 41 46 35 21 46 2 2 337 1 1 1 68 68 123 237 237 4 4 177 39 200 43 12 98 111 13 1 9 1 1 4 4 1 15 15 15 15 13 55 15 15 15 16 15 14 71 238 34 29 29 1 1 1 1 1 1 1 7 8 7 2 9 4 4 29 2 1 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __NET_SCHED_GENERIC_H #define __NET_SCHED_GENERIC_H #include <linux/netdevice.h> #include <linux/types.h> #include <linux/rcupdate.h> #include <linux/pkt_sched.h> #include <linux/pkt_cls.h> #include <linux/percpu.h> #include <linux/dynamic_queue_limits.h> #include <linux/list.h> #include <linux/refcount.h> #include <linux/workqueue.h> #include <linux/mutex.h> #include <linux/rwsem.h> #include <linux/atomic.h> #include <linux/hashtable.h> #include <net/gen_stats.h> #include <net/rtnetlink.h> #include <net/flow_offload.h> #include <linux/xarray.h> struct Qdisc_ops; struct qdisc_walker; struct tcf_walker; struct module; struct bpf_flow_keys; struct qdisc_rate_table { struct tc_ratespec rate; u32 data[256]; struct qdisc_rate_table *next; int refcnt; }; enum qdisc_state_t { __QDISC_STATE_SCHED, __QDISC_STATE_DEACTIVATED, __QDISC_STATE_MISSED, __QDISC_STATE_DRAINING, }; enum qdisc_state2_t { /* Only for !TCQ_F_NOLOCK qdisc. Never access it directly. * Use qdisc_run_begin/end() or qdisc_is_running() instead. */ __QDISC_STATE2_RUNNING, }; #define QDISC_STATE_MISSED BIT(__QDISC_STATE_MISSED) #define QDISC_STATE_DRAINING BIT(__QDISC_STATE_DRAINING) #define QDISC_STATE_NON_EMPTY (QDISC_STATE_MISSED | \ QDISC_STATE_DRAINING) struct qdisc_size_table { struct rcu_head rcu; struct list_head list; struct tc_sizespec szopts; int refcnt; u16 data[]; }; /* similar to sk_buff_head, but skb->prev pointer is undefined. */ struct qdisc_skb_head { struct sk_buff *head; struct sk_buff *tail; __u32 qlen; spinlock_t lock; }; struct Qdisc { int (*enqueue)(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free); struct sk_buff * (*dequeue)(struct Qdisc *sch); unsigned int flags; #define TCQ_F_BUILTIN 1 #define TCQ_F_INGRESS 2 #define TCQ_F_CAN_BYPASS 4 #define TCQ_F_MQROOT 8 #define TCQ_F_ONETXQUEUE 0x10 /* dequeue_skb() can assume all skbs are for * q->dev_queue : It can test * netif_xmit_frozen_or_stopped() before * dequeueing next packet. * Its true for MQ/MQPRIO slaves, or non * multiqueue device. */ #define TCQ_F_WARN_NONWC (1 << 16) #define TCQ_F_CPUSTATS 0x20 /* run using percpu statistics */ #define TCQ_F_NOPARENT 0x40 /* root of its hierarchy : * qdisc_tree_decrease_qlen() should stop. */ #define TCQ_F_INVISIBLE 0x80 /* invisible by default in dump */ #define TCQ_F_NOLOCK 0x100 /* qdisc does not require locking */ #define TCQ_F_OFFLOADED 0x200 /* qdisc is offloaded to HW */ u32 limit; const struct Qdisc_ops *ops; struct qdisc_size_table __rcu *stab; struct hlist_node hash; u32 handle; u32 parent; struct netdev_queue *dev_queue; struct net_rate_estimator __rcu *rate_est; struct gnet_stats_basic_sync __percpu *cpu_bstats; struct gnet_stats_queue __percpu *cpu_qstats; int pad; refcount_t refcnt; /* * For performance sake on SMP, we put highly modified fields at the end */ struct sk_buff_head gso_skb ____cacheline_aligned_in_smp; struct qdisc_skb_head q; struct gnet_stats_basic_sync bstats; struct gnet_stats_queue qstats; int owner; unsigned long state; unsigned long state2; /* must be written under qdisc spinlock */ struct Qdisc *next_sched; struct sk_buff_head skb_bad_txq; spinlock_t busylock ____cacheline_aligned_in_smp; spinlock_t seqlock; struct rcu_head rcu; netdevice_tracker dev_tracker; struct lock_class_key root_lock_key; /* private data */ long privdata[] ____cacheline_aligned; }; static inline void qdisc_refcount_inc(struct Qdisc *qdisc) { if (qdisc->flags & TCQ_F_BUILTIN) return; refcount_inc(&qdisc->refcnt); } static inline bool qdisc_refcount_dec_if_one(struct Qdisc *qdisc) { if (qdisc->flags & TCQ_F_BUILTIN) return true; return refcount_dec_if_one(&qdisc->refcnt); } /* Intended to be used by unlocked users, when concurrent qdisc release is * possible. */ static inline struct Qdisc *qdisc_refcount_inc_nz(struct Qdisc *qdisc) { if (qdisc->flags & TCQ_F_BUILTIN) return qdisc; if (refcount_inc_not_zero(&qdisc->refcnt)) return qdisc; return NULL; } /* For !TCQ_F_NOLOCK qdisc: callers must either call this within a qdisc * root_lock section, or provide their own memory barriers -- ordering * against qdisc_run_begin/end() atomic bit operations. */ static inline bool qdisc_is_running(struct Qdisc *qdisc) { if (qdisc->flags & TCQ_F_NOLOCK) return spin_is_locked(&qdisc->seqlock); return test_bit(__QDISC_STATE2_RUNNING, &qdisc->state2); } static inline bool nolock_qdisc_is_empty(const struct Qdisc *qdisc) { return !(READ_ONCE(qdisc->state) & QDISC_STATE_NON_EMPTY); } static inline bool qdisc_is_percpu_stats(const struct Qdisc *q) { return q->flags & TCQ_F_CPUSTATS; } static inline bool qdisc_is_empty(const struct Qdisc *qdisc) { if (qdisc_is_percpu_stats(qdisc)) return nolock_qdisc_is_empty(qdisc); return !READ_ONCE(qdisc->q.qlen); } /* For !TCQ_F_NOLOCK qdisc, qdisc_run_begin/end() must be invoked with * the qdisc root lock acquired. */ static inline bool qdisc_run_begin(struct Qdisc *qdisc) { if (qdisc->flags & TCQ_F_NOLOCK) { if (spin_trylock(&qdisc->seqlock)) return true; /* No need to insist if the MISSED flag was already set. * Note that test_and_set_bit() also gives us memory ordering * guarantees wrt potential earlier enqueue() and below * spin_trylock(), both of which are necessary to prevent races */ if (test_and_set_bit(__QDISC_STATE_MISSED, &qdisc->state)) return false; /* Try to take the lock again to make sure that we will either * grab it or the CPU that still has it will see MISSED set * when testing it in qdisc_run_end() */ return spin_trylock(&qdisc->seqlock); } return !__test_and_set_bit(__QDISC_STATE2_RUNNING, &qdisc->state2); } static inline void qdisc_run_end(struct Qdisc *qdisc) { if (qdisc->flags & TCQ_F_NOLOCK) { spin_unlock(&qdisc->seqlock); /* spin_unlock() only has store-release semantic. The unlock * and test_bit() ordering is a store-load ordering, so a full * memory barrier is needed here. */ smp_mb(); if (unlikely(test_bit(__QDISC_STATE_MISSED, &qdisc->state))) __netif_schedule(qdisc); } else { __clear_bit(__QDISC_STATE2_RUNNING, &qdisc->state2); } } static inline bool qdisc_may_bulk(const struct Qdisc *qdisc) { return qdisc->flags & TCQ_F_ONETXQUEUE; } static inline int qdisc_avail_bulklimit(const struct netdev_queue *txq) { return netdev_queue_dql_avail(txq); } struct Qdisc_class_ops { unsigned int flags; /* Child qdisc manipulation */ struct netdev_queue * (*select_queue)(struct Qdisc *, struct tcmsg *); int (*graft)(struct Qdisc *, unsigned long cl, struct Qdisc *, struct Qdisc **, struct netlink_ext_ack *extack); struct Qdisc * (*leaf)(struct Qdisc *, unsigned long cl); void (*qlen_notify)(struct Qdisc *, unsigned long); /* Class manipulation routines */ unsigned long (*find)(struct Qdisc *, u32 classid); int (*change)(struct Qdisc *, u32, u32, struct nlattr **, unsigned long *, struct netlink_ext_ack *); int (*delete)(struct Qdisc *, unsigned long, struct netlink_ext_ack *); void (*walk)(struct Qdisc *, struct qdisc_walker * arg); /* Filter manipulation */ struct tcf_block * (*tcf_block)(struct Qdisc *sch, unsigned long arg, struct netlink_ext_ack *extack); unsigned long (*bind_tcf)(struct Qdisc *, unsigned long, u32 classid); void (*unbind_tcf)(struct Qdisc *, unsigned long); /* rtnetlink specific */ int (*dump)(struct Qdisc *, unsigned long, struct sk_buff *skb, struct tcmsg*); int (*dump_stats)(struct Qdisc *, unsigned long, struct gnet_dump *); }; /* Qdisc_class_ops flag values */ /* Implements API that doesn't require rtnl lock */ enum qdisc_class_ops_flags { QDISC_CLASS_OPS_DOIT_UNLOCKED = 1, }; struct Qdisc_ops { struct Qdisc_ops *next; const struct Qdisc_class_ops *cl_ops; char id[IFNAMSIZ]; int priv_size; unsigned int static_flags; int (*enqueue)(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free); struct sk_buff * (*dequeue)(struct Qdisc *); struct sk_buff * (*peek)(struct Qdisc *); int (*init)(struct Qdisc *sch, struct nlattr *arg, struct netlink_ext_ack *extack); void (*reset)(struct Qdisc *); void (*destroy)(struct Qdisc *); int (*change)(struct Qdisc *sch, struct nlattr *arg, struct netlink_ext_ack *extack); void (*attach)(struct Qdisc *sch); int (*change_tx_queue_len)(struct Qdisc *, unsigned int); void (*change_real_num_tx)(struct Qdisc *sch, unsigned int new_real_tx); int (*dump)(struct Qdisc *, struct sk_buff *); int (*dump_stats)(struct Qdisc *, struct gnet_dump *); void (*ingress_block_set)(struct Qdisc *sch, u32 block_index); void (*egress_block_set)(struct Qdisc *sch, u32 block_index); u32 (*ingress_block_get)(struct Qdisc *sch); u32 (*egress_block_get)(struct Qdisc *sch); struct module *owner; }; struct tcf_result { union { struct { unsigned long class; u32 classid; }; const struct tcf_proto *goto_tp; }; }; struct tcf_chain; struct tcf_proto_ops { struct list_head head; char kind[IFNAMSIZ]; int (*classify)(struct sk_buff *, const struct tcf_proto *, struct tcf_result *); int (*init)(struct tcf_proto*); void (*destroy)(struct tcf_proto *tp, bool rtnl_held, struct netlink_ext_ack *extack); void* (*get)(struct tcf_proto*, u32 handle); void (*put)(struct tcf_proto *tp, void *f); int (*change)(struct net *net, struct sk_buff *, struct tcf_proto*, unsigned long, u32 handle, struct nlattr **, void **, u32, struct netlink_ext_ack *); int (*delete)(struct tcf_proto *tp, void *arg, bool *last, bool rtnl_held, struct netlink_ext_ack *); bool (*delete_empty)(struct tcf_proto *tp); void (*walk)(struct tcf_proto *tp, struct tcf_walker *arg, bool rtnl_held); int (*reoffload)(struct tcf_proto *tp, bool add, flow_setup_cb_t *cb, void *cb_priv, struct netlink_ext_ack *extack); void (*hw_add)(struct tcf_proto *tp, void *type_data); void (*hw_del)(struct tcf_proto *tp, void *type_data); void (*bind_class)(void *, u32, unsigned long, void *, unsigned long); void * (*tmplt_create)(struct net *net, struct tcf_chain *chain, struct nlattr **tca, struct netlink_ext_ack *extack); void (*tmplt_destroy)(void *tmplt_priv); void (*tmplt_reoffload)(struct tcf_chain *chain, bool add, flow_setup_cb_t *cb, void *cb_priv); struct tcf_exts * (*get_exts)(const struct tcf_proto *tp, u32 handle); /* rtnetlink specific */ int (*dump)(struct net*, struct tcf_proto*, void *, struct sk_buff *skb, struct tcmsg*, bool); int (*terse_dump)(struct net *net, struct tcf_proto *tp, void *fh, struct sk_buff *skb, struct tcmsg *t, bool rtnl_held); int (*tmplt_dump)(struct sk_buff *skb, struct net *net, void *tmplt_priv); struct module *owner; int flags; }; /* Classifiers setting TCF_PROTO_OPS_DOIT_UNLOCKED in tcf_proto_ops->flags * are expected to implement tcf_proto_ops->delete_empty(), otherwise race * conditions can occur when filters are inserted/deleted simultaneously. */ enum tcf_proto_ops_flags { TCF_PROTO_OPS_DOIT_UNLOCKED = 1, }; struct tcf_proto { /* Fast access part */ struct tcf_proto __rcu *next; void __rcu *root; /* called under RCU BH lock*/ int (*classify)(struct sk_buff *, const struct tcf_proto *, struct tcf_result *); __be16 protocol; /* All the rest */ u32 prio; void *data; const struct tcf_proto_ops *ops; struct tcf_chain *chain; /* Lock protects tcf_proto shared state and can be used by unlocked * classifiers to protect their private data. */ spinlock_t lock; bool deleting; bool counted; refcount_t refcnt; struct rcu_head rcu; struct hlist_node destroy_ht_node; }; struct qdisc_skb_cb { struct { unsigned int pkt_len; u16 slave_dev_queue_mapping; u16 tc_classid; }; #define QDISC_CB_PRIV_LEN 20 unsigned char data[QDISC_CB_PRIV_LEN]; }; typedef void tcf_chain_head_change_t(struct tcf_proto *tp_head, void *priv); struct tcf_chain { /* Protects filter_chain. */ struct mutex filter_chain_lock; struct tcf_proto __rcu *filter_chain; struct list_head list; struct tcf_block *block; u32 index; /* chain index */ unsigned int refcnt; unsigned int action_refcnt; bool explicitly_created; bool flushing; const struct tcf_proto_ops *tmplt_ops; void *tmplt_priv; struct rcu_head rcu; }; struct tcf_block { struct xarray ports; /* datapath accessible */ /* Lock protects tcf_block and lifetime-management data of chains * attached to the block (refcnt, action_refcnt, explicitly_created). */ struct mutex lock; struct list_head chain_list; u32 index; /* block index for shared blocks */ u32 classid; /* which class this block belongs to */ refcount_t refcnt; struct net *net; struct Qdisc *q; struct rw_semaphore cb_lock; /* protects cb_list and offload counters */ struct flow_block flow_block; struct list_head owner_list; bool keep_dst; bool bypass_wanted; atomic_t filtercnt; /* Number of filters */ atomic_t skipswcnt; /* Number of skip_sw filters */ atomic_t offloadcnt; /* Number of oddloaded filters */ unsigned int nooffloaddevcnt; /* Number of devs unable to do offload */ unsigned int lockeddevcnt; /* Number of devs that require rtnl lock. */ struct { struct tcf_chain *chain; struct list_head filter_chain_list; } chain0; struct rcu_head rcu; DECLARE_HASHTABLE(proto_destroy_ht, 7); struct mutex proto_destroy_lock; /* Lock for proto_destroy hashtable. */ }; struct tcf_block *tcf_block_lookup(struct net *net, u32 block_index); static inline bool lockdep_tcf_chain_is_locked(struct tcf_chain *chain) { return lockdep_is_held(&chain->filter_chain_lock); } static inline bool lockdep_tcf_proto_is_locked(struct tcf_proto *tp) { return lockdep_is_held(&tp->lock); } #define tcf_chain_dereference(p, chain) \ rcu_dereference_protected(p, lockdep_tcf_chain_is_locked(chain)) #define tcf_proto_dereference(p, tp) \ rcu_dereference_protected(p, lockdep_tcf_proto_is_locked(tp)) static inline void qdisc_cb_private_validate(const struct sk_buff *skb, int sz) { struct qdisc_skb_cb *qcb; BUILD_BUG_ON(sizeof(skb->cb) < sizeof(*qcb)); BUILD_BUG_ON(sizeof(qcb->data) < sz); } static inline int qdisc_qlen(const struct Qdisc *q) { return q->q.qlen; } static inline int qdisc_qlen_sum(const struct Qdisc *q) { __u32 qlen = q->qstats.qlen; int i; if (qdisc_is_percpu_stats(q)) { for_each_possible_cpu(i) qlen += per_cpu_ptr(q->cpu_qstats, i)->qlen; } else { qlen += q->q.qlen; } return qlen; } static inline struct qdisc_skb_cb *qdisc_skb_cb(const struct sk_buff *skb) { return (struct qdisc_skb_cb *)skb->cb; } static inline spinlock_t *qdisc_lock(struct Qdisc *qdisc) { return &qdisc->q.lock; } static inline struct Qdisc *qdisc_root(const struct Qdisc *qdisc) { struct Qdisc *q = rcu_dereference_rtnl(qdisc->dev_queue->qdisc); return q; } static inline struct Qdisc *qdisc_root_bh(const struct Qdisc *qdisc) { return rcu_dereference_bh(qdisc->dev_queue->qdisc); } static inline struct Qdisc *qdisc_root_sleeping(const struct Qdisc *qdisc) { return rcu_dereference_rtnl(qdisc->dev_queue->qdisc_sleeping); } static inline spinlock_t *qdisc_root_sleeping_lock(const struct Qdisc *qdisc) { struct Qdisc *root = qdisc_root_sleeping(qdisc); ASSERT_RTNL(); return qdisc_lock(root); } static inline struct net_device *qdisc_dev(const struct Qdisc *qdisc) { return qdisc->dev_queue->dev; } static inline void sch_tree_lock(struct Qdisc *q) { if (q->flags & TCQ_F_MQROOT) spin_lock_bh(qdisc_lock(q)); else spin_lock_bh(qdisc_root_sleeping_lock(q)); } static inline void sch_tree_unlock(struct Qdisc *q) { if (q->flags & TCQ_F_MQROOT) spin_unlock_bh(qdisc_lock(q)); else spin_unlock_bh(qdisc_root_sleeping_lock(q)); } extern struct Qdisc noop_qdisc; extern struct Qdisc_ops noop_qdisc_ops; extern struct Qdisc_ops pfifo_fast_ops; extern const u8 sch_default_prio2band[TC_PRIO_MAX + 1]; extern struct Qdisc_ops mq_qdisc_ops; extern struct Qdisc_ops noqueue_qdisc_ops; extern const struct Qdisc_ops *default_qdisc_ops; static inline const struct Qdisc_ops * get_default_qdisc_ops(const struct net_device *dev, int ntx) { return ntx < dev->real_num_tx_queues ? default_qdisc_ops : &pfifo_fast_ops; } struct Qdisc_class_common { u32 classid; unsigned int filter_cnt; struct hlist_node hnode; }; struct Qdisc_class_hash { struct hlist_head *hash; unsigned int hashsize; unsigned int hashmask; unsigned int hashelems; }; static inline unsigned int qdisc_class_hash(u32 id, u32 mask) { id ^= id >> 8; id ^= id >> 4; return id & mask; } static inline struct Qdisc_class_common * qdisc_class_find(const struct Qdisc_class_hash *hash, u32 id) { struct Qdisc_class_common *cl; unsigned int h; if (!id) return NULL; h = qdisc_class_hash(id, hash->hashmask); hlist_for_each_entry(cl, &hash->hash[h], hnode) { if (cl->classid == id) return cl; } return NULL; } static inline bool qdisc_class_in_use(const struct Qdisc_class_common *cl) { return cl->filter_cnt > 0; } static inline void qdisc_class_get(struct Qdisc_class_common *cl) { unsigned int res; if (check_add_overflow(cl->filter_cnt, 1, &res)) WARN(1, "Qdisc class overflow"); cl->filter_cnt = res; } static inline void qdisc_class_put(struct Qdisc_class_common *cl) { unsigned int res; if (check_sub_overflow(cl->filter_cnt, 1, &res)) WARN(1, "Qdisc class underflow"); cl->filter_cnt = res; } static inline int tc_classid_to_hwtc(struct net_device *dev, u32 classid) { u32 hwtc = TC_H_MIN(classid) - TC_H_MIN_PRIORITY; return (hwtc < netdev_get_num_tc(dev)) ? hwtc : -EINVAL; } int qdisc_class_hash_init(struct Qdisc_class_hash *); void qdisc_class_hash_insert(struct Qdisc_class_hash *, struct Qdisc_class_common *); void qdisc_class_hash_remove(struct Qdisc_class_hash *, struct Qdisc_class_common *); void qdisc_class_hash_grow(struct Qdisc *, struct Qdisc_class_hash *); void qdisc_class_hash_destroy(struct Qdisc_class_hash *); int dev_qdisc_change_tx_queue_len(struct net_device *dev); void dev_qdisc_change_real_num_tx(struct net_device *dev, unsigned int new_real_tx); void dev_init_scheduler(struct net_device *dev); void dev_shutdown(struct net_device *dev); void dev_activate(struct net_device *dev); void dev_deactivate(struct net_device *dev); void dev_deactivate_many(struct list_head *head); struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue, struct Qdisc *qdisc); void qdisc_reset(struct Qdisc *qdisc); void qdisc_destroy(struct Qdisc *qdisc); void qdisc_put(struct Qdisc *qdisc); void qdisc_put_unlocked(struct Qdisc *qdisc); void qdisc_tree_reduce_backlog(struct Qdisc *qdisc, int n, int len); #ifdef CONFIG_NET_SCHED int qdisc_offload_dump_helper(struct Qdisc *q, enum tc_setup_type type, void *type_data); void qdisc_offload_graft_helper(struct net_device *dev, struct Qdisc *sch, struct Qdisc *new, struct Qdisc *old, enum tc_setup_type type, void *type_data, struct netlink_ext_ack *extack); #else static inline int qdisc_offload_dump_helper(struct Qdisc *q, enum tc_setup_type type, void *type_data) { q->flags &= ~TCQ_F_OFFLOADED; return 0; } static inline void qdisc_offload_graft_helper(struct net_device *dev, struct Qdisc *sch, struct Qdisc *new, struct Qdisc *old, enum tc_setup_type type, void *type_data, struct netlink_ext_ack *extack) { } #endif void qdisc_offload_query_caps(struct net_device *dev, enum tc_setup_type type, void *caps, size_t caps_len); struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, const struct Qdisc_ops *ops, struct netlink_ext_ack *extack); void qdisc_free(struct Qdisc *qdisc); struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue, const struct Qdisc_ops *ops, u32 parentid, struct netlink_ext_ack *extack); void __qdisc_calculate_pkt_len(struct sk_buff *skb, const struct qdisc_size_table *stab); int skb_do_redirect(struct sk_buff *); static inline bool skb_at_tc_ingress(const struct sk_buff *skb) { #ifdef CONFIG_NET_XGRESS return skb->tc_at_ingress; #else return false; #endif } static inline bool skb_skip_tc_classify(struct sk_buff *skb) { #ifdef CONFIG_NET_CLS_ACT if (skb->tc_skip_classify) { skb->tc_skip_classify = 0; return true; } #endif return false; } /* Reset all TX qdiscs greater than index of a device. */ static inline void qdisc_reset_all_tx_gt(struct net_device *dev, unsigned int i) { struct Qdisc *qdisc; for (; i < dev->num_tx_queues; i++) { qdisc = rtnl_dereference(netdev_get_tx_queue(dev, i)->qdisc); if (qdisc) { spin_lock_bh(qdisc_lock(qdisc)); qdisc_reset(qdisc); spin_unlock_bh(qdisc_lock(qdisc)); } } } /* Are all TX queues of the device empty? */ static inline bool qdisc_all_tx_empty(const struct net_device *dev) { unsigned int i; rcu_read_lock(); for (i = 0; i < dev->num_tx_queues; i++) { struct netdev_queue *txq = netdev_get_tx_queue(dev, i); const struct Qdisc *q = rcu_dereference(txq->qdisc); if (!qdisc_is_empty(q)) { rcu_read_unlock(); return false; } } rcu_read_unlock(); return true; } /* Are any of the TX qdiscs changing? */ static inline bool qdisc_tx_changing(const struct net_device *dev) { unsigned int i; for (i = 0; i < dev->num_tx_queues; i++) { struct netdev_queue *txq = netdev_get_tx_queue(dev, i); if (rcu_access_pointer(txq->qdisc) != rcu_access_pointer(txq->qdisc_sleeping)) return true; } return false; } /* Is the device using the noop qdisc on all queues? */ static inline bool qdisc_tx_is_noop(const struct net_device *dev) { unsigned int i; for (i = 0; i < dev->num_tx_queues; i++) { struct netdev_queue *txq = netdev_get_tx_queue(dev, i); if (rcu_access_pointer(txq->qdisc) != &noop_qdisc) return false; } return true; } static inline unsigned int qdisc_pkt_len(const struct sk_buff *skb) { return qdisc_skb_cb(skb)->pkt_len; } /* additional qdisc xmit flags (NET_XMIT_MASK in linux/netdevice.h) */ enum net_xmit_qdisc_t { __NET_XMIT_STOLEN = 0x00010000, __NET_XMIT_BYPASS = 0x00020000, }; #ifdef CONFIG_NET_CLS_ACT #define net_xmit_drop_count(e) ((e) & __NET_XMIT_STOLEN ? 0 : 1) #else #define net_xmit_drop_count(e) (1) #endif static inline void qdisc_calculate_pkt_len(struct sk_buff *skb, const struct Qdisc *sch) { #ifdef CONFIG_NET_SCHED struct qdisc_size_table *stab = rcu_dereference_bh(sch->stab); if (stab) __qdisc_calculate_pkt_len(skb, stab); #endif } static inline int qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { qdisc_calculate_pkt_len(skb, sch); return sch->enqueue(skb, sch, to_free); } static inline void _bstats_update(struct gnet_stats_basic_sync *bstats, __u64 bytes, __u32 packets) { u64_stats_update_begin(&bstats->syncp); u64_stats_add(&bstats->bytes, bytes); u64_stats_add(&bstats->packets, packets); u64_stats_update_end(&bstats->syncp); } static inline void bstats_update(struct gnet_stats_basic_sync *bstats, const struct sk_buff *skb) { _bstats_update(bstats, qdisc_pkt_len(skb), skb_is_gso(skb) ? skb_shinfo(skb)->gso_segs : 1); } static inline void qdisc_bstats_cpu_update(struct Qdisc *sch, const struct sk_buff *skb) { bstats_update(this_cpu_ptr(sch->cpu_bstats), skb); } static inline void qdisc_bstats_update(struct Qdisc *sch, const struct sk_buff *skb) { bstats_update(&sch->bstats, skb); } static inline void qdisc_qstats_backlog_dec(struct Qdisc *sch, const struct sk_buff *skb) { sch->qstats.backlog -= qdisc_pkt_len(skb); } static inline void qdisc_qstats_cpu_backlog_dec(struct Qdisc *sch, const struct sk_buff *skb) { this_cpu_sub(sch->cpu_qstats->backlog, qdisc_pkt_len(skb)); } static inline void qdisc_qstats_backlog_inc(struct Qdisc *sch, const struct sk_buff *skb) { sch->qstats.backlog += qdisc_pkt_len(skb); } static inline void qdisc_qstats_cpu_backlog_inc(struct Qdisc *sch, const struct sk_buff *skb) { this_cpu_add(sch->cpu_qstats->backlog, qdisc_pkt_len(skb)); } static inline void qdisc_qstats_cpu_qlen_inc(struct Qdisc *sch) { this_cpu_inc(sch->cpu_qstats->qlen); } static inline void qdisc_qstats_cpu_qlen_dec(struct Qdisc *sch) { this_cpu_dec(sch->cpu_qstats->qlen); } static inline void qdisc_qstats_cpu_requeues_inc(struct Qdisc *sch) { this_cpu_inc(sch->cpu_qstats->requeues); } static inline void __qdisc_qstats_drop(struct Qdisc *sch, int count) { sch->qstats.drops += count; } static inline void qstats_drop_inc(struct gnet_stats_queue *qstats) { qstats->drops++; } static inline void qstats_overlimit_inc(struct gnet_stats_queue *qstats) { qstats->overlimits++; } static inline void qdisc_qstats_drop(struct Qdisc *sch) { qstats_drop_inc(&sch->qstats); } static inline void qdisc_qstats_cpu_drop(struct Qdisc *sch) { this_cpu_inc(sch->cpu_qstats->drops); } static inline void qdisc_qstats_overlimit(struct Qdisc *sch) { sch->qstats.overlimits++; } static inline int qdisc_qstats_copy(struct gnet_dump *d, struct Qdisc *sch) { __u32 qlen = qdisc_qlen_sum(sch); return gnet_stats_copy_queue(d, sch->cpu_qstats, &sch->qstats, qlen); } static inline void qdisc_qstats_qlen_backlog(struct Qdisc *sch, __u32 *qlen, __u32 *backlog) { struct gnet_stats_queue qstats = { 0 }; gnet_stats_add_queue(&qstats, sch->cpu_qstats, &sch->qstats); *qlen = qstats.qlen + qdisc_qlen(sch); *backlog = qstats.backlog; } static inline void qdisc_tree_flush_backlog(struct Qdisc *sch) { __u32 qlen, backlog; qdisc_qstats_qlen_backlog(sch, &qlen, &backlog); qdisc_tree_reduce_backlog(sch, qlen, backlog); } static inline void qdisc_purge_queue(struct Qdisc *sch) { __u32 qlen, backlog; qdisc_qstats_qlen_backlog(sch, &qlen, &backlog); qdisc_reset(sch); qdisc_tree_reduce_backlog(sch, qlen, backlog); } static inline void __qdisc_enqueue_tail(struct sk_buff *skb, struct qdisc_skb_head *qh) { struct sk_buff *last = qh->tail; if (last) { skb->next = NULL; last->next = skb; qh->tail = skb; } else { qh->tail = skb; qh->head = skb; } qh->qlen++; } static inline int qdisc_enqueue_tail(struct sk_buff *skb, struct Qdisc *sch) { __qdisc_enqueue_tail(skb, &sch->q); qdisc_qstats_backlog_inc(sch, skb); return NET_XMIT_SUCCESS; } static inline void __qdisc_enqueue_head(struct sk_buff *skb, struct qdisc_skb_head *qh) { skb->next = qh->head; if (!qh->head) qh->tail = skb; qh->head = skb; qh->qlen++; } static inline struct sk_buff *__qdisc_dequeue_head(struct qdisc_skb_head *qh) { struct sk_buff *skb = qh->head; if (likely(skb != NULL)) { qh->head = skb->next; qh->qlen--; if (qh->head == NULL) qh->tail = NULL; skb->next = NULL; } return skb; } static inline struct sk_buff *qdisc_dequeue_head(struct Qdisc *sch) { struct sk_buff *skb = __qdisc_dequeue_head(&sch->q); if (likely(skb != NULL)) { qdisc_qstats_backlog_dec(sch, skb); qdisc_bstats_update(sch, skb); } return skb; } struct tc_skb_cb { struct qdisc_skb_cb qdisc_cb; u32 drop_reason; u16 zone; /* Only valid if post_ct = true */ u16 mru; u8 post_ct:1; u8 post_ct_snat:1; u8 post_ct_dnat:1; }; static inline struct tc_skb_cb *tc_skb_cb(const struct sk_buff *skb) { struct tc_skb_cb *cb = (struct tc_skb_cb *)skb->cb; BUILD_BUG_ON(sizeof(*cb) > sizeof_field(struct sk_buff, cb)); return cb; } static inline enum skb_drop_reason tcf_get_drop_reason(const struct sk_buff *skb) { return tc_skb_cb(skb)->drop_reason; } static inline void tcf_set_drop_reason(const struct sk_buff *skb, enum skb_drop_reason reason) { tc_skb_cb(skb)->drop_reason = reason; } /* Instead of calling kfree_skb() while root qdisc lock is held, * queue the skb for future freeing at end of __dev_xmit_skb() */ static inline void __qdisc_drop(struct sk_buff *skb, struct sk_buff **to_free) { skb->next = *to_free; *to_free = skb; } static inline void __qdisc_drop_all(struct sk_buff *skb, struct sk_buff **to_free) { if (skb->prev) skb->prev->next = *to_free; else skb->next = *to_free; *to_free = skb; } static inline unsigned int __qdisc_queue_drop_head(struct Qdisc *sch, struct qdisc_skb_head *qh, struct sk_buff **to_free) { struct sk_buff *skb = __qdisc_dequeue_head(qh); if (likely(skb != NULL)) { unsigned int len = qdisc_pkt_len(skb); qdisc_qstats_backlog_dec(sch, skb); __qdisc_drop(skb, to_free); return len; } return 0; } static inline struct sk_buff *qdisc_peek_head(struct Qdisc *sch) { const struct qdisc_skb_head *qh = &sch->q; return qh->head; } /* generic pseudo peek method for non-work-conserving qdisc */ static inline struct sk_buff *qdisc_peek_dequeued(struct Qdisc *sch) { struct sk_buff *skb = skb_peek(&sch->gso_skb); /* we can reuse ->gso_skb because peek isn't called for root qdiscs */ if (!skb) { skb = sch->dequeue(sch); if (skb) { __skb_queue_head(&sch->gso_skb, skb); /* it's still part of the queue */ qdisc_qstats_backlog_inc(sch, skb); sch->q.qlen++; } } return skb; } static inline void qdisc_update_stats_at_dequeue(struct Qdisc *sch, struct sk_buff *skb) { if (qdisc_is_percpu_stats(sch)) { qdisc_qstats_cpu_backlog_dec(sch, skb); qdisc_bstats_cpu_update(sch, skb); qdisc_qstats_cpu_qlen_dec(sch); } else { qdisc_qstats_backlog_dec(sch, skb); qdisc_bstats_update(sch, skb); sch->q.qlen--; } } static inline void qdisc_update_stats_at_enqueue(struct Qdisc *sch, unsigned int pkt_len) { if (qdisc_is_percpu_stats(sch)) { qdisc_qstats_cpu_qlen_inc(sch); this_cpu_add(sch->cpu_qstats->backlog, pkt_len); } else { sch->qstats.backlog += pkt_len; sch->q.qlen++; } } /* use instead of qdisc->dequeue() for all qdiscs queried with ->peek() */ static inline struct sk_buff *qdisc_dequeue_peeked(struct Qdisc *sch) { struct sk_buff *skb = skb_peek(&sch->gso_skb); if (skb) { skb = __skb_dequeue(&sch->gso_skb); if (qdisc_is_percpu_stats(sch)) { qdisc_qstats_cpu_backlog_dec(sch, skb); qdisc_qstats_cpu_qlen_dec(sch); } else { qdisc_qstats_backlog_dec(sch, skb); sch->q.qlen--; } } else { skb = sch->dequeue(sch); } return skb; } static inline void __qdisc_reset_queue(struct qdisc_skb_head *qh) { /* * We do not know the backlog in bytes of this list, it * is up to the caller to correct it */ ASSERT_RTNL(); if (qh->qlen) { rtnl_kfree_skbs(qh->head, qh->tail); qh->head = NULL; qh->tail = NULL; qh->qlen = 0; } } static inline void qdisc_reset_queue(struct Qdisc *sch) { __qdisc_reset_queue(&sch->q); } static inline struct Qdisc *qdisc_replace(struct Qdisc *sch, struct Qdisc *new, struct Qdisc **pold) { struct Qdisc *old; sch_tree_lock(sch); old = *pold; *pold = new; if (old != NULL) qdisc_purge_queue(old); sch_tree_unlock(sch); return old; } static inline void rtnl_qdisc_drop(struct sk_buff *skb, struct Qdisc *sch) { rtnl_kfree_skbs(skb, skb); qdisc_qstats_drop(sch); } static inline int qdisc_drop_cpu(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { __qdisc_drop(skb, to_free); qdisc_qstats_cpu_drop(sch); return NET_XMIT_DROP; } static inline int qdisc_drop(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { __qdisc_drop(skb, to_free); qdisc_qstats_drop(sch); return NET_XMIT_DROP; } static inline int qdisc_drop_all(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { __qdisc_drop_all(skb, to_free); qdisc_qstats_drop(sch); return NET_XMIT_DROP; } struct psched_ratecfg { u64 rate_bytes_ps; /* bytes per second */ u32 mult; u16 overhead; u16 mpu; u8 linklayer; u8 shift; }; static inline u64 psched_l2t_ns(const struct psched_ratecfg *r, unsigned int len) { len += r->overhead; if (len < r->mpu) len = r->mpu; if (unlikely(r->linklayer == TC_LINKLAYER_ATM)) return ((u64)(DIV_ROUND_UP(len,48)*53) * r->mult) >> r->shift; return ((u64)len * r->mult) >> r->shift; } void psched_ratecfg_precompute(struct psched_ratecfg *r, const struct tc_ratespec *conf, u64 rate64); static inline void psched_ratecfg_getrate(struct tc_ratespec *res, const struct psched_ratecfg *r) { memset(res, 0, sizeof(*res)); /* legacy struct tc_ratespec has a 32bit @rate field * Qdisc using 64bit rate should add new attributes * in order to maintain compatibility. */ res->rate = min_t(u64, r->rate_bytes_ps, ~0U); res->overhead = r->overhead; res->mpu = r->mpu; res->linklayer = (r->linklayer & TC_LINKLAYER_MASK); } struct psched_pktrate { u64 rate_pkts_ps; /* packets per second */ u32 mult; u8 shift; }; static inline u64 psched_pkt2t_ns(const struct psched_pktrate *r, unsigned int pkt_num) { return ((u64)pkt_num * r->mult) >> r->shift; } void psched_ppscfg_precompute(struct psched_pktrate *r, u64 pktrate64); /* Mini Qdisc serves for specific needs of ingress/clsact Qdisc. * The fast path only needs to access filter list and to update stats */ struct mini_Qdisc { struct tcf_proto *filter_list; struct tcf_block *block; struct gnet_stats_basic_sync __percpu *cpu_bstats; struct gnet_stats_queue __percpu *cpu_qstats; unsigned long rcu_state; }; static inline void mini_qdisc_bstats_cpu_update(struct mini_Qdisc *miniq, const struct sk_buff *skb) { bstats_update(this_cpu_ptr(miniq->cpu_bstats), skb); } static inline void mini_qdisc_qstats_cpu_drop(struct mini_Qdisc *miniq) { this_cpu_inc(miniq->cpu_qstats->drops); } struct mini_Qdisc_pair { struct mini_Qdisc miniq1; struct mini_Qdisc miniq2; struct mini_Qdisc __rcu **p_miniq; }; void mini_qdisc_pair_swap(struct mini_Qdisc_pair *miniqp, struct tcf_proto *tp_head); void mini_qdisc_pair_init(struct mini_Qdisc_pair *miniqp, struct Qdisc *qdisc, struct mini_Qdisc __rcu **p_miniq); void mini_qdisc_pair_block_init(struct mini_Qdisc_pair *miniqp, struct tcf_block *block); void mq_change_real_num_tx(struct Qdisc *sch, unsigned int new_real_tx); int sch_frag_xmit_hook(struct sk_buff *skb, int (*xmit)(struct sk_buff *skb)); /* Make sure qdisc is no longer in SCHED state. */ static inline void qdisc_synchronize(const struct Qdisc *q) { while (test_bit(__QDISC_STATE_SCHED, &q->state)) msleep(1); } #endif |
| 11 11 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 | // SPDX-License-Identifier: GPL-2.0-or-later /* * CRC32C *@Article{castagnoli-crc, * author = { Guy Castagnoli and Stefan Braeuer and Martin Herrman}, * title = {{Optimization of Cyclic Redundancy-Check Codes with 24 * and 32 Parity Bits}}, * journal = IEEE Transactions on Communication, * year = {1993}, * volume = {41}, * number = {6}, * pages = {}, * month = {June}, *} * Used by the iSCSI driver, possibly others, and derived from * the iscsi-crc.c module of the linux-iscsi driver at * http://linux-iscsi.sourceforge.net. * * Following the example of lib/crc32, this function is intended to be * flexible and useful for all users. Modules that currently have their * own crc32c, but hopefully may be able to use this one are: * net/sctp (please add all your doco to here if you change to * use this one!) * <endoflist> * * Copyright (c) 2004 Cisco Systems, Inc. */ #include <crypto/hash.h> #include <linux/err.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/crc32c.h> static struct crypto_shash *tfm; u32 crc32c(u32 crc, const void *address, unsigned int length) { SHASH_DESC_ON_STACK(shash, tfm); u32 ret, *ctx = (u32 *)shash_desc_ctx(shash); int err; shash->tfm = tfm; *ctx = crc; err = crypto_shash_update(shash, address, length); BUG_ON(err); ret = *ctx; barrier_data(ctx); return ret; } EXPORT_SYMBOL(crc32c); static int __init libcrc32c_mod_init(void) { tfm = crypto_alloc_shash("crc32c", 0, 0); return PTR_ERR_OR_ZERO(tfm); } static void __exit libcrc32c_mod_fini(void) { crypto_free_shash(tfm); } module_init(libcrc32c_mod_init); module_exit(libcrc32c_mod_fini); MODULE_AUTHOR("Clay Haapala <chaapala@cisco.com>"); MODULE_DESCRIPTION("CRC32c (Castagnoli) calculations"); MODULE_LICENSE("GPL"); MODULE_SOFTDEP("pre: crc32c"); |
| 111 113 113 113 113 112 2797 711 2801 2288 2805 1239 681 1244 1236 1236 1239 2479 2479 2229 2478 510 510 506 512 506 127 127 409 507 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 | // SPDX-License-Identifier: GPL-2.0-or-later #define pr_fmt(fmt) "ref_tracker: " fmt #include <linux/export.h> #include <linux/list_sort.h> #include <linux/ref_tracker.h> #include <linux/slab.h> #include <linux/stacktrace.h> #include <linux/stackdepot.h> #define REF_TRACKER_STACK_ENTRIES 16 #define STACK_BUF_SIZE 1024 struct ref_tracker { struct list_head head; /* anchor into dir->list or dir->quarantine */ bool dead; depot_stack_handle_t alloc_stack_handle; depot_stack_handle_t free_stack_handle; }; struct ref_tracker_dir_stats { int total; int count; struct { depot_stack_handle_t stack_handle; unsigned int count; } stacks[]; }; static struct ref_tracker_dir_stats * ref_tracker_get_stats(struct ref_tracker_dir *dir, unsigned int limit) { struct ref_tracker_dir_stats *stats; struct ref_tracker *tracker; stats = kmalloc(struct_size(stats, stacks, limit), GFP_NOWAIT | __GFP_NOWARN); if (!stats) return ERR_PTR(-ENOMEM); stats->total = 0; stats->count = 0; list_for_each_entry(tracker, &dir->list, head) { depot_stack_handle_t stack = tracker->alloc_stack_handle; int i; ++stats->total; for (i = 0; i < stats->count; ++i) if (stats->stacks[i].stack_handle == stack) break; if (i >= limit) continue; if (i >= stats->count) { stats->stacks[i].stack_handle = stack; stats->stacks[i].count = 0; ++stats->count; } ++stats->stacks[i].count; } return stats; } struct ostream { char *buf; int size, used; }; #define pr_ostream(stream, fmt, args...) \ ({ \ struct ostream *_s = (stream); \ \ if (!_s->buf) { \ pr_err(fmt, ##args); \ } else { \ int ret, len = _s->size - _s->used; \ ret = snprintf(_s->buf + _s->used, len, pr_fmt(fmt), ##args); \ _s->used += min(ret, len); \ } \ }) static void __ref_tracker_dir_pr_ostream(struct ref_tracker_dir *dir, unsigned int display_limit, struct ostream *s) { struct ref_tracker_dir_stats *stats; unsigned int i = 0, skipped; depot_stack_handle_t stack; char *sbuf; lockdep_assert_held(&dir->lock); if (list_empty(&dir->list)) return; stats = ref_tracker_get_stats(dir, display_limit); if (IS_ERR(stats)) { pr_ostream(s, "%s@%pK: couldn't get stats, error %pe\n", dir->name, dir, stats); return; } sbuf = kmalloc(STACK_BUF_SIZE, GFP_NOWAIT | __GFP_NOWARN); for (i = 0, skipped = stats->total; i < stats->count; ++i) { stack = stats->stacks[i].stack_handle; if (sbuf && !stack_depot_snprint(stack, sbuf, STACK_BUF_SIZE, 4)) sbuf[0] = 0; pr_ostream(s, "%s@%pK has %d/%d users at\n%s\n", dir->name, dir, stats->stacks[i].count, stats->total, sbuf); skipped -= stats->stacks[i].count; } if (skipped) pr_ostream(s, "%s@%pK skipped reports about %d/%d users.\n", dir->name, dir, skipped, stats->total); kfree(sbuf); kfree(stats); } void ref_tracker_dir_print_locked(struct ref_tracker_dir *dir, unsigned int display_limit) { struct ostream os = {}; __ref_tracker_dir_pr_ostream(dir, display_limit, &os); } EXPORT_SYMBOL(ref_tracker_dir_print_locked); void ref_tracker_dir_print(struct ref_tracker_dir *dir, unsigned int display_limit) { unsigned long flags; spin_lock_irqsave(&dir->lock, flags); ref_tracker_dir_print_locked(dir, display_limit); spin_unlock_irqrestore(&dir->lock, flags); } EXPORT_SYMBOL(ref_tracker_dir_print); int ref_tracker_dir_snprint(struct ref_tracker_dir *dir, char *buf, size_t size) { struct ostream os = { .buf = buf, .size = size }; unsigned long flags; spin_lock_irqsave(&dir->lock, flags); __ref_tracker_dir_pr_ostream(dir, 16, &os); spin_unlock_irqrestore(&dir->lock, flags); return os.used; } EXPORT_SYMBOL(ref_tracker_dir_snprint); void ref_tracker_dir_exit(struct ref_tracker_dir *dir) { struct ref_tracker *tracker, *n; unsigned long flags; bool leak = false; dir->dead = true; spin_lock_irqsave(&dir->lock, flags); list_for_each_entry_safe(tracker, n, &dir->quarantine, head) { list_del(&tracker->head); kfree(tracker); dir->quarantine_avail++; } if (!list_empty(&dir->list)) { ref_tracker_dir_print_locked(dir, 16); leak = true; list_for_each_entry_safe(tracker, n, &dir->list, head) { list_del(&tracker->head); kfree(tracker); } } spin_unlock_irqrestore(&dir->lock, flags); WARN_ON_ONCE(leak); WARN_ON_ONCE(refcount_read(&dir->untracked) != 1); WARN_ON_ONCE(refcount_read(&dir->no_tracker) != 1); } EXPORT_SYMBOL(ref_tracker_dir_exit); int ref_tracker_alloc(struct ref_tracker_dir *dir, struct ref_tracker **trackerp, gfp_t gfp) { unsigned long entries[REF_TRACKER_STACK_ENTRIES]; struct ref_tracker *tracker; unsigned int nr_entries; gfp_t gfp_mask = gfp | __GFP_NOWARN; unsigned long flags; WARN_ON_ONCE(dir->dead); if (!trackerp) { refcount_inc(&dir->no_tracker); return 0; } if (gfp & __GFP_DIRECT_RECLAIM) gfp_mask |= __GFP_NOFAIL; *trackerp = tracker = kzalloc(sizeof(*tracker), gfp_mask); if (unlikely(!tracker)) { pr_err_once("memory allocation failure, unreliable refcount tracker.\n"); refcount_inc(&dir->untracked); return -ENOMEM; } nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 1); tracker->alloc_stack_handle = stack_depot_save(entries, nr_entries, gfp); spin_lock_irqsave(&dir->lock, flags); list_add(&tracker->head, &dir->list); spin_unlock_irqrestore(&dir->lock, flags); return 0; } EXPORT_SYMBOL_GPL(ref_tracker_alloc); int ref_tracker_free(struct ref_tracker_dir *dir, struct ref_tracker **trackerp) { unsigned long entries[REF_TRACKER_STACK_ENTRIES]; depot_stack_handle_t stack_handle; struct ref_tracker *tracker; unsigned int nr_entries; unsigned long flags; WARN_ON_ONCE(dir->dead); if (!trackerp) { refcount_dec(&dir->no_tracker); return 0; } tracker = *trackerp; if (!tracker) { refcount_dec(&dir->untracked); return -EEXIST; } nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 1); stack_handle = stack_depot_save(entries, nr_entries, GFP_NOWAIT | __GFP_NOWARN); spin_lock_irqsave(&dir->lock, flags); if (tracker->dead) { pr_err("reference already released.\n"); if (tracker->alloc_stack_handle) { pr_err("allocated in:\n"); stack_depot_print(tracker->alloc_stack_handle); } if (tracker->free_stack_handle) { pr_err("freed in:\n"); stack_depot_print(tracker->free_stack_handle); } spin_unlock_irqrestore(&dir->lock, flags); WARN_ON_ONCE(1); return -EINVAL; } tracker->dead = true; tracker->free_stack_handle = stack_handle; list_move_tail(&tracker->head, &dir->quarantine); if (!dir->quarantine_avail) { tracker = list_first_entry(&dir->quarantine, struct ref_tracker, head); list_del(&tracker->head); } else { dir->quarantine_avail--; tracker = NULL; } spin_unlock_irqrestore(&dir->lock, flags); kfree(tracker); return 0; } EXPORT_SYMBOL_GPL(ref_tracker_free); |
| 3 2 1 1 3 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2013 Patrick McHardy <kaber@trash.net> */ #include <linux/netfilter_ipv6/ip6_tables.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_SYNPROXY.h> #include <net/netfilter/nf_synproxy.h> static unsigned int synproxy_tg6(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_synproxy_info *info = par->targinfo; struct net *net = xt_net(par); struct synproxy_net *snet = synproxy_pernet(net); struct synproxy_options opts = {}; struct tcphdr *th, _th; if (nf_ip6_checksum(skb, xt_hooknum(par), par->thoff, IPPROTO_TCP)) return NF_DROP; th = skb_header_pointer(skb, par->thoff, sizeof(_th), &_th); if (th == NULL) return NF_DROP; if (!synproxy_parse_options(skb, par->thoff, th, &opts)) return NF_DROP; if (th->syn && !(th->ack || th->fin || th->rst)) { /* Initial SYN from client */ this_cpu_inc(snet->stats->syn_received); if (th->ece && th->cwr) opts.options |= XT_SYNPROXY_OPT_ECN; opts.options &= info->options; opts.mss_encode = opts.mss_option; opts.mss_option = info->mss; if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) synproxy_init_timestamp_cookie(info, &opts); else opts.options &= ~(XT_SYNPROXY_OPT_WSCALE | XT_SYNPROXY_OPT_SACK_PERM | XT_SYNPROXY_OPT_ECN); synproxy_send_client_synack_ipv6(net, skb, th, &opts); consume_skb(skb); return NF_STOLEN; } else if (th->ack && !(th->fin || th->rst || th->syn)) { /* ACK from client */ if (synproxy_recv_client_ack_ipv6(net, skb, th, &opts, ntohl(th->seq))) { consume_skb(skb); return NF_STOLEN; } else { return NF_DROP; } } return XT_CONTINUE; } static int synproxy_tg6_check(const struct xt_tgchk_param *par) { struct synproxy_net *snet = synproxy_pernet(par->net); const struct ip6t_entry *e = par->entryinfo; int err; if (!(e->ipv6.flags & IP6T_F_PROTO) || e->ipv6.proto != IPPROTO_TCP || e->ipv6.invflags & XT_INV_PROTO) return -EINVAL; err = nf_ct_netns_get(par->net, par->family); if (err) return err; err = nf_synproxy_ipv6_init(snet, par->net); if (err) { nf_ct_netns_put(par->net, par->family); return err; } return err; } static void synproxy_tg6_destroy(const struct xt_tgdtor_param *par) { struct synproxy_net *snet = synproxy_pernet(par->net); nf_synproxy_ipv6_fini(snet, par->net); nf_ct_netns_put(par->net, par->family); } static struct xt_target synproxy_tg6_reg __read_mostly = { .name = "SYNPROXY", .family = NFPROTO_IPV6, .hooks = (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_FORWARD), .target = synproxy_tg6, .targetsize = sizeof(struct xt_synproxy_info), .checkentry = synproxy_tg6_check, .destroy = synproxy_tg6_destroy, .me = THIS_MODULE, }; static int __init synproxy_tg6_init(void) { return xt_register_target(&synproxy_tg6_reg); } static void __exit synproxy_tg6_exit(void) { xt_unregister_target(&synproxy_tg6_reg); } module_init(synproxy_tg6_init); module_exit(synproxy_tg6_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); MODULE_DESCRIPTION("Intercept IPv6 TCP connections and establish them using syncookies"); |
| 36 35 35 35 4 3 36 5 4 36 36 36 36 36 36 35 228 36 36 36 36 36 35 228 35 3 35 1 1 62 35 35 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 | // SPDX-License-Identifier: GPL-2.0-or-later /* * net/dsa/user.c - user device handling * Copyright (c) 2008-2009 Marvell Semiconductor */ #include <linux/list.h> #include <linux/etherdevice.h> #include <linux/netdevice.h> #include <linux/phy.h> #include <linux/phy_fixed.h> #include <linux/phylink.h> #include <linux/of_net.h> #include <linux/of_mdio.h> #include <linux/mdio.h> #include <net/rtnetlink.h> #include <net/pkt_cls.h> #include <net/selftests.h> #include <net/tc_act/tc_mirred.h> #include <linux/if_bridge.h> #include <linux/if_hsr.h> #include <net/dcbnl.h> #include <linux/netpoll.h> #include <linux/string.h> #include "conduit.h" #include "dsa.h" #include "netlink.h" #include "port.h" #include "switch.h" #include "tag.h" #include "user.h" struct dsa_switchdev_event_work { struct net_device *dev; struct net_device *orig_dev; struct work_struct work; unsigned long event; /* Specific for SWITCHDEV_FDB_ADD_TO_DEVICE and * SWITCHDEV_FDB_DEL_TO_DEVICE */ unsigned char addr[ETH_ALEN]; u16 vid; bool host_addr; }; enum dsa_standalone_event { DSA_UC_ADD, DSA_UC_DEL, DSA_MC_ADD, DSA_MC_DEL, }; struct dsa_standalone_event_work { struct work_struct work; struct net_device *dev; enum dsa_standalone_event event; unsigned char addr[ETH_ALEN]; u16 vid; }; struct dsa_host_vlan_rx_filtering_ctx { struct net_device *dev; const unsigned char *addr; enum dsa_standalone_event event; }; static bool dsa_switch_supports_uc_filtering(struct dsa_switch *ds) { return ds->ops->port_fdb_add && ds->ops->port_fdb_del && ds->fdb_isolation && !ds->vlan_filtering_is_global && !ds->needs_standalone_vlan_filtering; } static bool dsa_switch_supports_mc_filtering(struct dsa_switch *ds) { return ds->ops->port_mdb_add && ds->ops->port_mdb_del && ds->fdb_isolation && !ds->vlan_filtering_is_global && !ds->needs_standalone_vlan_filtering; } static void dsa_user_standalone_event_work(struct work_struct *work) { struct dsa_standalone_event_work *standalone_work = container_of(work, struct dsa_standalone_event_work, work); const unsigned char *addr = standalone_work->addr; struct net_device *dev = standalone_work->dev; struct dsa_port *dp = dsa_user_to_port(dev); struct switchdev_obj_port_mdb mdb; struct dsa_switch *ds = dp->ds; u16 vid = standalone_work->vid; int err; switch (standalone_work->event) { case DSA_UC_ADD: err = dsa_port_standalone_host_fdb_add(dp, addr, vid); if (err) { dev_err(ds->dev, "port %d failed to add %pM vid %d to fdb: %d\n", dp->index, addr, vid, err); break; } break; case DSA_UC_DEL: err = dsa_port_standalone_host_fdb_del(dp, addr, vid); if (err) { dev_err(ds->dev, "port %d failed to delete %pM vid %d from fdb: %d\n", dp->index, addr, vid, err); } break; case DSA_MC_ADD: ether_addr_copy(mdb.addr, addr); mdb.vid = vid; err = dsa_port_standalone_host_mdb_add(dp, &mdb); if (err) { dev_err(ds->dev, "port %d failed to add %pM vid %d to mdb: %d\n", dp->index, addr, vid, err); break; } break; case DSA_MC_DEL: ether_addr_copy(mdb.addr, addr); mdb.vid = vid; err = dsa_port_standalone_host_mdb_del(dp, &mdb); if (err) { dev_err(ds->dev, "port %d failed to delete %pM vid %d from mdb: %d\n", dp->index, addr, vid, err); } break; } kfree(standalone_work); } static int dsa_user_schedule_standalone_work(struct net_device *dev, enum dsa_standalone_event event, const unsigned char *addr, u16 vid) { struct dsa_standalone_event_work *standalone_work; standalone_work = kzalloc(sizeof(*standalone_work), GFP_ATOMIC); if (!standalone_work) return -ENOMEM; INIT_WORK(&standalone_work->work, dsa_user_standalone_event_work); standalone_work->event = event; standalone_work->dev = dev; ether_addr_copy(standalone_work->addr, addr); standalone_work->vid = vid; dsa_schedule_work(&standalone_work->work); return 0; } static int dsa_user_host_vlan_rx_filtering(void *arg, int vid) { struct dsa_host_vlan_rx_filtering_ctx *ctx = arg; return dsa_user_schedule_standalone_work(ctx->dev, ctx->event, ctx->addr, vid); } static int dsa_user_vlan_for_each(struct net_device *dev, int (*cb)(void *arg, int vid), void *arg) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_vlan *v; int err; lockdep_assert_held(&dev->addr_list_lock); err = cb(arg, 0); if (err) return err; list_for_each_entry(v, &dp->user_vlans, list) { err = cb(arg, v->vid); if (err) return err; } return 0; } static int dsa_user_sync_uc(struct net_device *dev, const unsigned char *addr) { struct net_device *conduit = dsa_user_to_conduit(dev); struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_host_vlan_rx_filtering_ctx ctx = { .dev = dev, .addr = addr, .event = DSA_UC_ADD, }; dev_uc_add(conduit, addr); if (!dsa_switch_supports_uc_filtering(dp->ds)) return 0; return dsa_user_vlan_for_each(dev, dsa_user_host_vlan_rx_filtering, &ctx); } static int dsa_user_unsync_uc(struct net_device *dev, const unsigned char *addr) { struct net_device *conduit = dsa_user_to_conduit(dev); struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_host_vlan_rx_filtering_ctx ctx = { .dev = dev, .addr = addr, .event = DSA_UC_DEL, }; dev_uc_del(conduit, addr); if (!dsa_switch_supports_uc_filtering(dp->ds)) return 0; return dsa_user_vlan_for_each(dev, dsa_user_host_vlan_rx_filtering, &ctx); } static int dsa_user_sync_mc(struct net_device *dev, const unsigned char *addr) { struct net_device *conduit = dsa_user_to_conduit(dev); struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_host_vlan_rx_filtering_ctx ctx = { .dev = dev, .addr = addr, .event = DSA_MC_ADD, }; dev_mc_add(conduit, addr); if (!dsa_switch_supports_mc_filtering(dp->ds)) return 0; return dsa_user_vlan_for_each(dev, dsa_user_host_vlan_rx_filtering, &ctx); } static int dsa_user_unsync_mc(struct net_device *dev, const unsigned char *addr) { struct net_device *conduit = dsa_user_to_conduit(dev); struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_host_vlan_rx_filtering_ctx ctx = { .dev = dev, .addr = addr, .event = DSA_MC_DEL, }; dev_mc_del(conduit, addr); if (!dsa_switch_supports_mc_filtering(dp->ds)) return 0; return dsa_user_vlan_for_each(dev, dsa_user_host_vlan_rx_filtering, &ctx); } void dsa_user_sync_ha(struct net_device *dev) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; struct netdev_hw_addr *ha; netif_addr_lock_bh(dev); netdev_for_each_synced_mc_addr(ha, dev) dsa_user_sync_mc(dev, ha->addr); netdev_for_each_synced_uc_addr(ha, dev) dsa_user_sync_uc(dev, ha->addr); netif_addr_unlock_bh(dev); if (dsa_switch_supports_uc_filtering(ds) || dsa_switch_supports_mc_filtering(ds)) dsa_flush_workqueue(); } void dsa_user_unsync_ha(struct net_device *dev) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; struct netdev_hw_addr *ha; netif_addr_lock_bh(dev); netdev_for_each_synced_uc_addr(ha, dev) dsa_user_unsync_uc(dev, ha->addr); netdev_for_each_synced_mc_addr(ha, dev) dsa_user_unsync_mc(dev, ha->addr); netif_addr_unlock_bh(dev); if (dsa_switch_supports_uc_filtering(ds) || dsa_switch_supports_mc_filtering(ds)) dsa_flush_workqueue(); } /* user mii_bus handling ***************************************************/ static int dsa_user_phy_read(struct mii_bus *bus, int addr, int reg) { struct dsa_switch *ds = bus->priv; if (ds->phys_mii_mask & (1 << addr)) return ds->ops->phy_read(ds, addr, reg); return 0xffff; } static int dsa_user_phy_write(struct mii_bus *bus, int addr, int reg, u16 val) { struct dsa_switch *ds = bus->priv; if (ds->phys_mii_mask & (1 << addr)) return ds->ops->phy_write(ds, addr, reg, val); return 0; } void dsa_user_mii_bus_init(struct dsa_switch *ds) { ds->user_mii_bus->priv = (void *)ds; ds->user_mii_bus->name = "dsa user smi"; ds->user_mii_bus->read = dsa_user_phy_read; ds->user_mii_bus->write = dsa_user_phy_write; snprintf(ds->user_mii_bus->id, MII_BUS_ID_SIZE, "dsa-%d.%d", ds->dst->index, ds->index); ds->user_mii_bus->parent = ds->dev; ds->user_mii_bus->phy_mask = ~ds->phys_mii_mask; } /* user device handling ****************************************************/ static int dsa_user_get_iflink(const struct net_device *dev) { return READ_ONCE(dsa_user_to_conduit(dev)->ifindex); } static int dsa_user_open(struct net_device *dev) { struct net_device *conduit = dsa_user_to_conduit(dev); struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; int err; err = dev_open(conduit, NULL); if (err < 0) { netdev_err(dev, "failed to open conduit %s\n", conduit->name); goto out; } if (dsa_switch_supports_uc_filtering(ds)) { err = dsa_port_standalone_host_fdb_add(dp, dev->dev_addr, 0); if (err) goto out; } if (!ether_addr_equal(dev->dev_addr, conduit->dev_addr)) { err = dev_uc_add(conduit, dev->dev_addr); if (err < 0) goto del_host_addr; } err = dsa_port_enable_rt(dp, dev->phydev); if (err) goto del_unicast; return 0; del_unicast: if (!ether_addr_equal(dev->dev_addr, conduit->dev_addr)) dev_uc_del(conduit, dev->dev_addr); del_host_addr: if (dsa_switch_supports_uc_filtering(ds)) dsa_port_standalone_host_fdb_del(dp, dev->dev_addr, 0); out: return err; } static int dsa_user_close(struct net_device *dev) { struct net_device *conduit = dsa_user_to_conduit(dev); struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; dsa_port_disable_rt(dp); if (!ether_addr_equal(dev->dev_addr, conduit->dev_addr)) dev_uc_del(conduit, dev->dev_addr); if (dsa_switch_supports_uc_filtering(ds)) dsa_port_standalone_host_fdb_del(dp, dev->dev_addr, 0); return 0; } static void dsa_user_manage_host_flood(struct net_device *dev) { bool mc = dev->flags & (IFF_PROMISC | IFF_ALLMULTI); struct dsa_port *dp = dsa_user_to_port(dev); bool uc = dev->flags & IFF_PROMISC; dsa_port_set_host_flood(dp, uc, mc); } static void dsa_user_change_rx_flags(struct net_device *dev, int change) { struct net_device *conduit = dsa_user_to_conduit(dev); struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (change & IFF_ALLMULTI) dev_set_allmulti(conduit, dev->flags & IFF_ALLMULTI ? 1 : -1); if (change & IFF_PROMISC) dev_set_promiscuity(conduit, dev->flags & IFF_PROMISC ? 1 : -1); if (dsa_switch_supports_uc_filtering(ds) && dsa_switch_supports_mc_filtering(ds)) dsa_user_manage_host_flood(dev); } static void dsa_user_set_rx_mode(struct net_device *dev) { __dev_mc_sync(dev, dsa_user_sync_mc, dsa_user_unsync_mc); __dev_uc_sync(dev, dsa_user_sync_uc, dsa_user_unsync_uc); } static int dsa_user_set_mac_address(struct net_device *dev, void *a) { struct net_device *conduit = dsa_user_to_conduit(dev); struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; struct sockaddr *addr = a; int err; if (!is_valid_ether_addr(addr->sa_data)) return -EADDRNOTAVAIL; if (ds->ops->port_set_mac_address) { err = ds->ops->port_set_mac_address(ds, dp->index, addr->sa_data); if (err) return err; } /* If the port is down, the address isn't synced yet to hardware or * to the DSA conduit, so there is nothing to change. */ if (!(dev->flags & IFF_UP)) goto out_change_dev_addr; if (dsa_switch_supports_uc_filtering(ds)) { err = dsa_port_standalone_host_fdb_add(dp, addr->sa_data, 0); if (err) return err; } if (!ether_addr_equal(addr->sa_data, conduit->dev_addr)) { err = dev_uc_add(conduit, addr->sa_data); if (err < 0) goto del_unicast; } if (!ether_addr_equal(dev->dev_addr, conduit->dev_addr)) dev_uc_del(conduit, dev->dev_addr); if (dsa_switch_supports_uc_filtering(ds)) dsa_port_standalone_host_fdb_del(dp, dev->dev_addr, 0); out_change_dev_addr: eth_hw_addr_set(dev, addr->sa_data); return 0; del_unicast: if (dsa_switch_supports_uc_filtering(ds)) dsa_port_standalone_host_fdb_del(dp, addr->sa_data, 0); return err; } struct dsa_user_dump_ctx { struct net_device *dev; struct sk_buff *skb; struct netlink_callback *cb; int idx; }; static int dsa_user_port_fdb_do_dump(const unsigned char *addr, u16 vid, bool is_static, void *data) { struct dsa_user_dump_ctx *dump = data; u32 portid = NETLINK_CB(dump->cb->skb).portid; u32 seq = dump->cb->nlh->nlmsg_seq; struct nlmsghdr *nlh; struct ndmsg *ndm; if (dump->idx < dump->cb->args[2]) goto skip; nlh = nlmsg_put(dump->skb, portid, seq, RTM_NEWNEIGH, sizeof(*ndm), NLM_F_MULTI); if (!nlh) return -EMSGSIZE; ndm = nlmsg_data(nlh); ndm->ndm_family = AF_BRIDGE; ndm->ndm_pad1 = 0; ndm->ndm_pad2 = 0; ndm->ndm_flags = NTF_SELF; ndm->ndm_type = 0; ndm->ndm_ifindex = dump->dev->ifindex; ndm->ndm_state = is_static ? NUD_NOARP : NUD_REACHABLE; if (nla_put(dump->skb, NDA_LLADDR, ETH_ALEN, addr)) goto nla_put_failure; if (vid && nla_put_u16(dump->skb, NDA_VLAN, vid)) goto nla_put_failure; nlmsg_end(dump->skb, nlh); skip: dump->idx++; return 0; nla_put_failure: nlmsg_cancel(dump->skb, nlh); return -EMSGSIZE; } static int dsa_user_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev, struct net_device *filter_dev, int *idx) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_user_dump_ctx dump = { .dev = dev, .skb = skb, .cb = cb, .idx = *idx, }; int err; err = dsa_port_fdb_dump(dp, dsa_user_port_fdb_do_dump, &dump); *idx = dump.idx; return err; } static int dsa_user_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) { struct dsa_user_priv *p = netdev_priv(dev); struct dsa_switch *ds = p->dp->ds; int port = p->dp->index; /* Pass through to switch driver if it supports timestamping */ switch (cmd) { case SIOCGHWTSTAMP: if (ds->ops->port_hwtstamp_get) return ds->ops->port_hwtstamp_get(ds, port, ifr); break; case SIOCSHWTSTAMP: if (ds->ops->port_hwtstamp_set) return ds->ops->port_hwtstamp_set(ds, port, ifr); break; } return phylink_mii_ioctl(p->dp->pl, ifr, cmd); } static int dsa_user_port_attr_set(struct net_device *dev, const void *ctx, const struct switchdev_attr *attr, struct netlink_ext_ack *extack) { struct dsa_port *dp = dsa_user_to_port(dev); int ret; if (ctx && ctx != dp) return 0; switch (attr->id) { case SWITCHDEV_ATTR_ID_PORT_STP_STATE: if (!dsa_port_offloads_bridge_port(dp, attr->orig_dev)) return -EOPNOTSUPP; ret = dsa_port_set_state(dp, attr->u.stp_state, true); break; case SWITCHDEV_ATTR_ID_PORT_MST_STATE: if (!dsa_port_offloads_bridge_port(dp, attr->orig_dev)) return -EOPNOTSUPP; ret = dsa_port_set_mst_state(dp, &attr->u.mst_state, extack); break; case SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING: if (!dsa_port_offloads_bridge_dev(dp, attr->orig_dev)) return -EOPNOTSUPP; ret = dsa_port_vlan_filtering(dp, attr->u.vlan_filtering, extack); break; case SWITCHDEV_ATTR_ID_BRIDGE_AGEING_TIME: if (!dsa_port_offloads_bridge_dev(dp, attr->orig_dev)) return -EOPNOTSUPP; ret = dsa_port_ageing_time(dp, attr->u.ageing_time); break; case SWITCHDEV_ATTR_ID_BRIDGE_MST: if (!dsa_port_offloads_bridge_dev(dp, attr->orig_dev)) return -EOPNOTSUPP; ret = dsa_port_mst_enable(dp, attr->u.mst, extack); break; case SWITCHDEV_ATTR_ID_PORT_PRE_BRIDGE_FLAGS: if (!dsa_port_offloads_bridge_port(dp, attr->orig_dev)) return -EOPNOTSUPP; ret = dsa_port_pre_bridge_flags(dp, attr->u.brport_flags, extack); break; case SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS: if (!dsa_port_offloads_bridge_port(dp, attr->orig_dev)) return -EOPNOTSUPP; ret = dsa_port_bridge_flags(dp, attr->u.brport_flags, extack); break; case SWITCHDEV_ATTR_ID_VLAN_MSTI: if (!dsa_port_offloads_bridge_dev(dp, attr->orig_dev)) return -EOPNOTSUPP; ret = dsa_port_vlan_msti(dp, &attr->u.vlan_msti); break; default: ret = -EOPNOTSUPP; break; } return ret; } /* Must be called under rcu_read_lock() */ static int dsa_user_vlan_check_for_8021q_uppers(struct net_device *user, const struct switchdev_obj_port_vlan *vlan) { struct net_device *upper_dev; struct list_head *iter; netdev_for_each_upper_dev_rcu(user, upper_dev, iter) { u16 vid; if (!is_vlan_dev(upper_dev)) continue; vid = vlan_dev_vlan_id(upper_dev); if (vid == vlan->vid) return -EBUSY; } return 0; } static int dsa_user_vlan_add(struct net_device *dev, const struct switchdev_obj *obj, struct netlink_ext_ack *extack) { struct dsa_port *dp = dsa_user_to_port(dev); struct switchdev_obj_port_vlan *vlan; int err; if (dsa_port_skip_vlan_configuration(dp)) { NL_SET_ERR_MSG_MOD(extack, "skipping configuration of VLAN"); return 0; } vlan = SWITCHDEV_OBJ_PORT_VLAN(obj); /* Deny adding a bridge VLAN when there is already an 802.1Q upper with * the same VID. */ if (br_vlan_enabled(dsa_port_bridge_dev_get(dp))) { rcu_read_lock(); err = dsa_user_vlan_check_for_8021q_uppers(dev, vlan); rcu_read_unlock(); if (err) { NL_SET_ERR_MSG_MOD(extack, "Port already has a VLAN upper with this VID"); return err; } } return dsa_port_vlan_add(dp, vlan, extack); } /* Offload a VLAN installed on the bridge or on a foreign interface by * installing it as a VLAN towards the CPU port. */ static int dsa_user_host_vlan_add(struct net_device *dev, const struct switchdev_obj *obj, struct netlink_ext_ack *extack) { struct dsa_port *dp = dsa_user_to_port(dev); struct switchdev_obj_port_vlan vlan; /* Do nothing if this is a software bridge */ if (!dp->bridge) return -EOPNOTSUPP; if (dsa_port_skip_vlan_configuration(dp)) { NL_SET_ERR_MSG_MOD(extack, "skipping configuration of VLAN"); return 0; } vlan = *SWITCHDEV_OBJ_PORT_VLAN(obj); /* Even though drivers often handle CPU membership in special ways, * it doesn't make sense to program a PVID, so clear this flag. */ vlan.flags &= ~BRIDGE_VLAN_INFO_PVID; return dsa_port_host_vlan_add(dp, &vlan, extack); } static int dsa_user_port_obj_add(struct net_device *dev, const void *ctx, const struct switchdev_obj *obj, struct netlink_ext_ack *extack) { struct dsa_port *dp = dsa_user_to_port(dev); int err; if (ctx && ctx != dp) return 0; switch (obj->id) { case SWITCHDEV_OBJ_ID_PORT_MDB: if (!dsa_port_offloads_bridge_port(dp, obj->orig_dev)) return -EOPNOTSUPP; err = dsa_port_mdb_add(dp, SWITCHDEV_OBJ_PORT_MDB(obj)); break; case SWITCHDEV_OBJ_ID_HOST_MDB: if (!dsa_port_offloads_bridge_dev(dp, obj->orig_dev)) return -EOPNOTSUPP; err = dsa_port_bridge_host_mdb_add(dp, SWITCHDEV_OBJ_PORT_MDB(obj)); break; case SWITCHDEV_OBJ_ID_PORT_VLAN: if (dsa_port_offloads_bridge_port(dp, obj->orig_dev)) err = dsa_user_vlan_add(dev, obj, extack); else err = dsa_user_host_vlan_add(dev, obj, extack); break; case SWITCHDEV_OBJ_ID_MRP: if (!dsa_port_offloads_bridge_dev(dp, obj->orig_dev)) return -EOPNOTSUPP; err = dsa_port_mrp_add(dp, SWITCHDEV_OBJ_MRP(obj)); break; case SWITCHDEV_OBJ_ID_RING_ROLE_MRP: if (!dsa_port_offloads_bridge_dev(dp, obj->orig_dev)) return -EOPNOTSUPP; err = dsa_port_mrp_add_ring_role(dp, SWITCHDEV_OBJ_RING_ROLE_MRP(obj)); break; default: err = -EOPNOTSUPP; break; } return err; } static int dsa_user_vlan_del(struct net_device *dev, const struct switchdev_obj *obj) { struct dsa_port *dp = dsa_user_to_port(dev); struct switchdev_obj_port_vlan *vlan; if (dsa_port_skip_vlan_configuration(dp)) return 0; vlan = SWITCHDEV_OBJ_PORT_VLAN(obj); return dsa_port_vlan_del(dp, vlan); } static int dsa_user_host_vlan_del(struct net_device *dev, const struct switchdev_obj *obj) { struct dsa_port *dp = dsa_user_to_port(dev); struct switchdev_obj_port_vlan *vlan; /* Do nothing if this is a software bridge */ if (!dp->bridge) return -EOPNOTSUPP; if (dsa_port_skip_vlan_configuration(dp)) return 0; vlan = SWITCHDEV_OBJ_PORT_VLAN(obj); return dsa_port_host_vlan_del(dp, vlan); } static int dsa_user_port_obj_del(struct net_device *dev, const void *ctx, const struct switchdev_obj *obj) { struct dsa_port *dp = dsa_user_to_port(dev); int err; if (ctx && ctx != dp) return 0; switch (obj->id) { case SWITCHDEV_OBJ_ID_PORT_MDB: if (!dsa_port_offloads_bridge_port(dp, obj->orig_dev)) return -EOPNOTSUPP; err = dsa_port_mdb_del(dp, SWITCHDEV_OBJ_PORT_MDB(obj)); break; case SWITCHDEV_OBJ_ID_HOST_MDB: if (!dsa_port_offloads_bridge_dev(dp, obj->orig_dev)) return -EOPNOTSUPP; err = dsa_port_bridge_host_mdb_del(dp, SWITCHDEV_OBJ_PORT_MDB(obj)); break; case SWITCHDEV_OBJ_ID_PORT_VLAN: if (dsa_port_offloads_bridge_port(dp, obj->orig_dev)) err = dsa_user_vlan_del(dev, obj); else err = dsa_user_host_vlan_del(dev, obj); break; case SWITCHDEV_OBJ_ID_MRP: if (!dsa_port_offloads_bridge_dev(dp, obj->orig_dev)) return -EOPNOTSUPP; err = dsa_port_mrp_del(dp, SWITCHDEV_OBJ_MRP(obj)); break; case SWITCHDEV_OBJ_ID_RING_ROLE_MRP: if (!dsa_port_offloads_bridge_dev(dp, obj->orig_dev)) return -EOPNOTSUPP; err = dsa_port_mrp_del_ring_role(dp, SWITCHDEV_OBJ_RING_ROLE_MRP(obj)); break; default: err = -EOPNOTSUPP; break; } return err; } static netdev_tx_t dsa_user_netpoll_send_skb(struct net_device *dev, struct sk_buff *skb) { #ifdef CONFIG_NET_POLL_CONTROLLER struct dsa_user_priv *p = netdev_priv(dev); return netpoll_send_skb(p->netpoll, skb); #else BUG(); return NETDEV_TX_OK; #endif } static void dsa_skb_tx_timestamp(struct dsa_user_priv *p, struct sk_buff *skb) { struct dsa_switch *ds = p->dp->ds; if (!(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP)) return; if (!ds->ops->port_txtstamp) return; ds->ops->port_txtstamp(ds, p->dp->index, skb); } netdev_tx_t dsa_enqueue_skb(struct sk_buff *skb, struct net_device *dev) { /* SKB for netpoll still need to be mangled with the protocol-specific * tag to be successfully transmitted */ if (unlikely(netpoll_tx_running(dev))) return dsa_user_netpoll_send_skb(dev, skb); /* Queue the SKB for transmission on the parent interface, but * do not modify its EtherType */ skb->dev = dsa_user_to_conduit(dev); dev_queue_xmit(skb); return NETDEV_TX_OK; } EXPORT_SYMBOL_GPL(dsa_enqueue_skb); static netdev_tx_t dsa_user_xmit(struct sk_buff *skb, struct net_device *dev) { struct dsa_user_priv *p = netdev_priv(dev); struct sk_buff *nskb; dev_sw_netstats_tx_add(dev, 1, skb->len); memset(skb->cb, 0, sizeof(skb->cb)); /* Handle tx timestamp if any */ dsa_skb_tx_timestamp(p, skb); if (skb_ensure_writable_head_tail(skb, dev)) { dev_kfree_skb_any(skb); return NETDEV_TX_OK; } /* needed_tailroom should still be 'warm' in the cache line from * skb_ensure_writable_head_tail(), which has also ensured that * padding is safe. */ if (dev->needed_tailroom) eth_skb_pad(skb); /* Transmit function may have to reallocate the original SKB, * in which case it must have freed it. Only free it here on error. */ nskb = p->xmit(skb, dev); if (!nskb) { kfree_skb(skb); return NETDEV_TX_OK; } return dsa_enqueue_skb(nskb, dev); } /* ethtool operations *******************************************************/ static void dsa_user_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *drvinfo) { strscpy(drvinfo->driver, "dsa", sizeof(drvinfo->driver)); strscpy(drvinfo->fw_version, "N/A", sizeof(drvinfo->fw_version)); strscpy(drvinfo->bus_info, "platform", sizeof(drvinfo->bus_info)); } static int dsa_user_get_regs_len(struct net_device *dev) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (ds->ops->get_regs_len) return ds->ops->get_regs_len(ds, dp->index); return -EOPNOTSUPP; } static void dsa_user_get_regs(struct net_device *dev, struct ethtool_regs *regs, void *_p) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (ds->ops->get_regs) ds->ops->get_regs(ds, dp->index, regs, _p); } static int dsa_user_nway_reset(struct net_device *dev) { struct dsa_port *dp = dsa_user_to_port(dev); return phylink_ethtool_nway_reset(dp->pl); } static int dsa_user_get_eeprom_len(struct net_device *dev) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (ds->cd && ds->cd->eeprom_len) return ds->cd->eeprom_len; if (ds->ops->get_eeprom_len) return ds->ops->get_eeprom_len(ds); return 0; } static int dsa_user_get_eeprom(struct net_device *dev, struct ethtool_eeprom *eeprom, u8 *data) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (ds->ops->get_eeprom) return ds->ops->get_eeprom(ds, eeprom, data); return -EOPNOTSUPP; } static int dsa_user_set_eeprom(struct net_device *dev, struct ethtool_eeprom *eeprom, u8 *data) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (ds->ops->set_eeprom) return ds->ops->set_eeprom(ds, eeprom, data); return -EOPNOTSUPP; } static void dsa_user_get_strings(struct net_device *dev, uint32_t stringset, uint8_t *data) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (stringset == ETH_SS_STATS) { int len = ETH_GSTRING_LEN; strscpy_pad(data, "tx_packets", len); strscpy_pad(data + len, "tx_bytes", len); strscpy_pad(data + 2 * len, "rx_packets", len); strscpy_pad(data + 3 * len, "rx_bytes", len); if (ds->ops->get_strings) ds->ops->get_strings(ds, dp->index, stringset, data + 4 * len); } else if (stringset == ETH_SS_TEST) { net_selftest_get_strings(data); } } static void dsa_user_get_ethtool_stats(struct net_device *dev, struct ethtool_stats *stats, uint64_t *data) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; struct pcpu_sw_netstats *s; unsigned int start; int i; for_each_possible_cpu(i) { u64 tx_packets, tx_bytes, rx_packets, rx_bytes; s = per_cpu_ptr(dev->tstats, i); do { start = u64_stats_fetch_begin(&s->syncp); tx_packets = u64_stats_read(&s->tx_packets); tx_bytes = u64_stats_read(&s->tx_bytes); rx_packets = u64_stats_read(&s->rx_packets); rx_bytes = u64_stats_read(&s->rx_bytes); } while (u64_stats_fetch_retry(&s->syncp, start)); data[0] += tx_packets; data[1] += tx_bytes; data[2] += rx_packets; data[3] += rx_bytes; } if (ds->ops->get_ethtool_stats) ds->ops->get_ethtool_stats(ds, dp->index, data + 4); } static int dsa_user_get_sset_count(struct net_device *dev, int sset) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (sset == ETH_SS_STATS) { int count = 0; if (ds->ops->get_sset_count) { count = ds->ops->get_sset_count(ds, dp->index, sset); if (count < 0) return count; } return count + 4; } else if (sset == ETH_SS_TEST) { return net_selftest_get_count(); } return -EOPNOTSUPP; } static void dsa_user_get_eth_phy_stats(struct net_device *dev, struct ethtool_eth_phy_stats *phy_stats) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (ds->ops->get_eth_phy_stats) ds->ops->get_eth_phy_stats(ds, dp->index, phy_stats); } static void dsa_user_get_eth_mac_stats(struct net_device *dev, struct ethtool_eth_mac_stats *mac_stats) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (ds->ops->get_eth_mac_stats) ds->ops->get_eth_mac_stats(ds, dp->index, mac_stats); } static void dsa_user_get_eth_ctrl_stats(struct net_device *dev, struct ethtool_eth_ctrl_stats *ctrl_stats) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (ds->ops->get_eth_ctrl_stats) ds->ops->get_eth_ctrl_stats(ds, dp->index, ctrl_stats); } static void dsa_user_get_rmon_stats(struct net_device *dev, struct ethtool_rmon_stats *rmon_stats, const struct ethtool_rmon_hist_range **ranges) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (ds->ops->get_rmon_stats) ds->ops->get_rmon_stats(ds, dp->index, rmon_stats, ranges); } static void dsa_user_net_selftest(struct net_device *ndev, struct ethtool_test *etest, u64 *buf) { struct dsa_port *dp = dsa_user_to_port(ndev); struct dsa_switch *ds = dp->ds; if (ds->ops->self_test) { ds->ops->self_test(ds, dp->index, etest, buf); return; } net_selftest(ndev, etest, buf); } static int dsa_user_get_mm(struct net_device *dev, struct ethtool_mm_state *state) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (!ds->ops->get_mm) return -EOPNOTSUPP; return ds->ops->get_mm(ds, dp->index, state); } static int dsa_user_set_mm(struct net_device *dev, struct ethtool_mm_cfg *cfg, struct netlink_ext_ack *extack) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (!ds->ops->set_mm) return -EOPNOTSUPP; return ds->ops->set_mm(ds, dp->index, cfg, extack); } static void dsa_user_get_mm_stats(struct net_device *dev, struct ethtool_mm_stats *stats) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (ds->ops->get_mm_stats) ds->ops->get_mm_stats(ds, dp->index, stats); } static void dsa_user_get_wol(struct net_device *dev, struct ethtool_wolinfo *w) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; phylink_ethtool_get_wol(dp->pl, w); if (ds->ops->get_wol) ds->ops->get_wol(ds, dp->index, w); } static int dsa_user_set_wol(struct net_device *dev, struct ethtool_wolinfo *w) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; int ret = -EOPNOTSUPP; phylink_ethtool_set_wol(dp->pl, w); if (ds->ops->set_wol) ret = ds->ops->set_wol(ds, dp->index, w); return ret; } static int dsa_user_set_eee(struct net_device *dev, struct ethtool_keee *e) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; int ret; /* Port's PHY and MAC both need to be EEE capable */ if (!dev->phydev || !dp->pl) return -ENODEV; if (!ds->ops->set_mac_eee) return -EOPNOTSUPP; ret = ds->ops->set_mac_eee(ds, dp->index, e); if (ret) return ret; return phylink_ethtool_set_eee(dp->pl, e); } static int dsa_user_get_eee(struct net_device *dev, struct ethtool_keee *e) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; int ret; /* Port's PHY and MAC both need to be EEE capable */ if (!dev->phydev || !dp->pl) return -ENODEV; if (!ds->ops->get_mac_eee) return -EOPNOTSUPP; ret = ds->ops->get_mac_eee(ds, dp->index, e); if (ret) return ret; return phylink_ethtool_get_eee(dp->pl, e); } static int dsa_user_get_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *cmd) { struct dsa_port *dp = dsa_user_to_port(dev); return phylink_ethtool_ksettings_get(dp->pl, cmd); } static int dsa_user_set_link_ksettings(struct net_device *dev, const struct ethtool_link_ksettings *cmd) { struct dsa_port *dp = dsa_user_to_port(dev); return phylink_ethtool_ksettings_set(dp->pl, cmd); } static void dsa_user_get_pause_stats(struct net_device *dev, struct ethtool_pause_stats *pause_stats) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (ds->ops->get_pause_stats) ds->ops->get_pause_stats(ds, dp->index, pause_stats); } static void dsa_user_get_pauseparam(struct net_device *dev, struct ethtool_pauseparam *pause) { struct dsa_port *dp = dsa_user_to_port(dev); phylink_ethtool_get_pauseparam(dp->pl, pause); } static int dsa_user_set_pauseparam(struct net_device *dev, struct ethtool_pauseparam *pause) { struct dsa_port *dp = dsa_user_to_port(dev); return phylink_ethtool_set_pauseparam(dp->pl, pause); } #ifdef CONFIG_NET_POLL_CONTROLLER static int dsa_user_netpoll_setup(struct net_device *dev, struct netpoll_info *ni) { struct net_device *conduit = dsa_user_to_conduit(dev); struct dsa_user_priv *p = netdev_priv(dev); struct netpoll *netpoll; int err = 0; netpoll = kzalloc(sizeof(*netpoll), GFP_KERNEL); if (!netpoll) return -ENOMEM; err = __netpoll_setup(netpoll, conduit); if (err) { kfree(netpoll); goto out; } p->netpoll = netpoll; out: return err; } static void dsa_user_netpoll_cleanup(struct net_device *dev) { struct dsa_user_priv *p = netdev_priv(dev); struct netpoll *netpoll = p->netpoll; if (!netpoll) return; p->netpoll = NULL; __netpoll_free(netpoll); } static void dsa_user_poll_controller(struct net_device *dev) { } #endif static struct dsa_mall_tc_entry * dsa_user_mall_tc_entry_find(struct net_device *dev, unsigned long cookie) { struct dsa_user_priv *p = netdev_priv(dev); struct dsa_mall_tc_entry *mall_tc_entry; list_for_each_entry(mall_tc_entry, &p->mall_tc_list, list) if (mall_tc_entry->cookie == cookie) return mall_tc_entry; return NULL; } static int dsa_user_add_cls_matchall_mirred(struct net_device *dev, struct tc_cls_matchall_offload *cls, bool ingress) { struct netlink_ext_ack *extack = cls->common.extack; struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_user_priv *p = netdev_priv(dev); struct dsa_mall_mirror_tc_entry *mirror; struct dsa_mall_tc_entry *mall_tc_entry; struct dsa_switch *ds = dp->ds; struct flow_action_entry *act; struct dsa_port *to_dp; int err; if (!ds->ops->port_mirror_add) return -EOPNOTSUPP; if (!flow_action_basic_hw_stats_check(&cls->rule->action, cls->common.extack)) return -EOPNOTSUPP; act = &cls->rule->action.entries[0]; if (!act->dev) return -EINVAL; if (!dsa_user_dev_check(act->dev)) return -EOPNOTSUPP; mall_tc_entry = kzalloc(sizeof(*mall_tc_entry), GFP_KERNEL); if (!mall_tc_entry) return -ENOMEM; mall_tc_entry->cookie = cls->cookie; mall_tc_entry->type = DSA_PORT_MALL_MIRROR; mirror = &mall_tc_entry->mirror; to_dp = dsa_user_to_port(act->dev); mirror->to_local_port = to_dp->index; mirror->ingress = ingress; err = ds->ops->port_mirror_add(ds, dp->index, mirror, ingress, extack); if (err) { kfree(mall_tc_entry); return err; } list_add_tail(&mall_tc_entry->list, &p->mall_tc_list); return err; } static int dsa_user_add_cls_matchall_police(struct net_device *dev, struct tc_cls_matchall_offload *cls, bool ingress) { struct netlink_ext_ack *extack = cls->common.extack; struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_user_priv *p = netdev_priv(dev); struct dsa_mall_policer_tc_entry *policer; struct dsa_mall_tc_entry *mall_tc_entry; struct dsa_switch *ds = dp->ds; struct flow_action_entry *act; int err; if (!ds->ops->port_policer_add) { NL_SET_ERR_MSG_MOD(extack, "Policing offload not implemented"); return -EOPNOTSUPP; } if (!ingress) { NL_SET_ERR_MSG_MOD(extack, "Only supported on ingress qdisc"); return -EOPNOTSUPP; } if (!flow_action_basic_hw_stats_check(&cls->rule->action, cls->common.extack)) return -EOPNOTSUPP; list_for_each_entry(mall_tc_entry, &p->mall_tc_list, list) { if (mall_tc_entry->type == DSA_PORT_MALL_POLICER) { NL_SET_ERR_MSG_MOD(extack, "Only one port policer allowed"); return -EEXIST; } } act = &cls->rule->action.entries[0]; mall_tc_entry = kzalloc(sizeof(*mall_tc_entry), GFP_KERNEL); if (!mall_tc_entry) return -ENOMEM; mall_tc_entry->cookie = cls->cookie; mall_tc_entry->type = DSA_PORT_MALL_POLICER; policer = &mall_tc_entry->policer; policer->rate_bytes_per_sec = act->police.rate_bytes_ps; policer->burst = act->police.burst; err = ds->ops->port_policer_add(ds, dp->index, policer); if (err) { kfree(mall_tc_entry); return err; } list_add_tail(&mall_tc_entry->list, &p->mall_tc_list); return err; } static int dsa_user_add_cls_matchall(struct net_device *dev, struct tc_cls_matchall_offload *cls, bool ingress) { int err = -EOPNOTSUPP; if (cls->common.protocol == htons(ETH_P_ALL) && flow_offload_has_one_action(&cls->rule->action) && cls->rule->action.entries[0].id == FLOW_ACTION_MIRRED) err = dsa_user_add_cls_matchall_mirred(dev, cls, ingress); else if (flow_offload_has_one_action(&cls->rule->action) && cls->rule->action.entries[0].id == FLOW_ACTION_POLICE) err = dsa_user_add_cls_matchall_police(dev, cls, ingress); return err; } static void dsa_user_del_cls_matchall(struct net_device *dev, struct tc_cls_matchall_offload *cls) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_mall_tc_entry *mall_tc_entry; struct dsa_switch *ds = dp->ds; mall_tc_entry = dsa_user_mall_tc_entry_find(dev, cls->cookie); if (!mall_tc_entry) return; list_del(&mall_tc_entry->list); switch (mall_tc_entry->type) { case DSA_PORT_MALL_MIRROR: if (ds->ops->port_mirror_del) ds->ops->port_mirror_del(ds, dp->index, &mall_tc_entry->mirror); break; case DSA_PORT_MALL_POLICER: if (ds->ops->port_policer_del) ds->ops->port_policer_del(ds, dp->index); break; default: WARN_ON(1); } kfree(mall_tc_entry); } static int dsa_user_setup_tc_cls_matchall(struct net_device *dev, struct tc_cls_matchall_offload *cls, bool ingress) { if (cls->common.chain_index) return -EOPNOTSUPP; switch (cls->command) { case TC_CLSMATCHALL_REPLACE: return dsa_user_add_cls_matchall(dev, cls, ingress); case TC_CLSMATCHALL_DESTROY: dsa_user_del_cls_matchall(dev, cls); return 0; default: return -EOPNOTSUPP; } } static int dsa_user_add_cls_flower(struct net_device *dev, struct flow_cls_offload *cls, bool ingress) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; int port = dp->index; if (!ds->ops->cls_flower_add) return -EOPNOTSUPP; return ds->ops->cls_flower_add(ds, port, cls, ingress); } static int dsa_user_del_cls_flower(struct net_device *dev, struct flow_cls_offload *cls, bool ingress) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; int port = dp->index; if (!ds->ops->cls_flower_del) return -EOPNOTSUPP; return ds->ops->cls_flower_del(ds, port, cls, ingress); } static int dsa_user_stats_cls_flower(struct net_device *dev, struct flow_cls_offload *cls, bool ingress) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; int port = dp->index; if (!ds->ops->cls_flower_stats) return -EOPNOTSUPP; return ds->ops->cls_flower_stats(ds, port, cls, ingress); } static int dsa_user_setup_tc_cls_flower(struct net_device *dev, struct flow_cls_offload *cls, bool ingress) { switch (cls->command) { case FLOW_CLS_REPLACE: return dsa_user_add_cls_flower(dev, cls, ingress); case FLOW_CLS_DESTROY: return dsa_user_del_cls_flower(dev, cls, ingress); case FLOW_CLS_STATS: return dsa_user_stats_cls_flower(dev, cls, ingress); default: return -EOPNOTSUPP; } } static int dsa_user_setup_tc_block_cb(enum tc_setup_type type, void *type_data, void *cb_priv, bool ingress) { struct net_device *dev = cb_priv; if (!tc_can_offload(dev)) return -EOPNOTSUPP; switch (type) { case TC_SETUP_CLSMATCHALL: return dsa_user_setup_tc_cls_matchall(dev, type_data, ingress); case TC_SETUP_CLSFLOWER: return dsa_user_setup_tc_cls_flower(dev, type_data, ingress); default: return -EOPNOTSUPP; } } static int dsa_user_setup_tc_block_cb_ig(enum tc_setup_type type, void *type_data, void *cb_priv) { return dsa_user_setup_tc_block_cb(type, type_data, cb_priv, true); } static int dsa_user_setup_tc_block_cb_eg(enum tc_setup_type type, void *type_data, void *cb_priv) { return dsa_user_setup_tc_block_cb(type, type_data, cb_priv, false); } static LIST_HEAD(dsa_user_block_cb_list); static int dsa_user_setup_tc_block(struct net_device *dev, struct flow_block_offload *f) { struct flow_block_cb *block_cb; flow_setup_cb_t *cb; if (f->binder_type == FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS) cb = dsa_user_setup_tc_block_cb_ig; else if (f->binder_type == FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS) cb = dsa_user_setup_tc_block_cb_eg; else return -EOPNOTSUPP; f->driver_block_list = &dsa_user_block_cb_list; switch (f->command) { case FLOW_BLOCK_BIND: if (flow_block_cb_is_busy(cb, dev, &dsa_user_block_cb_list)) return -EBUSY; block_cb = flow_block_cb_alloc(cb, dev, dev, NULL); if (IS_ERR(block_cb)) return PTR_ERR(block_cb); flow_block_cb_add(block_cb, f); list_add_tail(&block_cb->driver_list, &dsa_user_block_cb_list); return 0; case FLOW_BLOCK_UNBIND: block_cb = flow_block_cb_lookup(f->block, cb, dev); if (!block_cb) return -ENOENT; flow_block_cb_remove(block_cb, f); list_del(&block_cb->driver_list); return 0; default: return -EOPNOTSUPP; } } static int dsa_user_setup_ft_block(struct dsa_switch *ds, int port, void *type_data) { struct net_device *conduit = dsa_port_to_conduit(dsa_to_port(ds, port)); if (!conduit->netdev_ops->ndo_setup_tc) return -EOPNOTSUPP; return conduit->netdev_ops->ndo_setup_tc(conduit, TC_SETUP_FT, type_data); } static int dsa_user_setup_tc(struct net_device *dev, enum tc_setup_type type, void *type_data) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; switch (type) { case TC_SETUP_BLOCK: return dsa_user_setup_tc_block(dev, type_data); case TC_SETUP_FT: return dsa_user_setup_ft_block(ds, dp->index, type_data); default: break; } if (!ds->ops->port_setup_tc) return -EOPNOTSUPP; return ds->ops->port_setup_tc(ds, dp->index, type, type_data); } static int dsa_user_get_rxnfc(struct net_device *dev, struct ethtool_rxnfc *nfc, u32 *rule_locs) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (!ds->ops->get_rxnfc) return -EOPNOTSUPP; return ds->ops->get_rxnfc(ds, dp->index, nfc, rule_locs); } static int dsa_user_set_rxnfc(struct net_device *dev, struct ethtool_rxnfc *nfc) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (!ds->ops->set_rxnfc) return -EOPNOTSUPP; return ds->ops->set_rxnfc(ds, dp->index, nfc); } static int dsa_user_get_ts_info(struct net_device *dev, struct ethtool_ts_info *ts) { struct dsa_user_priv *p = netdev_priv(dev); struct dsa_switch *ds = p->dp->ds; if (!ds->ops->get_ts_info) return -EOPNOTSUPP; return ds->ops->get_ts_info(ds, p->dp->index, ts); } static int dsa_user_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid) { struct dsa_port *dp = dsa_user_to_port(dev); struct switchdev_obj_port_vlan vlan = { .obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN, .vid = vid, /* This API only allows programming tagged, non-PVID VIDs */ .flags = 0, }; struct netlink_ext_ack extack = {0}; struct dsa_switch *ds = dp->ds; struct netdev_hw_addr *ha; struct dsa_vlan *v; int ret; /* User port... */ ret = dsa_port_vlan_add(dp, &vlan, &extack); if (ret) { if (extack._msg) netdev_err(dev, "%s\n", extack._msg); return ret; } /* And CPU port... */ ret = dsa_port_host_vlan_add(dp, &vlan, &extack); if (ret) { if (extack._msg) netdev_err(dev, "CPU port %d: %s\n", dp->cpu_dp->index, extack._msg); return ret; } if (!dsa_switch_supports_uc_filtering(ds) && !dsa_switch_supports_mc_filtering(ds)) return 0; v = kzalloc(sizeof(*v), GFP_KERNEL); if (!v) { ret = -ENOMEM; goto rollback; } netif_addr_lock_bh(dev); v->vid = vid; list_add_tail(&v->list, &dp->user_vlans); if (dsa_switch_supports_mc_filtering(ds)) { netdev_for_each_synced_mc_addr(ha, dev) { dsa_user_schedule_standalone_work(dev, DSA_MC_ADD, ha->addr, vid); } } if (dsa_switch_supports_uc_filtering(ds)) { netdev_for_each_synced_uc_addr(ha, dev) { dsa_user_schedule_standalone_work(dev, DSA_UC_ADD, ha->addr, vid); } } netif_addr_unlock_bh(dev); dsa_flush_workqueue(); return 0; rollback: dsa_port_host_vlan_del(dp, &vlan); dsa_port_vlan_del(dp, &vlan); return ret; } static int dsa_user_vlan_rx_kill_vid(struct net_device *dev, __be16 proto, u16 vid) { struct dsa_port *dp = dsa_user_to_port(dev); struct switchdev_obj_port_vlan vlan = { .vid = vid, /* This API only allows programming tagged, non-PVID VIDs */ .flags = 0, }; struct dsa_switch *ds = dp->ds; struct netdev_hw_addr *ha; struct dsa_vlan *v; int err; err = dsa_port_vlan_del(dp, &vlan); if (err) return err; err = dsa_port_host_vlan_del(dp, &vlan); if (err) return err; if (!dsa_switch_supports_uc_filtering(ds) && !dsa_switch_supports_mc_filtering(ds)) return 0; netif_addr_lock_bh(dev); v = dsa_vlan_find(&dp->user_vlans, &vlan); if (!v) { netif_addr_unlock_bh(dev); return -ENOENT; } list_del(&v->list); kfree(v); if (dsa_switch_supports_mc_filtering(ds)) { netdev_for_each_synced_mc_addr(ha, dev) { dsa_user_schedule_standalone_work(dev, DSA_MC_DEL, ha->addr, vid); } } if (dsa_switch_supports_uc_filtering(ds)) { netdev_for_each_synced_uc_addr(ha, dev) { dsa_user_schedule_standalone_work(dev, DSA_UC_DEL, ha->addr, vid); } } netif_addr_unlock_bh(dev); dsa_flush_workqueue(); return 0; } static int dsa_user_restore_vlan(struct net_device *vdev, int vid, void *arg) { __be16 proto = vdev ? vlan_dev_vlan_proto(vdev) : htons(ETH_P_8021Q); return dsa_user_vlan_rx_add_vid(arg, proto, vid); } static int dsa_user_clear_vlan(struct net_device *vdev, int vid, void *arg) { __be16 proto = vdev ? vlan_dev_vlan_proto(vdev) : htons(ETH_P_8021Q); return dsa_user_vlan_rx_kill_vid(arg, proto, vid); } /* Keep the VLAN RX filtering list in sync with the hardware only if VLAN * filtering is enabled. The baseline is that only ports that offload a * VLAN-aware bridge are VLAN-aware, and standalone ports are VLAN-unaware, * but there are exceptions for quirky hardware. * * If ds->vlan_filtering_is_global = true, then standalone ports which share * the same switch with other ports that offload a VLAN-aware bridge are also * inevitably VLAN-aware. * * To summarize, a DSA switch port offloads: * * - If standalone (this includes software bridge, software LAG): * - if ds->needs_standalone_vlan_filtering = true, OR if * (ds->vlan_filtering_is_global = true AND there are bridges spanning * this switch chip which have vlan_filtering=1) * - the 8021q upper VLANs * - else (standalone VLAN filtering is not needed, VLAN filtering is not * global, or it is, but no port is under a VLAN-aware bridge): * - no VLAN (any 8021q upper is a software VLAN) * * - If under a vlan_filtering=0 bridge which it offload: * - if ds->configure_vlan_while_not_filtering = true (default): * - the bridge VLANs. These VLANs are committed to hardware but inactive. * - else (deprecated): * - no VLAN. The bridge VLANs are not restored when VLAN awareness is * enabled, so this behavior is broken and discouraged. * * - If under a vlan_filtering=1 bridge which it offload: * - the bridge VLANs * - the 8021q upper VLANs */ int dsa_user_manage_vlan_filtering(struct net_device *user, bool vlan_filtering) { int err; if (vlan_filtering) { user->features |= NETIF_F_HW_VLAN_CTAG_FILTER; err = vlan_for_each(user, dsa_user_restore_vlan, user); if (err) { vlan_for_each(user, dsa_user_clear_vlan, user); user->features &= ~NETIF_F_HW_VLAN_CTAG_FILTER; return err; } } else { err = vlan_for_each(user, dsa_user_clear_vlan, user); if (err) return err; user->features &= ~NETIF_F_HW_VLAN_CTAG_FILTER; } return 0; } struct dsa_hw_port { struct list_head list; struct net_device *dev; int old_mtu; }; static int dsa_hw_port_list_set_mtu(struct list_head *hw_port_list, int mtu) { const struct dsa_hw_port *p; int err; list_for_each_entry(p, hw_port_list, list) { if (p->dev->mtu == mtu) continue; err = dev_set_mtu(p->dev, mtu); if (err) goto rollback; } return 0; rollback: list_for_each_entry_continue_reverse(p, hw_port_list, list) { if (p->dev->mtu == p->old_mtu) continue; if (dev_set_mtu(p->dev, p->old_mtu)) netdev_err(p->dev, "Failed to restore MTU\n"); } return err; } static void dsa_hw_port_list_free(struct list_head *hw_port_list) { struct dsa_hw_port *p, *n; list_for_each_entry_safe(p, n, hw_port_list, list) kfree(p); } /* Make the hardware datapath to/from @dev limited to a common MTU */ static void dsa_bridge_mtu_normalization(struct dsa_port *dp) { struct list_head hw_port_list; struct dsa_switch_tree *dst; int min_mtu = ETH_MAX_MTU; struct dsa_port *other_dp; int err; if (!dp->ds->mtu_enforcement_ingress) return; if (!dp->bridge) return; INIT_LIST_HEAD(&hw_port_list); /* Populate the list of ports that are part of the same bridge * as the newly added/modified port */ list_for_each_entry(dst, &dsa_tree_list, list) { list_for_each_entry(other_dp, &dst->ports, list) { struct dsa_hw_port *hw_port; struct net_device *user; if (other_dp->type != DSA_PORT_TYPE_USER) continue; if (!dsa_port_bridge_same(dp, other_dp)) continue; if (!other_dp->ds->mtu_enforcement_ingress) continue; user = other_dp->user; if (min_mtu > user->mtu) min_mtu = user->mtu; hw_port = kzalloc(sizeof(*hw_port), GFP_KERNEL); if (!hw_port) goto out; hw_port->dev = user; hw_port->old_mtu = user->mtu; list_add(&hw_port->list, &hw_port_list); } } /* Attempt to configure the entire hardware bridge to the newly added * interface's MTU first, regardless of whether the intention of the * user was to raise or lower it. */ err = dsa_hw_port_list_set_mtu(&hw_port_list, dp->user->mtu); if (!err) goto out; /* Clearly that didn't work out so well, so just set the minimum MTU on * all hardware bridge ports now. If this fails too, then all ports will * still have their old MTU rolled back anyway. */ dsa_hw_port_list_set_mtu(&hw_port_list, min_mtu); out: dsa_hw_port_list_free(&hw_port_list); } int dsa_user_change_mtu(struct net_device *dev, int new_mtu) { struct net_device *conduit = dsa_user_to_conduit(dev); struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_port *cpu_dp = dp->cpu_dp; struct dsa_switch *ds = dp->ds; struct dsa_port *other_dp; int largest_mtu = 0; int new_conduit_mtu; int old_conduit_mtu; int mtu_limit; int overhead; int cpu_mtu; int err; if (!ds->ops->port_change_mtu) return -EOPNOTSUPP; dsa_tree_for_each_user_port(other_dp, ds->dst) { int user_mtu; /* During probe, this function will be called for each user * device, while not all of them have been allocated. That's * ok, it doesn't change what the maximum is, so ignore it. */ if (!other_dp->user) continue; /* Pretend that we already applied the setting, which we * actually haven't (still haven't done all integrity checks) */ if (dp == other_dp) user_mtu = new_mtu; else user_mtu = other_dp->user->mtu; if (largest_mtu < user_mtu) largest_mtu = user_mtu; } overhead = dsa_tag_protocol_overhead(cpu_dp->tag_ops); mtu_limit = min_t(int, conduit->max_mtu, dev->max_mtu + overhead); old_conduit_mtu = conduit->mtu; new_conduit_mtu = largest_mtu + overhead; if (new_conduit_mtu > mtu_limit) return -ERANGE; /* If the conduit MTU isn't over limit, there's no need to check the CPU * MTU, since that surely isn't either. */ cpu_mtu = largest_mtu; /* Start applying stuff */ if (new_conduit_mtu != old_conduit_mtu) { err = dev_set_mtu(conduit, new_conduit_mtu); if (err < 0) goto out_conduit_failed; /* We only need to propagate the MTU of the CPU port to * upstream switches, so emit a notifier which updates them. */ err = dsa_port_mtu_change(cpu_dp, cpu_mtu); if (err) goto out_cpu_failed; } err = ds->ops->port_change_mtu(ds, dp->index, new_mtu); if (err) goto out_port_failed; WRITE_ONCE(dev->mtu, new_mtu); dsa_bridge_mtu_normalization(dp); return 0; out_port_failed: if (new_conduit_mtu != old_conduit_mtu) dsa_port_mtu_change(cpu_dp, old_conduit_mtu - overhead); out_cpu_failed: if (new_conduit_mtu != old_conduit_mtu) dev_set_mtu(conduit, old_conduit_mtu); out_conduit_failed: return err; } static int __maybe_unused dsa_user_dcbnl_set_apptrust(struct net_device *dev, u8 *sel, int nsel) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; int port = dp->index; if (!ds->ops->port_set_apptrust) return -EOPNOTSUPP; return ds->ops->port_set_apptrust(ds, port, sel, nsel); } static int __maybe_unused dsa_user_dcbnl_get_apptrust(struct net_device *dev, u8 *sel, int *nsel) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; int port = dp->index; if (!ds->ops->port_get_apptrust) return -EOPNOTSUPP; return ds->ops->port_get_apptrust(ds, port, sel, nsel); } static int __maybe_unused dsa_user_dcbnl_set_default_prio(struct net_device *dev, struct dcb_app *app) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; unsigned long mask, new_prio; int err, port = dp->index; if (!ds->ops->port_set_default_prio) return -EOPNOTSUPP; err = dcb_ieee_setapp(dev, app); if (err) return err; mask = dcb_ieee_getapp_mask(dev, app); new_prio = __fls(mask); err = ds->ops->port_set_default_prio(ds, port, new_prio); if (err) { dcb_ieee_delapp(dev, app); return err; } return 0; } /* Update the DSCP prio entries on all user ports of the switch in case * the switch supports global DSCP prio instead of per port DSCP prios. */ static int dsa_user_dcbnl_ieee_global_dscp_setdel(struct net_device *dev, struct dcb_app *app, bool del) { int (*setdel)(struct net_device *dev, struct dcb_app *app); struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; struct dsa_port *other_dp; int err, restore_err; if (del) setdel = dcb_ieee_delapp; else setdel = dcb_ieee_setapp; dsa_switch_for_each_user_port(other_dp, ds) { struct net_device *user = other_dp->user; if (!user || user == dev) continue; err = setdel(user, app); if (err) goto err_try_to_restore; } return 0; err_try_to_restore: /* Revert logic to restore previous state of app entries */ if (!del) setdel = dcb_ieee_delapp; else setdel = dcb_ieee_setapp; dsa_switch_for_each_user_port_continue_reverse(other_dp, ds) { struct net_device *user = other_dp->user; if (!user || user == dev) continue; restore_err = setdel(user, app); if (restore_err) netdev_err(user, "Failed to restore DSCP prio entry configuration\n"); } return err; } static int __maybe_unused dsa_user_dcbnl_add_dscp_prio(struct net_device *dev, struct dcb_app *app) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; unsigned long mask, new_prio; int err, port = dp->index; u8 dscp = app->protocol; if (!ds->ops->port_add_dscp_prio) return -EOPNOTSUPP; if (dscp >= 64) { netdev_err(dev, "DSCP APP entry with protocol value %u is invalid\n", dscp); return -EINVAL; } err = dcb_ieee_setapp(dev, app); if (err) return err; mask = dcb_ieee_getapp_mask(dev, app); new_prio = __fls(mask); err = ds->ops->port_add_dscp_prio(ds, port, dscp, new_prio); if (err) { dcb_ieee_delapp(dev, app); return err; } if (!ds->dscp_prio_mapping_is_global) return 0; err = dsa_user_dcbnl_ieee_global_dscp_setdel(dev, app, false); if (err) { if (ds->ops->port_del_dscp_prio) ds->ops->port_del_dscp_prio(ds, port, dscp, new_prio); dcb_ieee_delapp(dev, app); return err; } return 0; } static int __maybe_unused dsa_user_dcbnl_ieee_setapp(struct net_device *dev, struct dcb_app *app) { switch (app->selector) { case IEEE_8021QAZ_APP_SEL_ETHERTYPE: switch (app->protocol) { case 0: return dsa_user_dcbnl_set_default_prio(dev, app); default: return -EOPNOTSUPP; } break; case IEEE_8021QAZ_APP_SEL_DSCP: return dsa_user_dcbnl_add_dscp_prio(dev, app); default: return -EOPNOTSUPP; } } static int __maybe_unused dsa_user_dcbnl_del_default_prio(struct net_device *dev, struct dcb_app *app) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; unsigned long mask, new_prio; int err, port = dp->index; if (!ds->ops->port_set_default_prio) return -EOPNOTSUPP; err = dcb_ieee_delapp(dev, app); if (err) return err; mask = dcb_ieee_getapp_mask(dev, app); new_prio = mask ? __fls(mask) : 0; err = ds->ops->port_set_default_prio(ds, port, new_prio); if (err) { dcb_ieee_setapp(dev, app); return err; } return 0; } static int __maybe_unused dsa_user_dcbnl_del_dscp_prio(struct net_device *dev, struct dcb_app *app) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; int err, port = dp->index; u8 dscp = app->protocol; if (!ds->ops->port_del_dscp_prio) return -EOPNOTSUPP; err = dcb_ieee_delapp(dev, app); if (err) return err; err = ds->ops->port_del_dscp_prio(ds, port, dscp, app->priority); if (err) { dcb_ieee_setapp(dev, app); return err; } if (!ds->dscp_prio_mapping_is_global) return 0; err = dsa_user_dcbnl_ieee_global_dscp_setdel(dev, app, true); if (err) { if (ds->ops->port_add_dscp_prio) ds->ops->port_add_dscp_prio(ds, port, dscp, app->priority); dcb_ieee_setapp(dev, app); return err; } return 0; } static int __maybe_unused dsa_user_dcbnl_ieee_delapp(struct net_device *dev, struct dcb_app *app) { switch (app->selector) { case IEEE_8021QAZ_APP_SEL_ETHERTYPE: switch (app->protocol) { case 0: return dsa_user_dcbnl_del_default_prio(dev, app); default: return -EOPNOTSUPP; } break; case IEEE_8021QAZ_APP_SEL_DSCP: return dsa_user_dcbnl_del_dscp_prio(dev, app); default: return -EOPNOTSUPP; } } /* Pre-populate the DCB application priority table with the priorities * configured during switch setup, which we read from hardware here. */ static int dsa_user_dcbnl_init(struct net_device *dev) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; int port = dp->index; int err; if (ds->ops->port_get_default_prio) { int prio = ds->ops->port_get_default_prio(ds, port); struct dcb_app app = { .selector = IEEE_8021QAZ_APP_SEL_ETHERTYPE, .protocol = 0, .priority = prio, }; if (prio < 0) return prio; err = dcb_ieee_setapp(dev, &app); if (err) return err; } if (ds->ops->port_get_dscp_prio) { int protocol; for (protocol = 0; protocol < 64; protocol++) { struct dcb_app app = { .selector = IEEE_8021QAZ_APP_SEL_DSCP, .protocol = protocol, }; int prio; prio = ds->ops->port_get_dscp_prio(ds, port, protocol); if (prio == -EOPNOTSUPP) continue; if (prio < 0) return prio; app.priority = prio; err = dcb_ieee_setapp(dev, &app); if (err) return err; } } return 0; } static const struct ethtool_ops dsa_user_ethtool_ops = { .get_drvinfo = dsa_user_get_drvinfo, .get_regs_len = dsa_user_get_regs_len, .get_regs = dsa_user_get_regs, .nway_reset = dsa_user_nway_reset, .get_link = ethtool_op_get_link, .get_eeprom_len = dsa_user_get_eeprom_len, .get_eeprom = dsa_user_get_eeprom, .set_eeprom = dsa_user_set_eeprom, .get_strings = dsa_user_get_strings, .get_ethtool_stats = dsa_user_get_ethtool_stats, .get_sset_count = dsa_user_get_sset_count, .get_eth_phy_stats = dsa_user_get_eth_phy_stats, .get_eth_mac_stats = dsa_user_get_eth_mac_stats, .get_eth_ctrl_stats = dsa_user_get_eth_ctrl_stats, .get_rmon_stats = dsa_user_get_rmon_stats, .set_wol = dsa_user_set_wol, .get_wol = dsa_user_get_wol, .set_eee = dsa_user_set_eee, .get_eee = dsa_user_get_eee, .get_link_ksettings = dsa_user_get_link_ksettings, .set_link_ksettings = dsa_user_set_link_ksettings, .get_pause_stats = dsa_user_get_pause_stats, .get_pauseparam = dsa_user_get_pauseparam, .set_pauseparam = dsa_user_set_pauseparam, .get_rxnfc = dsa_user_get_rxnfc, .set_rxnfc = dsa_user_set_rxnfc, .get_ts_info = dsa_user_get_ts_info, .self_test = dsa_user_net_selftest, .get_mm = dsa_user_get_mm, .set_mm = dsa_user_set_mm, .get_mm_stats = dsa_user_get_mm_stats, }; static const struct dcbnl_rtnl_ops __maybe_unused dsa_user_dcbnl_ops = { .ieee_setapp = dsa_user_dcbnl_ieee_setapp, .ieee_delapp = dsa_user_dcbnl_ieee_delapp, .dcbnl_setapptrust = dsa_user_dcbnl_set_apptrust, .dcbnl_getapptrust = dsa_user_dcbnl_get_apptrust, }; static void dsa_user_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *s) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (ds->ops->get_stats64) ds->ops->get_stats64(ds, dp->index, s); else dev_get_tstats64(dev, s); } static int dsa_user_fill_forward_path(struct net_device_path_ctx *ctx, struct net_device_path *path) { struct dsa_port *dp = dsa_user_to_port(ctx->dev); struct net_device *conduit = dsa_port_to_conduit(dp); struct dsa_port *cpu_dp = dp->cpu_dp; path->dev = ctx->dev; path->type = DEV_PATH_DSA; path->dsa.proto = cpu_dp->tag_ops->proto; path->dsa.port = dp->index; ctx->dev = conduit; return 0; } static const struct net_device_ops dsa_user_netdev_ops = { .ndo_open = dsa_user_open, .ndo_stop = dsa_user_close, .ndo_start_xmit = dsa_user_xmit, .ndo_change_rx_flags = dsa_user_change_rx_flags, .ndo_set_rx_mode = dsa_user_set_rx_mode, .ndo_set_mac_address = dsa_user_set_mac_address, .ndo_fdb_dump = dsa_user_fdb_dump, .ndo_eth_ioctl = dsa_user_ioctl, .ndo_get_iflink = dsa_user_get_iflink, #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_netpoll_setup = dsa_user_netpoll_setup, .ndo_netpoll_cleanup = dsa_user_netpoll_cleanup, .ndo_poll_controller = dsa_user_poll_controller, #endif .ndo_setup_tc = dsa_user_setup_tc, .ndo_get_stats64 = dsa_user_get_stats64, .ndo_vlan_rx_add_vid = dsa_user_vlan_rx_add_vid, .ndo_vlan_rx_kill_vid = dsa_user_vlan_rx_kill_vid, .ndo_change_mtu = dsa_user_change_mtu, .ndo_fill_forward_path = dsa_user_fill_forward_path, }; static const struct device_type dsa_type = { .name = "dsa", }; void dsa_port_phylink_mac_change(struct dsa_switch *ds, int port, bool up) { const struct dsa_port *dp = dsa_to_port(ds, port); if (dp->pl) phylink_mac_change(dp->pl, up); } EXPORT_SYMBOL_GPL(dsa_port_phylink_mac_change); static void dsa_user_phylink_fixed_state(struct phylink_config *config, struct phylink_link_state *state) { struct dsa_port *dp = dsa_phylink_to_port(config); struct dsa_switch *ds = dp->ds; /* No need to check that this operation is valid, the callback would * not be called if it was not. */ ds->ops->phylink_fixed_state(ds, dp->index, state); } /* user device setup *******************************************************/ static int dsa_user_phy_connect(struct net_device *user_dev, int addr, u32 flags) { struct dsa_port *dp = dsa_user_to_port(user_dev); struct dsa_switch *ds = dp->ds; user_dev->phydev = mdiobus_get_phy(ds->user_mii_bus, addr); if (!user_dev->phydev) { netdev_err(user_dev, "no phy at %d\n", addr); return -ENODEV; } user_dev->phydev->dev_flags |= flags; return phylink_connect_phy(dp->pl, user_dev->phydev); } static int dsa_user_phy_setup(struct net_device *user_dev) { struct dsa_port *dp = dsa_user_to_port(user_dev); struct device_node *port_dn = dp->dn; struct dsa_switch *ds = dp->ds; u32 phy_flags = 0; int ret; dp->pl_config.dev = &user_dev->dev; dp->pl_config.type = PHYLINK_NETDEV; /* The get_fixed_state callback takes precedence over polling the * link GPIO in PHYLINK (see phylink_get_fixed_state). Only set * this if the switch provides such a callback. */ if (ds->ops->phylink_fixed_state) { dp->pl_config.get_fixed_state = dsa_user_phylink_fixed_state; dp->pl_config.poll_fixed_state = true; } ret = dsa_port_phylink_create(dp); if (ret) return ret; if (ds->ops->get_phy_flags) phy_flags = ds->ops->get_phy_flags(ds, dp->index); ret = phylink_of_phy_connect(dp->pl, port_dn, phy_flags); if (ret == -ENODEV && ds->user_mii_bus) { /* We could not connect to a designated PHY or SFP, so try to * use the switch internal MDIO bus instead */ ret = dsa_user_phy_connect(user_dev, dp->index, phy_flags); } if (ret) { netdev_err(user_dev, "failed to connect to PHY: %pe\n", ERR_PTR(ret)); dsa_port_phylink_destroy(dp); } return ret; } void dsa_user_setup_tagger(struct net_device *user) { struct dsa_port *dp = dsa_user_to_port(user); struct net_device *conduit = dsa_port_to_conduit(dp); struct dsa_user_priv *p = netdev_priv(user); const struct dsa_port *cpu_dp = dp->cpu_dp; const struct dsa_switch *ds = dp->ds; user->needed_headroom = cpu_dp->tag_ops->needed_headroom; user->needed_tailroom = cpu_dp->tag_ops->needed_tailroom; /* Try to save one extra realloc later in the TX path (in the conduit) * by also inheriting the conduit's needed headroom and tailroom. * The 8021q driver also does this. */ user->needed_headroom += conduit->needed_headroom; user->needed_tailroom += conduit->needed_tailroom; p->xmit = cpu_dp->tag_ops->xmit; user->features = conduit->vlan_features | NETIF_F_HW_TC; user->hw_features |= NETIF_F_HW_TC; user->features |= NETIF_F_LLTX; if (user->needed_tailroom) user->features &= ~(NETIF_F_SG | NETIF_F_FRAGLIST); if (ds->needs_standalone_vlan_filtering) user->features |= NETIF_F_HW_VLAN_CTAG_FILTER; } int dsa_user_suspend(struct net_device *user_dev) { struct dsa_port *dp = dsa_user_to_port(user_dev); if (!netif_running(user_dev)) return 0; netif_device_detach(user_dev); rtnl_lock(); phylink_stop(dp->pl); rtnl_unlock(); return 0; } int dsa_user_resume(struct net_device *user_dev) { struct dsa_port *dp = dsa_user_to_port(user_dev); if (!netif_running(user_dev)) return 0; netif_device_attach(user_dev); rtnl_lock(); phylink_start(dp->pl); rtnl_unlock(); return 0; } int dsa_user_create(struct dsa_port *port) { struct net_device *conduit = dsa_port_to_conduit(port); struct dsa_switch *ds = port->ds; struct net_device *user_dev; struct dsa_user_priv *p; const char *name; int assign_type; int ret; if (!ds->num_tx_queues) ds->num_tx_queues = 1; if (port->name) { name = port->name; assign_type = NET_NAME_PREDICTABLE; } else { name = "eth%d"; assign_type = NET_NAME_ENUM; } user_dev = alloc_netdev_mqs(sizeof(struct dsa_user_priv), name, assign_type, ether_setup, ds->num_tx_queues, 1); if (user_dev == NULL) return -ENOMEM; user_dev->rtnl_link_ops = &dsa_link_ops; user_dev->ethtool_ops = &dsa_user_ethtool_ops; #if IS_ENABLED(CONFIG_DCB) user_dev->dcbnl_ops = &dsa_user_dcbnl_ops; #endif if (!is_zero_ether_addr(port->mac)) eth_hw_addr_set(user_dev, port->mac); else eth_hw_addr_inherit(user_dev, conduit); user_dev->priv_flags |= IFF_NO_QUEUE; if (dsa_switch_supports_uc_filtering(ds)) user_dev->priv_flags |= IFF_UNICAST_FLT; user_dev->netdev_ops = &dsa_user_netdev_ops; if (ds->ops->port_max_mtu) user_dev->max_mtu = ds->ops->port_max_mtu(ds, port->index); SET_NETDEV_DEVTYPE(user_dev, &dsa_type); SET_NETDEV_DEV(user_dev, port->ds->dev); SET_NETDEV_DEVLINK_PORT(user_dev, &port->devlink_port); user_dev->dev.of_node = port->dn; user_dev->vlan_features = conduit->vlan_features; p = netdev_priv(user_dev); user_dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; ret = gro_cells_init(&p->gcells, user_dev); if (ret) goto out_free; p->dp = port; INIT_LIST_HEAD(&p->mall_tc_list); port->user = user_dev; dsa_user_setup_tagger(user_dev); netif_carrier_off(user_dev); ret = dsa_user_phy_setup(user_dev); if (ret) { netdev_err(user_dev, "error %d setting up PHY for tree %d, switch %d, port %d\n", ret, ds->dst->index, ds->index, port->index); goto out_gcells; } rtnl_lock(); ret = dsa_user_change_mtu(user_dev, ETH_DATA_LEN); if (ret && ret != -EOPNOTSUPP) dev_warn(ds->dev, "nonfatal error %d setting MTU to %d on port %d\n", ret, ETH_DATA_LEN, port->index); ret = register_netdevice(user_dev); if (ret) { netdev_err(conduit, "error %d registering interface %s\n", ret, user_dev->name); rtnl_unlock(); goto out_phy; } if (IS_ENABLED(CONFIG_DCB)) { ret = dsa_user_dcbnl_init(user_dev); if (ret) { netdev_err(user_dev, "failed to initialize DCB: %pe\n", ERR_PTR(ret)); rtnl_unlock(); goto out_unregister; } } ret = netdev_upper_dev_link(conduit, user_dev, NULL); rtnl_unlock(); if (ret) goto out_unregister; return 0; out_unregister: unregister_netdev(user_dev); out_phy: rtnl_lock(); phylink_disconnect_phy(p->dp->pl); rtnl_unlock(); dsa_port_phylink_destroy(p->dp); out_gcells: gro_cells_destroy(&p->gcells); out_free: free_netdev(user_dev); port->user = NULL; return ret; } void dsa_user_destroy(struct net_device *user_dev) { struct net_device *conduit = dsa_user_to_conduit(user_dev); struct dsa_port *dp = dsa_user_to_port(user_dev); struct dsa_user_priv *p = netdev_priv(user_dev); netif_carrier_off(user_dev); rtnl_lock(); netdev_upper_dev_unlink(conduit, user_dev); unregister_netdevice(user_dev); phylink_disconnect_phy(dp->pl); rtnl_unlock(); dsa_port_phylink_destroy(dp); gro_cells_destroy(&p->gcells); free_netdev(user_dev); } int dsa_user_change_conduit(struct net_device *dev, struct net_device *conduit, struct netlink_ext_ack *extack) { struct net_device *old_conduit = dsa_user_to_conduit(dev); struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; struct net_device *upper; struct list_head *iter; int err; if (conduit == old_conduit) return 0; if (!ds->ops->port_change_conduit) { NL_SET_ERR_MSG_MOD(extack, "Driver does not support changing DSA conduit"); return -EOPNOTSUPP; } if (!netdev_uses_dsa(conduit)) { NL_SET_ERR_MSG_MOD(extack, "Interface not eligible as DSA conduit"); return -EOPNOTSUPP; } netdev_for_each_upper_dev_rcu(conduit, upper, iter) { if (dsa_user_dev_check(upper)) continue; if (netif_is_bridge_master(upper)) continue; NL_SET_ERR_MSG_MOD(extack, "Cannot join conduit with unknown uppers"); return -EOPNOTSUPP; } /* Since we allow live-changing the DSA conduit, plus we auto-open the * DSA conduit when the user port opens => we need to ensure that the * new DSA conduit is open too. */ if (dev->flags & IFF_UP) { err = dev_open(conduit, extack); if (err) return err; } netdev_upper_dev_unlink(old_conduit, dev); err = netdev_upper_dev_link(conduit, dev, extack); if (err) goto out_revert_old_conduit_unlink; err = dsa_port_change_conduit(dp, conduit, extack); if (err) goto out_revert_conduit_link; /* Update the MTU of the new CPU port through cross-chip notifiers */ err = dsa_user_change_mtu(dev, dev->mtu); if (err && err != -EOPNOTSUPP) { netdev_warn(dev, "nonfatal error updating MTU with new conduit: %pe\n", ERR_PTR(err)); } /* If the port doesn't have its own MAC address and relies on the DSA * conduit's one, inherit it again from the new DSA conduit. */ if (is_zero_ether_addr(dp->mac)) eth_hw_addr_inherit(dev, conduit); return 0; out_revert_conduit_link: netdev_upper_dev_unlink(conduit, dev); out_revert_old_conduit_unlink: netdev_upper_dev_link(old_conduit, dev, NULL); return err; } bool dsa_user_dev_check(const struct net_device *dev) { return dev->netdev_ops == &dsa_user_netdev_ops; } EXPORT_SYMBOL_GPL(dsa_user_dev_check); static int dsa_user_changeupper(struct net_device *dev, struct netdev_notifier_changeupper_info *info) { struct netlink_ext_ack *extack; int err = NOTIFY_DONE; struct dsa_port *dp; if (!dsa_user_dev_check(dev)) return err; dp = dsa_user_to_port(dev); extack = netdev_notifier_info_to_extack(&info->info); if (netif_is_bridge_master(info->upper_dev)) { if (info->linking) { err = dsa_port_bridge_join(dp, info->upper_dev, extack); if (!err) dsa_bridge_mtu_normalization(dp); if (err == -EOPNOTSUPP) { NL_SET_ERR_MSG_WEAK_MOD(extack, "Offloading not supported"); err = 0; } err = notifier_from_errno(err); } else { dsa_port_bridge_leave(dp, info->upper_dev); err = NOTIFY_OK; } } else if (netif_is_lag_master(info->upper_dev)) { if (info->linking) { err = dsa_port_lag_join(dp, info->upper_dev, info->upper_info, extack); if (err == -EOPNOTSUPP) { NL_SET_ERR_MSG_WEAK_MOD(extack, "Offloading not supported"); err = 0; } err = notifier_from_errno(err); } else { dsa_port_lag_leave(dp, info->upper_dev); err = NOTIFY_OK; } } else if (is_hsr_master(info->upper_dev)) { if (info->linking) { err = dsa_port_hsr_join(dp, info->upper_dev, extack); if (err == -EOPNOTSUPP) { NL_SET_ERR_MSG_WEAK_MOD(extack, "Offloading not supported"); err = 0; } err = notifier_from_errno(err); } else { dsa_port_hsr_leave(dp, info->upper_dev); err = NOTIFY_OK; } } return err; } static int dsa_user_prechangeupper(struct net_device *dev, struct netdev_notifier_changeupper_info *info) { struct dsa_port *dp; if (!dsa_user_dev_check(dev)) return NOTIFY_DONE; dp = dsa_user_to_port(dev); if (netif_is_bridge_master(info->upper_dev) && !info->linking) dsa_port_pre_bridge_leave(dp, info->upper_dev); else if (netif_is_lag_master(info->upper_dev) && !info->linking) dsa_port_pre_lag_leave(dp, info->upper_dev); /* dsa_port_pre_hsr_leave is not yet necessary since hsr devices cannot * meaningfully placed under a bridge yet */ return NOTIFY_DONE; } static int dsa_user_lag_changeupper(struct net_device *dev, struct netdev_notifier_changeupper_info *info) { struct net_device *lower; struct list_head *iter; int err = NOTIFY_DONE; struct dsa_port *dp; if (!netif_is_lag_master(dev)) return err; netdev_for_each_lower_dev(dev, lower, iter) { if (!dsa_user_dev_check(lower)) continue; dp = dsa_user_to_port(lower); if (!dp->lag) /* Software LAG */ continue; err = dsa_user_changeupper(lower, info); if (notifier_to_errno(err)) break; } return err; } /* Same as dsa_user_lag_changeupper() except that it calls * dsa_user_prechangeupper() */ static int dsa_user_lag_prechangeupper(struct net_device *dev, struct netdev_notifier_changeupper_info *info) { struct net_device *lower; struct list_head *iter; int err = NOTIFY_DONE; struct dsa_port *dp; if (!netif_is_lag_master(dev)) return err; netdev_for_each_lower_dev(dev, lower, iter) { if (!dsa_user_dev_check(lower)) continue; dp = dsa_user_to_port(lower); if (!dp->lag) /* Software LAG */ continue; err = dsa_user_prechangeupper(lower, info); if (notifier_to_errno(err)) break; } return err; } static int dsa_prevent_bridging_8021q_upper(struct net_device *dev, struct netdev_notifier_changeupper_info *info) { struct netlink_ext_ack *ext_ack; struct net_device *user, *br; struct dsa_port *dp; ext_ack = netdev_notifier_info_to_extack(&info->info); if (!is_vlan_dev(dev)) return NOTIFY_DONE; user = vlan_dev_real_dev(dev); if (!dsa_user_dev_check(user)) return NOTIFY_DONE; dp = dsa_user_to_port(user); br = dsa_port_bridge_dev_get(dp); if (!br) return NOTIFY_DONE; /* Deny enslaving a VLAN device into a VLAN-aware bridge */ if (br_vlan_enabled(br) && netif_is_bridge_master(info->upper_dev) && info->linking) { NL_SET_ERR_MSG_MOD(ext_ack, "Cannot make VLAN device join VLAN-aware bridge"); return notifier_from_errno(-EINVAL); } return NOTIFY_DONE; } static int dsa_user_check_8021q_upper(struct net_device *dev, struct netdev_notifier_changeupper_info *info) { struct dsa_port *dp = dsa_user_to_port(dev); struct net_device *br = dsa_port_bridge_dev_get(dp); struct bridge_vlan_info br_info; struct netlink_ext_ack *extack; int err = NOTIFY_DONE; u16 vid; if (!br || !br_vlan_enabled(br)) return NOTIFY_DONE; extack = netdev_notifier_info_to_extack(&info->info); vid = vlan_dev_vlan_id(info->upper_dev); /* br_vlan_get_info() returns -EINVAL or -ENOENT if the * device, respectively the VID is not found, returning * 0 means success, which is a failure for us here. */ err = br_vlan_get_info(br, vid, &br_info); if (err == 0) { NL_SET_ERR_MSG_MOD(extack, "This VLAN is already configured by the bridge"); return notifier_from_errno(-EBUSY); } return NOTIFY_DONE; } static int dsa_user_prechangeupper_sanity_check(struct net_device *dev, struct netdev_notifier_changeupper_info *info) { struct dsa_switch *ds; struct dsa_port *dp; int err; if (!dsa_user_dev_check(dev)) return dsa_prevent_bridging_8021q_upper(dev, info); dp = dsa_user_to_port(dev); ds = dp->ds; if (ds->ops->port_prechangeupper) { err = ds->ops->port_prechangeupper(ds, dp->index, info); if (err) return notifier_from_errno(err); } if (is_vlan_dev(info->upper_dev)) return dsa_user_check_8021q_upper(dev, info); return NOTIFY_DONE; } /* To be eligible as a DSA conduit, a LAG must have all lower interfaces be * eligible DSA conduits. Additionally, all LAG slaves must be DSA conduits of * switches in the same switch tree. */ static int dsa_lag_conduit_validate(struct net_device *lag_dev, struct netlink_ext_ack *extack) { struct net_device *lower1, *lower2; struct list_head *iter1, *iter2; netdev_for_each_lower_dev(lag_dev, lower1, iter1) { netdev_for_each_lower_dev(lag_dev, lower2, iter2) { if (!netdev_uses_dsa(lower1) || !netdev_uses_dsa(lower2)) { NL_SET_ERR_MSG_MOD(extack, "All LAG ports must be eligible as DSA conduits"); return notifier_from_errno(-EINVAL); } if (lower1 == lower2) continue; if (!dsa_port_tree_same(lower1->dsa_ptr, lower2->dsa_ptr)) { NL_SET_ERR_MSG_MOD(extack, "LAG contains DSA conduits of disjoint switch trees"); return notifier_from_errno(-EINVAL); } } } return NOTIFY_DONE; } static int dsa_conduit_prechangeupper_sanity_check(struct net_device *conduit, struct netdev_notifier_changeupper_info *info) { struct netlink_ext_ack *extack = netdev_notifier_info_to_extack(&info->info); if (!netdev_uses_dsa(conduit)) return NOTIFY_DONE; if (!info->linking) return NOTIFY_DONE; /* Allow DSA switch uppers */ if (dsa_user_dev_check(info->upper_dev)) return NOTIFY_DONE; /* Allow bridge uppers of DSA conduits, subject to further * restrictions in dsa_bridge_prechangelower_sanity_check() */ if (netif_is_bridge_master(info->upper_dev)) return NOTIFY_DONE; /* Allow LAG uppers, subject to further restrictions in * dsa_lag_conduit_prechangelower_sanity_check() */ if (netif_is_lag_master(info->upper_dev)) return dsa_lag_conduit_validate(info->upper_dev, extack); NL_SET_ERR_MSG_MOD(extack, "DSA conduit cannot join unknown upper interfaces"); return notifier_from_errno(-EBUSY); } static int dsa_lag_conduit_prechangelower_sanity_check(struct net_device *dev, struct netdev_notifier_changeupper_info *info) { struct netlink_ext_ack *extack = netdev_notifier_info_to_extack(&info->info); struct net_device *lag_dev = info->upper_dev; struct net_device *lower; struct list_head *iter; if (!netdev_uses_dsa(lag_dev) || !netif_is_lag_master(lag_dev)) return NOTIFY_DONE; if (!info->linking) return NOTIFY_DONE; if (!netdev_uses_dsa(dev)) { NL_SET_ERR_MSG(extack, "Only DSA conduits can join a LAG DSA conduit"); return notifier_from_errno(-EINVAL); } netdev_for_each_lower_dev(lag_dev, lower, iter) { if (!dsa_port_tree_same(dev->dsa_ptr, lower->dsa_ptr)) { NL_SET_ERR_MSG(extack, "Interface is DSA conduit for a different switch tree than this LAG"); return notifier_from_errno(-EINVAL); } break; } return NOTIFY_DONE; } /* Don't allow bridging of DSA conduits, since the bridge layer rx_handler * prevents the DSA fake ethertype handler to be invoked, so we don't get the * chance to strip off and parse the DSA switch tag protocol header (the bridge * layer just returns RX_HANDLER_CONSUMED, stopping RX processing for these * frames). * The only case where that would not be an issue is when bridging can already * be offloaded, such as when the DSA conduit is itself a DSA or plain switchdev * port, and is bridged only with other ports from the same hardware device. */ static int dsa_bridge_prechangelower_sanity_check(struct net_device *new_lower, struct netdev_notifier_changeupper_info *info) { struct net_device *br = info->upper_dev; struct netlink_ext_ack *extack; struct net_device *lower; struct list_head *iter; if (!netif_is_bridge_master(br)) return NOTIFY_DONE; if (!info->linking) return NOTIFY_DONE; extack = netdev_notifier_info_to_extack(&info->info); netdev_for_each_lower_dev(br, lower, iter) { if (!netdev_uses_dsa(new_lower) && !netdev_uses_dsa(lower)) continue; if (!netdev_port_same_parent_id(lower, new_lower)) { NL_SET_ERR_MSG(extack, "Cannot do software bridging with a DSA conduit"); return notifier_from_errno(-EINVAL); } } return NOTIFY_DONE; } static void dsa_tree_migrate_ports_from_lag_conduit(struct dsa_switch_tree *dst, struct net_device *lag_dev) { struct net_device *new_conduit = dsa_tree_find_first_conduit(dst); struct dsa_port *dp; int err; dsa_tree_for_each_user_port(dp, dst) { if (dsa_port_to_conduit(dp) != lag_dev) continue; err = dsa_user_change_conduit(dp->user, new_conduit, NULL); if (err) { netdev_err(dp->user, "failed to restore conduit to %s: %pe\n", new_conduit->name, ERR_PTR(err)); } } } static int dsa_conduit_lag_join(struct net_device *conduit, struct net_device *lag_dev, struct netdev_lag_upper_info *uinfo, struct netlink_ext_ack *extack) { struct dsa_port *cpu_dp = conduit->dsa_ptr; struct dsa_switch_tree *dst = cpu_dp->dst; struct dsa_port *dp; int err; err = dsa_conduit_lag_setup(lag_dev, cpu_dp, uinfo, extack); if (err) return err; dsa_tree_for_each_user_port(dp, dst) { if (dsa_port_to_conduit(dp) != conduit) continue; err = dsa_user_change_conduit(dp->user, lag_dev, extack); if (err) goto restore; } return 0; restore: dsa_tree_for_each_user_port_continue_reverse(dp, dst) { if (dsa_port_to_conduit(dp) != lag_dev) continue; err = dsa_user_change_conduit(dp->user, conduit, NULL); if (err) { netdev_err(dp->user, "failed to restore conduit to %s: %pe\n", conduit->name, ERR_PTR(err)); } } dsa_conduit_lag_teardown(lag_dev, conduit->dsa_ptr); return err; } static void dsa_conduit_lag_leave(struct net_device *conduit, struct net_device *lag_dev) { struct dsa_port *dp, *cpu_dp = lag_dev->dsa_ptr; struct dsa_switch_tree *dst = cpu_dp->dst; struct dsa_port *new_cpu_dp = NULL; struct net_device *lower; struct list_head *iter; netdev_for_each_lower_dev(lag_dev, lower, iter) { if (netdev_uses_dsa(lower)) { new_cpu_dp = lower->dsa_ptr; break; } } if (new_cpu_dp) { /* Update the CPU port of the user ports still under the LAG * so that dsa_port_to_conduit() continues to work properly */ dsa_tree_for_each_user_port(dp, dst) if (dsa_port_to_conduit(dp) == lag_dev) dp->cpu_dp = new_cpu_dp; /* Update the index of the virtual CPU port to match the lowest * physical CPU port */ lag_dev->dsa_ptr = new_cpu_dp; wmb(); } else { /* If the LAG DSA conduit has no ports left, migrate back all * user ports to the first physical CPU port */ dsa_tree_migrate_ports_from_lag_conduit(dst, lag_dev); } /* This DSA conduit has left its LAG in any case, so let * the CPU port leave the hardware LAG as well */ dsa_conduit_lag_teardown(lag_dev, conduit->dsa_ptr); } static int dsa_conduit_changeupper(struct net_device *dev, struct netdev_notifier_changeupper_info *info) { struct netlink_ext_ack *extack; int err = NOTIFY_DONE; if (!netdev_uses_dsa(dev)) return err; extack = netdev_notifier_info_to_extack(&info->info); if (netif_is_lag_master(info->upper_dev)) { if (info->linking) { err = dsa_conduit_lag_join(dev, info->upper_dev, info->upper_info, extack); err = notifier_from_errno(err); } else { dsa_conduit_lag_leave(dev, info->upper_dev); err = NOTIFY_OK; } } return err; } static int dsa_user_netdevice_event(struct notifier_block *nb, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); switch (event) { case NETDEV_PRECHANGEUPPER: { struct netdev_notifier_changeupper_info *info = ptr; int err; err = dsa_user_prechangeupper_sanity_check(dev, info); if (notifier_to_errno(err)) return err; err = dsa_conduit_prechangeupper_sanity_check(dev, info); if (notifier_to_errno(err)) return err; err = dsa_lag_conduit_prechangelower_sanity_check(dev, info); if (notifier_to_errno(err)) return err; err = dsa_bridge_prechangelower_sanity_check(dev, info); if (notifier_to_errno(err)) return err; err = dsa_user_prechangeupper(dev, ptr); if (notifier_to_errno(err)) return err; err = dsa_user_lag_prechangeupper(dev, ptr); if (notifier_to_errno(err)) return err; break; } case NETDEV_CHANGEUPPER: { int err; err = dsa_user_changeupper(dev, ptr); if (notifier_to_errno(err)) return err; err = dsa_user_lag_changeupper(dev, ptr); if (notifier_to_errno(err)) return err; err = dsa_conduit_changeupper(dev, ptr); if (notifier_to_errno(err)) return err; break; } case NETDEV_CHANGELOWERSTATE: { struct netdev_notifier_changelowerstate_info *info = ptr; struct dsa_port *dp; int err = 0; if (dsa_user_dev_check(dev)) { dp = dsa_user_to_port(dev); err = dsa_port_lag_change(dp, info->lower_state_info); } /* Mirror LAG port events on DSA conduits that are in * a LAG towards their respective switch CPU ports */ if (netdev_uses_dsa(dev)) { dp = dev->dsa_ptr; err = dsa_port_lag_change(dp, info->lower_state_info); } return notifier_from_errno(err); } case NETDEV_CHANGE: case NETDEV_UP: { /* Track state of conduit port. * DSA driver may require the conduit port (and indirectly * the tagger) to be available for some special operation. */ if (netdev_uses_dsa(dev)) { struct dsa_port *cpu_dp = dev->dsa_ptr; struct dsa_switch_tree *dst = cpu_dp->ds->dst; /* Track when the conduit port is UP */ dsa_tree_conduit_oper_state_change(dst, dev, netif_oper_up(dev)); /* Track when the conduit port is ready and can accept * packet. * NETDEV_UP event is not enough to flag a port as ready. * We also have to wait for linkwatch_do_dev to dev_activate * and emit a NETDEV_CHANGE event. * We check if a conduit port is ready by checking if the dev * have a qdisc assigned and is not noop. */ dsa_tree_conduit_admin_state_change(dst, dev, !qdisc_tx_is_noop(dev)); return NOTIFY_OK; } return NOTIFY_DONE; } case NETDEV_GOING_DOWN: { struct dsa_port *dp, *cpu_dp; struct dsa_switch_tree *dst; LIST_HEAD(close_list); if (!netdev_uses_dsa(dev)) return NOTIFY_DONE; cpu_dp = dev->dsa_ptr; dst = cpu_dp->ds->dst; dsa_tree_conduit_admin_state_change(dst, dev, false); list_for_each_entry(dp, &dst->ports, list) { if (!dsa_port_is_user(dp)) continue; if (dp->cpu_dp != cpu_dp) continue; list_add(&dp->user->close_list, &close_list); } dev_close_many(&close_list, true); return NOTIFY_OK; } default: break; } return NOTIFY_DONE; } static void dsa_fdb_offload_notify(struct dsa_switchdev_event_work *switchdev_work) { struct switchdev_notifier_fdb_info info = {}; info.addr = switchdev_work->addr; info.vid = switchdev_work->vid; info.offloaded = true; call_switchdev_notifiers(SWITCHDEV_FDB_OFFLOADED, switchdev_work->orig_dev, &info.info, NULL); } static void dsa_user_switchdev_event_work(struct work_struct *work) { struct dsa_switchdev_event_work *switchdev_work = container_of(work, struct dsa_switchdev_event_work, work); const unsigned char *addr = switchdev_work->addr; struct net_device *dev = switchdev_work->dev; u16 vid = switchdev_work->vid; struct dsa_switch *ds; struct dsa_port *dp; int err; dp = dsa_user_to_port(dev); ds = dp->ds; switch (switchdev_work->event) { case SWITCHDEV_FDB_ADD_TO_DEVICE: if (switchdev_work->host_addr) err = dsa_port_bridge_host_fdb_add(dp, addr, vid); else if (dp->lag) err = dsa_port_lag_fdb_add(dp, addr, vid); else err = dsa_port_fdb_add(dp, addr, vid); if (err) { dev_err(ds->dev, "port %d failed to add %pM vid %d to fdb: %d\n", dp->index, addr, vid, err); break; } dsa_fdb_offload_notify(switchdev_work); break; case SWITCHDEV_FDB_DEL_TO_DEVICE: if (switchdev_work->host_addr) err = dsa_port_bridge_host_fdb_del(dp, addr, vid); else if (dp->lag) err = dsa_port_lag_fdb_del(dp, addr, vid); else err = dsa_port_fdb_del(dp, addr, vid); if (err) { dev_err(ds->dev, "port %d failed to delete %pM vid %d from fdb: %d\n", dp->index, addr, vid, err); } break; } kfree(switchdev_work); } static bool dsa_foreign_dev_check(const struct net_device *dev, const struct net_device *foreign_dev) { const struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch_tree *dst = dp->ds->dst; if (netif_is_bridge_master(foreign_dev)) return !dsa_tree_offloads_bridge_dev(dst, foreign_dev); if (netif_is_bridge_port(foreign_dev)) return !dsa_tree_offloads_bridge_port(dst, foreign_dev); /* Everything else is foreign */ return true; } static int dsa_user_fdb_event(struct net_device *dev, struct net_device *orig_dev, unsigned long event, const void *ctx, const struct switchdev_notifier_fdb_info *fdb_info) { struct dsa_switchdev_event_work *switchdev_work; struct dsa_port *dp = dsa_user_to_port(dev); bool host_addr = fdb_info->is_local; struct dsa_switch *ds = dp->ds; if (ctx && ctx != dp) return 0; if (!dp->bridge) return 0; if (switchdev_fdb_is_dynamically_learned(fdb_info)) { if (dsa_port_offloads_bridge_port(dp, orig_dev)) return 0; /* FDB entries learned by the software bridge or by foreign * bridge ports should be installed as host addresses only if * the driver requests assisted learning. */ if (!ds->assisted_learning_on_cpu_port) return 0; } /* Also treat FDB entries on foreign interfaces bridged with us as host * addresses. */ if (dsa_foreign_dev_check(dev, orig_dev)) host_addr = true; /* Check early that we're not doing work in vain. * Host addresses on LAG ports still require regular FDB ops, * since the CPU port isn't in a LAG. */ if (dp->lag && !host_addr) { if (!ds->ops->lag_fdb_add || !ds->ops->lag_fdb_del) return -EOPNOTSUPP; } else { if (!ds->ops->port_fdb_add || !ds->ops->port_fdb_del) return -EOPNOTSUPP; } switchdev_work = kzalloc(sizeof(*switchdev_work), GFP_ATOMIC); if (!switchdev_work) return -ENOMEM; netdev_dbg(dev, "%s FDB entry towards %s, addr %pM vid %d%s\n", event == SWITCHDEV_FDB_ADD_TO_DEVICE ? "Adding" : "Deleting", orig_dev->name, fdb_info->addr, fdb_info->vid, host_addr ? " as host address" : ""); INIT_WORK(&switchdev_work->work, dsa_user_switchdev_event_work); switchdev_work->event = event; switchdev_work->dev = dev; switchdev_work->orig_dev = orig_dev; ether_addr_copy(switchdev_work->addr, fdb_info->addr); switchdev_work->vid = fdb_info->vid; switchdev_work->host_addr = host_addr; dsa_schedule_work(&switchdev_work->work); return 0; } /* Called under rcu_read_lock() */ static int dsa_user_switchdev_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = switchdev_notifier_info_to_dev(ptr); int err; switch (event) { case SWITCHDEV_PORT_ATTR_SET: err = switchdev_handle_port_attr_set(dev, ptr, dsa_user_dev_check, dsa_user_port_attr_set); return notifier_from_errno(err); case SWITCHDEV_FDB_ADD_TO_DEVICE: case SWITCHDEV_FDB_DEL_TO_DEVICE: err = switchdev_handle_fdb_event_to_device(dev, event, ptr, dsa_user_dev_check, dsa_foreign_dev_check, dsa_user_fdb_event); return notifier_from_errno(err); default: return NOTIFY_DONE; } return NOTIFY_OK; } static int dsa_user_switchdev_blocking_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = switchdev_notifier_info_to_dev(ptr); int err; switch (event) { case SWITCHDEV_PORT_OBJ_ADD: err = switchdev_handle_port_obj_add_foreign(dev, ptr, dsa_user_dev_check, dsa_foreign_dev_check, dsa_user_port_obj_add); return notifier_from_errno(err); case SWITCHDEV_PORT_OBJ_DEL: err = switchdev_handle_port_obj_del_foreign(dev, ptr, dsa_user_dev_check, dsa_foreign_dev_check, dsa_user_port_obj_del); return notifier_from_errno(err); case SWITCHDEV_PORT_ATTR_SET: err = switchdev_handle_port_attr_set(dev, ptr, dsa_user_dev_check, dsa_user_port_attr_set); return notifier_from_errno(err); } return NOTIFY_DONE; } static struct notifier_block dsa_user_nb __read_mostly = { .notifier_call = dsa_user_netdevice_event, }; struct notifier_block dsa_user_switchdev_notifier = { .notifier_call = dsa_user_switchdev_event, }; struct notifier_block dsa_user_switchdev_blocking_notifier = { .notifier_call = dsa_user_switchdev_blocking_event, }; int dsa_user_register_notifier(void) { struct notifier_block *nb; int err; err = register_netdevice_notifier(&dsa_user_nb); if (err) return err; err = register_switchdev_notifier(&dsa_user_switchdev_notifier); if (err) goto err_switchdev_nb; nb = &dsa_user_switchdev_blocking_notifier; err = register_switchdev_blocking_notifier(nb); if (err) goto err_switchdev_blocking_nb; return 0; err_switchdev_blocking_nb: unregister_switchdev_notifier(&dsa_user_switchdev_notifier); err_switchdev_nb: unregister_netdevice_notifier(&dsa_user_nb); return err; } void dsa_user_unregister_notifier(void) { struct notifier_block *nb; int err; nb = &dsa_user_switchdev_blocking_notifier; err = unregister_switchdev_blocking_notifier(nb); if (err) pr_err("DSA: failed to unregister switchdev blocking notifier (%d)\n", err); err = unregister_switchdev_notifier(&dsa_user_switchdev_notifier); if (err) pr_err("DSA: failed to unregister switchdev notifier (%d)\n", err); err = unregister_netdevice_notifier(&dsa_user_nb); if (err) pr_err("DSA: failed to unregister user notifier (%d)\n", err); } |
| 5 5 5 2 2 2 2 1 1 2 2 1 2 3 2 2 1 3 3 3 3 3 3 3 3 3 4 4 3 1 1 2 3 3 3 3 3 3 3 3 3 4 3 1 3 2 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2018 Facebook */ #include <linux/bpf.h> #include <linux/err.h> #include <linux/sock_diag.h> #include <net/sock_reuseport.h> #include <linux/btf_ids.h> struct reuseport_array { struct bpf_map map; struct sock __rcu *ptrs[]; }; static struct reuseport_array *reuseport_array(struct bpf_map *map) { return (struct reuseport_array *)map; } /* The caller must hold the reuseport_lock */ void bpf_sk_reuseport_detach(struct sock *sk) { struct sock __rcu **socks; write_lock_bh(&sk->sk_callback_lock); socks = __locked_read_sk_user_data_with_flags(sk, SK_USER_DATA_BPF); if (socks) { WRITE_ONCE(sk->sk_user_data, NULL); /* * Do not move this NULL assignment outside of * sk->sk_callback_lock because there is * a race with reuseport_array_free() * which does not hold the reuseport_lock. */ RCU_INIT_POINTER(*socks, NULL); } write_unlock_bh(&sk->sk_callback_lock); } static int reuseport_array_alloc_check(union bpf_attr *attr) { if (attr->value_size != sizeof(u32) && attr->value_size != sizeof(u64)) return -EINVAL; return array_map_alloc_check(attr); } static void *reuseport_array_lookup_elem(struct bpf_map *map, void *key) { struct reuseport_array *array = reuseport_array(map); u32 index = *(u32 *)key; if (unlikely(index >= array->map.max_entries)) return NULL; return rcu_dereference(array->ptrs[index]); } /* Called from syscall only */ static long reuseport_array_delete_elem(struct bpf_map *map, void *key) { struct reuseport_array *array = reuseport_array(map); u32 index = *(u32 *)key; struct sock *sk; int err; if (index >= map->max_entries) return -E2BIG; if (!rcu_access_pointer(array->ptrs[index])) return -ENOENT; spin_lock_bh(&reuseport_lock); sk = rcu_dereference_protected(array->ptrs[index], lockdep_is_held(&reuseport_lock)); if (sk) { write_lock_bh(&sk->sk_callback_lock); WRITE_ONCE(sk->sk_user_data, NULL); RCU_INIT_POINTER(array->ptrs[index], NULL); write_unlock_bh(&sk->sk_callback_lock); err = 0; } else { err = -ENOENT; } spin_unlock_bh(&reuseport_lock); return err; } static void reuseport_array_free(struct bpf_map *map) { struct reuseport_array *array = reuseport_array(map); struct sock *sk; u32 i; /* * ops->map_*_elem() will not be able to access this * array now. Hence, this function only races with * bpf_sk_reuseport_detach() which was triggered by * close() or disconnect(). * * This function and bpf_sk_reuseport_detach() are * both removing sk from "array". Who removes it * first does not matter. * * The only concern here is bpf_sk_reuseport_detach() * may access "array" which is being freed here. * bpf_sk_reuseport_detach() access this "array" * through sk->sk_user_data _and_ with sk->sk_callback_lock * held which is enough because this "array" is not freed * until all sk->sk_user_data has stopped referencing this "array". * * Hence, due to the above, taking "reuseport_lock" is not * needed here. */ /* * Since reuseport_lock is not taken, sk is accessed under * rcu_read_lock() */ rcu_read_lock(); for (i = 0; i < map->max_entries; i++) { sk = rcu_dereference(array->ptrs[i]); if (sk) { write_lock_bh(&sk->sk_callback_lock); /* * No need for WRITE_ONCE(). At this point, * no one is reading it without taking the * sk->sk_callback_lock. */ sk->sk_user_data = NULL; write_unlock_bh(&sk->sk_callback_lock); RCU_INIT_POINTER(array->ptrs[i], NULL); } } rcu_read_unlock(); /* * Once reaching here, all sk->sk_user_data is not * referencing this "array". "array" can be freed now. */ bpf_map_area_free(array); } static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr) { int numa_node = bpf_map_attr_numa_node(attr); struct reuseport_array *array; /* allocate all map elements and zero-initialize them */ array = bpf_map_area_alloc(struct_size(array, ptrs, attr->max_entries), numa_node); if (!array) return ERR_PTR(-ENOMEM); /* copy mandatory map attributes */ bpf_map_init_from_attr(&array->map, attr); return &array->map; } int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key, void *value) { struct sock *sk; int err; if (map->value_size != sizeof(u64)) return -ENOSPC; rcu_read_lock(); sk = reuseport_array_lookup_elem(map, key); if (sk) { *(u64 *)value = __sock_gen_cookie(sk); err = 0; } else { err = -ENOENT; } rcu_read_unlock(); return err; } static int reuseport_array_update_check(const struct reuseport_array *array, const struct sock *nsk, const struct sock *osk, const struct sock_reuseport *nsk_reuse, u32 map_flags) { if (osk && map_flags == BPF_NOEXIST) return -EEXIST; if (!osk && map_flags == BPF_EXIST) return -ENOENT; if (nsk->sk_protocol != IPPROTO_UDP && nsk->sk_protocol != IPPROTO_TCP) return -ENOTSUPP; if (nsk->sk_family != AF_INET && nsk->sk_family != AF_INET6) return -ENOTSUPP; if (nsk->sk_type != SOCK_STREAM && nsk->sk_type != SOCK_DGRAM) return -ENOTSUPP; /* * sk must be hashed (i.e. listening in the TCP case or binded * in the UDP case) and * it must also be a SO_REUSEPORT sk (i.e. reuse cannot be NULL). * * Also, sk will be used in bpf helper that is protected by * rcu_read_lock(). */ if (!sock_flag(nsk, SOCK_RCU_FREE) || !sk_hashed(nsk) || !nsk_reuse) return -EINVAL; /* READ_ONCE because the sk->sk_callback_lock may not be held here */ if (READ_ONCE(nsk->sk_user_data)) return -EBUSY; return 0; } /* * Called from syscall only. * The "nsk" in the fd refcnt. * The "osk" and "reuse" are protected by reuseport_lock. */ int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { struct reuseport_array *array = reuseport_array(map); struct sock *free_osk = NULL, *osk, *nsk; struct sock_reuseport *reuse; u32 index = *(u32 *)key; uintptr_t sk_user_data; struct socket *socket; int err, fd; if (map_flags > BPF_EXIST) return -EINVAL; if (index >= map->max_entries) return -E2BIG; if (map->value_size == sizeof(u64)) { u64 fd64 = *(u64 *)value; if (fd64 > S32_MAX) return -EINVAL; fd = fd64; } else { fd = *(int *)value; } socket = sockfd_lookup(fd, &err); if (!socket) return err; nsk = socket->sk; if (!nsk) { err = -EINVAL; goto put_file; } /* Quick checks before taking reuseport_lock */ err = reuseport_array_update_check(array, nsk, rcu_access_pointer(array->ptrs[index]), rcu_access_pointer(nsk->sk_reuseport_cb), map_flags); if (err) goto put_file; spin_lock_bh(&reuseport_lock); /* * Some of the checks only need reuseport_lock * but it is done under sk_callback_lock also * for simplicity reason. */ write_lock_bh(&nsk->sk_callback_lock); osk = rcu_dereference_protected(array->ptrs[index], lockdep_is_held(&reuseport_lock)); reuse = rcu_dereference_protected(nsk->sk_reuseport_cb, lockdep_is_held(&reuseport_lock)); err = reuseport_array_update_check(array, nsk, osk, reuse, map_flags); if (err) goto put_file_unlock; sk_user_data = (uintptr_t)&array->ptrs[index] | SK_USER_DATA_NOCOPY | SK_USER_DATA_BPF; WRITE_ONCE(nsk->sk_user_data, (void *)sk_user_data); rcu_assign_pointer(array->ptrs[index], nsk); free_osk = osk; err = 0; put_file_unlock: write_unlock_bh(&nsk->sk_callback_lock); if (free_osk) { write_lock_bh(&free_osk->sk_callback_lock); WRITE_ONCE(free_osk->sk_user_data, NULL); write_unlock_bh(&free_osk->sk_callback_lock); } spin_unlock_bh(&reuseport_lock); put_file: fput(socket->file); return err; } /* Called from syscall */ static int reuseport_array_get_next_key(struct bpf_map *map, void *key, void *next_key) { struct reuseport_array *array = reuseport_array(map); u32 index = key ? *(u32 *)key : U32_MAX; u32 *next = (u32 *)next_key; if (index >= array->map.max_entries) { *next = 0; return 0; } if (index == array->map.max_entries - 1) return -ENOENT; *next = index + 1; return 0; } static u64 reuseport_array_mem_usage(const struct bpf_map *map) { struct reuseport_array *array; return struct_size(array, ptrs, map->max_entries); } BTF_ID_LIST_SINGLE(reuseport_array_map_btf_ids, struct, reuseport_array) const struct bpf_map_ops reuseport_array_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = reuseport_array_alloc_check, .map_alloc = reuseport_array_alloc, .map_free = reuseport_array_free, .map_lookup_elem = reuseport_array_lookup_elem, .map_get_next_key = reuseport_array_get_next_key, .map_delete_elem = reuseport_array_delete_elem, .map_mem_usage = reuseport_array_mem_usage, .map_btf_id = &reuseport_array_map_btf_ids[0], }; |
| 6 6 2 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 | /* * llc_c_ev.c - Connection component state transition event qualifiers * * A 'state' consists of a number of possible event matching functions, * the actions associated with each being executed when that event is * matched; a 'state machine' accepts events in a serial fashion from an * event queue. Each event is passed to each successive event matching * function until a match is made (the event matching function returns * success, or '0') or the list of event matching functions is exhausted. * If a match is made, the actions associated with the event are executed * and the state is changed to that event's transition state. Before some * events are recognized, even after a match has been made, a certain * number of 'event qualifier' functions must also be executed. If these * all execute successfully, then the event is finally executed. * * These event functions must return 0 for success, to show a matched * event, of 1 if the event does not match. Event qualifier functions * must return a 0 for success or a non-zero for failure. Each function * is simply responsible for verifying one single thing and returning * either a success or failure. * * All of followed event functions are described in 802.2 LLC Protocol * standard document except two functions that we added that will explain * in their comments, at below. * * Copyright (c) 1997 by Procom Technology, Inc. * 2001-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br> * * This program can be redistributed or modified under the terms of the * GNU General Public License as published by the Free Software Foundation. * This program is distributed without any warranty or implied warranty * of merchantability or fitness for a particular purpose. * * See the GNU General Public License for more details. */ #include <linux/netdevice.h> #include <net/llc_conn.h> #include <net/llc_sap.h> #include <net/sock.h> #include <net/llc_c_ac.h> #include <net/llc_c_ev.h> #include <net/llc_pdu.h> #if 1 #define dprintk(args...) printk(KERN_DEBUG args) #else #define dprintk(args...) #endif /** * llc_util_ns_inside_rx_window - check if sequence number is in rx window * @ns: sequence number of received pdu. * @vr: sequence number which receiver expects to receive. * @rw: receive window size of receiver. * * Checks if sequence number of received PDU is in range of receive * window. Returns 0 for success, 1 otherwise */ static u16 llc_util_ns_inside_rx_window(u8 ns, u8 vr, u8 rw) { return !llc_circular_between(vr, ns, (vr + rw - 1) % LLC_2_SEQ_NBR_MODULO); } /** * llc_util_nr_inside_tx_window - check if sequence number is in tx window * @sk: current connection. * @nr: N(R) of received PDU. * * This routine checks if N(R) of received PDU is in range of transmit * window; on the other hand checks if received PDU acknowledges some * outstanding PDUs that are in transmit window. Returns 0 for success, 1 * otherwise. */ static u16 llc_util_nr_inside_tx_window(struct sock *sk, u8 nr) { u8 nr1, nr2; struct sk_buff *skb; struct llc_pdu_sn *pdu; struct llc_sock *llc = llc_sk(sk); int rc = 0; if (llc->dev->flags & IFF_LOOPBACK) goto out; rc = 1; if (skb_queue_empty(&llc->pdu_unack_q)) goto out; skb = skb_peek(&llc->pdu_unack_q); pdu = llc_pdu_sn_hdr(skb); nr1 = LLC_I_GET_NS(pdu); skb = skb_peek_tail(&llc->pdu_unack_q); pdu = llc_pdu_sn_hdr(skb); nr2 = LLC_I_GET_NS(pdu); rc = !llc_circular_between(nr1, nr, (nr2 + 1) % LLC_2_SEQ_NBR_MODULO); out: return rc; } int llc_conn_ev_conn_req(struct sock *sk, struct sk_buff *skb) { const struct llc_conn_state_ev *ev = llc_conn_ev(skb); return ev->prim == LLC_CONN_PRIM && ev->prim_type == LLC_PRIM_TYPE_REQ ? 0 : 1; } int llc_conn_ev_data_req(struct sock *sk, struct sk_buff *skb) { const struct llc_conn_state_ev *ev = llc_conn_ev(skb); return ev->prim == LLC_DATA_PRIM && ev->prim_type == LLC_PRIM_TYPE_REQ ? 0 : 1; } int llc_conn_ev_disc_req(struct sock *sk, struct sk_buff *skb) { const struct llc_conn_state_ev *ev = llc_conn_ev(skb); return ev->prim == LLC_DISC_PRIM && ev->prim_type == LLC_PRIM_TYPE_REQ ? 0 : 1; } int llc_conn_ev_rst_req(struct sock *sk, struct sk_buff *skb) { const struct llc_conn_state_ev *ev = llc_conn_ev(skb); return ev->prim == LLC_RESET_PRIM && ev->prim_type == LLC_PRIM_TYPE_REQ ? 0 : 1; } int llc_conn_ev_local_busy_detected(struct sock *sk, struct sk_buff *skb) { const struct llc_conn_state_ev *ev = llc_conn_ev(skb); return ev->type == LLC_CONN_EV_TYPE_SIMPLE && ev->prim_type == LLC_CONN_EV_LOCAL_BUSY_DETECTED ? 0 : 1; } int llc_conn_ev_local_busy_cleared(struct sock *sk, struct sk_buff *skb) { const struct llc_conn_state_ev *ev = llc_conn_ev(skb); return ev->type == LLC_CONN_EV_TYPE_SIMPLE && ev->prim_type == LLC_CONN_EV_LOCAL_BUSY_CLEARED ? 0 : 1; } int llc_conn_ev_rx_bad_pdu(struct sock *sk, struct sk_buff *skb) { return 1; } int llc_conn_ev_rx_disc_cmd_pbit_set_x(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); return LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_U(pdu) && LLC_U_PDU_CMD(pdu) == LLC_2_PDU_CMD_DISC ? 0 : 1; } int llc_conn_ev_rx_dm_rsp_fbit_set_x(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_U(pdu) && LLC_U_PDU_RSP(pdu) == LLC_2_PDU_RSP_DM ? 0 : 1; } int llc_conn_ev_rx_frmr_rsp_fbit_set_x(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_U(pdu) && LLC_U_PDU_RSP(pdu) == LLC_2_PDU_RSP_FRMR ? 0 : 1; } int llc_conn_ev_rx_i_cmd_pbit_set_0(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); return llc_conn_space(sk, skb) && LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_I(pdu) && LLC_I_PF_IS_0(pdu) && LLC_I_GET_NS(pdu) == llc_sk(sk)->vR ? 0 : 1; } int llc_conn_ev_rx_i_cmd_pbit_set_1(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); return llc_conn_space(sk, skb) && LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_I(pdu) && LLC_I_PF_IS_1(pdu) && LLC_I_GET_NS(pdu) == llc_sk(sk)->vR ? 0 : 1; } int llc_conn_ev_rx_i_cmd_pbit_set_0_unexpd_ns(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); const u8 vr = llc_sk(sk)->vR; const u8 ns = LLC_I_GET_NS(pdu); return LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_I(pdu) && LLC_I_PF_IS_0(pdu) && ns != vr && !llc_util_ns_inside_rx_window(ns, vr, llc_sk(sk)->rw) ? 0 : 1; } int llc_conn_ev_rx_i_cmd_pbit_set_1_unexpd_ns(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); const u8 vr = llc_sk(sk)->vR; const u8 ns = LLC_I_GET_NS(pdu); return LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_I(pdu) && LLC_I_PF_IS_1(pdu) && ns != vr && !llc_util_ns_inside_rx_window(ns, vr, llc_sk(sk)->rw) ? 0 : 1; } int llc_conn_ev_rx_i_cmd_pbit_set_x_inval_ns(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn * pdu = llc_pdu_sn_hdr(skb); const u8 vr = llc_sk(sk)->vR; const u8 ns = LLC_I_GET_NS(pdu); const u16 rc = LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_I(pdu) && ns != vr && llc_util_ns_inside_rx_window(ns, vr, llc_sk(sk)->rw) ? 0 : 1; if (!rc) dprintk("%s: matched, state=%d, ns=%d, vr=%d\n", __func__, llc_sk(sk)->state, ns, vr); return rc; } int llc_conn_ev_rx_i_rsp_fbit_set_0(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); return llc_conn_space(sk, skb) && LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_I(pdu) && LLC_I_PF_IS_0(pdu) && LLC_I_GET_NS(pdu) == llc_sk(sk)->vR ? 0 : 1; } int llc_conn_ev_rx_i_rsp_fbit_set_1(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_I(pdu) && LLC_I_PF_IS_1(pdu) && LLC_I_GET_NS(pdu) == llc_sk(sk)->vR ? 0 : 1; } int llc_conn_ev_rx_i_rsp_fbit_set_x(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); return llc_conn_space(sk, skb) && LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_I(pdu) && LLC_I_GET_NS(pdu) == llc_sk(sk)->vR ? 0 : 1; } int llc_conn_ev_rx_i_rsp_fbit_set_0_unexpd_ns(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); const u8 vr = llc_sk(sk)->vR; const u8 ns = LLC_I_GET_NS(pdu); return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_I(pdu) && LLC_I_PF_IS_0(pdu) && ns != vr && !llc_util_ns_inside_rx_window(ns, vr, llc_sk(sk)->rw) ? 0 : 1; } int llc_conn_ev_rx_i_rsp_fbit_set_1_unexpd_ns(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); const u8 vr = llc_sk(sk)->vR; const u8 ns = LLC_I_GET_NS(pdu); return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_I(pdu) && LLC_I_PF_IS_1(pdu) && ns != vr && !llc_util_ns_inside_rx_window(ns, vr, llc_sk(sk)->rw) ? 0 : 1; } int llc_conn_ev_rx_i_rsp_fbit_set_x_unexpd_ns(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); const u8 vr = llc_sk(sk)->vR; const u8 ns = LLC_I_GET_NS(pdu); return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_I(pdu) && ns != vr && !llc_util_ns_inside_rx_window(ns, vr, llc_sk(sk)->rw) ? 0 : 1; } int llc_conn_ev_rx_i_rsp_fbit_set_x_inval_ns(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); const u8 vr = llc_sk(sk)->vR; const u8 ns = LLC_I_GET_NS(pdu); const u16 rc = LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_I(pdu) && ns != vr && llc_util_ns_inside_rx_window(ns, vr, llc_sk(sk)->rw) ? 0 : 1; if (!rc) dprintk("%s: matched, state=%d, ns=%d, vr=%d\n", __func__, llc_sk(sk)->state, ns, vr); return rc; } int llc_conn_ev_rx_rej_cmd_pbit_set_0(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); return LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_S(pdu) && LLC_S_PF_IS_0(pdu) && LLC_S_PDU_CMD(pdu) == LLC_2_PDU_CMD_REJ ? 0 : 1; } int llc_conn_ev_rx_rej_cmd_pbit_set_1(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); return LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_S(pdu) && LLC_S_PF_IS_1(pdu) && LLC_S_PDU_CMD(pdu) == LLC_2_PDU_CMD_REJ ? 0 : 1; } int llc_conn_ev_rx_rej_rsp_fbit_set_0(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_S(pdu) && LLC_S_PF_IS_0(pdu) && LLC_S_PDU_RSP(pdu) == LLC_2_PDU_RSP_REJ ? 0 : 1; } int llc_conn_ev_rx_rej_rsp_fbit_set_1(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_S(pdu) && LLC_S_PF_IS_1(pdu) && LLC_S_PDU_RSP(pdu) == LLC_2_PDU_RSP_REJ ? 0 : 1; } int llc_conn_ev_rx_rej_rsp_fbit_set_x(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_S(pdu) && LLC_S_PDU_RSP(pdu) == LLC_2_PDU_RSP_REJ ? 0 : 1; } int llc_conn_ev_rx_rnr_cmd_pbit_set_0(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); return LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_S(pdu) && LLC_S_PF_IS_0(pdu) && LLC_S_PDU_CMD(pdu) == LLC_2_PDU_CMD_RNR ? 0 : 1; } int llc_conn_ev_rx_rnr_cmd_pbit_set_1(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); return LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_S(pdu) && LLC_S_PF_IS_1(pdu) && LLC_S_PDU_CMD(pdu) == LLC_2_PDU_CMD_RNR ? 0 : 1; } int llc_conn_ev_rx_rnr_rsp_fbit_set_0(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_S(pdu) && LLC_S_PF_IS_0(pdu) && LLC_S_PDU_RSP(pdu) == LLC_2_PDU_RSP_RNR ? 0 : 1; } int llc_conn_ev_rx_rnr_rsp_fbit_set_1(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_S(pdu) && LLC_S_PF_IS_1(pdu) && LLC_S_PDU_RSP(pdu) == LLC_2_PDU_RSP_RNR ? 0 : 1; } int llc_conn_ev_rx_rr_cmd_pbit_set_0(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); return LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_S(pdu) && LLC_S_PF_IS_0(pdu) && LLC_S_PDU_CMD(pdu) == LLC_2_PDU_CMD_RR ? 0 : 1; } int llc_conn_ev_rx_rr_cmd_pbit_set_1(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); return LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_S(pdu) && LLC_S_PF_IS_1(pdu) && LLC_S_PDU_CMD(pdu) == LLC_2_PDU_CMD_RR ? 0 : 1; } int llc_conn_ev_rx_rr_rsp_fbit_set_0(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); return llc_conn_space(sk, skb) && LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_S(pdu) && LLC_S_PF_IS_0(pdu) && LLC_S_PDU_RSP(pdu) == LLC_2_PDU_RSP_RR ? 0 : 1; } int llc_conn_ev_rx_rr_rsp_fbit_set_1(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); return llc_conn_space(sk, skb) && LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_S(pdu) && LLC_S_PF_IS_1(pdu) && LLC_S_PDU_RSP(pdu) == LLC_2_PDU_RSP_RR ? 0 : 1; } int llc_conn_ev_rx_sabme_cmd_pbit_set_x(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); return LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_U(pdu) && LLC_U_PDU_CMD(pdu) == LLC_2_PDU_CMD_SABME ? 0 : 1; } int llc_conn_ev_rx_ua_rsp_fbit_set_x(struct sock *sk, struct sk_buff *skb) { struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_U(pdu) && LLC_U_PDU_RSP(pdu) == LLC_2_PDU_RSP_UA ? 0 : 1; } int llc_conn_ev_rx_xxx_cmd_pbit_set_1(struct sock *sk, struct sk_buff *skb) { u16 rc = 1; const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); if (LLC_PDU_IS_CMD(pdu)) { if (LLC_PDU_TYPE_IS_I(pdu) || LLC_PDU_TYPE_IS_S(pdu)) { if (LLC_I_PF_IS_1(pdu)) rc = 0; } else if (LLC_PDU_TYPE_IS_U(pdu) && LLC_U_PF_IS_1(pdu)) rc = 0; } return rc; } int llc_conn_ev_rx_xxx_cmd_pbit_set_x(struct sock *sk, struct sk_buff *skb) { u16 rc = 1; const struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); if (LLC_PDU_IS_CMD(pdu)) { if (LLC_PDU_TYPE_IS_I(pdu) || LLC_PDU_TYPE_IS_S(pdu)) rc = 0; else if (LLC_PDU_TYPE_IS_U(pdu)) switch (LLC_U_PDU_CMD(pdu)) { case LLC_2_PDU_CMD_SABME: case LLC_2_PDU_CMD_DISC: rc = 0; break; } } return rc; } int llc_conn_ev_rx_xxx_rsp_fbit_set_x(struct sock *sk, struct sk_buff *skb) { u16 rc = 1; const struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); if (LLC_PDU_IS_RSP(pdu)) { if (LLC_PDU_TYPE_IS_I(pdu) || LLC_PDU_TYPE_IS_S(pdu)) rc = 0; else if (LLC_PDU_TYPE_IS_U(pdu)) switch (LLC_U_PDU_RSP(pdu)) { case LLC_2_PDU_RSP_UA: case LLC_2_PDU_RSP_DM: case LLC_2_PDU_RSP_FRMR: rc = 0; break; } } return rc; } int llc_conn_ev_rx_zzz_cmd_pbit_set_x_inval_nr(struct sock *sk, struct sk_buff *skb) { u16 rc = 1; const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); const u8 vs = llc_sk(sk)->vS; const u8 nr = LLC_I_GET_NR(pdu); if (LLC_PDU_IS_CMD(pdu) && (LLC_PDU_TYPE_IS_I(pdu) || LLC_PDU_TYPE_IS_S(pdu)) && nr != vs && llc_util_nr_inside_tx_window(sk, nr)) { dprintk("%s: matched, state=%d, vs=%d, nr=%d\n", __func__, llc_sk(sk)->state, vs, nr); rc = 0; } return rc; } int llc_conn_ev_rx_zzz_rsp_fbit_set_x_inval_nr(struct sock *sk, struct sk_buff *skb) { u16 rc = 1; const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); const u8 vs = llc_sk(sk)->vS; const u8 nr = LLC_I_GET_NR(pdu); if (LLC_PDU_IS_RSP(pdu) && (LLC_PDU_TYPE_IS_I(pdu) || LLC_PDU_TYPE_IS_S(pdu)) && nr != vs && llc_util_nr_inside_tx_window(sk, nr)) { rc = 0; dprintk("%s: matched, state=%d, vs=%d, nr=%d\n", __func__, llc_sk(sk)->state, vs, nr); } return rc; } int llc_conn_ev_rx_any_frame(struct sock *sk, struct sk_buff *skb) { return 0; } int llc_conn_ev_p_tmr_exp(struct sock *sk, struct sk_buff *skb) { const struct llc_conn_state_ev *ev = llc_conn_ev(skb); return ev->type != LLC_CONN_EV_TYPE_P_TMR; } int llc_conn_ev_ack_tmr_exp(struct sock *sk, struct sk_buff *skb) { const struct llc_conn_state_ev *ev = llc_conn_ev(skb); return ev->type != LLC_CONN_EV_TYPE_ACK_TMR; } int llc_conn_ev_rej_tmr_exp(struct sock *sk, struct sk_buff *skb) { const struct llc_conn_state_ev *ev = llc_conn_ev(skb); return ev->type != LLC_CONN_EV_TYPE_REJ_TMR; } int llc_conn_ev_busy_tmr_exp(struct sock *sk, struct sk_buff *skb) { const struct llc_conn_state_ev *ev = llc_conn_ev(skb); return ev->type != LLC_CONN_EV_TYPE_BUSY_TMR; } int llc_conn_ev_init_p_f_cycle(struct sock *sk, struct sk_buff *skb) { return 1; } int llc_conn_ev_tx_buffer_full(struct sock *sk, struct sk_buff *skb) { const struct llc_conn_state_ev *ev = llc_conn_ev(skb); return ev->type == LLC_CONN_EV_TYPE_SIMPLE && ev->prim_type == LLC_CONN_EV_TX_BUFF_FULL ? 0 : 1; } /* Event qualifier functions * * these functions simply verify the value of a state flag associated with * the connection and return either a 0 for success or a non-zero value * for not-success; verify the event is the type we expect */ int llc_conn_ev_qlfy_data_flag_eq_1(struct sock *sk, struct sk_buff *skb) { return llc_sk(sk)->data_flag != 1; } int llc_conn_ev_qlfy_data_flag_eq_0(struct sock *sk, struct sk_buff *skb) { return llc_sk(sk)->data_flag; } int llc_conn_ev_qlfy_data_flag_eq_2(struct sock *sk, struct sk_buff *skb) { return llc_sk(sk)->data_flag != 2; } int llc_conn_ev_qlfy_p_flag_eq_1(struct sock *sk, struct sk_buff *skb) { return llc_sk(sk)->p_flag != 1; } /** * llc_conn_ev_qlfy_last_frame_eq_1 - checks if frame is last in tx window * @sk: current connection structure. * @skb: current event. * * This function determines when frame which is sent, is last frame of * transmit window, if it is then this function return zero else return * one. This function is used for sending last frame of transmit window * as I-format command with p-bit set to one. Returns 0 if frame is last * frame, 1 otherwise. */ int llc_conn_ev_qlfy_last_frame_eq_1(struct sock *sk, struct sk_buff *skb) { return !(skb_queue_len(&llc_sk(sk)->pdu_unack_q) + 1 == llc_sk(sk)->k); } /** * llc_conn_ev_qlfy_last_frame_eq_0 - checks if frame isn't last in tx window * @sk: current connection structure. * @skb: current event. * * This function determines when frame which is sent, isn't last frame of * transmit window, if it isn't then this function return zero else return * one. Returns 0 if frame isn't last frame, 1 otherwise. */ int llc_conn_ev_qlfy_last_frame_eq_0(struct sock *sk, struct sk_buff *skb) { return skb_queue_len(&llc_sk(sk)->pdu_unack_q) + 1 == llc_sk(sk)->k; } int llc_conn_ev_qlfy_p_flag_eq_0(struct sock *sk, struct sk_buff *skb) { return llc_sk(sk)->p_flag; } int llc_conn_ev_qlfy_p_flag_eq_f(struct sock *sk, struct sk_buff *skb) { u8 f_bit; llc_pdu_decode_pf_bit(skb, &f_bit); return llc_sk(sk)->p_flag == f_bit ? 0 : 1; } int llc_conn_ev_qlfy_remote_busy_eq_0(struct sock *sk, struct sk_buff *skb) { return llc_sk(sk)->remote_busy_flag; } int llc_conn_ev_qlfy_remote_busy_eq_1(struct sock *sk, struct sk_buff *skb) { return !llc_sk(sk)->remote_busy_flag; } int llc_conn_ev_qlfy_retry_cnt_lt_n2(struct sock *sk, struct sk_buff *skb) { return !(llc_sk(sk)->retry_count < llc_sk(sk)->n2); } int llc_conn_ev_qlfy_retry_cnt_gte_n2(struct sock *sk, struct sk_buff *skb) { return !(llc_sk(sk)->retry_count >= llc_sk(sk)->n2); } int llc_conn_ev_qlfy_s_flag_eq_1(struct sock *sk, struct sk_buff *skb) { return !llc_sk(sk)->s_flag; } int llc_conn_ev_qlfy_s_flag_eq_0(struct sock *sk, struct sk_buff *skb) { return llc_sk(sk)->s_flag; } int llc_conn_ev_qlfy_cause_flag_eq_1(struct sock *sk, struct sk_buff *skb) { return !llc_sk(sk)->cause_flag; } int llc_conn_ev_qlfy_cause_flag_eq_0(struct sock *sk, struct sk_buff *skb) { return llc_sk(sk)->cause_flag; } int llc_conn_ev_qlfy_set_status_conn(struct sock *sk, struct sk_buff *skb) { struct llc_conn_state_ev *ev = llc_conn_ev(skb); ev->status = LLC_STATUS_CONN; return 0; } int llc_conn_ev_qlfy_set_status_disc(struct sock *sk, struct sk_buff *skb) { struct llc_conn_state_ev *ev = llc_conn_ev(skb); ev->status = LLC_STATUS_DISC; return 0; } int llc_conn_ev_qlfy_set_status_failed(struct sock *sk, struct sk_buff *skb) { struct llc_conn_state_ev *ev = llc_conn_ev(skb); ev->status = LLC_STATUS_FAILED; return 0; } int llc_conn_ev_qlfy_set_status_remote_busy(struct sock *sk, struct sk_buff *skb) { struct llc_conn_state_ev *ev = llc_conn_ev(skb); ev->status = LLC_STATUS_REMOTE_BUSY; return 0; } int llc_conn_ev_qlfy_set_status_refuse(struct sock *sk, struct sk_buff *skb) { struct llc_conn_state_ev *ev = llc_conn_ev(skb); ev->status = LLC_STATUS_REFUSE; return 0; } int llc_conn_ev_qlfy_set_status_conflict(struct sock *sk, struct sk_buff *skb) { struct llc_conn_state_ev *ev = llc_conn_ev(skb); ev->status = LLC_STATUS_CONFLICT; return 0; } int llc_conn_ev_qlfy_set_status_rst_done(struct sock *sk, struct sk_buff *skb) { struct llc_conn_state_ev *ev = llc_conn_ev(skb); ev->status = LLC_STATUS_RESET_DONE; return 0; } |
| 37 38 38 38 37 38 38 38 38 37 37 38 37 38 38 16 16 7 38 38 38 38 38 38 38 38 38 38 38 38 38 11 11 7 7 7 7 7 7 7 7 7 7 1 1 7 1 6 7 7 7 7 6 11 11 10 16 16 16 16 16 13 13 13 13 13 11 10 13 6 4 6 16 16 16 16 13 13 2 10 9 7 7 7 6 1 6 6 5 9 16 2 2 2 2 1 1 1 1 1 2 2 2 2 1 1 3 1 1 1 1 1 1 1 1 18 18 18 17 17 17 17 16 1 1 1 17 1 1 1 19 19 19 18 18 19 19 7 7 2 3 3 2 1 2 1 1 1 7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 | // SPDX-License-Identifier: GPL-2.0-only /* * Packet matching code for ARP packets. * * Based heavily, if not almost entirely, upon ip_tables.c framework. * * Some ARP specific bits are: * * Copyright (C) 2002 David S. Miller (davem@redhat.com) * Copyright (C) 2006-2009 Patrick McHardy <kaber@trash.net> * */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/capability.h> #include <linux/if_arp.h> #include <linux/kmod.h> #include <linux/vmalloc.h> #include <linux/proc_fs.h> #include <linux/module.h> #include <linux/init.h> #include <linux/mutex.h> #include <linux/err.h> #include <net/compat.h> #include <net/sock.h> #include <linux/uaccess.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter_arp/arp_tables.h> #include "../../netfilter/xt_repldata.h" MODULE_LICENSE("GPL"); MODULE_AUTHOR("David S. Miller <davem@redhat.com>"); MODULE_DESCRIPTION("arptables core"); void *arpt_alloc_initial_table(const struct xt_table *info) { return xt_alloc_initial_table(arpt, ARPT); } EXPORT_SYMBOL_GPL(arpt_alloc_initial_table); static inline int arp_devaddr_compare(const struct arpt_devaddr_info *ap, const char *hdr_addr, int len) { int i, ret; if (len > ARPT_DEV_ADDR_LEN_MAX) len = ARPT_DEV_ADDR_LEN_MAX; ret = 0; for (i = 0; i < len; i++) ret |= (hdr_addr[i] ^ ap->addr[i]) & ap->mask[i]; return ret != 0; } /* * Unfortunately, _b and _mask are not aligned to an int (or long int) * Some arches dont care, unrolling the loop is a win on them. * For other arches, we only have a 16bit alignement. */ static unsigned long ifname_compare(const char *_a, const char *_b, const char *_mask) { #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS unsigned long ret = ifname_compare_aligned(_a, _b, _mask); #else unsigned long ret = 0; const u16 *a = (const u16 *)_a; const u16 *b = (const u16 *)_b; const u16 *mask = (const u16 *)_mask; int i; for (i = 0; i < IFNAMSIZ/sizeof(u16); i++) ret |= (a[i] ^ b[i]) & mask[i]; #endif return ret; } /* Returns whether packet matches rule or not. */ static inline int arp_packet_match(const struct arphdr *arphdr, struct net_device *dev, const char *indev, const char *outdev, const struct arpt_arp *arpinfo) { const char *arpptr = (char *)(arphdr + 1); const char *src_devaddr, *tgt_devaddr; __be32 src_ipaddr, tgt_ipaddr; long ret; if (NF_INVF(arpinfo, ARPT_INV_ARPOP, (arphdr->ar_op & arpinfo->arpop_mask) != arpinfo->arpop)) return 0; if (NF_INVF(arpinfo, ARPT_INV_ARPHRD, (arphdr->ar_hrd & arpinfo->arhrd_mask) != arpinfo->arhrd)) return 0; if (NF_INVF(arpinfo, ARPT_INV_ARPPRO, (arphdr->ar_pro & arpinfo->arpro_mask) != arpinfo->arpro)) return 0; if (NF_INVF(arpinfo, ARPT_INV_ARPHLN, (arphdr->ar_hln & arpinfo->arhln_mask) != arpinfo->arhln)) return 0; src_devaddr = arpptr; arpptr += dev->addr_len; memcpy(&src_ipaddr, arpptr, sizeof(u32)); arpptr += sizeof(u32); tgt_devaddr = arpptr; arpptr += dev->addr_len; memcpy(&tgt_ipaddr, arpptr, sizeof(u32)); if (NF_INVF(arpinfo, ARPT_INV_SRCDEVADDR, arp_devaddr_compare(&arpinfo->src_devaddr, src_devaddr, dev->addr_len)) || NF_INVF(arpinfo, ARPT_INV_TGTDEVADDR, arp_devaddr_compare(&arpinfo->tgt_devaddr, tgt_devaddr, dev->addr_len))) return 0; if (NF_INVF(arpinfo, ARPT_INV_SRCIP, (src_ipaddr & arpinfo->smsk.s_addr) != arpinfo->src.s_addr) || NF_INVF(arpinfo, ARPT_INV_TGTIP, (tgt_ipaddr & arpinfo->tmsk.s_addr) != arpinfo->tgt.s_addr)) return 0; /* Look for ifname matches. */ ret = ifname_compare(indev, arpinfo->iniface, arpinfo->iniface_mask); if (NF_INVF(arpinfo, ARPT_INV_VIA_IN, ret != 0)) return 0; ret = ifname_compare(outdev, arpinfo->outiface, arpinfo->outiface_mask); if (NF_INVF(arpinfo, ARPT_INV_VIA_OUT, ret != 0)) return 0; return 1; } static inline int arp_checkentry(const struct arpt_arp *arp) { if (arp->flags & ~ARPT_F_MASK) return 0; if (arp->invflags & ~ARPT_INV_MASK) return 0; return 1; } static unsigned int arpt_error(struct sk_buff *skb, const struct xt_action_param *par) { net_err_ratelimited("arp_tables: error: '%s'\n", (const char *)par->targinfo); return NF_DROP; } static inline const struct xt_entry_target * arpt_get_target_c(const struct arpt_entry *e) { return arpt_get_target((struct arpt_entry *)e); } static inline struct arpt_entry * get_entry(const void *base, unsigned int offset) { return (struct arpt_entry *)(base + offset); } static inline struct arpt_entry *arpt_next_entry(const struct arpt_entry *entry) { return (void *)entry + entry->next_offset; } unsigned int arpt_do_table(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { const struct xt_table *table = priv; unsigned int hook = state->hook; static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long)))); unsigned int verdict = NF_DROP; const struct arphdr *arp; struct arpt_entry *e, **jumpstack; const char *indev, *outdev; const void *table_base; unsigned int cpu, stackidx = 0; const struct xt_table_info *private; struct xt_action_param acpar; unsigned int addend; if (!pskb_may_pull(skb, arp_hdr_len(skb->dev))) return NF_DROP; indev = state->in ? state->in->name : nulldevname; outdev = state->out ? state->out->name : nulldevname; local_bh_disable(); addend = xt_write_recseq_begin(); private = READ_ONCE(table->private); /* Address dependency. */ cpu = smp_processor_id(); table_base = private->entries; jumpstack = (struct arpt_entry **)private->jumpstack[cpu]; /* No TEE support for arptables, so no need to switch to alternate * stack. All targets that reenter must return absolute verdicts. */ e = get_entry(table_base, private->hook_entry[hook]); acpar.state = state; acpar.hotdrop = false; arp = arp_hdr(skb); do { const struct xt_entry_target *t; struct xt_counters *counter; if (!arp_packet_match(arp, skb->dev, indev, outdev, &e->arp)) { e = arpt_next_entry(e); continue; } counter = xt_get_this_cpu_counter(&e->counters); ADD_COUNTER(*counter, arp_hdr_len(skb->dev), 1); t = arpt_get_target_c(e); /* Standard target? */ if (!t->u.kernel.target->target) { int v; v = ((struct xt_standard_target *)t)->verdict; if (v < 0) { /* Pop from stack? */ if (v != XT_RETURN) { verdict = (unsigned int)(-v) - 1; break; } if (stackidx == 0) { e = get_entry(table_base, private->underflow[hook]); } else { e = jumpstack[--stackidx]; e = arpt_next_entry(e); } continue; } if (table_base + v != arpt_next_entry(e)) { if (unlikely(stackidx >= private->stacksize)) { verdict = NF_DROP; break; } jumpstack[stackidx++] = e; } e = get_entry(table_base, v); continue; } acpar.target = t->u.kernel.target; acpar.targinfo = t->data; verdict = t->u.kernel.target->target(skb, &acpar); if (verdict == XT_CONTINUE) { /* Target might have changed stuff. */ arp = arp_hdr(skb); e = arpt_next_entry(e); } else { /* Verdict */ break; } } while (!acpar.hotdrop); xt_write_recseq_end(addend); local_bh_enable(); if (acpar.hotdrop) return NF_DROP; else return verdict; } /* All zeroes == unconditional rule. */ static inline bool unconditional(const struct arpt_entry *e) { static const struct arpt_arp uncond; return e->target_offset == sizeof(struct arpt_entry) && memcmp(&e->arp, &uncond, sizeof(uncond)) == 0; } /* Figures out from what hook each rule can be called: returns 0 if * there are loops. Puts hook bitmask in comefrom. */ static int mark_source_chains(const struct xt_table_info *newinfo, unsigned int valid_hooks, void *entry0, unsigned int *offsets) { unsigned int hook; /* No recursion; use packet counter to save back ptrs (reset * to 0 as we leave), and comefrom to save source hook bitmask. */ for (hook = 0; hook < NF_ARP_NUMHOOKS; hook++) { unsigned int pos = newinfo->hook_entry[hook]; struct arpt_entry *e = entry0 + pos; if (!(valid_hooks & (1 << hook))) continue; /* Set initial back pointer. */ e->counters.pcnt = pos; for (;;) { const struct xt_standard_target *t = (void *)arpt_get_target_c(e); int visited = e->comefrom & (1 << hook); if (e->comefrom & (1 << NF_ARP_NUMHOOKS)) return 0; e->comefrom |= ((1 << hook) | (1 << NF_ARP_NUMHOOKS)); /* Unconditional return/END. */ if ((unconditional(e) && (strcmp(t->target.u.user.name, XT_STANDARD_TARGET) == 0) && t->verdict < 0) || visited) { unsigned int oldpos, size; /* Return: backtrack through the last * big jump. */ do { e->comefrom ^= (1<<NF_ARP_NUMHOOKS); oldpos = pos; pos = e->counters.pcnt; e->counters.pcnt = 0; /* We're at the start. */ if (pos == oldpos) goto next; e = entry0 + pos; } while (oldpos == pos + e->next_offset); /* Move along one */ size = e->next_offset; e = entry0 + pos + size; if (pos + size >= newinfo->size) return 0; e->counters.pcnt = pos; pos += size; } else { int newpos = t->verdict; if (strcmp(t->target.u.user.name, XT_STANDARD_TARGET) == 0 && newpos >= 0) { /* This a jump; chase it. */ if (!xt_find_jump_offset(offsets, newpos, newinfo->number)) return 0; } else { /* ... this is a fallthru */ newpos = pos + e->next_offset; if (newpos >= newinfo->size) return 0; } e = entry0 + newpos; e->counters.pcnt = pos; pos = newpos; } } next: ; } return 1; } static int check_target(struct arpt_entry *e, struct net *net, const char *name) { struct xt_entry_target *t = arpt_get_target(e); struct xt_tgchk_param par = { .net = net, .table = name, .entryinfo = e, .target = t->u.kernel.target, .targinfo = t->data, .hook_mask = e->comefrom, .family = NFPROTO_ARP, }; return xt_check_target(&par, t->u.target_size - sizeof(*t), 0, false); } static int find_check_entry(struct arpt_entry *e, struct net *net, const char *name, unsigned int size, struct xt_percpu_counter_alloc_state *alloc_state) { struct xt_entry_target *t; struct xt_target *target; int ret; if (!xt_percpu_counter_alloc(alloc_state, &e->counters)) return -ENOMEM; t = arpt_get_target(e); target = xt_request_find_target(NFPROTO_ARP, t->u.user.name, t->u.user.revision); if (IS_ERR(target)) { ret = PTR_ERR(target); goto out; } t->u.kernel.target = target; ret = check_target(e, net, name); if (ret) goto err; return 0; err: module_put(t->u.kernel.target->me); out: xt_percpu_counter_free(&e->counters); return ret; } static bool check_underflow(const struct arpt_entry *e) { const struct xt_entry_target *t; unsigned int verdict; if (!unconditional(e)) return false; t = arpt_get_target_c(e); if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0) return false; verdict = ((struct xt_standard_target *)t)->verdict; verdict = -verdict - 1; return verdict == NF_DROP || verdict == NF_ACCEPT; } static inline int check_entry_size_and_hooks(struct arpt_entry *e, struct xt_table_info *newinfo, const unsigned char *base, const unsigned char *limit, const unsigned int *hook_entries, const unsigned int *underflows, unsigned int valid_hooks) { unsigned int h; int err; if ((unsigned long)e % __alignof__(struct arpt_entry) != 0 || (unsigned char *)e + sizeof(struct arpt_entry) >= limit || (unsigned char *)e + e->next_offset > limit) return -EINVAL; if (e->next_offset < sizeof(struct arpt_entry) + sizeof(struct xt_entry_target)) return -EINVAL; if (!arp_checkentry(&e->arp)) return -EINVAL; err = xt_check_entry_offsets(e, e->elems, e->target_offset, e->next_offset); if (err) return err; /* Check hooks & underflows */ for (h = 0; h < NF_ARP_NUMHOOKS; h++) { if (!(valid_hooks & (1 << h))) continue; if ((unsigned char *)e - base == hook_entries[h]) newinfo->hook_entry[h] = hook_entries[h]; if ((unsigned char *)e - base == underflows[h]) { if (!check_underflow(e)) return -EINVAL; newinfo->underflow[h] = underflows[h]; } } /* Clear counters and comefrom */ e->counters = ((struct xt_counters) { 0, 0 }); e->comefrom = 0; return 0; } static void cleanup_entry(struct arpt_entry *e, struct net *net) { struct xt_tgdtor_param par; struct xt_entry_target *t; t = arpt_get_target(e); par.net = net; par.target = t->u.kernel.target; par.targinfo = t->data; par.family = NFPROTO_ARP; if (par.target->destroy != NULL) par.target->destroy(&par); module_put(par.target->me); xt_percpu_counter_free(&e->counters); } /* Checks and translates the user-supplied table segment (held in * newinfo). */ static int translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, const struct arpt_replace *repl) { struct xt_percpu_counter_alloc_state alloc_state = { 0 }; struct arpt_entry *iter; unsigned int *offsets; unsigned int i; int ret = 0; newinfo->size = repl->size; newinfo->number = repl->num_entries; /* Init all hooks to impossible value. */ for (i = 0; i < NF_ARP_NUMHOOKS; i++) { newinfo->hook_entry[i] = 0xFFFFFFFF; newinfo->underflow[i] = 0xFFFFFFFF; } offsets = xt_alloc_entry_offsets(newinfo->number); if (!offsets) return -ENOMEM; i = 0; /* Walk through entries, checking offsets. */ xt_entry_foreach(iter, entry0, newinfo->size) { ret = check_entry_size_and_hooks(iter, newinfo, entry0, entry0 + repl->size, repl->hook_entry, repl->underflow, repl->valid_hooks); if (ret != 0) goto out_free; if (i < repl->num_entries) offsets[i] = (void *)iter - entry0; ++i; if (strcmp(arpt_get_target(iter)->u.user.name, XT_ERROR_TARGET) == 0) ++newinfo->stacksize; } ret = -EINVAL; if (i != repl->num_entries) goto out_free; ret = xt_check_table_hooks(newinfo, repl->valid_hooks); if (ret) goto out_free; if (!mark_source_chains(newinfo, repl->valid_hooks, entry0, offsets)) { ret = -ELOOP; goto out_free; } kvfree(offsets); /* Finally, each sanity check must pass */ i = 0; xt_entry_foreach(iter, entry0, newinfo->size) { ret = find_check_entry(iter, net, repl->name, repl->size, &alloc_state); if (ret != 0) break; ++i; } if (ret != 0) { xt_entry_foreach(iter, entry0, newinfo->size) { if (i-- == 0) break; cleanup_entry(iter, net); } return ret; } return ret; out_free: kvfree(offsets); return ret; } static void get_counters(const struct xt_table_info *t, struct xt_counters counters[]) { struct arpt_entry *iter; unsigned int cpu; unsigned int i; for_each_possible_cpu(cpu) { seqcount_t *s = &per_cpu(xt_recseq, cpu); i = 0; xt_entry_foreach(iter, t->entries, t->size) { struct xt_counters *tmp; u64 bcnt, pcnt; unsigned int start; tmp = xt_get_per_cpu_counter(&iter->counters, cpu); do { start = read_seqcount_begin(s); bcnt = tmp->bcnt; pcnt = tmp->pcnt; } while (read_seqcount_retry(s, start)); ADD_COUNTER(counters[i], bcnt, pcnt); ++i; cond_resched(); } } } static void get_old_counters(const struct xt_table_info *t, struct xt_counters counters[]) { struct arpt_entry *iter; unsigned int cpu, i; for_each_possible_cpu(cpu) { i = 0; xt_entry_foreach(iter, t->entries, t->size) { struct xt_counters *tmp; tmp = xt_get_per_cpu_counter(&iter->counters, cpu); ADD_COUNTER(counters[i], tmp->bcnt, tmp->pcnt); ++i; } cond_resched(); } } static struct xt_counters *alloc_counters(const struct xt_table *table) { unsigned int countersize; struct xt_counters *counters; const struct xt_table_info *private = table->private; /* We need atomic snapshot of counters: rest doesn't change * (other than comefrom, which userspace doesn't care * about). */ countersize = sizeof(struct xt_counters) * private->number; counters = vzalloc(countersize); if (counters == NULL) return ERR_PTR(-ENOMEM); get_counters(private, counters); return counters; } static int copy_entries_to_user(unsigned int total_size, const struct xt_table *table, void __user *userptr) { unsigned int off, num; const struct arpt_entry *e; struct xt_counters *counters; struct xt_table_info *private = table->private; int ret = 0; void *loc_cpu_entry; counters = alloc_counters(table); if (IS_ERR(counters)) return PTR_ERR(counters); loc_cpu_entry = private->entries; /* FIXME: use iterator macros --RR */ /* ... then go back and fix counters and names */ for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){ const struct xt_entry_target *t; e = loc_cpu_entry + off; if (copy_to_user(userptr + off, e, sizeof(*e))) { ret = -EFAULT; goto free_counters; } if (copy_to_user(userptr + off + offsetof(struct arpt_entry, counters), &counters[num], sizeof(counters[num])) != 0) { ret = -EFAULT; goto free_counters; } t = arpt_get_target_c(e); if (xt_target_to_user(t, userptr + off + e->target_offset)) { ret = -EFAULT; goto free_counters; } } free_counters: vfree(counters); return ret; } #ifdef CONFIG_NETFILTER_XTABLES_COMPAT static void compat_standard_from_user(void *dst, const void *src) { int v = *(compat_int_t *)src; if (v > 0) v += xt_compat_calc_jump(NFPROTO_ARP, v); memcpy(dst, &v, sizeof(v)); } static int compat_standard_to_user(void __user *dst, const void *src) { compat_int_t cv = *(int *)src; if (cv > 0) cv -= xt_compat_calc_jump(NFPROTO_ARP, cv); return copy_to_user(dst, &cv, sizeof(cv)) ? -EFAULT : 0; } static int compat_calc_entry(const struct arpt_entry *e, const struct xt_table_info *info, const void *base, struct xt_table_info *newinfo) { const struct xt_entry_target *t; unsigned int entry_offset; int off, i, ret; off = sizeof(struct arpt_entry) - sizeof(struct compat_arpt_entry); entry_offset = (void *)e - base; t = arpt_get_target_c(e); off += xt_compat_target_offset(t->u.kernel.target); newinfo->size -= off; ret = xt_compat_add_offset(NFPROTO_ARP, entry_offset, off); if (ret) return ret; for (i = 0; i < NF_ARP_NUMHOOKS; i++) { if (info->hook_entry[i] && (e < (struct arpt_entry *)(base + info->hook_entry[i]))) newinfo->hook_entry[i] -= off; if (info->underflow[i] && (e < (struct arpt_entry *)(base + info->underflow[i]))) newinfo->underflow[i] -= off; } return 0; } static int compat_table_info(const struct xt_table_info *info, struct xt_table_info *newinfo) { struct arpt_entry *iter; const void *loc_cpu_entry; int ret; if (!newinfo || !info) return -EINVAL; /* we dont care about newinfo->entries */ memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); newinfo->initial_entries = 0; loc_cpu_entry = info->entries; ret = xt_compat_init_offsets(NFPROTO_ARP, info->number); if (ret) return ret; xt_entry_foreach(iter, loc_cpu_entry, info->size) { ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo); if (ret != 0) return ret; } return 0; } #endif static int get_info(struct net *net, void __user *user, const int *len) { char name[XT_TABLE_MAXNAMELEN]; struct xt_table *t; int ret; if (*len != sizeof(struct arpt_getinfo)) return -EINVAL; if (copy_from_user(name, user, sizeof(name)) != 0) return -EFAULT; name[XT_TABLE_MAXNAMELEN-1] = '\0'; #ifdef CONFIG_NETFILTER_XTABLES_COMPAT if (in_compat_syscall()) xt_compat_lock(NFPROTO_ARP); #endif t = xt_request_find_table_lock(net, NFPROTO_ARP, name); if (!IS_ERR(t)) { struct arpt_getinfo info; const struct xt_table_info *private = t->private; #ifdef CONFIG_NETFILTER_XTABLES_COMPAT struct xt_table_info tmp; if (in_compat_syscall()) { ret = compat_table_info(private, &tmp); xt_compat_flush_offsets(NFPROTO_ARP); private = &tmp; } #endif memset(&info, 0, sizeof(info)); info.valid_hooks = t->valid_hooks; memcpy(info.hook_entry, private->hook_entry, sizeof(info.hook_entry)); memcpy(info.underflow, private->underflow, sizeof(info.underflow)); info.num_entries = private->number; info.size = private->size; strcpy(info.name, name); if (copy_to_user(user, &info, *len) != 0) ret = -EFAULT; else ret = 0; xt_table_unlock(t); module_put(t->me); } else ret = PTR_ERR(t); #ifdef CONFIG_NETFILTER_XTABLES_COMPAT if (in_compat_syscall()) xt_compat_unlock(NFPROTO_ARP); #endif return ret; } static int get_entries(struct net *net, struct arpt_get_entries __user *uptr, const int *len) { int ret; struct arpt_get_entries get; struct xt_table *t; if (*len < sizeof(get)) return -EINVAL; if (copy_from_user(&get, uptr, sizeof(get)) != 0) return -EFAULT; if (*len != sizeof(struct arpt_get_entries) + get.size) return -EINVAL; get.name[sizeof(get.name) - 1] = '\0'; t = xt_find_table_lock(net, NFPROTO_ARP, get.name); if (!IS_ERR(t)) { const struct xt_table_info *private = t->private; if (get.size == private->size) ret = copy_entries_to_user(private->size, t, uptr->entrytable); else ret = -EAGAIN; module_put(t->me); xt_table_unlock(t); } else ret = PTR_ERR(t); return ret; } static int __do_replace(struct net *net, const char *name, unsigned int valid_hooks, struct xt_table_info *newinfo, unsigned int num_counters, void __user *counters_ptr) { int ret; struct xt_table *t; struct xt_table_info *oldinfo; struct xt_counters *counters; void *loc_cpu_old_entry; struct arpt_entry *iter; ret = 0; counters = xt_counters_alloc(num_counters); if (!counters) { ret = -ENOMEM; goto out; } t = xt_request_find_table_lock(net, NFPROTO_ARP, name); if (IS_ERR(t)) { ret = PTR_ERR(t); goto free_newinfo_counters_untrans; } /* You lied! */ if (valid_hooks != t->valid_hooks) { ret = -EINVAL; goto put_module; } oldinfo = xt_replace_table(t, num_counters, newinfo, &ret); if (!oldinfo) goto put_module; /* Update module usage count based on number of rules */ if ((oldinfo->number > oldinfo->initial_entries) || (newinfo->number <= oldinfo->initial_entries)) module_put(t->me); if ((oldinfo->number > oldinfo->initial_entries) && (newinfo->number <= oldinfo->initial_entries)) module_put(t->me); xt_table_unlock(t); get_old_counters(oldinfo, counters); /* Decrease module usage counts and free resource */ loc_cpu_old_entry = oldinfo->entries; xt_entry_foreach(iter, loc_cpu_old_entry, oldinfo->size) cleanup_entry(iter, net); xt_free_table_info(oldinfo); if (copy_to_user(counters_ptr, counters, sizeof(struct xt_counters) * num_counters) != 0) { /* Silent error, can't fail, new table is already in place */ net_warn_ratelimited("arptables: counters copy to user failed while replacing table\n"); } vfree(counters); return ret; put_module: module_put(t->me); xt_table_unlock(t); free_newinfo_counters_untrans: vfree(counters); out: return ret; } static int do_replace(struct net *net, sockptr_t arg, unsigned int len) { int ret; struct arpt_replace tmp; struct xt_table_info *newinfo; void *loc_cpu_entry; struct arpt_entry *iter; if (len < sizeof(tmp)) return -EINVAL; if (copy_from_sockptr(&tmp, arg, sizeof(tmp)) != 0) return -EFAULT; /* overflow check */ if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) return -ENOMEM; if (tmp.num_counters == 0) return -EINVAL; if ((u64)len < (u64)tmp.size + sizeof(tmp)) return -EINVAL; tmp.name[sizeof(tmp.name)-1] = 0; newinfo = xt_alloc_table_info(tmp.size); if (!newinfo) return -ENOMEM; loc_cpu_entry = newinfo->entries; if (copy_from_sockptr_offset(loc_cpu_entry, arg, sizeof(tmp), tmp.size) != 0) { ret = -EFAULT; goto free_newinfo; } ret = translate_table(net, newinfo, loc_cpu_entry, &tmp); if (ret != 0) goto free_newinfo; ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo, tmp.num_counters, tmp.counters); if (ret) goto free_newinfo_untrans; return 0; free_newinfo_untrans: xt_entry_foreach(iter, loc_cpu_entry, newinfo->size) cleanup_entry(iter, net); free_newinfo: xt_free_table_info(newinfo); return ret; } static int do_add_counters(struct net *net, sockptr_t arg, unsigned int len) { unsigned int i; struct xt_counters_info tmp; struct xt_counters *paddc; struct xt_table *t; const struct xt_table_info *private; int ret = 0; struct arpt_entry *iter; unsigned int addend; paddc = xt_copy_counters(arg, len, &tmp); if (IS_ERR(paddc)) return PTR_ERR(paddc); t = xt_find_table_lock(net, NFPROTO_ARP, tmp.name); if (IS_ERR(t)) { ret = PTR_ERR(t); goto free; } local_bh_disable(); private = t->private; if (private->number != tmp.num_counters) { ret = -EINVAL; goto unlock_up_free; } i = 0; addend = xt_write_recseq_begin(); xt_entry_foreach(iter, private->entries, private->size) { struct xt_counters *tmp; tmp = xt_get_this_cpu_counter(&iter->counters); ADD_COUNTER(*tmp, paddc[i].bcnt, paddc[i].pcnt); ++i; } xt_write_recseq_end(addend); unlock_up_free: local_bh_enable(); xt_table_unlock(t); module_put(t->me); free: vfree(paddc); return ret; } #ifdef CONFIG_NETFILTER_XTABLES_COMPAT struct compat_arpt_replace { char name[XT_TABLE_MAXNAMELEN]; u32 valid_hooks; u32 num_entries; u32 size; u32 hook_entry[NF_ARP_NUMHOOKS]; u32 underflow[NF_ARP_NUMHOOKS]; u32 num_counters; compat_uptr_t counters; struct compat_arpt_entry entries[]; }; static inline void compat_release_entry(struct compat_arpt_entry *e) { struct xt_entry_target *t; t = compat_arpt_get_target(e); module_put(t->u.kernel.target->me); } static int check_compat_entry_size_and_hooks(struct compat_arpt_entry *e, struct xt_table_info *newinfo, unsigned int *size, const unsigned char *base, const unsigned char *limit) { struct xt_entry_target *t; struct xt_target *target; unsigned int entry_offset; int ret, off; if ((unsigned long)e % __alignof__(struct compat_arpt_entry) != 0 || (unsigned char *)e + sizeof(struct compat_arpt_entry) >= limit || (unsigned char *)e + e->next_offset > limit) return -EINVAL; if (e->next_offset < sizeof(struct compat_arpt_entry) + sizeof(struct compat_xt_entry_target)) return -EINVAL; if (!arp_checkentry(&e->arp)) return -EINVAL; ret = xt_compat_check_entry_offsets(e, e->elems, e->target_offset, e->next_offset); if (ret) return ret; off = sizeof(struct arpt_entry) - sizeof(struct compat_arpt_entry); entry_offset = (void *)e - (void *)base; t = compat_arpt_get_target(e); target = xt_request_find_target(NFPROTO_ARP, t->u.user.name, t->u.user.revision); if (IS_ERR(target)) { ret = PTR_ERR(target); goto out; } t->u.kernel.target = target; off += xt_compat_target_offset(target); *size += off; ret = xt_compat_add_offset(NFPROTO_ARP, entry_offset, off); if (ret) goto release_target; return 0; release_target: module_put(t->u.kernel.target->me); out: return ret; } static void compat_copy_entry_from_user(struct compat_arpt_entry *e, void **dstptr, unsigned int *size, struct xt_table_info *newinfo, unsigned char *base) { struct xt_entry_target *t; struct arpt_entry *de; unsigned int origsize; int h; origsize = *size; de = *dstptr; memcpy(de, e, sizeof(struct arpt_entry)); memcpy(&de->counters, &e->counters, sizeof(e->counters)); *dstptr += sizeof(struct arpt_entry); *size += sizeof(struct arpt_entry) - sizeof(struct compat_arpt_entry); de->target_offset = e->target_offset - (origsize - *size); t = compat_arpt_get_target(e); xt_compat_target_from_user(t, dstptr, size); de->next_offset = e->next_offset - (origsize - *size); for (h = 0; h < NF_ARP_NUMHOOKS; h++) { if ((unsigned char *)de - base < newinfo->hook_entry[h]) newinfo->hook_entry[h] -= origsize - *size; if ((unsigned char *)de - base < newinfo->underflow[h]) newinfo->underflow[h] -= origsize - *size; } } static int translate_compat_table(struct net *net, struct xt_table_info **pinfo, void **pentry0, const struct compat_arpt_replace *compatr) { unsigned int i, j; struct xt_table_info *newinfo, *info; void *pos, *entry0, *entry1; struct compat_arpt_entry *iter0; struct arpt_replace repl; unsigned int size; int ret; info = *pinfo; entry0 = *pentry0; size = compatr->size; info->number = compatr->num_entries; j = 0; xt_compat_lock(NFPROTO_ARP); ret = xt_compat_init_offsets(NFPROTO_ARP, compatr->num_entries); if (ret) goto out_unlock; /* Walk through entries, checking offsets. */ xt_entry_foreach(iter0, entry0, compatr->size) { ret = check_compat_entry_size_and_hooks(iter0, info, &size, entry0, entry0 + compatr->size); if (ret != 0) goto out_unlock; ++j; } ret = -EINVAL; if (j != compatr->num_entries) goto out_unlock; ret = -ENOMEM; newinfo = xt_alloc_table_info(size); if (!newinfo) goto out_unlock; memset(newinfo->entries, 0, size); newinfo->number = compatr->num_entries; for (i = 0; i < NF_ARP_NUMHOOKS; i++) { newinfo->hook_entry[i] = compatr->hook_entry[i]; newinfo->underflow[i] = compatr->underflow[i]; } entry1 = newinfo->entries; pos = entry1; size = compatr->size; xt_entry_foreach(iter0, entry0, compatr->size) compat_copy_entry_from_user(iter0, &pos, &size, newinfo, entry1); /* all module references in entry0 are now gone */ xt_compat_flush_offsets(NFPROTO_ARP); xt_compat_unlock(NFPROTO_ARP); memcpy(&repl, compatr, sizeof(*compatr)); for (i = 0; i < NF_ARP_NUMHOOKS; i++) { repl.hook_entry[i] = newinfo->hook_entry[i]; repl.underflow[i] = newinfo->underflow[i]; } repl.num_counters = 0; repl.counters = NULL; repl.size = newinfo->size; ret = translate_table(net, newinfo, entry1, &repl); if (ret) goto free_newinfo; *pinfo = newinfo; *pentry0 = entry1; xt_free_table_info(info); return 0; free_newinfo: xt_free_table_info(newinfo); return ret; out_unlock: xt_compat_flush_offsets(NFPROTO_ARP); xt_compat_unlock(NFPROTO_ARP); xt_entry_foreach(iter0, entry0, compatr->size) { if (j-- == 0) break; compat_release_entry(iter0); } return ret; } static int compat_do_replace(struct net *net, sockptr_t arg, unsigned int len) { int ret; struct compat_arpt_replace tmp; struct xt_table_info *newinfo; void *loc_cpu_entry; struct arpt_entry *iter; if (len < sizeof(tmp)) return -EINVAL; if (copy_from_sockptr(&tmp, arg, sizeof(tmp)) != 0) return -EFAULT; /* overflow check */ if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) return -ENOMEM; if (tmp.num_counters == 0) return -EINVAL; if ((u64)len < (u64)tmp.size + sizeof(tmp)) return -EINVAL; tmp.name[sizeof(tmp.name)-1] = 0; newinfo = xt_alloc_table_info(tmp.size); if (!newinfo) return -ENOMEM; loc_cpu_entry = newinfo->entries; if (copy_from_sockptr_offset(loc_cpu_entry, arg, sizeof(tmp), tmp.size) != 0) { ret = -EFAULT; goto free_newinfo; } ret = translate_compat_table(net, &newinfo, &loc_cpu_entry, &tmp); if (ret != 0) goto free_newinfo; ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo, tmp.num_counters, compat_ptr(tmp.counters)); if (ret) goto free_newinfo_untrans; return 0; free_newinfo_untrans: xt_entry_foreach(iter, loc_cpu_entry, newinfo->size) cleanup_entry(iter, net); free_newinfo: xt_free_table_info(newinfo); return ret; } static int compat_copy_entry_to_user(struct arpt_entry *e, void __user **dstptr, compat_uint_t *size, struct xt_counters *counters, unsigned int i) { struct xt_entry_target *t; struct compat_arpt_entry __user *ce; u_int16_t target_offset, next_offset; compat_uint_t origsize; int ret; origsize = *size; ce = *dstptr; if (copy_to_user(ce, e, sizeof(struct arpt_entry)) != 0 || copy_to_user(&ce->counters, &counters[i], sizeof(counters[i])) != 0) return -EFAULT; *dstptr += sizeof(struct compat_arpt_entry); *size -= sizeof(struct arpt_entry) - sizeof(struct compat_arpt_entry); target_offset = e->target_offset - (origsize - *size); t = arpt_get_target(e); ret = xt_compat_target_to_user(t, dstptr, size); if (ret) return ret; next_offset = e->next_offset - (origsize - *size); if (put_user(target_offset, &ce->target_offset) != 0 || put_user(next_offset, &ce->next_offset) != 0) return -EFAULT; return 0; } static int compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table, void __user *userptr) { struct xt_counters *counters; const struct xt_table_info *private = table->private; void __user *pos; unsigned int size; int ret = 0; unsigned int i = 0; struct arpt_entry *iter; counters = alloc_counters(table); if (IS_ERR(counters)) return PTR_ERR(counters); pos = userptr; size = total_size; xt_entry_foreach(iter, private->entries, total_size) { ret = compat_copy_entry_to_user(iter, &pos, &size, counters, i++); if (ret != 0) break; } vfree(counters); return ret; } struct compat_arpt_get_entries { char name[XT_TABLE_MAXNAMELEN]; compat_uint_t size; struct compat_arpt_entry entrytable[]; }; static int compat_get_entries(struct net *net, struct compat_arpt_get_entries __user *uptr, int *len) { int ret; struct compat_arpt_get_entries get; struct xt_table *t; if (*len < sizeof(get)) return -EINVAL; if (copy_from_user(&get, uptr, sizeof(get)) != 0) return -EFAULT; if (*len != sizeof(struct compat_arpt_get_entries) + get.size) return -EINVAL; get.name[sizeof(get.name) - 1] = '\0'; xt_compat_lock(NFPROTO_ARP); t = xt_find_table_lock(net, NFPROTO_ARP, get.name); if (!IS_ERR(t)) { const struct xt_table_info *private = t->private; struct xt_table_info info; ret = compat_table_info(private, &info); if (!ret && get.size == info.size) { ret = compat_copy_entries_to_user(private->size, t, uptr->entrytable); } else if (!ret) ret = -EAGAIN; xt_compat_flush_offsets(NFPROTO_ARP); module_put(t->me); xt_table_unlock(t); } else ret = PTR_ERR(t); xt_compat_unlock(NFPROTO_ARP); return ret; } #endif static int do_arpt_set_ctl(struct sock *sk, int cmd, sockptr_t arg, unsigned int len) { int ret; if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) return -EPERM; switch (cmd) { case ARPT_SO_SET_REPLACE: #ifdef CONFIG_NETFILTER_XTABLES_COMPAT if (in_compat_syscall()) ret = compat_do_replace(sock_net(sk), arg, len); else #endif ret = do_replace(sock_net(sk), arg, len); break; case ARPT_SO_SET_ADD_COUNTERS: ret = do_add_counters(sock_net(sk), arg, len); break; default: ret = -EINVAL; } return ret; } static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) { int ret; if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) return -EPERM; switch (cmd) { case ARPT_SO_GET_INFO: ret = get_info(sock_net(sk), user, len); break; case ARPT_SO_GET_ENTRIES: #ifdef CONFIG_NETFILTER_XTABLES_COMPAT if (in_compat_syscall()) ret = compat_get_entries(sock_net(sk), user, len); else #endif ret = get_entries(sock_net(sk), user, len); break; case ARPT_SO_GET_REVISION_TARGET: { struct xt_get_revision rev; if (*len != sizeof(rev)) { ret = -EINVAL; break; } if (copy_from_user(&rev, user, sizeof(rev)) != 0) { ret = -EFAULT; break; } rev.name[sizeof(rev.name)-1] = 0; try_then_request_module(xt_find_revision(NFPROTO_ARP, rev.name, rev.revision, 1, &ret), "arpt_%s", rev.name); break; } default: ret = -EINVAL; } return ret; } static void __arpt_unregister_table(struct net *net, struct xt_table *table) { struct xt_table_info *private; void *loc_cpu_entry; struct module *table_owner = table->me; struct arpt_entry *iter; private = xt_unregister_table(table); /* Decrease module usage counts and free resources */ loc_cpu_entry = private->entries; xt_entry_foreach(iter, loc_cpu_entry, private->size) cleanup_entry(iter, net); if (private->number > private->initial_entries) module_put(table_owner); xt_free_table_info(private); } int arpt_register_table(struct net *net, const struct xt_table *table, const struct arpt_replace *repl, const struct nf_hook_ops *template_ops) { struct nf_hook_ops *ops; unsigned int num_ops; int ret, i; struct xt_table_info *newinfo; struct xt_table_info bootstrap = {0}; void *loc_cpu_entry; struct xt_table *new_table; newinfo = xt_alloc_table_info(repl->size); if (!newinfo) return -ENOMEM; loc_cpu_entry = newinfo->entries; memcpy(loc_cpu_entry, repl->entries, repl->size); ret = translate_table(net, newinfo, loc_cpu_entry, repl); if (ret != 0) { xt_free_table_info(newinfo); return ret; } new_table = xt_register_table(net, table, &bootstrap, newinfo); if (IS_ERR(new_table)) { struct arpt_entry *iter; xt_entry_foreach(iter, loc_cpu_entry, newinfo->size) cleanup_entry(iter, net); xt_free_table_info(newinfo); return PTR_ERR(new_table); } num_ops = hweight32(table->valid_hooks); if (num_ops == 0) { ret = -EINVAL; goto out_free; } ops = kmemdup(template_ops, sizeof(*ops) * num_ops, GFP_KERNEL); if (!ops) { ret = -ENOMEM; goto out_free; } for (i = 0; i < num_ops; i++) ops[i].priv = new_table; new_table->ops = ops; ret = nf_register_net_hooks(net, ops, num_ops); if (ret != 0) goto out_free; return ret; out_free: __arpt_unregister_table(net, new_table); return ret; } void arpt_unregister_table_pre_exit(struct net *net, const char *name) { struct xt_table *table = xt_find_table(net, NFPROTO_ARP, name); if (table) nf_unregister_net_hooks(net, table->ops, hweight32(table->valid_hooks)); } EXPORT_SYMBOL(arpt_unregister_table_pre_exit); void arpt_unregister_table(struct net *net, const char *name) { struct xt_table *table = xt_find_table(net, NFPROTO_ARP, name); if (table) __arpt_unregister_table(net, table); } /* The built-in targets: standard (NULL) and error. */ static struct xt_target arpt_builtin_tg[] __read_mostly = { { .name = XT_STANDARD_TARGET, .targetsize = sizeof(int), .family = NFPROTO_ARP, #ifdef CONFIG_NETFILTER_XTABLES_COMPAT .compatsize = sizeof(compat_int_t), .compat_from_user = compat_standard_from_user, .compat_to_user = compat_standard_to_user, #endif }, { .name = XT_ERROR_TARGET, .target = arpt_error, .targetsize = XT_FUNCTION_MAXNAMELEN, .family = NFPROTO_ARP, }, }; static struct nf_sockopt_ops arpt_sockopts = { .pf = PF_INET, .set_optmin = ARPT_BASE_CTL, .set_optmax = ARPT_SO_SET_MAX+1, .set = do_arpt_set_ctl, .get_optmin = ARPT_BASE_CTL, .get_optmax = ARPT_SO_GET_MAX+1, .get = do_arpt_get_ctl, .owner = THIS_MODULE, }; static int __net_init arp_tables_net_init(struct net *net) { return xt_proto_init(net, NFPROTO_ARP); } static void __net_exit arp_tables_net_exit(struct net *net) { xt_proto_fini(net, NFPROTO_ARP); } static struct pernet_operations arp_tables_net_ops = { .init = arp_tables_net_init, .exit = arp_tables_net_exit, }; static int __init arp_tables_init(void) { int ret; ret = register_pernet_subsys(&arp_tables_net_ops); if (ret < 0) goto err1; /* No one else will be downing sem now, so we won't sleep */ ret = xt_register_targets(arpt_builtin_tg, ARRAY_SIZE(arpt_builtin_tg)); if (ret < 0) goto err2; /* Register setsockopt */ ret = nf_register_sockopt(&arpt_sockopts); if (ret < 0) goto err4; return 0; err4: xt_unregister_targets(arpt_builtin_tg, ARRAY_SIZE(arpt_builtin_tg)); err2: unregister_pernet_subsys(&arp_tables_net_ops); err1: return ret; } static void __exit arp_tables_fini(void) { nf_unregister_sockopt(&arpt_sockopts); xt_unregister_targets(arpt_builtin_tg, ARRAY_SIZE(arpt_builtin_tg)); unregister_pernet_subsys(&arp_tables_net_ops); } EXPORT_SYMBOL(arpt_register_table); EXPORT_SYMBOL(arpt_unregister_table); EXPORT_SYMBOL(arpt_do_table); module_init(arp_tables_init); module_exit(arp_tables_fini); |
| 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 | // SPDX-License-Identifier: GPL-2.0 #include <linux/export.h> #include <linux/icmpv6.h> #include <linux/mutex.h> #include <linux/netdevice.h> #include <linux/spinlock.h> #include <net/ipv6.h> #if IS_ENABLED(CONFIG_IPV6) #if !IS_BUILTIN(CONFIG_IPV6) static ip6_icmp_send_t __rcu *ip6_icmp_send; int inet6_register_icmp_sender(ip6_icmp_send_t *fn) { return (cmpxchg((ip6_icmp_send_t **)&ip6_icmp_send, NULL, fn) == NULL) ? 0 : -EBUSY; } EXPORT_SYMBOL(inet6_register_icmp_sender); int inet6_unregister_icmp_sender(ip6_icmp_send_t *fn) { int ret; ret = (cmpxchg((ip6_icmp_send_t **)&ip6_icmp_send, fn, NULL) == fn) ? 0 : -EINVAL; synchronize_net(); return ret; } EXPORT_SYMBOL(inet6_unregister_icmp_sender); void __icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, const struct inet6_skb_parm *parm) { ip6_icmp_send_t *send; rcu_read_lock(); send = rcu_dereference(ip6_icmp_send); if (send) send(skb, type, code, info, NULL, parm); rcu_read_unlock(); } EXPORT_SYMBOL(__icmpv6_send); #endif #if IS_ENABLED(CONFIG_NF_NAT) #include <net/netfilter/nf_conntrack.h> void icmpv6_ndo_send(struct sk_buff *skb_in, u8 type, u8 code, __u32 info) { struct inet6_skb_parm parm = { 0 }; struct sk_buff *cloned_skb = NULL; enum ip_conntrack_info ctinfo; struct in6_addr orig_ip; struct nf_conn *ct; ct = nf_ct_get(skb_in, &ctinfo); if (!ct || !(ct->status & IPS_SRC_NAT)) { __icmpv6_send(skb_in, type, code, info, &parm); return; } if (skb_shared(skb_in)) skb_in = cloned_skb = skb_clone(skb_in, GFP_ATOMIC); if (unlikely(!skb_in || skb_network_header(skb_in) < skb_in->head || (skb_network_header(skb_in) + sizeof(struct ipv6hdr)) > skb_tail_pointer(skb_in) || skb_ensure_writable(skb_in, skb_network_offset(skb_in) + sizeof(struct ipv6hdr)))) goto out; orig_ip = ipv6_hdr(skb_in)->saddr; ipv6_hdr(skb_in)->saddr = ct->tuplehash[0].tuple.src.u3.in6; __icmpv6_send(skb_in, type, code, info, &parm); ipv6_hdr(skb_in)->saddr = orig_ip; out: consume_skb(cloned_skb); } EXPORT_SYMBOL(icmpv6_ndo_send); #endif #endif |
| 4 4 4 3 3 3 3 29 29 29 4 4 4 4 4 4 4 4 4 1 1 1 1 1 1 1 3 3 3 3 3 3 3 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 | // SPDX-License-Identifier: GPL-2.0 #include "cgroup-internal.h" #include <linux/sched/task.h> #include <linux/slab.h> #include <linux/nsproxy.h> #include <linux/proc_ns.h> /* cgroup namespaces */ static struct ucounts *inc_cgroup_namespaces(struct user_namespace *ns) { return inc_ucount(ns, current_euid(), UCOUNT_CGROUP_NAMESPACES); } static void dec_cgroup_namespaces(struct ucounts *ucounts) { dec_ucount(ucounts, UCOUNT_CGROUP_NAMESPACES); } static struct cgroup_namespace *alloc_cgroup_ns(void) { struct cgroup_namespace *new_ns; int ret; new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL_ACCOUNT); if (!new_ns) return ERR_PTR(-ENOMEM); ret = ns_alloc_inum(&new_ns->ns); if (ret) { kfree(new_ns); return ERR_PTR(ret); } refcount_set(&new_ns->ns.count, 1); new_ns->ns.ops = &cgroupns_operations; return new_ns; } void free_cgroup_ns(struct cgroup_namespace *ns) { put_css_set(ns->root_cset); dec_cgroup_namespaces(ns->ucounts); put_user_ns(ns->user_ns); ns_free_inum(&ns->ns); kfree(ns); } EXPORT_SYMBOL(free_cgroup_ns); struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, struct user_namespace *user_ns, struct cgroup_namespace *old_ns) { struct cgroup_namespace *new_ns; struct ucounts *ucounts; struct css_set *cset; BUG_ON(!old_ns); if (!(flags & CLONE_NEWCGROUP)) { get_cgroup_ns(old_ns); return old_ns; } /* Allow only sysadmin to create cgroup namespace. */ if (!ns_capable(user_ns, CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); ucounts = inc_cgroup_namespaces(user_ns); if (!ucounts) return ERR_PTR(-ENOSPC); /* It is not safe to take cgroup_mutex here */ spin_lock_irq(&css_set_lock); cset = task_css_set(current); get_css_set(cset); spin_unlock_irq(&css_set_lock); new_ns = alloc_cgroup_ns(); if (IS_ERR(new_ns)) { put_css_set(cset); dec_cgroup_namespaces(ucounts); return new_ns; } new_ns->user_ns = get_user_ns(user_ns); new_ns->ucounts = ucounts; new_ns->root_cset = cset; return new_ns; } static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns) { return container_of(ns, struct cgroup_namespace, ns); } static int cgroupns_install(struct nsset *nsset, struct ns_common *ns) { struct nsproxy *nsproxy = nsset->nsproxy; struct cgroup_namespace *cgroup_ns = to_cg_ns(ns); if (!ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN) || !ns_capable(cgroup_ns->user_ns, CAP_SYS_ADMIN)) return -EPERM; /* Don't need to do anything if we are attaching to our own cgroupns. */ if (cgroup_ns == nsproxy->cgroup_ns) return 0; get_cgroup_ns(cgroup_ns); put_cgroup_ns(nsproxy->cgroup_ns); nsproxy->cgroup_ns = cgroup_ns; return 0; } static struct ns_common *cgroupns_get(struct task_struct *task) { struct cgroup_namespace *ns = NULL; struct nsproxy *nsproxy; task_lock(task); nsproxy = task->nsproxy; if (nsproxy) { ns = nsproxy->cgroup_ns; get_cgroup_ns(ns); } task_unlock(task); return ns ? &ns->ns : NULL; } static void cgroupns_put(struct ns_common *ns) { put_cgroup_ns(to_cg_ns(ns)); } static struct user_namespace *cgroupns_owner(struct ns_common *ns) { return to_cg_ns(ns)->user_ns; } const struct proc_ns_operations cgroupns_operations = { .name = "cgroup", .type = CLONE_NEWCGROUP, .get = cgroupns_get, .put = cgroupns_put, .install = cgroupns_install, .owner = cgroupns_owner, }; |
| 311 926 165 838 426 426 61 124 122 107 140 2 4 85 136 223 7 5 5 6 10 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_RCULIST_H #define _LINUX_RCULIST_H #ifdef __KERNEL__ /* * RCU-protected list version */ #include <linux/list.h> #include <linux/rcupdate.h> /* * INIT_LIST_HEAD_RCU - Initialize a list_head visible to RCU readers * @list: list to be initialized * * You should instead use INIT_LIST_HEAD() for normal initialization and * cleanup tasks, when readers have no access to the list being initialized. * However, if the list being initialized is visible to readers, you * need to keep the compiler from being too mischievous. */ static inline void INIT_LIST_HEAD_RCU(struct list_head *list) { WRITE_ONCE(list->next, list); WRITE_ONCE(list->prev, list); } /* * return the ->next pointer of a list_head in an rcu safe * way, we must not access it directly */ #define list_next_rcu(list) (*((struct list_head __rcu **)(&(list)->next))) /** * list_tail_rcu - returns the prev pointer of the head of the list * @head: the head of the list * * Note: This should only be used with the list header, and even then * only if list_del() and similar primitives are not also used on the * list header. */ #define list_tail_rcu(head) (*((struct list_head __rcu **)(&(head)->prev))) /* * Check during list traversal that we are within an RCU reader */ #define check_arg_count_one(dummy) #ifdef CONFIG_PROVE_RCU_LIST #define __list_check_rcu(dummy, cond, extra...) \ ({ \ check_arg_count_one(extra); \ RCU_LOCKDEP_WARN(!(cond) && !rcu_read_lock_any_held(), \ "RCU-list traversed in non-reader section!"); \ }) #define __list_check_srcu(cond) \ ({ \ RCU_LOCKDEP_WARN(!(cond), \ "RCU-list traversed without holding the required lock!");\ }) #else #define __list_check_rcu(dummy, cond, extra...) \ ({ check_arg_count_one(extra); }) #define __list_check_srcu(cond) ({ }) #endif /* * Insert a new entry between two known consecutive entries. * * This is only for internal list manipulation where we know * the prev/next entries already! */ static inline void __list_add_rcu(struct list_head *new, struct list_head *prev, struct list_head *next) { if (!__list_add_valid(new, prev, next)) return; new->next = next; new->prev = prev; rcu_assign_pointer(list_next_rcu(prev), new); next->prev = new; } /** * list_add_rcu - add a new entry to rcu-protected list * @new: new entry to be added * @head: list head to add it after * * Insert a new entry after the specified head. * This is good for implementing stacks. * * The caller must take whatever precautions are necessary * (such as holding appropriate locks) to avoid racing * with another list-mutation primitive, such as list_add_rcu() * or list_del_rcu(), running on this same list. * However, it is perfectly legal to run concurrently with * the _rcu list-traversal primitives, such as * list_for_each_entry_rcu(). */ static inline void list_add_rcu(struct list_head *new, struct list_head *head) { __list_add_rcu(new, head, head->next); } /** * list_add_tail_rcu - add a new entry to rcu-protected list * @new: new entry to be added * @head: list head to add it before * * Insert a new entry before the specified head. * This is useful for implementing queues. * * The caller must take whatever precautions are necessary * (such as holding appropriate locks) to avoid racing * with another list-mutation primitive, such as list_add_tail_rcu() * or list_del_rcu(), running on this same list. * However, it is perfectly legal to run concurrently with * the _rcu list-traversal primitives, such as * list_for_each_entry_rcu(). */ static inline void list_add_tail_rcu(struct list_head *new, struct list_head *head) { __list_add_rcu(new, head->prev, head); } /** * list_del_rcu - deletes entry from list without re-initialization * @entry: the element to delete from the list. * * Note: list_empty() on entry does not return true after this, * the entry is in an undefined state. It is useful for RCU based * lockfree traversal. * * In particular, it means that we can not poison the forward * pointers that may still be used for walking the list. * * The caller must take whatever precautions are necessary * (such as holding appropriate locks) to avoid racing * with another list-mutation primitive, such as list_del_rcu() * or list_add_rcu(), running on this same list. * However, it is perfectly legal to run concurrently with * the _rcu list-traversal primitives, such as * list_for_each_entry_rcu(). * * Note that the caller is not permitted to immediately free * the newly deleted entry. Instead, either synchronize_rcu() * or call_rcu() must be used to defer freeing until an RCU * grace period has elapsed. */ static inline void list_del_rcu(struct list_head *entry) { __list_del_entry(entry); entry->prev = LIST_POISON2; } /** * hlist_del_init_rcu - deletes entry from hash list with re-initialization * @n: the element to delete from the hash list. * * Note: list_unhashed() on the node return true after this. It is * useful for RCU based read lockfree traversal if the writer side * must know if the list entry is still hashed or already unhashed. * * In particular, it means that we can not poison the forward pointers * that may still be used for walking the hash list and we can only * zero the pprev pointer so list_unhashed() will return true after * this. * * The caller must take whatever precautions are necessary (such as * holding appropriate locks) to avoid racing with another * list-mutation primitive, such as hlist_add_head_rcu() or * hlist_del_rcu(), running on this same list. However, it is * perfectly legal to run concurrently with the _rcu list-traversal * primitives, such as hlist_for_each_entry_rcu(). */ static inline void hlist_del_init_rcu(struct hlist_node *n) { if (!hlist_unhashed(n)) { __hlist_del(n); WRITE_ONCE(n->pprev, NULL); } } /** * list_replace_rcu - replace old entry by new one * @old : the element to be replaced * @new : the new element to insert * * The @old entry will be replaced with the @new entry atomically. * Note: @old should not be empty. */ static inline void list_replace_rcu(struct list_head *old, struct list_head *new) { new->next = old->next; new->prev = old->prev; rcu_assign_pointer(list_next_rcu(new->prev), new); new->next->prev = new; old->prev = LIST_POISON2; } /** * __list_splice_init_rcu - join an RCU-protected list into an existing list. * @list: the RCU-protected list to splice * @prev: points to the last element of the existing list * @next: points to the first element of the existing list * @sync: synchronize_rcu, synchronize_rcu_expedited, ... * * The list pointed to by @prev and @next can be RCU-read traversed * concurrently with this function. * * Note that this function blocks. * * Important note: the caller must take whatever action is necessary to prevent * any other updates to the existing list. In principle, it is possible to * modify the list as soon as sync() begins execution. If this sort of thing * becomes necessary, an alternative version based on call_rcu() could be * created. But only if -really- needed -- there is no shortage of RCU API * members. */ static inline void __list_splice_init_rcu(struct list_head *list, struct list_head *prev, struct list_head *next, void (*sync)(void)) { struct list_head *first = list->next; struct list_head *last = list->prev; /* * "first" and "last" tracking list, so initialize it. RCU readers * have access to this list, so we must use INIT_LIST_HEAD_RCU() * instead of INIT_LIST_HEAD(). */ INIT_LIST_HEAD_RCU(list); /* * At this point, the list body still points to the source list. * Wait for any readers to finish using the list before splicing * the list body into the new list. Any new readers will see * an empty list. */ sync(); ASSERT_EXCLUSIVE_ACCESS(*first); ASSERT_EXCLUSIVE_ACCESS(*last); /* * Readers are finished with the source list, so perform splice. * The order is important if the new list is global and accessible * to concurrent RCU readers. Note that RCU readers are not * permitted to traverse the prev pointers without excluding * this function. */ last->next = next; rcu_assign_pointer(list_next_rcu(prev), first); first->prev = prev; next->prev = last; } /** * list_splice_init_rcu - splice an RCU-protected list into an existing list, * designed for stacks. * @list: the RCU-protected list to splice * @head: the place in the existing list to splice the first list into * @sync: synchronize_rcu, synchronize_rcu_expedited, ... */ static inline void list_splice_init_rcu(struct list_head *list, struct list_head *head, void (*sync)(void)) { if (!list_empty(list)) __list_splice_init_rcu(list, head, head->next, sync); } /** * list_splice_tail_init_rcu - splice an RCU-protected list into an existing * list, designed for queues. * @list: the RCU-protected list to splice * @head: the place in the existing list to splice the first list into * @sync: synchronize_rcu, synchronize_rcu_expedited, ... */ static inline void list_splice_tail_init_rcu(struct list_head *list, struct list_head *head, void (*sync)(void)) { if (!list_empty(list)) __list_splice_init_rcu(list, head->prev, head, sync); } /** * list_entry_rcu - get the struct for this entry * @ptr: the &struct list_head pointer. * @type: the type of the struct this is embedded in. * @member: the name of the list_head within the struct. * * This primitive may safely run concurrently with the _rcu list-mutation * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock(). */ #define list_entry_rcu(ptr, type, member) \ container_of(READ_ONCE(ptr), type, member) /* * Where are list_empty_rcu() and list_first_entry_rcu()? * * They do not exist because they would lead to subtle race conditions: * * if (!list_empty_rcu(mylist)) { * struct foo *bar = list_first_entry_rcu(mylist, struct foo, list_member); * do_something(bar); * } * * The list might be non-empty when list_empty_rcu() checks it, but it * might have become empty by the time that list_first_entry_rcu() rereads * the ->next pointer, which would result in a SEGV. * * When not using RCU, it is OK for list_first_entry() to re-read that * pointer because both functions should be protected by some lock that * blocks writers. * * When using RCU, list_empty() uses READ_ONCE() to fetch the * RCU-protected ->next pointer and then compares it to the address of the * list head. However, it neither dereferences this pointer nor provides * this pointer to its caller. Thus, READ_ONCE() suffices (that is, * rcu_dereference() is not needed), which means that list_empty() can be * used anywhere you would want to use list_empty_rcu(). Just don't * expect anything useful to happen if you do a subsequent lockless * call to list_first_entry_rcu()!!! * * See list_first_or_null_rcu for an alternative. */ /** * list_first_or_null_rcu - get the first element from a list * @ptr: the list head to take the element from. * @type: the type of the struct this is embedded in. * @member: the name of the list_head within the struct. * * Note that if the list is empty, it returns NULL. * * This primitive may safely run concurrently with the _rcu list-mutation * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock(). */ #define list_first_or_null_rcu(ptr, type, member) \ ({ \ struct list_head *__ptr = (ptr); \ struct list_head *__next = READ_ONCE(__ptr->next); \ likely(__ptr != __next) ? list_entry_rcu(__next, type, member) : NULL; \ }) /** * list_next_or_null_rcu - get the next element from a list * @head: the head for the list. * @ptr: the list head to take the next element from. * @type: the type of the struct this is embedded in. * @member: the name of the list_head within the struct. * * Note that if the ptr is at the end of the list, NULL is returned. * * This primitive may safely run concurrently with the _rcu list-mutation * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock(). */ #define list_next_or_null_rcu(head, ptr, type, member) \ ({ \ struct list_head *__head = (head); \ struct list_head *__ptr = (ptr); \ struct list_head *__next = READ_ONCE(__ptr->next); \ likely(__next != __head) ? list_entry_rcu(__next, type, \ member) : NULL; \ }) /** * list_for_each_entry_rcu - iterate over rcu list of given type * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. * @cond: optional lockdep expression if called from non-RCU protection. * * This list-traversal primitive may safely run concurrently with * the _rcu list-mutation primitives such as list_add_rcu() * as long as the traversal is guarded by rcu_read_lock(). */ #define list_for_each_entry_rcu(pos, head, member, cond...) \ for (__list_check_rcu(dummy, ## cond, 0), \ pos = list_entry_rcu((head)->next, typeof(*pos), member); \ &pos->member != (head); \ pos = list_entry_rcu(pos->member.next, typeof(*pos), member)) /** * list_for_each_entry_srcu - iterate over rcu list of given type * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. * @cond: lockdep expression for the lock required to traverse the list. * * This list-traversal primitive may safely run concurrently with * the _rcu list-mutation primitives such as list_add_rcu() * as long as the traversal is guarded by srcu_read_lock(). * The lockdep expression srcu_read_lock_held() can be passed as the * cond argument from read side. */ #define list_for_each_entry_srcu(pos, head, member, cond) \ for (__list_check_srcu(cond), \ pos = list_entry_rcu((head)->next, typeof(*pos), member); \ &pos->member != (head); \ pos = list_entry_rcu(pos->member.next, typeof(*pos), member)) /** * list_entry_lockless - get the struct for this entry * @ptr: the &struct list_head pointer. * @type: the type of the struct this is embedded in. * @member: the name of the list_head within the struct. * * This primitive may safely run concurrently with the _rcu * list-mutation primitives such as list_add_rcu(), but requires some * implicit RCU read-side guarding. One example is running within a special * exception-time environment where preemption is disabled and where lockdep * cannot be invoked. Another example is when items are added to the list, * but never deleted. */ #define list_entry_lockless(ptr, type, member) \ container_of((typeof(ptr))READ_ONCE(ptr), type, member) /** * list_for_each_entry_lockless - iterate over rcu list of given type * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_struct within the struct. * * This primitive may safely run concurrently with the _rcu * list-mutation primitives such as list_add_rcu(), but requires some * implicit RCU read-side guarding. One example is running within a special * exception-time environment where preemption is disabled and where lockdep * cannot be invoked. Another example is when items are added to the list, * but never deleted. */ #define list_for_each_entry_lockless(pos, head, member) \ for (pos = list_entry_lockless((head)->next, typeof(*pos), member); \ &pos->member != (head); \ pos = list_entry_lockless(pos->member.next, typeof(*pos), member)) /** * list_for_each_entry_continue_rcu - continue iteration over list of given type * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. * * Continue to iterate over list of given type, continuing after * the current position which must have been in the list when the RCU read * lock was taken. * This would typically require either that you obtained the node from a * previous walk of the list in the same RCU read-side critical section, or * that you held some sort of non-RCU reference (such as a reference count) * to keep the node alive *and* in the list. * * This iterator is similar to list_for_each_entry_from_rcu() except * this starts after the given position and that one starts at the given * position. */ #define list_for_each_entry_continue_rcu(pos, head, member) \ for (pos = list_entry_rcu(pos->member.next, typeof(*pos), member); \ &pos->member != (head); \ pos = list_entry_rcu(pos->member.next, typeof(*pos), member)) /** * list_for_each_entry_from_rcu - iterate over a list from current point * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_node within the struct. * * Iterate over the tail of a list starting from a given position, * which must have been in the list when the RCU read lock was taken. * This would typically require either that you obtained the node from a * previous walk of the list in the same RCU read-side critical section, or * that you held some sort of non-RCU reference (such as a reference count) * to keep the node alive *and* in the list. * * This iterator is similar to list_for_each_entry_continue_rcu() except * this starts from the given position and that one starts from the position * after the given position. */ #define list_for_each_entry_from_rcu(pos, head, member) \ for (; &(pos)->member != (head); \ pos = list_entry_rcu(pos->member.next, typeof(*(pos)), member)) /** * hlist_del_rcu - deletes entry from hash list without re-initialization * @n: the element to delete from the hash list. * * Note: list_unhashed() on entry does not return true after this, * the entry is in an undefined state. It is useful for RCU based * lockfree traversal. * * In particular, it means that we can not poison the forward * pointers that may still be used for walking the hash list. * * The caller must take whatever precautions are necessary * (such as holding appropriate locks) to avoid racing * with another list-mutation primitive, such as hlist_add_head_rcu() * or hlist_del_rcu(), running on this same list. * However, it is perfectly legal to run concurrently with * the _rcu list-traversal primitives, such as * hlist_for_each_entry(). */ static inline void hlist_del_rcu(struct hlist_node *n) { __hlist_del(n); WRITE_ONCE(n->pprev, LIST_POISON2); } /** * hlist_replace_rcu - replace old entry by new one * @old : the element to be replaced * @new : the new element to insert * * The @old entry will be replaced with the @new entry atomically. */ static inline void hlist_replace_rcu(struct hlist_node *old, struct hlist_node *new) { struct hlist_node *next = old->next; new->next = next; WRITE_ONCE(new->pprev, old->pprev); rcu_assign_pointer(*(struct hlist_node __rcu **)new->pprev, new); if (next) WRITE_ONCE(new->next->pprev, &new->next); WRITE_ONCE(old->pprev, LIST_POISON2); } /** * hlists_swap_heads_rcu - swap the lists the hlist heads point to * @left: The hlist head on the left * @right: The hlist head on the right * * The lists start out as [@left ][node1 ... ] and * [@right ][node2 ... ] * The lists end up as [@left ][node2 ... ] * [@right ][node1 ... ] */ static inline void hlists_swap_heads_rcu(struct hlist_head *left, struct hlist_head *right) { struct hlist_node *node1 = left->first; struct hlist_node *node2 = right->first; rcu_assign_pointer(left->first, node2); rcu_assign_pointer(right->first, node1); WRITE_ONCE(node2->pprev, &left->first); WRITE_ONCE(node1->pprev, &right->first); } /* * return the first or the next element in an RCU protected hlist */ #define hlist_first_rcu(head) (*((struct hlist_node __rcu **)(&(head)->first))) #define hlist_next_rcu(node) (*((struct hlist_node __rcu **)(&(node)->next))) #define hlist_pprev_rcu(node) (*((struct hlist_node __rcu **)((node)->pprev))) /** * hlist_add_head_rcu * @n: the element to add to the hash list. * @h: the list to add to. * * Description: * Adds the specified element to the specified hlist, * while permitting racing traversals. * * The caller must take whatever precautions are necessary * (such as holding appropriate locks) to avoid racing * with another list-mutation primitive, such as hlist_add_head_rcu() * or hlist_del_rcu(), running on this same list. * However, it is perfectly legal to run concurrently with * the _rcu list-traversal primitives, such as * hlist_for_each_entry_rcu(), used to prevent memory-consistency * problems on Alpha CPUs. Regardless of the type of CPU, the * list-traversal primitive must be guarded by rcu_read_lock(). */ static inline void hlist_add_head_rcu(struct hlist_node *n, struct hlist_head *h) { struct hlist_node *first = h->first; n->next = first; WRITE_ONCE(n->pprev, &h->first); rcu_assign_pointer(hlist_first_rcu(h), n); if (first) WRITE_ONCE(first->pprev, &n->next); } /** * hlist_add_tail_rcu * @n: the element to add to the hash list. * @h: the list to add to. * * Description: * Adds the specified element to the specified hlist, * while permitting racing traversals. * * The caller must take whatever precautions are necessary * (such as holding appropriate locks) to avoid racing * with another list-mutation primitive, such as hlist_add_head_rcu() * or hlist_del_rcu(), running on this same list. * However, it is perfectly legal to run concurrently with * the _rcu list-traversal primitives, such as * hlist_for_each_entry_rcu(), used to prevent memory-consistency * problems on Alpha CPUs. Regardless of the type of CPU, the * list-traversal primitive must be guarded by rcu_read_lock(). */ static inline void hlist_add_tail_rcu(struct hlist_node *n, struct hlist_head *h) { struct hlist_node *i, *last = NULL; /* Note: write side code, so rcu accessors are not needed. */ for (i = h->first; i; i = i->next) last = i; if (last) { n->next = last->next; WRITE_ONCE(n->pprev, &last->next); rcu_assign_pointer(hlist_next_rcu(last), n); } else { hlist_add_head_rcu(n, h); } } /** * hlist_add_before_rcu * @n: the new element to add to the hash list. * @next: the existing element to add the new element before. * * Description: * Adds the specified element to the specified hlist * before the specified node while permitting racing traversals. * * The caller must take whatever precautions are necessary * (such as holding appropriate locks) to avoid racing * with another list-mutation primitive, such as hlist_add_head_rcu() * or hlist_del_rcu(), running on this same list. * However, it is perfectly legal to run concurrently with * the _rcu list-traversal primitives, such as * hlist_for_each_entry_rcu(), used to prevent memory-consistency * problems on Alpha CPUs. */ static inline void hlist_add_before_rcu(struct hlist_node *n, struct hlist_node *next) { WRITE_ONCE(n->pprev, next->pprev); n->next = next; rcu_assign_pointer(hlist_pprev_rcu(n), n); WRITE_ONCE(next->pprev, &n->next); } /** * hlist_add_behind_rcu * @n: the new element to add to the hash list. * @prev: the existing element to add the new element after. * * Description: * Adds the specified element to the specified hlist * after the specified node while permitting racing traversals. * * The caller must take whatever precautions are necessary * (such as holding appropriate locks) to avoid racing * with another list-mutation primitive, such as hlist_add_head_rcu() * or hlist_del_rcu(), running on this same list. * However, it is perfectly legal to run concurrently with * the _rcu list-traversal primitives, such as * hlist_for_each_entry_rcu(), used to prevent memory-consistency * problems on Alpha CPUs. */ static inline void hlist_add_behind_rcu(struct hlist_node *n, struct hlist_node *prev) { n->next = prev->next; WRITE_ONCE(n->pprev, &prev->next); rcu_assign_pointer(hlist_next_rcu(prev), n); if (n->next) WRITE_ONCE(n->next->pprev, &n->next); } #define __hlist_for_each_rcu(pos, head) \ for (pos = rcu_dereference(hlist_first_rcu(head)); \ pos; \ pos = rcu_dereference(hlist_next_rcu(pos))) /** * hlist_for_each_entry_rcu - iterate over rcu list of given type * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the hlist_node within the struct. * @cond: optional lockdep expression if called from non-RCU protection. * * This list-traversal primitive may safely run concurrently with * the _rcu list-mutation primitives such as hlist_add_head_rcu() * as long as the traversal is guarded by rcu_read_lock(). */ #define hlist_for_each_entry_rcu(pos, head, member, cond...) \ for (__list_check_rcu(dummy, ## cond, 0), \ pos = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),\ typeof(*(pos)), member); \ pos; \ pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(\ &(pos)->member)), typeof(*(pos)), member)) /** * hlist_for_each_entry_srcu - iterate over rcu list of given type * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the hlist_node within the struct. * @cond: lockdep expression for the lock required to traverse the list. * * This list-traversal primitive may safely run concurrently with * the _rcu list-mutation primitives such as hlist_add_head_rcu() * as long as the traversal is guarded by srcu_read_lock(). * The lockdep expression srcu_read_lock_held() can be passed as the * cond argument from read side. */ #define hlist_for_each_entry_srcu(pos, head, member, cond) \ for (__list_check_srcu(cond), \ pos = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),\ typeof(*(pos)), member); \ pos; \ pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(\ &(pos)->member)), typeof(*(pos)), member)) /** * hlist_for_each_entry_rcu_notrace - iterate over rcu list of given type (for tracing) * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the hlist_node within the struct. * * This list-traversal primitive may safely run concurrently with * the _rcu list-mutation primitives such as hlist_add_head_rcu() * as long as the traversal is guarded by rcu_read_lock(). * * This is the same as hlist_for_each_entry_rcu() except that it does * not do any RCU debugging or tracing. */ #define hlist_for_each_entry_rcu_notrace(pos, head, member) \ for (pos = hlist_entry_safe(rcu_dereference_raw_check(hlist_first_rcu(head)),\ typeof(*(pos)), member); \ pos; \ pos = hlist_entry_safe(rcu_dereference_raw_check(hlist_next_rcu(\ &(pos)->member)), typeof(*(pos)), member)) /** * hlist_for_each_entry_rcu_bh - iterate over rcu list of given type * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the hlist_node within the struct. * * This list-traversal primitive may safely run concurrently with * the _rcu list-mutation primitives such as hlist_add_head_rcu() * as long as the traversal is guarded by rcu_read_lock(). */ #define hlist_for_each_entry_rcu_bh(pos, head, member) \ for (pos = hlist_entry_safe(rcu_dereference_bh(hlist_first_rcu(head)),\ typeof(*(pos)), member); \ pos; \ pos = hlist_entry_safe(rcu_dereference_bh(hlist_next_rcu(\ &(pos)->member)), typeof(*(pos)), member)) /** * hlist_for_each_entry_continue_rcu - iterate over a hlist continuing after current point * @pos: the type * to use as a loop cursor. * @member: the name of the hlist_node within the struct. */ #define hlist_for_each_entry_continue_rcu(pos, member) \ for (pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu( \ &(pos)->member)), typeof(*(pos)), member); \ pos; \ pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu( \ &(pos)->member)), typeof(*(pos)), member)) /** * hlist_for_each_entry_continue_rcu_bh - iterate over a hlist continuing after current point * @pos: the type * to use as a loop cursor. * @member: the name of the hlist_node within the struct. */ #define hlist_for_each_entry_continue_rcu_bh(pos, member) \ for (pos = hlist_entry_safe(rcu_dereference_bh(hlist_next_rcu( \ &(pos)->member)), typeof(*(pos)), member); \ pos; \ pos = hlist_entry_safe(rcu_dereference_bh(hlist_next_rcu( \ &(pos)->member)), typeof(*(pos)), member)) /** * hlist_for_each_entry_from_rcu - iterate over a hlist continuing from current point * @pos: the type * to use as a loop cursor. * @member: the name of the hlist_node within the struct. */ #define hlist_for_each_entry_from_rcu(pos, member) \ for (; pos; \ pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu( \ &(pos)->member)), typeof(*(pos)), member)) #endif /* __KERNEL__ */ #endif |
| 2 2 2 2 2 2 2 2 2 2 3 3 2 3 2 3 3 3 3 1 3 3 2 4 4 3 3 3 3 3 2 5 5 1 1 1 1 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 | // SPDX-License-Identifier: GPL-2.0 #include <linux/syscalls.h> #include <linux/slab.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/mount.h> #include <linux/namei.h> #include <linux/exportfs.h> #include <linux/fs_struct.h> #include <linux/fsnotify.h> #include <linux/personality.h> #include <linux/uaccess.h> #include <linux/compat.h> #include "internal.h" #include "mount.h" static long do_sys_name_to_handle(const struct path *path, struct file_handle __user *ufh, int __user *mnt_id, int fh_flags) { long retval; struct file_handle f_handle; int handle_dwords, handle_bytes; struct file_handle *handle = NULL; /* * We need to make sure whether the file system support decoding of * the file handle if decodeable file handle was requested. */ if (!exportfs_can_encode_fh(path->dentry->d_sb->s_export_op, fh_flags)) return -EOPNOTSUPP; if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle))) return -EFAULT; if (f_handle.handle_bytes > MAX_HANDLE_SZ) return -EINVAL; handle = kzalloc(struct_size(handle, f_handle, f_handle.handle_bytes), GFP_KERNEL); if (!handle) return -ENOMEM; /* convert handle size to multiple of sizeof(u32) */ handle_dwords = f_handle.handle_bytes >> 2; /* we ask for a non connectable maybe decodeable file handle */ retval = exportfs_encode_fh(path->dentry, (struct fid *)handle->f_handle, &handle_dwords, fh_flags); handle->handle_type = retval; /* convert handle size to bytes */ handle_bytes = handle_dwords * sizeof(u32); handle->handle_bytes = handle_bytes; if ((handle->handle_bytes > f_handle.handle_bytes) || (retval == FILEID_INVALID) || (retval < 0)) { /* As per old exportfs_encode_fh documentation * we could return ENOSPC to indicate overflow * But file system returned 255 always. So handle * both the values */ if (retval == FILEID_INVALID || retval == -ENOSPC) retval = -EOVERFLOW; /* * set the handle size to zero so we copy only * non variable part of the file_handle */ handle_bytes = 0; } else retval = 0; /* copy the mount id */ if (put_user(real_mount(path->mnt)->mnt_id, mnt_id) || copy_to_user(ufh, handle, struct_size(handle, f_handle, handle_bytes))) retval = -EFAULT; kfree(handle); return retval; } /** * sys_name_to_handle_at: convert name to handle * @dfd: directory relative to which name is interpreted if not absolute * @name: name that should be converted to handle. * @handle: resulting file handle * @mnt_id: mount id of the file system containing the file * @flag: flag value to indicate whether to follow symlink or not * and whether a decodable file handle is required. * * @handle->handle_size indicate the space available to store the * variable part of the file handle in bytes. If there is not * enough space, the field is updated to return the minimum * value required. */ SYSCALL_DEFINE5(name_to_handle_at, int, dfd, const char __user *, name, struct file_handle __user *, handle, int __user *, mnt_id, int, flag) { struct path path; int lookup_flags; int fh_flags; int err; if (flag & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH | AT_HANDLE_FID)) return -EINVAL; lookup_flags = (flag & AT_SYMLINK_FOLLOW) ? LOOKUP_FOLLOW : 0; fh_flags = (flag & AT_HANDLE_FID) ? EXPORT_FH_FID : 0; if (flag & AT_EMPTY_PATH) lookup_flags |= LOOKUP_EMPTY; err = user_path_at(dfd, name, lookup_flags, &path); if (!err) { err = do_sys_name_to_handle(&path, handle, mnt_id, fh_flags); path_put(&path); } return err; } static struct vfsmount *get_vfsmount_from_fd(int fd) { struct vfsmount *mnt; if (fd == AT_FDCWD) { struct fs_struct *fs = current->fs; spin_lock(&fs->lock); mnt = mntget(fs->pwd.mnt); spin_unlock(&fs->lock); } else { struct fd f = fdget(fd); if (!f.file) return ERR_PTR(-EBADF); mnt = mntget(f.file->f_path.mnt); fdput(f); } return mnt; } static int vfs_dentry_acceptable(void *context, struct dentry *dentry) { return 1; } static int do_handle_to_path(int mountdirfd, struct file_handle *handle, struct path *path) { int retval = 0; int handle_dwords; path->mnt = get_vfsmount_from_fd(mountdirfd); if (IS_ERR(path->mnt)) { retval = PTR_ERR(path->mnt); goto out_err; } /* change the handle size to multiple of sizeof(u32) */ handle_dwords = handle->handle_bytes >> 2; path->dentry = exportfs_decode_fh(path->mnt, (struct fid *)handle->f_handle, handle_dwords, handle->handle_type, vfs_dentry_acceptable, NULL); if (IS_ERR(path->dentry)) { retval = PTR_ERR(path->dentry); goto out_mnt; } return 0; out_mnt: mntput(path->mnt); out_err: return retval; } static int handle_to_path(int mountdirfd, struct file_handle __user *ufh, struct path *path) { int retval = 0; struct file_handle f_handle; struct file_handle *handle = NULL; /* * With handle we don't look at the execute bit on the * directory. Ideally we would like CAP_DAC_SEARCH. * But we don't have that */ if (!capable(CAP_DAC_READ_SEARCH)) { retval = -EPERM; goto out_err; } if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle))) { retval = -EFAULT; goto out_err; } if ((f_handle.handle_bytes > MAX_HANDLE_SZ) || (f_handle.handle_bytes == 0)) { retval = -EINVAL; goto out_err; } handle = kmalloc(struct_size(handle, f_handle, f_handle.handle_bytes), GFP_KERNEL); if (!handle) { retval = -ENOMEM; goto out_err; } /* copy the full handle */ *handle = f_handle; if (copy_from_user(&handle->f_handle, &ufh->f_handle, f_handle.handle_bytes)) { retval = -EFAULT; goto out_handle; } retval = do_handle_to_path(mountdirfd, handle, path); out_handle: kfree(handle); out_err: return retval; } static long do_handle_open(int mountdirfd, struct file_handle __user *ufh, int open_flag) { long retval = 0; struct path path; struct file *file; int fd; retval = handle_to_path(mountdirfd, ufh, &path); if (retval) return retval; fd = get_unused_fd_flags(open_flag); if (fd < 0) { path_put(&path); return fd; } file = file_open_root(&path, "", open_flag, 0); if (IS_ERR(file)) { put_unused_fd(fd); retval = PTR_ERR(file); } else { retval = fd; fd_install(fd, file); } path_put(&path); return retval; } /** * sys_open_by_handle_at: Open the file handle * @mountdirfd: directory file descriptor * @handle: file handle to be opened * @flags: open flags. * * @mountdirfd indicate the directory file descriptor * of the mount point. file handle is decoded relative * to the vfsmount pointed by the @mountdirfd. @flags * value is same as the open(2) flags. */ SYSCALL_DEFINE3(open_by_handle_at, int, mountdirfd, struct file_handle __user *, handle, int, flags) { long ret; if (force_o_largefile()) flags |= O_LARGEFILE; ret = do_handle_open(mountdirfd, handle, flags); return ret; } #ifdef CONFIG_COMPAT /* * Exactly like fs/open.c:sys_open_by_handle_at(), except that it * doesn't set the O_LARGEFILE flag. */ COMPAT_SYSCALL_DEFINE3(open_by_handle_at, int, mountdirfd, struct file_handle __user *, handle, int, flags) { return do_handle_open(mountdirfd, handle, flags); } #endif |
| 8 8 4 2 2 2 2 4 8 8 8 8 8 8 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 | // SPDX-License-Identifier: GPL-2.0 /* * HugeTLB Vmemmap Optimization (HVO) * * Copyright (c) 2020, ByteDance. All rights reserved. * * Author: Muchun Song <songmuchun@bytedance.com> * * See Documentation/mm/vmemmap_dedup.rst */ #define pr_fmt(fmt) "HugeTLB: " fmt #include <linux/pgtable.h> #include <linux/moduleparam.h> #include <linux/bootmem_info.h> #include <linux/mmdebug.h> #include <linux/pagewalk.h> #include <asm/pgalloc.h> #include <asm/tlbflush.h> #include "hugetlb_vmemmap.h" /** * struct vmemmap_remap_walk - walk vmemmap page table * * @remap_pte: called for each lowest-level entry (PTE). * @nr_walked: the number of walked pte. * @reuse_page: the page which is reused for the tail vmemmap pages. * @reuse_addr: the virtual address of the @reuse_page page. * @vmemmap_pages: the list head of the vmemmap pages that can be freed * or is mapped from. * @flags: used to modify behavior in vmemmap page table walking * operations. */ struct vmemmap_remap_walk { void (*remap_pte)(pte_t *pte, unsigned long addr, struct vmemmap_remap_walk *walk); unsigned long nr_walked; struct page *reuse_page; unsigned long reuse_addr; struct list_head *vmemmap_pages; /* Skip the TLB flush when we split the PMD */ #define VMEMMAP_SPLIT_NO_TLB_FLUSH BIT(0) /* Skip the TLB flush when we remap the PTE */ #define VMEMMAP_REMAP_NO_TLB_FLUSH BIT(1) unsigned long flags; }; static int vmemmap_split_pmd(pmd_t *pmd, struct page *head, unsigned long start, struct vmemmap_remap_walk *walk) { pmd_t __pmd; int i; unsigned long addr = start; pte_t *pgtable; pgtable = pte_alloc_one_kernel(&init_mm); if (!pgtable) return -ENOMEM; pmd_populate_kernel(&init_mm, &__pmd, pgtable); for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) { pte_t entry, *pte; pgprot_t pgprot = PAGE_KERNEL; entry = mk_pte(head + i, pgprot); pte = pte_offset_kernel(&__pmd, addr); set_pte_at(&init_mm, addr, pte, entry); } spin_lock(&init_mm.page_table_lock); if (likely(pmd_leaf(*pmd))) { /* * Higher order allocations from buddy allocator must be able to * be treated as indepdenent small pages (as they can be freed * individually). */ if (!PageReserved(head)) split_page(head, get_order(PMD_SIZE)); /* Make pte visible before pmd. See comment in pmd_install(). */ smp_wmb(); pmd_populate_kernel(&init_mm, pmd, pgtable); if (!(walk->flags & VMEMMAP_SPLIT_NO_TLB_FLUSH)) flush_tlb_kernel_range(start, start + PMD_SIZE); } else { pte_free_kernel(&init_mm, pgtable); } spin_unlock(&init_mm.page_table_lock); return 0; } static int vmemmap_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next, struct mm_walk *walk) { int ret = 0; struct page *head; struct vmemmap_remap_walk *vmemmap_walk = walk->private; /* Only splitting, not remapping the vmemmap pages. */ if (!vmemmap_walk->remap_pte) walk->action = ACTION_CONTINUE; spin_lock(&init_mm.page_table_lock); head = pmd_leaf(*pmd) ? pmd_page(*pmd) : NULL; /* * Due to HugeTLB alignment requirements and the vmemmap * pages being at the start of the hotplugged memory * region in memory_hotplug.memmap_on_memory case. Checking * the vmemmap page associated with the first vmemmap page * if it is self-hosted is sufficient. * * [ hotplugged memory ] * [ section ][...][ section ] * [ vmemmap ][ usable memory ] * ^ | ^ | * +--+ | | * +------------------------+ */ if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG) && unlikely(!vmemmap_walk->nr_walked)) { struct page *page = head ? head + pte_index(addr) : pte_page(ptep_get(pte_offset_kernel(pmd, addr))); if (PageVmemmapSelfHosted(page)) ret = -ENOTSUPP; } spin_unlock(&init_mm.page_table_lock); if (!head || ret) return ret; return vmemmap_split_pmd(pmd, head, addr & PMD_MASK, vmemmap_walk); } static int vmemmap_pte_entry(pte_t *pte, unsigned long addr, unsigned long next, struct mm_walk *walk) { struct vmemmap_remap_walk *vmemmap_walk = walk->private; /* * The reuse_page is found 'first' in page table walking before * starting remapping. */ if (!vmemmap_walk->reuse_page) vmemmap_walk->reuse_page = pte_page(ptep_get(pte)); else vmemmap_walk->remap_pte(pte, addr, vmemmap_walk); vmemmap_walk->nr_walked++; return 0; } static const struct mm_walk_ops vmemmap_remap_ops = { .pmd_entry = vmemmap_pmd_entry, .pte_entry = vmemmap_pte_entry, }; static int vmemmap_remap_range(unsigned long start, unsigned long end, struct vmemmap_remap_walk *walk) { int ret; VM_BUG_ON(!PAGE_ALIGNED(start | end)); mmap_read_lock(&init_mm); ret = walk_page_range_novma(&init_mm, start, end, &vmemmap_remap_ops, NULL, walk); mmap_read_unlock(&init_mm); if (ret) return ret; if (walk->remap_pte && !(walk->flags & VMEMMAP_REMAP_NO_TLB_FLUSH)) flush_tlb_kernel_range(start, end); return 0; } /* * Free a vmemmap page. A vmemmap page can be allocated from the memblock * allocator or buddy allocator. If the PG_reserved flag is set, it means * that it allocated from the memblock allocator, just free it via the * free_bootmem_page(). Otherwise, use __free_page(). */ static inline void free_vmemmap_page(struct page *page) { if (PageReserved(page)) free_bootmem_page(page); else __free_page(page); } /* Free a list of the vmemmap pages */ static void free_vmemmap_page_list(struct list_head *list) { struct page *page, *next; list_for_each_entry_safe(page, next, list, lru) free_vmemmap_page(page); } static void vmemmap_remap_pte(pte_t *pte, unsigned long addr, struct vmemmap_remap_walk *walk) { /* * Remap the tail pages as read-only to catch illegal write operation * to the tail pages. */ pgprot_t pgprot = PAGE_KERNEL_RO; struct page *page = pte_page(ptep_get(pte)); pte_t entry; /* Remapping the head page requires r/w */ if (unlikely(addr == walk->reuse_addr)) { pgprot = PAGE_KERNEL; list_del(&walk->reuse_page->lru); /* * Makes sure that preceding stores to the page contents from * vmemmap_remap_free() become visible before the set_pte_at() * write. */ smp_wmb(); } entry = mk_pte(walk->reuse_page, pgprot); list_add(&page->lru, walk->vmemmap_pages); set_pte_at(&init_mm, addr, pte, entry); } /* * How many struct page structs need to be reset. When we reuse the head * struct page, the special metadata (e.g. page->flags or page->mapping) * cannot copy to the tail struct page structs. The invalid value will be * checked in the free_tail_page_prepare(). In order to avoid the message * of "corrupted mapping in tail page". We need to reset at least 3 (one * head struct page struct and two tail struct page structs) struct page * structs. */ #define NR_RESET_STRUCT_PAGE 3 static inline void reset_struct_pages(struct page *start) { struct page *from = start + NR_RESET_STRUCT_PAGE; BUILD_BUG_ON(NR_RESET_STRUCT_PAGE * 2 > PAGE_SIZE / sizeof(struct page)); memcpy(start, from, sizeof(*from) * NR_RESET_STRUCT_PAGE); } static void vmemmap_restore_pte(pte_t *pte, unsigned long addr, struct vmemmap_remap_walk *walk) { pgprot_t pgprot = PAGE_KERNEL; struct page *page; void *to; BUG_ON(pte_page(ptep_get(pte)) != walk->reuse_page); page = list_first_entry(walk->vmemmap_pages, struct page, lru); list_del(&page->lru); to = page_to_virt(page); copy_page(to, (void *)walk->reuse_addr); reset_struct_pages(to); /* * Makes sure that preceding stores to the page contents become visible * before the set_pte_at() write. */ smp_wmb(); set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot)); } /** * vmemmap_remap_split - split the vmemmap virtual address range [@start, @end) * backing PMDs of the directmap into PTEs * @start: start address of the vmemmap virtual address range that we want * to remap. * @end: end address of the vmemmap virtual address range that we want to * remap. * @reuse: reuse address. * * Return: %0 on success, negative error code otherwise. */ static int vmemmap_remap_split(unsigned long start, unsigned long end, unsigned long reuse) { struct vmemmap_remap_walk walk = { .remap_pte = NULL, .flags = VMEMMAP_SPLIT_NO_TLB_FLUSH, }; /* See the comment in the vmemmap_remap_free(). */ BUG_ON(start - reuse != PAGE_SIZE); return vmemmap_remap_range(reuse, end, &walk); } /** * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end) * to the page which @reuse is mapped to, then free vmemmap * which the range are mapped to. * @start: start address of the vmemmap virtual address range that we want * to remap. * @end: end address of the vmemmap virtual address range that we want to * remap. * @reuse: reuse address. * @vmemmap_pages: list to deposit vmemmap pages to be freed. It is callers * responsibility to free pages. * @flags: modifications to vmemmap_remap_walk flags * * Return: %0 on success, negative error code otherwise. */ static int vmemmap_remap_free(unsigned long start, unsigned long end, unsigned long reuse, struct list_head *vmemmap_pages, unsigned long flags) { int ret; struct vmemmap_remap_walk walk = { .remap_pte = vmemmap_remap_pte, .reuse_addr = reuse, .vmemmap_pages = vmemmap_pages, .flags = flags, }; int nid = page_to_nid((struct page *)reuse); gfp_t gfp_mask = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN; /* * Allocate a new head vmemmap page to avoid breaking a contiguous * block of struct page memory when freeing it back to page allocator * in free_vmemmap_page_list(). This will allow the likely contiguous * struct page backing memory to be kept contiguous and allowing for * more allocations of hugepages. Fallback to the currently * mapped head page in case should it fail to allocate. */ walk.reuse_page = alloc_pages_node(nid, gfp_mask, 0); if (walk.reuse_page) { copy_page(page_to_virt(walk.reuse_page), (void *)walk.reuse_addr); list_add(&walk.reuse_page->lru, vmemmap_pages); } /* * In order to make remapping routine most efficient for the huge pages, * the routine of vmemmap page table walking has the following rules * (see more details from the vmemmap_pte_range()): * * - The range [@start, @end) and the range [@reuse, @reuse + PAGE_SIZE) * should be continuous. * - The @reuse address is part of the range [@reuse, @end) that we are * walking which is passed to vmemmap_remap_range(). * - The @reuse address is the first in the complete range. * * So we need to make sure that @start and @reuse meet the above rules. */ BUG_ON(start - reuse != PAGE_SIZE); ret = vmemmap_remap_range(reuse, end, &walk); if (ret && walk.nr_walked) { end = reuse + walk.nr_walked * PAGE_SIZE; /* * vmemmap_pages contains pages from the previous * vmemmap_remap_range call which failed. These * are pages which were removed from the vmemmap. * They will be restored in the following call. */ walk = (struct vmemmap_remap_walk) { .remap_pte = vmemmap_restore_pte, .reuse_addr = reuse, .vmemmap_pages = vmemmap_pages, .flags = 0, }; vmemmap_remap_range(reuse, end, &walk); } return ret; } static int alloc_vmemmap_page_list(unsigned long start, unsigned long end, struct list_head *list) { gfp_t gfp_mask = GFP_KERNEL | __GFP_RETRY_MAYFAIL; unsigned long nr_pages = (end - start) >> PAGE_SHIFT; int nid = page_to_nid((struct page *)start); struct page *page, *next; while (nr_pages--) { page = alloc_pages_node(nid, gfp_mask, 0); if (!page) goto out; list_add(&page->lru, list); } return 0; out: list_for_each_entry_safe(page, next, list, lru) __free_page(page); return -ENOMEM; } /** * vmemmap_remap_alloc - remap the vmemmap virtual address range [@start, end) * to the page which is from the @vmemmap_pages * respectively. * @start: start address of the vmemmap virtual address range that we want * to remap. * @end: end address of the vmemmap virtual address range that we want to * remap. * @reuse: reuse address. * @flags: modifications to vmemmap_remap_walk flags * * Return: %0 on success, negative error code otherwise. */ static int vmemmap_remap_alloc(unsigned long start, unsigned long end, unsigned long reuse, unsigned long flags) { LIST_HEAD(vmemmap_pages); struct vmemmap_remap_walk walk = { .remap_pte = vmemmap_restore_pte, .reuse_addr = reuse, .vmemmap_pages = &vmemmap_pages, .flags = flags, }; /* See the comment in the vmemmap_remap_free(). */ BUG_ON(start - reuse != PAGE_SIZE); if (alloc_vmemmap_page_list(start, end, &vmemmap_pages)) return -ENOMEM; return vmemmap_remap_range(reuse, end, &walk); } DEFINE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key); EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key); static bool vmemmap_optimize_enabled = IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON); core_param(hugetlb_free_vmemmap, vmemmap_optimize_enabled, bool, 0); static int __hugetlb_vmemmap_restore_folio(const struct hstate *h, struct folio *folio, unsigned long flags) { int ret; unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end; unsigned long vmemmap_reuse; VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(folio), folio); if (!folio_test_hugetlb_vmemmap_optimized(folio)) return 0; vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); vmemmap_reuse = vmemmap_start; vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE; /* * The pages which the vmemmap virtual address range [@vmemmap_start, * @vmemmap_end) are mapped to are freed to the buddy allocator, and * the range is mapped to the page which @vmemmap_reuse is mapped to. * When a HugeTLB page is freed to the buddy allocator, previously * discarded vmemmap pages must be allocated and remapping. */ ret = vmemmap_remap_alloc(vmemmap_start, vmemmap_end, vmemmap_reuse, flags); if (!ret) { folio_clear_hugetlb_vmemmap_optimized(folio); static_branch_dec(&hugetlb_optimize_vmemmap_key); } return ret; } /** * hugetlb_vmemmap_restore_folio - restore previously optimized (by * hugetlb_vmemmap_optimize_folio()) vmemmap pages which * will be reallocated and remapped. * @h: struct hstate. * @folio: the folio whose vmemmap pages will be restored. * * Return: %0 if @folio's vmemmap pages have been reallocated and remapped, * negative error code otherwise. */ int hugetlb_vmemmap_restore_folio(const struct hstate *h, struct folio *folio) { return __hugetlb_vmemmap_restore_folio(h, folio, 0); } /** * hugetlb_vmemmap_restore_folios - restore vmemmap for every folio on the list. * @h: hstate. * @folio_list: list of folios. * @non_hvo_folios: Output list of folios for which vmemmap exists. * * Return: number of folios for which vmemmap was restored, or an error code * if an error was encountered restoring vmemmap for a folio. * Folios that have vmemmap are moved to the non_hvo_folios * list. Processing of entries stops when the first error is * encountered. The folio that experienced the error and all * non-processed folios will remain on folio_list. */ long hugetlb_vmemmap_restore_folios(const struct hstate *h, struct list_head *folio_list, struct list_head *non_hvo_folios) { struct folio *folio, *t_folio; long restored = 0; long ret = 0; list_for_each_entry_safe(folio, t_folio, folio_list, lru) { if (folio_test_hugetlb_vmemmap_optimized(folio)) { ret = __hugetlb_vmemmap_restore_folio(h, folio, VMEMMAP_REMAP_NO_TLB_FLUSH); if (ret) break; restored++; } /* Add non-optimized folios to output list */ list_move(&folio->lru, non_hvo_folios); } if (restored) flush_tlb_all(); if (!ret) ret = restored; return ret; } /* Return true iff a HugeTLB whose vmemmap should and can be optimized. */ static bool vmemmap_should_optimize_folio(const struct hstate *h, struct folio *folio) { if (folio_test_hugetlb_vmemmap_optimized(folio)) return false; if (!READ_ONCE(vmemmap_optimize_enabled)) return false; if (!hugetlb_vmemmap_optimizable(h)) return false; return true; } static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio, struct list_head *vmemmap_pages, unsigned long flags) { int ret = 0; unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end; unsigned long vmemmap_reuse; VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(folio), folio); if (!vmemmap_should_optimize_folio(h, folio)) return ret; static_branch_inc(&hugetlb_optimize_vmemmap_key); /* * Very Subtle * If VMEMMAP_REMAP_NO_TLB_FLUSH is set, TLB flushing is not performed * immediately after remapping. As a result, subsequent accesses * and modifications to struct pages associated with the hugetlb * page could be to the OLD struct pages. Set the vmemmap optimized * flag here so that it is copied to the new head page. This keeps * the old and new struct pages in sync. * If there is an error during optimization, we will immediately FLUSH * the TLB and clear the flag below. */ folio_set_hugetlb_vmemmap_optimized(folio); vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); vmemmap_reuse = vmemmap_start; vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE; /* * Remap the vmemmap virtual address range [@vmemmap_start, @vmemmap_end) * to the page which @vmemmap_reuse is mapped to. Add pages previously * mapping the range to vmemmap_pages list so that they can be freed by * the caller. */ ret = vmemmap_remap_free(vmemmap_start, vmemmap_end, vmemmap_reuse, vmemmap_pages, flags); if (ret) { static_branch_dec(&hugetlb_optimize_vmemmap_key); folio_clear_hugetlb_vmemmap_optimized(folio); } return ret; } /** * hugetlb_vmemmap_optimize_folio - optimize @folio's vmemmap pages. * @h: struct hstate. * @folio: the folio whose vmemmap pages will be optimized. * * This function only tries to optimize @folio's vmemmap pages and does not * guarantee that the optimization will succeed after it returns. The caller * can use folio_test_hugetlb_vmemmap_optimized(@folio) to detect if @folio's * vmemmap pages have been optimized. */ void hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio) { LIST_HEAD(vmemmap_pages); __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, 0); free_vmemmap_page_list(&vmemmap_pages); } static int hugetlb_vmemmap_split_folio(const struct hstate *h, struct folio *folio) { unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end; unsigned long vmemmap_reuse; if (!vmemmap_should_optimize_folio(h, folio)) return 0; vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); vmemmap_reuse = vmemmap_start; vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE; /* * Split PMDs on the vmemmap virtual address range [@vmemmap_start, * @vmemmap_end] */ return vmemmap_remap_split(vmemmap_start, vmemmap_end, vmemmap_reuse); } void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list) { struct folio *folio; LIST_HEAD(vmemmap_pages); list_for_each_entry(folio, folio_list, lru) { int ret = hugetlb_vmemmap_split_folio(h, folio); /* * Spliting the PMD requires allocating a page, thus lets fail * early once we encounter the first OOM. No point in retrying * as it can be dynamically done on remap with the memory * we get back from the vmemmap deduplication. */ if (ret == -ENOMEM) break; } flush_tlb_all(); list_for_each_entry(folio, folio_list, lru) { int ret; ret = __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, VMEMMAP_REMAP_NO_TLB_FLUSH); /* * Pages to be freed may have been accumulated. If we * encounter an ENOMEM, free what we have and try again. * This can occur in the case that both spliting fails * halfway and head page allocation also failed. In this * case __hugetlb_vmemmap_optimize_folio() would free memory * allowing more vmemmap remaps to occur. */ if (ret == -ENOMEM && !list_empty(&vmemmap_pages)) { flush_tlb_all(); free_vmemmap_page_list(&vmemmap_pages); INIT_LIST_HEAD(&vmemmap_pages); __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, VMEMMAP_REMAP_NO_TLB_FLUSH); } } flush_tlb_all(); free_vmemmap_page_list(&vmemmap_pages); } static struct ctl_table hugetlb_vmemmap_sysctls[] = { { .procname = "hugetlb_optimize_vmemmap", .data = &vmemmap_optimize_enabled, .maxlen = sizeof(vmemmap_optimize_enabled), .mode = 0644, .proc_handler = proc_dobool, }, }; static int __init hugetlb_vmemmap_init(void) { const struct hstate *h; /* HUGETLB_VMEMMAP_RESERVE_SIZE should cover all used struct pages */ BUILD_BUG_ON(__NR_USED_SUBPAGE > HUGETLB_VMEMMAP_RESERVE_PAGES); for_each_hstate(h) { if (hugetlb_vmemmap_optimizable(h)) { register_sysctl_init("vm", hugetlb_vmemmap_sysctls); break; } } return 0; } late_initcall(hugetlb_vmemmap_init); |
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_GENERIC_BITOPS___FFS_H_ #define _ASM_GENERIC_BITOPS___FFS_H_ #include <asm/types.h> /** * generic___ffs - find first bit in word. * @word: The word to search * * Undefined if no bit exists, so code should check against 0 first. */ static __always_inline unsigned int generic___ffs(unsigned long word) { unsigned int num = 0; #if BITS_PER_LONG == 64 if ((word & 0xffffffff) == 0) { num += 32; word >>= 32; } #endif if ((word & 0xffff) == 0) { num += 16; word >>= 16; } if ((word & 0xff) == 0) { num += 8; word >>= 8; } if ((word & 0xf) == 0) { num += 4; word >>= 4; } if ((word & 0x3) == 0) { num += 2; word >>= 2; } if ((word & 0x1) == 0) num += 1; return num; } #ifndef __HAVE_ARCH___FFS #define __ffs(word) generic___ffs(word) #endif #endif /* _ASM_GENERIC_BITOPS___FFS_H_ */ |
| 72 73 73 73 73 73 73 73 73 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 | // SPDX-License-Identifier: GPL-2.0-only /* * Common framework for low-level network console, dump, and debugger code * * Sep 8 2003 Matt Mackall <mpm@selenic.com> * * based on the netconsole code from: * * Copyright (C) 2001 Ingo Molnar <mingo@redhat.com> * Copyright (C) 2002 Red Hat, Inc. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/moduleparam.h> #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/string.h> #include <linux/if_arp.h> #include <linux/inetdevice.h> #include <linux/inet.h> #include <linux/interrupt.h> #include <linux/netpoll.h> #include <linux/sched.h> #include <linux/delay.h> #include <linux/rcupdate.h> #include <linux/workqueue.h> #include <linux/slab.h> #include <linux/export.h> #include <linux/if_vlan.h> #include <net/tcp.h> #include <net/udp.h> #include <net/addrconf.h> #include <net/ndisc.h> #include <net/ip6_checksum.h> #include <asm/unaligned.h> #include <trace/events/napi.h> #include <linux/kconfig.h> /* * We maintain a small pool of fully-sized skbs, to make sure the * message gets out even in extreme OOM situations. */ #define MAX_UDP_CHUNK 1460 #define MAX_SKBS 32 static struct sk_buff_head skb_pool; DEFINE_STATIC_SRCU(netpoll_srcu); #define USEC_PER_POLL 50 #define MAX_SKB_SIZE \ (sizeof(struct ethhdr) + \ sizeof(struct iphdr) + \ sizeof(struct udphdr) + \ MAX_UDP_CHUNK) static void zap_completion_queue(void); static unsigned int carrier_timeout = 4; module_param(carrier_timeout, uint, 0644); #define np_info(np, fmt, ...) \ pr_info("%s: " fmt, np->name, ##__VA_ARGS__) #define np_err(np, fmt, ...) \ pr_err("%s: " fmt, np->name, ##__VA_ARGS__) #define np_notice(np, fmt, ...) \ pr_notice("%s: " fmt, np->name, ##__VA_ARGS__) static netdev_tx_t netpoll_start_xmit(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq) { netdev_tx_t status = NETDEV_TX_OK; netdev_features_t features; features = netif_skb_features(skb); if (skb_vlan_tag_present(skb) && !vlan_hw_offload_capable(features, skb->vlan_proto)) { skb = __vlan_hwaccel_push_inside(skb); if (unlikely(!skb)) { /* This is actually a packet drop, but we * don't want the code that calls this * function to try and operate on a NULL skb. */ goto out; } } status = netdev_start_xmit(skb, dev, txq, false); out: return status; } static void queue_process(struct work_struct *work) { struct netpoll_info *npinfo = container_of(work, struct netpoll_info, tx_work.work); struct sk_buff *skb; unsigned long flags; while ((skb = skb_dequeue(&npinfo->txq))) { struct net_device *dev = skb->dev; struct netdev_queue *txq; unsigned int q_index; if (!netif_device_present(dev) || !netif_running(dev)) { kfree_skb(skb); continue; } local_irq_save(flags); /* check if skb->queue_mapping is still valid */ q_index = skb_get_queue_mapping(skb); if (unlikely(q_index >= dev->real_num_tx_queues)) { q_index = q_index % dev->real_num_tx_queues; skb_set_queue_mapping(skb, q_index); } txq = netdev_get_tx_queue(dev, q_index); HARD_TX_LOCK(dev, txq, smp_processor_id()); if (netif_xmit_frozen_or_stopped(txq) || !dev_xmit_complete(netpoll_start_xmit(skb, dev, txq))) { skb_queue_head(&npinfo->txq, skb); HARD_TX_UNLOCK(dev, txq); local_irq_restore(flags); schedule_delayed_work(&npinfo->tx_work, HZ/10); return; } HARD_TX_UNLOCK(dev, txq); local_irq_restore(flags); } } static int netif_local_xmit_active(struct net_device *dev) { int i; for (i = 0; i < dev->num_tx_queues; i++) { struct netdev_queue *txq = netdev_get_tx_queue(dev, i); if (READ_ONCE(txq->xmit_lock_owner) == smp_processor_id()) return 1; } return 0; } static void poll_one_napi(struct napi_struct *napi) { int work; /* If we set this bit but see that it has already been set, * that indicates that napi has been disabled and we need * to abort this operation */ if (test_and_set_bit(NAPI_STATE_NPSVC, &napi->state)) return; /* We explicilty pass the polling call a budget of 0 to * indicate that we are clearing the Tx path only. */ work = napi->poll(napi, 0); WARN_ONCE(work, "%pS exceeded budget in poll\n", napi->poll); trace_napi_poll(napi, work, 0); clear_bit(NAPI_STATE_NPSVC, &napi->state); } static void poll_napi(struct net_device *dev) { struct napi_struct *napi; int cpu = smp_processor_id(); list_for_each_entry_rcu(napi, &dev->napi_list, dev_list) { if (cmpxchg(&napi->poll_owner, -1, cpu) == -1) { poll_one_napi(napi); smp_store_release(&napi->poll_owner, -1); } } } void netpoll_poll_dev(struct net_device *dev) { struct netpoll_info *ni = rcu_dereference_bh(dev->npinfo); const struct net_device_ops *ops; /* Don't do any rx activity if the dev_lock mutex is held * the dev_open/close paths use this to block netpoll activity * while changing device state */ if (!ni || down_trylock(&ni->dev_lock)) return; /* Some drivers will take the same locks in poll and xmit, * we can't poll if local CPU is already in xmit. */ if (!netif_running(dev) || netif_local_xmit_active(dev)) { up(&ni->dev_lock); return; } ops = dev->netdev_ops; if (ops->ndo_poll_controller) ops->ndo_poll_controller(dev); poll_napi(dev); up(&ni->dev_lock); zap_completion_queue(); } EXPORT_SYMBOL(netpoll_poll_dev); void netpoll_poll_disable(struct net_device *dev) { struct netpoll_info *ni; int idx; might_sleep(); idx = srcu_read_lock(&netpoll_srcu); ni = srcu_dereference(dev->npinfo, &netpoll_srcu); if (ni) down(&ni->dev_lock); srcu_read_unlock(&netpoll_srcu, idx); } EXPORT_SYMBOL(netpoll_poll_disable); void netpoll_poll_enable(struct net_device *dev) { struct netpoll_info *ni; rcu_read_lock(); ni = rcu_dereference(dev->npinfo); if (ni) up(&ni->dev_lock); rcu_read_unlock(); } EXPORT_SYMBOL(netpoll_poll_enable); static void refill_skbs(void) { struct sk_buff *skb; unsigned long flags; spin_lock_irqsave(&skb_pool.lock, flags); while (skb_pool.qlen < MAX_SKBS) { skb = alloc_skb(MAX_SKB_SIZE, GFP_ATOMIC); if (!skb) break; __skb_queue_tail(&skb_pool, skb); } spin_unlock_irqrestore(&skb_pool.lock, flags); } static void zap_completion_queue(void) { unsigned long flags; struct softnet_data *sd = &get_cpu_var(softnet_data); if (sd->completion_queue) { struct sk_buff *clist; local_irq_save(flags); clist = sd->completion_queue; sd->completion_queue = NULL; local_irq_restore(flags); while (clist != NULL) { struct sk_buff *skb = clist; clist = clist->next; if (!skb_irq_freeable(skb)) { refcount_set(&skb->users, 1); dev_kfree_skb_any(skb); /* put this one back */ } else { __kfree_skb(skb); } } } put_cpu_var(softnet_data); } static struct sk_buff *find_skb(struct netpoll *np, int len, int reserve) { int count = 0; struct sk_buff *skb; zap_completion_queue(); refill_skbs(); repeat: skb = alloc_skb(len, GFP_ATOMIC); if (!skb) skb = skb_dequeue(&skb_pool); if (!skb) { if (++count < 10) { netpoll_poll_dev(np->dev); goto repeat; } return NULL; } refcount_set(&skb->users, 1); skb_reserve(skb, reserve); return skb; } static int netpoll_owner_active(struct net_device *dev) { struct napi_struct *napi; list_for_each_entry_rcu(napi, &dev->napi_list, dev_list) { if (READ_ONCE(napi->poll_owner) == smp_processor_id()) return 1; } return 0; } /* call with IRQ disabled */ static netdev_tx_t __netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) { netdev_tx_t status = NETDEV_TX_BUSY; struct net_device *dev; unsigned long tries; /* It is up to the caller to keep npinfo alive. */ struct netpoll_info *npinfo; lockdep_assert_irqs_disabled(); dev = np->dev; npinfo = rcu_dereference_bh(dev->npinfo); if (!npinfo || !netif_running(dev) || !netif_device_present(dev)) { dev_kfree_skb_irq(skb); return NET_XMIT_DROP; } /* don't get messages out of order, and no recursion */ if (skb_queue_len(&npinfo->txq) == 0 && !netpoll_owner_active(dev)) { struct netdev_queue *txq; txq = netdev_core_pick_tx(dev, skb, NULL); /* try until next clock tick */ for (tries = jiffies_to_usecs(1)/USEC_PER_POLL; tries > 0; --tries) { if (HARD_TX_TRYLOCK(dev, txq)) { if (!netif_xmit_stopped(txq)) status = netpoll_start_xmit(skb, dev, txq); HARD_TX_UNLOCK(dev, txq); if (dev_xmit_complete(status)) break; } /* tickle device maybe there is some cleanup */ netpoll_poll_dev(np->dev); udelay(USEC_PER_POLL); } WARN_ONCE(!irqs_disabled(), "netpoll_send_skb_on_dev(): %s enabled interrupts in poll (%pS)\n", dev->name, dev->netdev_ops->ndo_start_xmit); } if (!dev_xmit_complete(status)) { skb_queue_tail(&npinfo->txq, skb); schedule_delayed_work(&npinfo->tx_work,0); } return NETDEV_TX_OK; } netdev_tx_t netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) { unsigned long flags; netdev_tx_t ret; if (unlikely(!np)) { dev_kfree_skb_irq(skb); ret = NET_XMIT_DROP; } else { local_irq_save(flags); ret = __netpoll_send_skb(np, skb); local_irq_restore(flags); } return ret; } EXPORT_SYMBOL(netpoll_send_skb); void netpoll_send_udp(struct netpoll *np, const char *msg, int len) { int total_len, ip_len, udp_len; struct sk_buff *skb; struct udphdr *udph; struct iphdr *iph; struct ethhdr *eth; static atomic_t ip_ident; struct ipv6hdr *ip6h; if (!IS_ENABLED(CONFIG_PREEMPT_RT)) WARN_ON_ONCE(!irqs_disabled()); udp_len = len + sizeof(*udph); if (np->ipv6) ip_len = udp_len + sizeof(*ip6h); else ip_len = udp_len + sizeof(*iph); total_len = ip_len + LL_RESERVED_SPACE(np->dev); skb = find_skb(np, total_len + np->dev->needed_tailroom, total_len - len); if (!skb) return; skb_copy_to_linear_data(skb, msg, len); skb_put(skb, len); skb_push(skb, sizeof(*udph)); skb_reset_transport_header(skb); udph = udp_hdr(skb); udph->source = htons(np->local_port); udph->dest = htons(np->remote_port); udph->len = htons(udp_len); if (np->ipv6) { udph->check = 0; udph->check = csum_ipv6_magic(&np->local_ip.in6, &np->remote_ip.in6, udp_len, IPPROTO_UDP, csum_partial(udph, udp_len, 0)); if (udph->check == 0) udph->check = CSUM_MANGLED_0; skb_push(skb, sizeof(*ip6h)); skb_reset_network_header(skb); ip6h = ipv6_hdr(skb); /* ip6h->version = 6; ip6h->priority = 0; */ *(unsigned char *)ip6h = 0x60; ip6h->flow_lbl[0] = 0; ip6h->flow_lbl[1] = 0; ip6h->flow_lbl[2] = 0; ip6h->payload_len = htons(sizeof(struct udphdr) + len); ip6h->nexthdr = IPPROTO_UDP; ip6h->hop_limit = 32; ip6h->saddr = np->local_ip.in6; ip6h->daddr = np->remote_ip.in6; eth = skb_push(skb, ETH_HLEN); skb_reset_mac_header(skb); skb->protocol = eth->h_proto = htons(ETH_P_IPV6); } else { udph->check = 0; udph->check = csum_tcpudp_magic(np->local_ip.ip, np->remote_ip.ip, udp_len, IPPROTO_UDP, csum_partial(udph, udp_len, 0)); if (udph->check == 0) udph->check = CSUM_MANGLED_0; skb_push(skb, sizeof(*iph)); skb_reset_network_header(skb); iph = ip_hdr(skb); /* iph->version = 4; iph->ihl = 5; */ *(unsigned char *)iph = 0x45; iph->tos = 0; put_unaligned(htons(ip_len), &(iph->tot_len)); iph->id = htons(atomic_inc_return(&ip_ident)); iph->frag_off = 0; iph->ttl = 64; iph->protocol = IPPROTO_UDP; iph->check = 0; put_unaligned(np->local_ip.ip, &(iph->saddr)); put_unaligned(np->remote_ip.ip, &(iph->daddr)); iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); eth = skb_push(skb, ETH_HLEN); skb_reset_mac_header(skb); skb->protocol = eth->h_proto = htons(ETH_P_IP); } ether_addr_copy(eth->h_source, np->dev->dev_addr); ether_addr_copy(eth->h_dest, np->remote_mac); skb->dev = np->dev; netpoll_send_skb(np, skb); } EXPORT_SYMBOL(netpoll_send_udp); void netpoll_print_options(struct netpoll *np) { np_info(np, "local port %d\n", np->local_port); if (np->ipv6) np_info(np, "local IPv6 address %pI6c\n", &np->local_ip.in6); else np_info(np, "local IPv4 address %pI4\n", &np->local_ip.ip); np_info(np, "interface '%s'\n", np->dev_name); np_info(np, "remote port %d\n", np->remote_port); if (np->ipv6) np_info(np, "remote IPv6 address %pI6c\n", &np->remote_ip.in6); else np_info(np, "remote IPv4 address %pI4\n", &np->remote_ip.ip); np_info(np, "remote ethernet address %pM\n", np->remote_mac); } EXPORT_SYMBOL(netpoll_print_options); static int netpoll_parse_ip_addr(const char *str, union inet_addr *addr) { const char *end; if (!strchr(str, ':') && in4_pton(str, -1, (void *)addr, -1, &end) > 0) { if (!*end) return 0; } if (in6_pton(str, -1, addr->in6.s6_addr, -1, &end) > 0) { #if IS_ENABLED(CONFIG_IPV6) if (!*end) return 1; #else return -1; #endif } return -1; } int netpoll_parse_options(struct netpoll *np, char *opt) { char *cur=opt, *delim; int ipv6; bool ipversion_set = false; if (*cur != '@') { if ((delim = strchr(cur, '@')) == NULL) goto parse_failed; *delim = 0; if (kstrtou16(cur, 10, &np->local_port)) goto parse_failed; cur = delim; } cur++; if (*cur != '/') { ipversion_set = true; if ((delim = strchr(cur, '/')) == NULL) goto parse_failed; *delim = 0; ipv6 = netpoll_parse_ip_addr(cur, &np->local_ip); if (ipv6 < 0) goto parse_failed; else np->ipv6 = (bool)ipv6; cur = delim; } cur++; if (*cur != ',') { /* parse out dev name */ if ((delim = strchr(cur, ',')) == NULL) goto parse_failed; *delim = 0; strscpy(np->dev_name, cur, sizeof(np->dev_name)); cur = delim; } cur++; if (*cur != '@') { /* dst port */ if ((delim = strchr(cur, '@')) == NULL) goto parse_failed; *delim = 0; if (*cur == ' ' || *cur == '\t') np_info(np, "warning: whitespace is not allowed\n"); if (kstrtou16(cur, 10, &np->remote_port)) goto parse_failed; cur = delim; } cur++; /* dst ip */ if ((delim = strchr(cur, '/')) == NULL) goto parse_failed; *delim = 0; ipv6 = netpoll_parse_ip_addr(cur, &np->remote_ip); if (ipv6 < 0) goto parse_failed; else if (ipversion_set && np->ipv6 != (bool)ipv6) goto parse_failed; else np->ipv6 = (bool)ipv6; cur = delim + 1; if (*cur != 0) { /* MAC address */ if (!mac_pton(cur, np->remote_mac)) goto parse_failed; } netpoll_print_options(np); return 0; parse_failed: np_info(np, "couldn't parse config at '%s'!\n", cur); return -1; } EXPORT_SYMBOL(netpoll_parse_options); int __netpoll_setup(struct netpoll *np, struct net_device *ndev) { struct netpoll_info *npinfo; const struct net_device_ops *ops; int err; np->dev = ndev; strscpy(np->dev_name, ndev->name, IFNAMSIZ); if (ndev->priv_flags & IFF_DISABLE_NETPOLL) { np_err(np, "%s doesn't support polling, aborting\n", np->dev_name); err = -ENOTSUPP; goto out; } if (!ndev->npinfo) { npinfo = kmalloc(sizeof(*npinfo), GFP_KERNEL); if (!npinfo) { err = -ENOMEM; goto out; } sema_init(&npinfo->dev_lock, 1); skb_queue_head_init(&npinfo->txq); INIT_DELAYED_WORK(&npinfo->tx_work, queue_process); refcount_set(&npinfo->refcnt, 1); ops = np->dev->netdev_ops; if (ops->ndo_netpoll_setup) { err = ops->ndo_netpoll_setup(ndev, npinfo); if (err) goto free_npinfo; } } else { npinfo = rtnl_dereference(ndev->npinfo); refcount_inc(&npinfo->refcnt); } npinfo->netpoll = np; /* last thing to do is link it to the net device structure */ rcu_assign_pointer(ndev->npinfo, npinfo); return 0; free_npinfo: kfree(npinfo); out: return err; } EXPORT_SYMBOL_GPL(__netpoll_setup); int netpoll_setup(struct netpoll *np) { struct net_device *ndev = NULL; struct in_device *in_dev; int err; rtnl_lock(); if (np->dev_name[0]) { struct net *net = current->nsproxy->net_ns; ndev = __dev_get_by_name(net, np->dev_name); } if (!ndev) { np_err(np, "%s doesn't exist, aborting\n", np->dev_name); err = -ENODEV; goto unlock; } netdev_hold(ndev, &np->dev_tracker, GFP_KERNEL); if (netdev_master_upper_dev_get(ndev)) { np_err(np, "%s is a slave device, aborting\n", np->dev_name); err = -EBUSY; goto put; } if (!netif_running(ndev)) { unsigned long atmost; np_info(np, "device %s not up yet, forcing it\n", np->dev_name); err = dev_open(ndev, NULL); if (err) { np_err(np, "failed to open %s\n", ndev->name); goto put; } rtnl_unlock(); atmost = jiffies + carrier_timeout * HZ; while (!netif_carrier_ok(ndev)) { if (time_after(jiffies, atmost)) { np_notice(np, "timeout waiting for carrier\n"); break; } msleep(1); } rtnl_lock(); } if (!np->local_ip.ip) { if (!np->ipv6) { const struct in_ifaddr *ifa; in_dev = __in_dev_get_rtnl(ndev); if (!in_dev) goto put_noaddr; ifa = rtnl_dereference(in_dev->ifa_list); if (!ifa) { put_noaddr: np_err(np, "no IP address for %s, aborting\n", np->dev_name); err = -EDESTADDRREQ; goto put; } np->local_ip.ip = ifa->ifa_local; np_info(np, "local IP %pI4\n", &np->local_ip.ip); } else { #if IS_ENABLED(CONFIG_IPV6) struct inet6_dev *idev; err = -EDESTADDRREQ; idev = __in6_dev_get(ndev); if (idev) { struct inet6_ifaddr *ifp; read_lock_bh(&idev->lock); list_for_each_entry(ifp, &idev->addr_list, if_list) { if (!!(ipv6_addr_type(&ifp->addr) & IPV6_ADDR_LINKLOCAL) != !!(ipv6_addr_type(&np->remote_ip.in6) & IPV6_ADDR_LINKLOCAL)) continue; np->local_ip.in6 = ifp->addr; err = 0; break; } read_unlock_bh(&idev->lock); } if (err) { np_err(np, "no IPv6 address for %s, aborting\n", np->dev_name); goto put; } else np_info(np, "local IPv6 %pI6c\n", &np->local_ip.in6); #else np_err(np, "IPv6 is not supported %s, aborting\n", np->dev_name); err = -EINVAL; goto put; #endif } } /* fill up the skb queue */ refill_skbs(); err = __netpoll_setup(np, ndev); if (err) goto put; rtnl_unlock(); return 0; put: netdev_put(ndev, &np->dev_tracker); unlock: rtnl_unlock(); return err; } EXPORT_SYMBOL(netpoll_setup); static int __init netpoll_init(void) { skb_queue_head_init(&skb_pool); return 0; } core_initcall(netpoll_init); static void rcu_cleanup_netpoll_info(struct rcu_head *rcu_head) { struct netpoll_info *npinfo = container_of(rcu_head, struct netpoll_info, rcu); skb_queue_purge(&npinfo->txq); /* we can't call cancel_delayed_work_sync here, as we are in softirq */ cancel_delayed_work(&npinfo->tx_work); /* clean after last, unfinished work */ __skb_queue_purge(&npinfo->txq); /* now cancel it again */ cancel_delayed_work(&npinfo->tx_work); kfree(npinfo); } void __netpoll_cleanup(struct netpoll *np) { struct netpoll_info *npinfo; npinfo = rtnl_dereference(np->dev->npinfo); if (!npinfo) return; synchronize_srcu(&netpoll_srcu); if (refcount_dec_and_test(&npinfo->refcnt)) { const struct net_device_ops *ops; ops = np->dev->netdev_ops; if (ops->ndo_netpoll_cleanup) ops->ndo_netpoll_cleanup(np->dev); RCU_INIT_POINTER(np->dev->npinfo, NULL); call_rcu(&npinfo->rcu, rcu_cleanup_netpoll_info); } else RCU_INIT_POINTER(np->dev->npinfo, NULL); } EXPORT_SYMBOL_GPL(__netpoll_cleanup); void __netpoll_free(struct netpoll *np) { ASSERT_RTNL(); /* Wait for transmitting packets to finish before freeing. */ synchronize_rcu(); __netpoll_cleanup(np); kfree(np); } EXPORT_SYMBOL_GPL(__netpoll_free); void netpoll_cleanup(struct netpoll *np) { rtnl_lock(); if (!np->dev) goto out; __netpoll_cleanup(np); netdev_put(np->dev, &np->dev_tracker); np->dev = NULL; out: rtnl_unlock(); } EXPORT_SYMBOL(netpoll_cleanup); |
| 45 45 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 | #ifndef BLK_THROTTLE_H #define BLK_THROTTLE_H #include "blk-cgroup-rwstat.h" /* * To implement hierarchical throttling, throtl_grps form a tree and bios * are dispatched upwards level by level until they reach the top and get * issued. When dispatching bios from the children and local group at each * level, if the bios are dispatched into a single bio_list, there's a risk * of a local or child group which can queue many bios at once filling up * the list starving others. * * To avoid such starvation, dispatched bios are queued separately * according to where they came from. When they are again dispatched to * the parent, they're popped in round-robin order so that no single source * hogs the dispatch window. * * throtl_qnode is used to keep the queued bios separated by their sources. * Bios are queued to throtl_qnode which in turn is queued to * throtl_service_queue and then dispatched in round-robin order. * * It's also used to track the reference counts on blkg's. A qnode always * belongs to a throtl_grp and gets queued on itself or the parent, so * incrementing the reference of the associated throtl_grp when a qnode is * queued and decrementing when dequeued is enough to keep the whole blkg * tree pinned while bios are in flight. */ struct throtl_qnode { struct list_head node; /* service_queue->queued[] */ struct bio_list bios; /* queued bios */ struct throtl_grp *tg; /* tg this qnode belongs to */ }; struct throtl_service_queue { struct throtl_service_queue *parent_sq; /* the parent service_queue */ /* * Bios queued directly to this service_queue or dispatched from * children throtl_grp's. */ struct list_head queued[2]; /* throtl_qnode [READ/WRITE] */ unsigned int nr_queued[2]; /* number of queued bios */ /* * RB tree of active children throtl_grp's, which are sorted by * their ->disptime. */ struct rb_root_cached pending_tree; /* RB tree of active tgs */ unsigned int nr_pending; /* # queued in the tree */ unsigned long first_pending_disptime; /* disptime of the first tg */ struct timer_list pending_timer; /* fires on first_pending_disptime */ }; enum tg_state_flags { THROTL_TG_PENDING = 1 << 0, /* on parent's pending tree */ THROTL_TG_WAS_EMPTY = 1 << 1, /* bio_lists[] became non-empty */ THROTL_TG_CANCELING = 1 << 2, /* starts to cancel bio */ }; struct throtl_grp { /* must be the first member */ struct blkg_policy_data pd; /* active throtl group service_queue member */ struct rb_node rb_node; /* throtl_data this group belongs to */ struct throtl_data *td; /* this group's service queue */ struct throtl_service_queue service_queue; /* * qnode_on_self is used when bios are directly queued to this * throtl_grp so that local bios compete fairly with bios * dispatched from children. qnode_on_parent is used when bios are * dispatched from this throtl_grp into its parent and will compete * with the sibling qnode_on_parents and the parent's * qnode_on_self. */ struct throtl_qnode qnode_on_self[2]; struct throtl_qnode qnode_on_parent[2]; /* * Dispatch time in jiffies. This is the estimated time when group * will unthrottle and is ready to dispatch more bio. It is used as * key to sort active groups in service tree. */ unsigned long disptime; unsigned int flags; /* are there any throtl rules between this group and td? */ bool has_rules_bps[2]; bool has_rules_iops[2]; /* bytes per second rate limits */ uint64_t bps[2]; /* IOPS limits */ unsigned int iops[2]; /* Number of bytes dispatched in current slice */ uint64_t bytes_disp[2]; /* Number of bio's dispatched in current slice */ unsigned int io_disp[2]; unsigned long last_low_overflow_time[2]; uint64_t last_bytes_disp[2]; unsigned int last_io_disp[2]; /* * The following two fields are updated when new configuration is * submitted while some bios are still throttled, they record how many * bytes/ios are waited already in previous configuration, and they will * be used to calculate wait time under new configuration. */ long long carryover_bytes[2]; int carryover_ios[2]; unsigned long last_check_time; /* When did we start a new slice */ unsigned long slice_start[2]; unsigned long slice_end[2]; struct blkg_rwstat stat_bytes; struct blkg_rwstat stat_ios; }; extern struct blkcg_policy blkcg_policy_throtl; static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd) { return pd ? container_of(pd, struct throtl_grp, pd) : NULL; } static inline struct throtl_grp *blkg_to_tg(struct blkcg_gq *blkg) { return pd_to_tg(blkg_to_pd(blkg, &blkcg_policy_throtl)); } /* * Internal throttling interface */ #ifndef CONFIG_BLK_DEV_THROTTLING static inline void blk_throtl_exit(struct gendisk *disk) { } static inline bool blk_throtl_bio(struct bio *bio) { return false; } static inline void blk_throtl_cancel_bios(struct gendisk *disk) { } #else /* CONFIG_BLK_DEV_THROTTLING */ void blk_throtl_exit(struct gendisk *disk); bool __blk_throtl_bio(struct bio *bio); void blk_throtl_cancel_bios(struct gendisk *disk); static inline bool blk_throtl_activated(struct request_queue *q) { return q->td != NULL; } static inline bool blk_should_throtl(struct bio *bio) { struct throtl_grp *tg; int rw = bio_data_dir(bio); /* * This is called under bio_queue_enter(), and it's synchronized with * the activation of blk-throtl, which is protected by * blk_mq_freeze_queue(). */ if (!blk_throtl_activated(bio->bi_bdev->bd_queue)) return false; tg = blkg_to_tg(bio->bi_blkg); if (!cgroup_subsys_on_dfl(io_cgrp_subsys)) { if (!bio_flagged(bio, BIO_CGROUP_ACCT)) { bio_set_flag(bio, BIO_CGROUP_ACCT); blkg_rwstat_add(&tg->stat_bytes, bio->bi_opf, bio->bi_iter.bi_size); } blkg_rwstat_add(&tg->stat_ios, bio->bi_opf, 1); } /* iops limit is always counted */ if (tg->has_rules_iops[rw]) return true; if (tg->has_rules_bps[rw] && !bio_flagged(bio, BIO_BPS_THROTTLED)) return true; return false; } static inline bool blk_throtl_bio(struct bio *bio) { if (!blk_should_throtl(bio)) return false; return __blk_throtl_bio(bio); } #endif /* CONFIG_BLK_DEV_THROTTLING */ #endif |
| 1 1 1 1 1 1 7 7 7 7 7 7 7 7 7 7 7 1 1 1 1 1 1 1 1 1 1 8 8 8 8 8 20 20 20 20 20 20 20 10 20 20 20 20 20 20 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 | /* * Mu-Law conversion Plug-In Interface * Copyright (c) 1999 by Jaroslav Kysela <perex@perex.cz> * Uros Bizjak <uros@kss-loka.si> * * Based on reference implementation by Sun Microsystems, Inc. * * This library is free software; you can redistribute it and/or modify * it under the terms of the GNU Library General Public License as * published by the Free Software Foundation; either version 2 of * the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Library General Public License for more details. * * You should have received a copy of the GNU Library General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * */ #include <linux/time.h> #include <sound/core.h> #include <sound/pcm.h> #include "pcm_plugin.h" #define SIGN_BIT (0x80) /* Sign bit for a u-law byte. */ #define QUANT_MASK (0xf) /* Quantization field mask. */ #define NSEGS (8) /* Number of u-law segments. */ #define SEG_SHIFT (4) /* Left shift for segment number. */ #define SEG_MASK (0x70) /* Segment field mask. */ static inline int val_seg(int val) { int r = 0; val >>= 7; if (val & 0xf0) { val >>= 4; r += 4; } if (val & 0x0c) { val >>= 2; r += 2; } if (val & 0x02) r += 1; return r; } #define BIAS (0x84) /* Bias for linear code. */ /* * linear2ulaw() - Convert a linear PCM value to u-law * * In order to simplify the encoding process, the original linear magnitude * is biased by adding 33 which shifts the encoding range from (0 - 8158) to * (33 - 8191). The result can be seen in the following encoding table: * * Biased Linear Input Code Compressed Code * ------------------------ --------------- * 00000001wxyza 000wxyz * 0000001wxyzab 001wxyz * 000001wxyzabc 010wxyz * 00001wxyzabcd 011wxyz * 0001wxyzabcde 100wxyz * 001wxyzabcdef 101wxyz * 01wxyzabcdefg 110wxyz * 1wxyzabcdefgh 111wxyz * * Each biased linear code has a leading 1 which identifies the segment * number. The value of the segment number is equal to 7 minus the number * of leading 0's. The quantization interval is directly available as the * four bits wxyz. * The trailing bits (a - h) are ignored. * * Ordinarily the complement of the resulting code word is used for * transmission, and so the code word is complemented before it is returned. * * For further information see John C. Bellamy's Digital Telephony, 1982, * John Wiley & Sons, pps 98-111 and 472-476. */ static unsigned char linear2ulaw(int pcm_val) /* 2's complement (16-bit range) */ { int mask; int seg; unsigned char uval; /* Get the sign and the magnitude of the value. */ if (pcm_val < 0) { pcm_val = BIAS - pcm_val; mask = 0x7F; } else { pcm_val += BIAS; mask = 0xFF; } if (pcm_val > 0x7FFF) pcm_val = 0x7FFF; /* Convert the scaled magnitude to segment number. */ seg = val_seg(pcm_val); /* * Combine the sign, segment, quantization bits; * and complement the code word. */ uval = (seg << 4) | ((pcm_val >> (seg + 3)) & 0xF); return uval ^ mask; } /* * ulaw2linear() - Convert a u-law value to 16-bit linear PCM * * First, a biased linear code is derived from the code word. An unbiased * output can then be obtained by subtracting 33 from the biased code. * * Note that this function expects to be passed the complement of the * original code word. This is in keeping with ISDN conventions. */ static int ulaw2linear(unsigned char u_val) { int t; /* Complement to obtain normal u-law value. */ u_val = ~u_val; /* * Extract and bias the quantization bits. Then * shift up by the segment number and subtract out the bias. */ t = ((u_val & QUANT_MASK) << 3) + BIAS; t <<= ((unsigned)u_val & SEG_MASK) >> SEG_SHIFT; return ((u_val & SIGN_BIT) ? (BIAS - t) : (t - BIAS)); } /* * Basic Mu-Law plugin */ typedef void (*mulaw_f)(struct snd_pcm_plugin *plugin, const struct snd_pcm_plugin_channel *src_channels, struct snd_pcm_plugin_channel *dst_channels, snd_pcm_uframes_t frames); struct mulaw_priv { mulaw_f func; int cvt_endian; /* need endian conversion? */ unsigned int native_ofs; /* byte offset in native format */ unsigned int copy_ofs; /* byte offset in s16 format */ unsigned int native_bytes; /* byte size of the native format */ unsigned int copy_bytes; /* bytes to copy per conversion */ u16 flip; /* MSB flip for signedness, done after endian conversion */ }; static inline void cvt_s16_to_native(struct mulaw_priv *data, unsigned char *dst, u16 sample) { sample ^= data->flip; if (data->cvt_endian) sample = swab16(sample); if (data->native_bytes > data->copy_bytes) memset(dst, 0, data->native_bytes); memcpy(dst + data->native_ofs, (char *)&sample + data->copy_ofs, data->copy_bytes); } static void mulaw_decode(struct snd_pcm_plugin *plugin, const struct snd_pcm_plugin_channel *src_channels, struct snd_pcm_plugin_channel *dst_channels, snd_pcm_uframes_t frames) { struct mulaw_priv *data = (struct mulaw_priv *)plugin->extra_data; int channel; int nchannels = plugin->src_format.channels; for (channel = 0; channel < nchannels; ++channel) { char *src; char *dst; int src_step, dst_step; snd_pcm_uframes_t frames1; if (!src_channels[channel].enabled) { if (dst_channels[channel].wanted) snd_pcm_area_silence(&dst_channels[channel].area, 0, frames, plugin->dst_format.format); dst_channels[channel].enabled = 0; continue; } dst_channels[channel].enabled = 1; src = src_channels[channel].area.addr + src_channels[channel].area.first / 8; dst = dst_channels[channel].area.addr + dst_channels[channel].area.first / 8; src_step = src_channels[channel].area.step / 8; dst_step = dst_channels[channel].area.step / 8; frames1 = frames; while (frames1-- > 0) { signed short sample = ulaw2linear(*src); cvt_s16_to_native(data, dst, sample); src += src_step; dst += dst_step; } } } static inline signed short cvt_native_to_s16(struct mulaw_priv *data, unsigned char *src) { u16 sample = 0; memcpy((char *)&sample + data->copy_ofs, src + data->native_ofs, data->copy_bytes); if (data->cvt_endian) sample = swab16(sample); sample ^= data->flip; return (signed short)sample; } static void mulaw_encode(struct snd_pcm_plugin *plugin, const struct snd_pcm_plugin_channel *src_channels, struct snd_pcm_plugin_channel *dst_channels, snd_pcm_uframes_t frames) { struct mulaw_priv *data = (struct mulaw_priv *)plugin->extra_data; int channel; int nchannels = plugin->src_format.channels; for (channel = 0; channel < nchannels; ++channel) { char *src; char *dst; int src_step, dst_step; snd_pcm_uframes_t frames1; if (!src_channels[channel].enabled) { if (dst_channels[channel].wanted) snd_pcm_area_silence(&dst_channels[channel].area, 0, frames, plugin->dst_format.format); dst_channels[channel].enabled = 0; continue; } dst_channels[channel].enabled = 1; src = src_channels[channel].area.addr + src_channels[channel].area.first / 8; dst = dst_channels[channel].area.addr + dst_channels[channel].area.first / 8; src_step = src_channels[channel].area.step / 8; dst_step = dst_channels[channel].area.step / 8; frames1 = frames; while (frames1-- > 0) { signed short sample = cvt_native_to_s16(data, src); *dst = linear2ulaw(sample); src += src_step; dst += dst_step; } } } static snd_pcm_sframes_t mulaw_transfer(struct snd_pcm_plugin *plugin, const struct snd_pcm_plugin_channel *src_channels, struct snd_pcm_plugin_channel *dst_channels, snd_pcm_uframes_t frames) { struct mulaw_priv *data; if (snd_BUG_ON(!plugin || !src_channels || !dst_channels)) return -ENXIO; if (frames == 0) return 0; #ifdef CONFIG_SND_DEBUG { unsigned int channel; for (channel = 0; channel < plugin->src_format.channels; channel++) { if (snd_BUG_ON(src_channels[channel].area.first % 8 || src_channels[channel].area.step % 8)) return -ENXIO; if (snd_BUG_ON(dst_channels[channel].area.first % 8 || dst_channels[channel].area.step % 8)) return -ENXIO; } } #endif if (frames > dst_channels[0].frames) frames = dst_channels[0].frames; data = (struct mulaw_priv *)plugin->extra_data; data->func(plugin, src_channels, dst_channels, frames); return frames; } static void init_data(struct mulaw_priv *data, snd_pcm_format_t format) { #ifdef SNDRV_LITTLE_ENDIAN data->cvt_endian = snd_pcm_format_big_endian(format) > 0; #else data->cvt_endian = snd_pcm_format_little_endian(format) > 0; #endif if (!snd_pcm_format_signed(format)) data->flip = 0x8000; data->native_bytes = snd_pcm_format_physical_width(format) / 8; data->copy_bytes = data->native_bytes < 2 ? 1 : 2; if (snd_pcm_format_little_endian(format)) { data->native_ofs = data->native_bytes - data->copy_bytes; data->copy_ofs = 2 - data->copy_bytes; } else { /* S24 in 4bytes need an 1 byte offset */ data->native_ofs = data->native_bytes - snd_pcm_format_width(format) / 8; } } int snd_pcm_plugin_build_mulaw(struct snd_pcm_substream *plug, struct snd_pcm_plugin_format *src_format, struct snd_pcm_plugin_format *dst_format, struct snd_pcm_plugin **r_plugin) { int err; struct mulaw_priv *data; struct snd_pcm_plugin *plugin; struct snd_pcm_plugin_format *format; mulaw_f func; if (snd_BUG_ON(!r_plugin)) return -ENXIO; *r_plugin = NULL; if (snd_BUG_ON(src_format->rate != dst_format->rate)) return -ENXIO; if (snd_BUG_ON(src_format->channels != dst_format->channels)) return -ENXIO; if (dst_format->format == SNDRV_PCM_FORMAT_MU_LAW) { format = src_format; func = mulaw_encode; } else if (src_format->format == SNDRV_PCM_FORMAT_MU_LAW) { format = dst_format; func = mulaw_decode; } else { snd_BUG(); return -EINVAL; } if (!snd_pcm_format_linear(format->format)) return -EINVAL; err = snd_pcm_plugin_build(plug, "Mu-Law<->linear conversion", src_format, dst_format, sizeof(struct mulaw_priv), &plugin); if (err < 0) return err; data = (struct mulaw_priv *)plugin->extra_data; data->func = func; init_data(data, format->format); plugin->transfer = mulaw_transfer; *r_plugin = plugin; return 0; } |
| 1 42 42 36 42 42 42 41 16 42 42 42 42 42 42 37 42 1 1 1 1 1 1 1 1 1 1 45 16 42 42 41 42 42 16 16 16 16 16 16 16 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 | // SPDX-License-Identifier: GPL-2.0-only /* * Power Management Quality of Service (PM QoS) support base. * * Copyright (C) 2020 Intel Corporation * * Authors: * Mark Gross <mgross@linux.intel.com> * Rafael J. Wysocki <rafael.j.wysocki@intel.com> * * Provided here is an interface for specifying PM QoS dependencies. It allows * entities depending on QoS constraints to register their requests which are * aggregated as appropriate to produce effective constraints (target values) * that can be monitored by entities needing to respect them, either by polling * or through a built-in notification mechanism. * * In addition to the basic functionality, more specific interfaces for managing * global CPU latency QoS requests and frequency QoS requests are provided. */ /*#define DEBUG*/ #include <linux/pm_qos.h> #include <linux/sched.h> #include <linux/spinlock.h> #include <linux/slab.h> #include <linux/time.h> #include <linux/fs.h> #include <linux/device.h> #include <linux/miscdevice.h> #include <linux/string.h> #include <linux/platform_device.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/debugfs.h> #include <linux/seq_file.h> #include <linux/uaccess.h> #include <linux/export.h> #include <trace/events/power.h> /* * locking rule: all changes to constraints or notifiers lists * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock * held, taken with _irqsave. One lock to rule them all */ static DEFINE_SPINLOCK(pm_qos_lock); /** * pm_qos_read_value - Return the current effective constraint value. * @c: List of PM QoS constraint requests. */ s32 pm_qos_read_value(struct pm_qos_constraints *c) { return READ_ONCE(c->target_value); } static int pm_qos_get_value(struct pm_qos_constraints *c) { if (plist_head_empty(&c->list)) return c->no_constraint_value; switch (c->type) { case PM_QOS_MIN: return plist_first(&c->list)->prio; case PM_QOS_MAX: return plist_last(&c->list)->prio; default: WARN(1, "Unknown PM QoS type in %s\n", __func__); return PM_QOS_DEFAULT_VALUE; } } static void pm_qos_set_value(struct pm_qos_constraints *c, s32 value) { WRITE_ONCE(c->target_value, value); } /** * pm_qos_update_target - Update a list of PM QoS constraint requests. * @c: List of PM QoS requests. * @node: Target list entry. * @action: Action to carry out (add, update or remove). * @value: New request value for the target list entry. * * Update the given list of PM QoS constraint requests, @c, by carrying an * @action involving the @node list entry and @value on it. * * The recognized values of @action are PM_QOS_ADD_REQ (store @value in @node * and add it to the list), PM_QOS_UPDATE_REQ (remove @node from the list, store * @value in it and add it to the list again), and PM_QOS_REMOVE_REQ (remove * @node from the list, ignore @value). * * Return: 1 if the aggregate constraint value has changed, 0 otherwise. */ int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node, enum pm_qos_req_action action, int value) { int prev_value, curr_value, new_value; unsigned long flags; spin_lock_irqsave(&pm_qos_lock, flags); prev_value = pm_qos_get_value(c); if (value == PM_QOS_DEFAULT_VALUE) new_value = c->default_value; else new_value = value; switch (action) { case PM_QOS_REMOVE_REQ: plist_del(node, &c->list); break; case PM_QOS_UPDATE_REQ: /* * To change the list, atomically remove, reinit with new value * and add, then see if the aggregate has changed. */ plist_del(node, &c->list); fallthrough; case PM_QOS_ADD_REQ: plist_node_init(node, new_value); plist_add(node, &c->list); break; default: /* no action */ ; } curr_value = pm_qos_get_value(c); pm_qos_set_value(c, curr_value); spin_unlock_irqrestore(&pm_qos_lock, flags); trace_pm_qos_update_target(action, prev_value, curr_value); if (prev_value == curr_value) return 0; if (c->notifiers) blocking_notifier_call_chain(c->notifiers, curr_value, NULL); return 1; } /** * pm_qos_flags_remove_req - Remove device PM QoS flags request. * @pqf: Device PM QoS flags set to remove the request from. * @req: Request to remove from the set. */ static void pm_qos_flags_remove_req(struct pm_qos_flags *pqf, struct pm_qos_flags_request *req) { s32 val = 0; list_del(&req->node); list_for_each_entry(req, &pqf->list, node) val |= req->flags; pqf->effective_flags = val; } /** * pm_qos_update_flags - Update a set of PM QoS flags. * @pqf: Set of PM QoS flags to update. * @req: Request to add to the set, to modify, or to remove from the set. * @action: Action to take on the set. * @val: Value of the request to add or modify. * * Return: 1 if the aggregate constraint value has changed, 0 otherwise. */ bool pm_qos_update_flags(struct pm_qos_flags *pqf, struct pm_qos_flags_request *req, enum pm_qos_req_action action, s32 val) { unsigned long irqflags; s32 prev_value, curr_value; spin_lock_irqsave(&pm_qos_lock, irqflags); prev_value = list_empty(&pqf->list) ? 0 : pqf->effective_flags; switch (action) { case PM_QOS_REMOVE_REQ: pm_qos_flags_remove_req(pqf, req); break; case PM_QOS_UPDATE_REQ: pm_qos_flags_remove_req(pqf, req); fallthrough; case PM_QOS_ADD_REQ: req->flags = val; INIT_LIST_HEAD(&req->node); list_add_tail(&req->node, &pqf->list); pqf->effective_flags |= val; break; default: /* no action */ ; } curr_value = list_empty(&pqf->list) ? 0 : pqf->effective_flags; spin_unlock_irqrestore(&pm_qos_lock, irqflags); trace_pm_qos_update_flags(action, prev_value, curr_value); return prev_value != curr_value; } #ifdef CONFIG_CPU_IDLE /* Definitions related to the CPU latency QoS. */ static struct pm_qos_constraints cpu_latency_constraints = { .list = PLIST_HEAD_INIT(cpu_latency_constraints.list), .target_value = PM_QOS_CPU_LATENCY_DEFAULT_VALUE, .default_value = PM_QOS_CPU_LATENCY_DEFAULT_VALUE, .no_constraint_value = PM_QOS_CPU_LATENCY_DEFAULT_VALUE, .type = PM_QOS_MIN, }; static inline bool cpu_latency_qos_value_invalid(s32 value) { return value < 0 && value != PM_QOS_DEFAULT_VALUE; } /** * cpu_latency_qos_limit - Return current system-wide CPU latency QoS limit. */ s32 cpu_latency_qos_limit(void) { return pm_qos_read_value(&cpu_latency_constraints); } /** * cpu_latency_qos_request_active - Check the given PM QoS request. * @req: PM QoS request to check. * * Return: 'true' if @req has been added to the CPU latency QoS list, 'false' * otherwise. */ bool cpu_latency_qos_request_active(struct pm_qos_request *req) { return req->qos == &cpu_latency_constraints; } EXPORT_SYMBOL_GPL(cpu_latency_qos_request_active); static void cpu_latency_qos_apply(struct pm_qos_request *req, enum pm_qos_req_action action, s32 value) { int ret = pm_qos_update_target(req->qos, &req->node, action, value); if (ret > 0) wake_up_all_idle_cpus(); } /** * cpu_latency_qos_add_request - Add new CPU latency QoS request. * @req: Pointer to a preallocated handle. * @value: Requested constraint value. * * Use @value to initialize the request handle pointed to by @req, insert it as * a new entry to the CPU latency QoS list and recompute the effective QoS * constraint for that list. * * Callers need to save the handle for later use in updates and removal of the * QoS request represented by it. */ void cpu_latency_qos_add_request(struct pm_qos_request *req, s32 value) { if (!req || cpu_latency_qos_value_invalid(value)) return; if (cpu_latency_qos_request_active(req)) { WARN(1, KERN_ERR "%s called for already added request\n", __func__); return; } trace_pm_qos_add_request(value); req->qos = &cpu_latency_constraints; cpu_latency_qos_apply(req, PM_QOS_ADD_REQ, value); } EXPORT_SYMBOL_GPL(cpu_latency_qos_add_request); /** * cpu_latency_qos_update_request - Modify existing CPU latency QoS request. * @req : QoS request to update. * @new_value: New requested constraint value. * * Use @new_value to update the QoS request represented by @req in the CPU * latency QoS list along with updating the effective constraint value for that * list. */ void cpu_latency_qos_update_request(struct pm_qos_request *req, s32 new_value) { if (!req || cpu_latency_qos_value_invalid(new_value)) return; if (!cpu_latency_qos_request_active(req)) { WARN(1, KERN_ERR "%s called for unknown object\n", __func__); return; } trace_pm_qos_update_request(new_value); if (new_value == req->node.prio) return; cpu_latency_qos_apply(req, PM_QOS_UPDATE_REQ, new_value); } EXPORT_SYMBOL_GPL(cpu_latency_qos_update_request); /** * cpu_latency_qos_remove_request - Remove existing CPU latency QoS request. * @req: QoS request to remove. * * Remove the CPU latency QoS request represented by @req from the CPU latency * QoS list along with updating the effective constraint value for that list. */ void cpu_latency_qos_remove_request(struct pm_qos_request *req) { if (!req) return; if (!cpu_latency_qos_request_active(req)) { WARN(1, KERN_ERR "%s called for unknown object\n", __func__); return; } trace_pm_qos_remove_request(PM_QOS_DEFAULT_VALUE); cpu_latency_qos_apply(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE); memset(req, 0, sizeof(*req)); } EXPORT_SYMBOL_GPL(cpu_latency_qos_remove_request); /* User space interface to the CPU latency QoS via misc device. */ static int cpu_latency_qos_open(struct inode *inode, struct file *filp) { struct pm_qos_request *req; req = kzalloc(sizeof(*req), GFP_KERNEL); if (!req) return -ENOMEM; cpu_latency_qos_add_request(req, PM_QOS_DEFAULT_VALUE); filp->private_data = req; return 0; } static int cpu_latency_qos_release(struct inode *inode, struct file *filp) { struct pm_qos_request *req = filp->private_data; filp->private_data = NULL; cpu_latency_qos_remove_request(req); kfree(req); return 0; } static ssize_t cpu_latency_qos_read(struct file *filp, char __user *buf, size_t count, loff_t *f_pos) { struct pm_qos_request *req = filp->private_data; unsigned long flags; s32 value; if (!req || !cpu_latency_qos_request_active(req)) return -EINVAL; spin_lock_irqsave(&pm_qos_lock, flags); value = pm_qos_get_value(&cpu_latency_constraints); spin_unlock_irqrestore(&pm_qos_lock, flags); return simple_read_from_buffer(buf, count, f_pos, &value, sizeof(s32)); } static ssize_t cpu_latency_qos_write(struct file *filp, const char __user *buf, size_t count, loff_t *f_pos) { s32 value; if (count == sizeof(s32)) { if (copy_from_user(&value, buf, sizeof(s32))) return -EFAULT; } else { int ret; ret = kstrtos32_from_user(buf, count, 16, &value); if (ret) return ret; } cpu_latency_qos_update_request(filp->private_data, value); return count; } static const struct file_operations cpu_latency_qos_fops = { .write = cpu_latency_qos_write, .read = cpu_latency_qos_read, .open = cpu_latency_qos_open, .release = cpu_latency_qos_release, .llseek = noop_llseek, }; static struct miscdevice cpu_latency_qos_miscdev = { .minor = MISC_DYNAMIC_MINOR, .name = "cpu_dma_latency", .fops = &cpu_latency_qos_fops, }; static int __init cpu_latency_qos_init(void) { int ret; ret = misc_register(&cpu_latency_qos_miscdev); if (ret < 0) pr_err("%s: %s setup failed\n", __func__, cpu_latency_qos_miscdev.name); return ret; } late_initcall(cpu_latency_qos_init); #endif /* CONFIG_CPU_IDLE */ /* Definitions related to the frequency QoS below. */ static inline bool freq_qos_value_invalid(s32 value) { return value < 0 && value != PM_QOS_DEFAULT_VALUE; } /** * freq_constraints_init - Initialize frequency QoS constraints. * @qos: Frequency QoS constraints to initialize. */ void freq_constraints_init(struct freq_constraints *qos) { struct pm_qos_constraints *c; c = &qos->min_freq; plist_head_init(&c->list); c->target_value = FREQ_QOS_MIN_DEFAULT_VALUE; c->default_value = FREQ_QOS_MIN_DEFAULT_VALUE; c->no_constraint_value = FREQ_QOS_MIN_DEFAULT_VALUE; c->type = PM_QOS_MAX; c->notifiers = &qos->min_freq_notifiers; BLOCKING_INIT_NOTIFIER_HEAD(c->notifiers); c = &qos->max_freq; plist_head_init(&c->list); c->target_value = FREQ_QOS_MAX_DEFAULT_VALUE; c->default_value = FREQ_QOS_MAX_DEFAULT_VALUE; c->no_constraint_value = FREQ_QOS_MAX_DEFAULT_VALUE; c->type = PM_QOS_MIN; c->notifiers = &qos->max_freq_notifiers; BLOCKING_INIT_NOTIFIER_HEAD(c->notifiers); } /** * freq_qos_read_value - Get frequency QoS constraint for a given list. * @qos: Constraints to evaluate. * @type: QoS request type. */ s32 freq_qos_read_value(struct freq_constraints *qos, enum freq_qos_req_type type) { s32 ret; switch (type) { case FREQ_QOS_MIN: ret = IS_ERR_OR_NULL(qos) ? FREQ_QOS_MIN_DEFAULT_VALUE : pm_qos_read_value(&qos->min_freq); break; case FREQ_QOS_MAX: ret = IS_ERR_OR_NULL(qos) ? FREQ_QOS_MAX_DEFAULT_VALUE : pm_qos_read_value(&qos->max_freq); break; default: WARN_ON(1); ret = 0; } return ret; } /** * freq_qos_apply - Add/modify/remove frequency QoS request. * @req: Constraint request to apply. * @action: Action to perform (add/update/remove). * @value: Value to assign to the QoS request. * * This is only meant to be called from inside pm_qos, not drivers. */ int freq_qos_apply(struct freq_qos_request *req, enum pm_qos_req_action action, s32 value) { int ret; switch(req->type) { case FREQ_QOS_MIN: ret = pm_qos_update_target(&req->qos->min_freq, &req->pnode, action, value); break; case FREQ_QOS_MAX: ret = pm_qos_update_target(&req->qos->max_freq, &req->pnode, action, value); break; default: ret = -EINVAL; } return ret; } /** * freq_qos_add_request - Insert new frequency QoS request into a given list. * @qos: Constraints to update. * @req: Preallocated request object. * @type: Request type. * @value: Request value. * * Insert a new entry into the @qos list of requests, recompute the effective * QoS constraint value for that list and initialize the @req object. The * caller needs to save that object for later use in updates and removal. * * Return 1 if the effective constraint value has changed, 0 if the effective * constraint value has not changed, or a negative error code on failures. */ int freq_qos_add_request(struct freq_constraints *qos, struct freq_qos_request *req, enum freq_qos_req_type type, s32 value) { int ret; if (IS_ERR_OR_NULL(qos) || !req || freq_qos_value_invalid(value)) return -EINVAL; if (WARN(freq_qos_request_active(req), "%s() called for active request\n", __func__)) return -EINVAL; req->qos = qos; req->type = type; ret = freq_qos_apply(req, PM_QOS_ADD_REQ, value); if (ret < 0) { req->qos = NULL; req->type = 0; } return ret; } EXPORT_SYMBOL_GPL(freq_qos_add_request); /** * freq_qos_update_request - Modify existing frequency QoS request. * @req: Request to modify. * @new_value: New request value. * * Update an existing frequency QoS request along with the effective constraint * value for the list of requests it belongs to. * * Return 1 if the effective constraint value has changed, 0 if the effective * constraint value has not changed, or a negative error code on failures. */ int freq_qos_update_request(struct freq_qos_request *req, s32 new_value) { if (!req || freq_qos_value_invalid(new_value)) return -EINVAL; if (WARN(!freq_qos_request_active(req), "%s() called for unknown object\n", __func__)) return -EINVAL; if (req->pnode.prio == new_value) return 0; return freq_qos_apply(req, PM_QOS_UPDATE_REQ, new_value); } EXPORT_SYMBOL_GPL(freq_qos_update_request); /** * freq_qos_remove_request - Remove frequency QoS request from its list. * @req: Request to remove. * * Remove the given frequency QoS request from the list of constraints it * belongs to and recompute the effective constraint value for that list. * * Return 1 if the effective constraint value has changed, 0 if the effective * constraint value has not changed, or a negative error code on failures. */ int freq_qos_remove_request(struct freq_qos_request *req) { int ret; if (!req) return -EINVAL; if (WARN(!freq_qos_request_active(req), "%s() called for unknown object\n", __func__)) return -EINVAL; ret = freq_qos_apply(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE); req->qos = NULL; req->type = 0; return ret; } EXPORT_SYMBOL_GPL(freq_qos_remove_request); /** * freq_qos_add_notifier - Add frequency QoS change notifier. * @qos: List of requests to add the notifier to. * @type: Request type. * @notifier: Notifier block to add. */ int freq_qos_add_notifier(struct freq_constraints *qos, enum freq_qos_req_type type, struct notifier_block *notifier) { int ret; if (IS_ERR_OR_NULL(qos) || !notifier) return -EINVAL; switch (type) { case FREQ_QOS_MIN: ret = blocking_notifier_chain_register(qos->min_freq.notifiers, notifier); break; case FREQ_QOS_MAX: ret = blocking_notifier_chain_register(qos->max_freq.notifiers, notifier); break; default: WARN_ON(1); ret = -EINVAL; } return ret; } EXPORT_SYMBOL_GPL(freq_qos_add_notifier); /** * freq_qos_remove_notifier - Remove frequency QoS change notifier. * @qos: List of requests to remove the notifier from. * @type: Request type. * @notifier: Notifier block to remove. */ int freq_qos_remove_notifier(struct freq_constraints *qos, enum freq_qos_req_type type, struct notifier_block *notifier) { int ret; if (IS_ERR_OR_NULL(qos) || !notifier) return -EINVAL; switch (type) { case FREQ_QOS_MIN: ret = blocking_notifier_chain_unregister(qos->min_freq.notifiers, notifier); break; case FREQ_QOS_MAX: ret = blocking_notifier_chain_unregister(qos->max_freq.notifiers, notifier); break; default: WARN_ON(1); ret = -EINVAL; } return ret; } EXPORT_SYMBOL_GPL(freq_qos_remove_notifier); |
| 1 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Public Key Encryption * * Copyright (c) 2015, Intel Corporation * Authors: Tadeusz Struk <tadeusz.struk@intel.com> */ #include <crypto/internal/akcipher.h> #include <linux/cryptouser.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/scatterlist.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/string.h> #include <net/netlink.h> #include "internal.h" #define CRYPTO_ALG_TYPE_AHASH_MASK 0x0000000e static int __maybe_unused crypto_akcipher_report( struct sk_buff *skb, struct crypto_alg *alg) { struct crypto_report_akcipher rakcipher; memset(&rakcipher, 0, sizeof(rakcipher)); strscpy(rakcipher.type, "akcipher", sizeof(rakcipher.type)); return nla_put(skb, CRYPTOCFGA_REPORT_AKCIPHER, sizeof(rakcipher), &rakcipher); } static void crypto_akcipher_show(struct seq_file *m, struct crypto_alg *alg) __maybe_unused; static void crypto_akcipher_show(struct seq_file *m, struct crypto_alg *alg) { seq_puts(m, "type : akcipher\n"); } static void crypto_akcipher_exit_tfm(struct crypto_tfm *tfm) { struct crypto_akcipher *akcipher = __crypto_akcipher_tfm(tfm); struct akcipher_alg *alg = crypto_akcipher_alg(akcipher); alg->exit(akcipher); } static int crypto_akcipher_init_tfm(struct crypto_tfm *tfm) { struct crypto_akcipher *akcipher = __crypto_akcipher_tfm(tfm); struct akcipher_alg *alg = crypto_akcipher_alg(akcipher); if (alg->exit) akcipher->base.exit = crypto_akcipher_exit_tfm; if (alg->init) return alg->init(akcipher); return 0; } static void crypto_akcipher_free_instance(struct crypto_instance *inst) { struct akcipher_instance *akcipher = akcipher_instance(inst); akcipher->free(akcipher); } static const struct crypto_type crypto_akcipher_type = { .extsize = crypto_alg_extsize, .init_tfm = crypto_akcipher_init_tfm, .free = crypto_akcipher_free_instance, #ifdef CONFIG_PROC_FS .show = crypto_akcipher_show, #endif #if IS_ENABLED(CONFIG_CRYPTO_USER) .report = crypto_akcipher_report, #endif .maskclear = ~CRYPTO_ALG_TYPE_MASK, .maskset = CRYPTO_ALG_TYPE_AHASH_MASK, .type = CRYPTO_ALG_TYPE_AKCIPHER, .tfmsize = offsetof(struct crypto_akcipher, base), }; int crypto_grab_akcipher(struct crypto_akcipher_spawn *spawn, struct crypto_instance *inst, const char *name, u32 type, u32 mask) { spawn->base.frontend = &crypto_akcipher_type; return crypto_grab_spawn(&spawn->base, inst, name, type, mask); } EXPORT_SYMBOL_GPL(crypto_grab_akcipher); struct crypto_akcipher *crypto_alloc_akcipher(const char *alg_name, u32 type, u32 mask) { return crypto_alloc_tfm(alg_name, &crypto_akcipher_type, type, mask); } EXPORT_SYMBOL_GPL(crypto_alloc_akcipher); static void akcipher_prepare_alg(struct akcipher_alg *alg) { struct crypto_alg *base = &alg->base; base->cra_type = &crypto_akcipher_type; base->cra_flags &= ~CRYPTO_ALG_TYPE_MASK; base->cra_flags |= CRYPTO_ALG_TYPE_AKCIPHER; } static int akcipher_default_op(struct akcipher_request *req) { return -ENOSYS; } static int akcipher_default_set_key(struct crypto_akcipher *tfm, const void *key, unsigned int keylen) { return -ENOSYS; } int crypto_register_akcipher(struct akcipher_alg *alg) { struct crypto_alg *base = &alg->base; if (!alg->sign) alg->sign = akcipher_default_op; if (!alg->verify) alg->verify = akcipher_default_op; if (!alg->encrypt) alg->encrypt = akcipher_default_op; if (!alg->decrypt) alg->decrypt = akcipher_default_op; if (!alg->set_priv_key) alg->set_priv_key = akcipher_default_set_key; akcipher_prepare_alg(alg); return crypto_register_alg(base); } EXPORT_SYMBOL_GPL(crypto_register_akcipher); void crypto_unregister_akcipher(struct akcipher_alg *alg) { crypto_unregister_alg(&alg->base); } EXPORT_SYMBOL_GPL(crypto_unregister_akcipher); int akcipher_register_instance(struct crypto_template *tmpl, struct akcipher_instance *inst) { if (WARN_ON(!inst->free)) return -EINVAL; akcipher_prepare_alg(&inst->alg); return crypto_register_instance(tmpl, akcipher_crypto_instance(inst)); } EXPORT_SYMBOL_GPL(akcipher_register_instance); int crypto_akcipher_sync_prep(struct crypto_akcipher_sync_data *data) { unsigned int reqsize = crypto_akcipher_reqsize(data->tfm); struct akcipher_request *req; struct scatterlist *sg; unsigned int mlen; unsigned int len; u8 *buf; if (data->dst) mlen = max(data->slen, data->dlen); else mlen = data->slen + data->dlen; len = sizeof(*req) + reqsize + mlen; if (len < mlen) return -EOVERFLOW; req = kzalloc(len, GFP_KERNEL); if (!req) return -ENOMEM; data->req = req; akcipher_request_set_tfm(req, data->tfm); buf = (u8 *)(req + 1) + reqsize; data->buf = buf; memcpy(buf, data->src, data->slen); sg = &data->sg; sg_init_one(sg, buf, mlen); akcipher_request_set_crypt(req, sg, data->dst ? sg : NULL, data->slen, data->dlen); crypto_init_wait(&data->cwait); akcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP, crypto_req_done, &data->cwait); return 0; } EXPORT_SYMBOL_GPL(crypto_akcipher_sync_prep); int crypto_akcipher_sync_post(struct crypto_akcipher_sync_data *data, int err) { err = crypto_wait_req(err, &data->cwait); if (data->dst) memcpy(data->dst, data->buf, data->dlen); data->dlen = data->req->dst_len; kfree_sensitive(data->req); return err; } EXPORT_SYMBOL_GPL(crypto_akcipher_sync_post); int crypto_akcipher_sync_encrypt(struct crypto_akcipher *tfm, const void *src, unsigned int slen, void *dst, unsigned int dlen) { struct crypto_akcipher_sync_data data = { .tfm = tfm, .src = src, .dst = dst, .slen = slen, .dlen = dlen, }; return crypto_akcipher_sync_prep(&data) ?: crypto_akcipher_sync_post(&data, crypto_akcipher_encrypt(data.req)); } EXPORT_SYMBOL_GPL(crypto_akcipher_sync_encrypt); int crypto_akcipher_sync_decrypt(struct crypto_akcipher *tfm, const void *src, unsigned int slen, void *dst, unsigned int dlen) { struct crypto_akcipher_sync_data data = { .tfm = tfm, .src = src, .dst = dst, .slen = slen, .dlen = dlen, }; return crypto_akcipher_sync_prep(&data) ?: crypto_akcipher_sync_post(&data, crypto_akcipher_decrypt(data.req)) ?: data.dlen; } EXPORT_SYMBOL_GPL(crypto_akcipher_sync_decrypt); static void crypto_exit_akcipher_ops_sig(struct crypto_tfm *tfm) { struct crypto_akcipher **ctx = crypto_tfm_ctx(tfm); crypto_free_akcipher(*ctx); } int crypto_init_akcipher_ops_sig(struct crypto_tfm *tfm) { struct crypto_akcipher **ctx = crypto_tfm_ctx(tfm); struct crypto_alg *calg = tfm->__crt_alg; struct crypto_akcipher *akcipher; if (!crypto_mod_get(calg)) return -EAGAIN; akcipher = crypto_create_tfm(calg, &crypto_akcipher_type); if (IS_ERR(akcipher)) { crypto_mod_put(calg); return PTR_ERR(akcipher); } *ctx = akcipher; tfm->exit = crypto_exit_akcipher_ops_sig; return 0; } EXPORT_SYMBOL_GPL(crypto_init_akcipher_ops_sig); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Generic public key cipher type"); |
| 7 7 7 3 3 1 37 36 36 2 2 1 1 36 10 1 9 35 35 8 8 6 6 8 33 33 5 4 5 1 5 2 2 3 29 6 26 29 29 29 4 26 23 26 26 26 26 22 20 22 26 1 25 25 6 37 1 1 1 1 12 2 12 12 12 12 12 12 12 12 12 12 12 10 17 17 17 10 10 10 8 3 3 5 5 1 1 1 1 1 7 2 10 2 5 5 1 4 3 3 3 3 3 3 3 3 3 3 13 5 8 7 13 13 13 13 13 13 13 8 8 8 8 8 4 8 8 8 8 8 8 8 4 8 4 4 4 4 8 4 8 8 4 4 4 8 6 6 2 2 6 4 6 4 4 4 4 4 1 3 4 4 4 4 4 4 4 4 17 2 17 10 7 7 17 1 38 38 38 4 38 8 2 8 8 4 4 8 35 34 17 17 34 34 1 1 1 1 1 4 3 3 1 3 1 3 1 3 3 1 2 4 29 29 29 35 35 35 35 35 3 34 1 33 33 29 33 33 26 26 8 1 7 2 6 6 6 6 5 6 6 35 20 8 1 20 2 20 18 18 18 18 18 7 7 7 2 3 6 22 22 11 6 34 34 34 34 34 31 8 8 28 19 4 4 1 4 4 4 4 4 2 4 4 4 2 4 4 4 4 4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 | // SPDX-License-Identifier: GPL-2.0-or-later /* * TCP over IPv6 * Linux INET6 implementation * * Authors: * Pedro Roque <roque@di.fc.ul.pt> * * Based on: * linux/net/ipv4/tcp.c * linux/net/ipv4/tcp_input.c * linux/net/ipv4/tcp_output.c * * Fixes: * Hideaki YOSHIFUJI : sin6_scope_id support * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind * a single port at the same time. * YOSHIFUJI Hideaki @USAGI: convert /proc/net/tcp6 to seq_file. */ #include <linux/bottom_half.h> #include <linux/module.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/jiffies.h> #include <linux/in.h> #include <linux/in6.h> #include <linux/netdevice.h> #include <linux/init.h> #include <linux/jhash.h> #include <linux/ipsec.h> #include <linux/times.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/ipv6.h> #include <linux/icmpv6.h> #include <linux/random.h> #include <linux/indirect_call_wrapper.h> #include <net/tcp.h> #include <net/ndisc.h> #include <net/inet6_hashtables.h> #include <net/inet6_connection_sock.h> #include <net/ipv6.h> #include <net/transp_v6.h> #include <net/addrconf.h> #include <net/ip6_route.h> #include <net/ip6_checksum.h> #include <net/inet_ecn.h> #include <net/protocol.h> #include <net/xfrm.h> #include <net/snmp.h> #include <net/dsfield.h> #include <net/timewait_sock.h> #include <net/inet_common.h> #include <net/secure_seq.h> #include <net/hotdata.h> #include <net/busy_poll.h> #include <net/rstreason.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <crypto/hash.h> #include <linux/scatterlist.h> #include <trace/events/tcp.h> static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb, enum sk_rst_reason reason); static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, struct request_sock *req); INDIRECT_CALLABLE_SCOPE int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb); static const struct inet_connection_sock_af_ops ipv6_mapped; const struct inet_connection_sock_af_ops ipv6_specific; #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) static const struct tcp_sock_af_ops tcp_sock_ipv6_specific; static const struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific; #endif /* Helper returning the inet6 address from a given tcp socket. * It can be used in TCP stack instead of inet6_sk(sk). * This avoids a dereference and allow compiler optimizations. * It is a specialized version of inet6_sk_generic(). */ #define tcp_inet6_sk(sk) (&container_of_const(tcp_sk(sk), \ struct tcp6_sock, tcp)->inet6) static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); if (dst && dst_hold_safe(dst)) { rcu_assign_pointer(sk->sk_rx_dst, dst); sk->sk_rx_dst_ifindex = skb->skb_iif; sk->sk_rx_dst_cookie = rt6_get_cookie(dst_rt6_info(dst)); } } static u32 tcp_v6_init_seq(const struct sk_buff *skb) { return secure_tcpv6_seq(ipv6_hdr(skb)->daddr.s6_addr32, ipv6_hdr(skb)->saddr.s6_addr32, tcp_hdr(skb)->dest, tcp_hdr(skb)->source); } static u32 tcp_v6_init_ts_off(const struct net *net, const struct sk_buff *skb) { return secure_tcpv6_ts_off(net, ipv6_hdr(skb)->daddr.s6_addr32, ipv6_hdr(skb)->saddr.s6_addr32); } static int tcp_v6_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { /* This check is replicated from tcp_v6_connect() and intended to * prevent BPF program called below from accessing bytes that are out * of the bound specified by user in addr_len. */ if (addr_len < SIN6_LEN_RFC2133) return -EINVAL; sock_owned_by_me(sk); return BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr, &addr_len); } static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr; struct inet_connection_sock *icsk = inet_csk(sk); struct in6_addr *saddr = NULL, *final_p, final; struct inet_timewait_death_row *tcp_death_row; struct ipv6_pinfo *np = tcp_inet6_sk(sk); struct inet_sock *inet = inet_sk(sk); struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); struct ipv6_txoptions *opt; struct dst_entry *dst; struct flowi6 fl6; int addr_type; int err; if (addr_len < SIN6_LEN_RFC2133) return -EINVAL; if (usin->sin6_family != AF_INET6) return -EAFNOSUPPORT; memset(&fl6, 0, sizeof(fl6)); if (inet6_test_bit(SNDFLOW, sk)) { fl6.flowlabel = usin->sin6_flowinfo&IPV6_FLOWINFO_MASK; IP6_ECN_flow_init(fl6.flowlabel); if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) { struct ip6_flowlabel *flowlabel; flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); if (IS_ERR(flowlabel)) return -EINVAL; fl6_sock_release(flowlabel); } } /* * connect() to INADDR_ANY means loopback (BSD'ism). */ if (ipv6_addr_any(&usin->sin6_addr)) { if (ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)) ipv6_addr_set_v4mapped(htonl(INADDR_LOOPBACK), &usin->sin6_addr); else usin->sin6_addr = in6addr_loopback; } addr_type = ipv6_addr_type(&usin->sin6_addr); if (addr_type & IPV6_ADDR_MULTICAST) return -ENETUNREACH; if (addr_type&IPV6_ADDR_LINKLOCAL) { if (addr_len >= sizeof(struct sockaddr_in6) && usin->sin6_scope_id) { /* If interface is set while binding, indices * must coincide. */ if (!sk_dev_equal_l3scope(sk, usin->sin6_scope_id)) return -EINVAL; sk->sk_bound_dev_if = usin->sin6_scope_id; } /* Connect to link-local address requires an interface */ if (!sk->sk_bound_dev_if) return -EINVAL; } if (tp->rx_opt.ts_recent_stamp && !ipv6_addr_equal(&sk->sk_v6_daddr, &usin->sin6_addr)) { tp->rx_opt.ts_recent = 0; tp->rx_opt.ts_recent_stamp = 0; WRITE_ONCE(tp->write_seq, 0); } sk->sk_v6_daddr = usin->sin6_addr; np->flow_label = fl6.flowlabel; /* * TCP over IPv4 */ if (addr_type & IPV6_ADDR_MAPPED) { u32 exthdrlen = icsk->icsk_ext_hdr_len; struct sockaddr_in sin; if (ipv6_only_sock(sk)) return -ENETUNREACH; sin.sin_family = AF_INET; sin.sin_port = usin->sin6_port; sin.sin_addr.s_addr = usin->sin6_addr.s6_addr32[3]; /* Paired with READ_ONCE() in tcp_(get|set)sockopt() */ WRITE_ONCE(icsk->icsk_af_ops, &ipv6_mapped); if (sk_is_mptcp(sk)) mptcpv6_handle_mapped(sk, true); sk->sk_backlog_rcv = tcp_v4_do_rcv; #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) tp->af_specific = &tcp_sock_ipv6_mapped_specific; #endif err = tcp_v4_connect(sk, (struct sockaddr *)&sin, sizeof(sin)); if (err) { icsk->icsk_ext_hdr_len = exthdrlen; /* Paired with READ_ONCE() in tcp_(get|set)sockopt() */ WRITE_ONCE(icsk->icsk_af_ops, &ipv6_specific); if (sk_is_mptcp(sk)) mptcpv6_handle_mapped(sk, false); sk->sk_backlog_rcv = tcp_v6_do_rcv; #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) tp->af_specific = &tcp_sock_ipv6_specific; #endif goto failure; } np->saddr = sk->sk_v6_rcv_saddr; return err; } if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr)) saddr = &sk->sk_v6_rcv_saddr; fl6.flowi6_proto = IPPROTO_TCP; fl6.daddr = sk->sk_v6_daddr; fl6.saddr = saddr ? *saddr : np->saddr; fl6.flowlabel = ip6_make_flowinfo(np->tclass, np->flow_label); fl6.flowi6_oif = sk->sk_bound_dev_if; fl6.flowi6_mark = sk->sk_mark; fl6.fl6_dport = usin->sin6_port; fl6.fl6_sport = inet->inet_sport; fl6.flowi6_uid = sk->sk_uid; opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk)); final_p = fl6_update_dst(&fl6, opt, &final); security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6)); dst = ip6_dst_lookup_flow(net, sk, &fl6, final_p); if (IS_ERR(dst)) { err = PTR_ERR(dst); goto failure; } tp->tcp_usec_ts = dst_tcp_usec_ts(dst); tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; if (!saddr) { saddr = &fl6.saddr; err = inet_bhash2_update_saddr(sk, saddr, AF_INET6); if (err) goto failure; } /* set the source address */ np->saddr = *saddr; inet->inet_rcv_saddr = LOOPBACK4_IPV6; sk->sk_gso_type = SKB_GSO_TCPV6; ip6_dst_store(sk, dst, NULL, NULL); icsk->icsk_ext_hdr_len = 0; if (opt) icsk->icsk_ext_hdr_len = opt->opt_flen + opt->opt_nflen; tp->rx_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr); inet->inet_dport = usin->sin6_port; tcp_set_state(sk, TCP_SYN_SENT); err = inet6_hash_connect(tcp_death_row, sk); if (err) goto late_failure; sk_set_txhash(sk); if (likely(!tp->repair)) { if (!tp->write_seq) WRITE_ONCE(tp->write_seq, secure_tcpv6_seq(np->saddr.s6_addr32, sk->sk_v6_daddr.s6_addr32, inet->inet_sport, inet->inet_dport)); tp->tsoffset = secure_tcpv6_ts_off(net, np->saddr.s6_addr32, sk->sk_v6_daddr.s6_addr32); } if (tcp_fastopen_defer_connect(sk, &err)) return err; if (err) goto late_failure; err = tcp_connect(sk); if (err) goto late_failure; return 0; late_failure: tcp_set_state(sk, TCP_CLOSE); inet_bhash2_reset_saddr(sk); failure: inet->inet_dport = 0; sk->sk_route_caps = 0; return err; } static void tcp_v6_mtu_reduced(struct sock *sk) { struct dst_entry *dst; u32 mtu; if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) return; mtu = READ_ONCE(tcp_sk(sk)->mtu_info); /* Drop requests trying to increase our current mss. * Check done in __ip6_rt_update_pmtu() is too late. */ if (tcp_mtu_to_mss(sk, mtu) >= tcp_sk(sk)->mss_cache) return; dst = inet6_csk_update_pmtu(sk, mtu); if (!dst) return; if (inet_csk(sk)->icsk_pmtu_cookie > dst_mtu(dst)) { tcp_sync_mss(sk, dst_mtu(dst)); tcp_simple_retransmit(sk); } } static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { const struct ipv6hdr *hdr = (const struct ipv6hdr *)skb->data; const struct tcphdr *th = (struct tcphdr *)(skb->data+offset); struct net *net = dev_net(skb->dev); struct request_sock *fastopen; struct ipv6_pinfo *np; struct tcp_sock *tp; __u32 seq, snd_una; struct sock *sk; bool fatal; int err; sk = __inet6_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, &hdr->daddr, th->dest, &hdr->saddr, ntohs(th->source), skb->dev->ifindex, inet6_sdif(skb)); if (!sk) { __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev), ICMP6_MIB_INERRORS); return -ENOENT; } if (sk->sk_state == TCP_TIME_WAIT) { /* To increase the counter of ignored icmps for TCP-AO */ tcp_ao_ignore_icmp(sk, AF_INET6, type, code); inet_twsk_put(inet_twsk(sk)); return 0; } seq = ntohl(th->seq); fatal = icmpv6_err_convert(type, code, &err); if (sk->sk_state == TCP_NEW_SYN_RECV) { tcp_req_err(sk, seq, fatal); return 0; } if (tcp_ao_ignore_icmp(sk, AF_INET6, type, code)) { sock_put(sk); return 0; } bh_lock_sock(sk); if (sock_owned_by_user(sk) && type != ICMPV6_PKT_TOOBIG) __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); if (sk->sk_state == TCP_CLOSE) goto out; if (static_branch_unlikely(&ip6_min_hopcount)) { /* min_hopcount can be changed concurrently from do_ipv6_setsockopt() */ if (ipv6_hdr(skb)->hop_limit < READ_ONCE(tcp_inet6_sk(sk)->min_hopcount)) { __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); goto out; } } tp = tcp_sk(sk); /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ fastopen = rcu_dereference(tp->fastopen_rsk); snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; if (sk->sk_state != TCP_LISTEN && !between(seq, snd_una, tp->snd_nxt)) { __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); goto out; } np = tcp_inet6_sk(sk); if (type == NDISC_REDIRECT) { if (!sock_owned_by_user(sk)) { struct dst_entry *dst = __sk_dst_check(sk, np->dst_cookie); if (dst) dst->ops->redirect(dst, sk, skb); } goto out; } if (type == ICMPV6_PKT_TOOBIG) { u32 mtu = ntohl(info); /* We are not interested in TCP_LISTEN and open_requests * (SYN-ACKs send out by Linux are always <576bytes so * they should go through unfragmented). */ if (sk->sk_state == TCP_LISTEN) goto out; if (!ip6_sk_accept_pmtu(sk)) goto out; if (mtu < IPV6_MIN_MTU) goto out; WRITE_ONCE(tp->mtu_info, mtu); if (!sock_owned_by_user(sk)) tcp_v6_mtu_reduced(sk); else if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags)) sock_hold(sk); goto out; } /* Might be for an request_sock */ switch (sk->sk_state) { case TCP_SYN_SENT: case TCP_SYN_RECV: /* Only in fast or simultaneous open. If a fast open socket is * already accepted it is treated as a connected one below. */ if (fastopen && !fastopen->sk) break; ipv6_icmp_error(sk, skb, err, th->dest, ntohl(info), (u8 *)th); if (!sock_owned_by_user(sk)) { WRITE_ONCE(sk->sk_err, err); sk_error_report(sk); /* Wake people up to see the error (see connect in sock.c) */ tcp_done(sk); } else { WRITE_ONCE(sk->sk_err_soft, err); } goto out; case TCP_LISTEN: break; default: /* check if this ICMP message allows revert of backoff. * (see RFC 6069) */ if (!fastopen && type == ICMPV6_DEST_UNREACH && code == ICMPV6_NOROUTE) tcp_ld_RTO_revert(sk, seq); } if (!sock_owned_by_user(sk) && inet6_test_bit(RECVERR6, sk)) { WRITE_ONCE(sk->sk_err, err); sk_error_report(sk); } else { WRITE_ONCE(sk->sk_err_soft, err); } out: bh_unlock_sock(sk); sock_put(sk); return 0; } static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst, struct flowi *fl, struct request_sock *req, struct tcp_fastopen_cookie *foc, enum tcp_synack_type synack_type, struct sk_buff *syn_skb) { struct inet_request_sock *ireq = inet_rsk(req); const struct ipv6_pinfo *np = tcp_inet6_sk(sk); struct ipv6_txoptions *opt; struct flowi6 *fl6 = &fl->u.ip6; struct sk_buff *skb; int err = -ENOMEM; u8 tclass; /* First, grab a route. */ if (!dst && (dst = inet6_csk_route_req(sk, fl6, req, IPPROTO_TCP)) == NULL) goto done; skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb); if (skb) { __tcp_v6_send_check(skb, &ireq->ir_v6_loc_addr, &ireq->ir_v6_rmt_addr); fl6->daddr = ireq->ir_v6_rmt_addr; if (inet6_test_bit(REPFLOW, sk) && ireq->pktopts) fl6->flowlabel = ip6_flowlabel(ipv6_hdr(ireq->pktopts)); tclass = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ? (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) | (np->tclass & INET_ECN_MASK) : np->tclass; if (!INET_ECN_is_capable(tclass) && tcp_bpf_ca_needs_ecn((struct sock *)req)) tclass |= INET_ECN_ECT_0; rcu_read_lock(); opt = ireq->ipv6_opt; if (!opt) opt = rcu_dereference(np->opt); err = ip6_xmit(sk, skb, fl6, skb->mark ? : READ_ONCE(sk->sk_mark), opt, tclass, READ_ONCE(sk->sk_priority)); rcu_read_unlock(); err = net_xmit_eval(err); } done: return err; } static void tcp_v6_reqsk_destructor(struct request_sock *req) { kfree(inet_rsk(req)->ipv6_opt); consume_skb(inet_rsk(req)->pktopts); } #ifdef CONFIG_TCP_MD5SIG static struct tcp_md5sig_key *tcp_v6_md5_do_lookup(const struct sock *sk, const struct in6_addr *addr, int l3index) { return tcp_md5_do_lookup(sk, l3index, (union tcp_md5_addr *)addr, AF_INET6); } static struct tcp_md5sig_key *tcp_v6_md5_lookup(const struct sock *sk, const struct sock *addr_sk) { int l3index; l3index = l3mdev_master_ifindex_by_index(sock_net(sk), addr_sk->sk_bound_dev_if); return tcp_v6_md5_do_lookup(sk, &addr_sk->sk_v6_daddr, l3index); } static int tcp_v6_parse_md5_keys(struct sock *sk, int optname, sockptr_t optval, int optlen) { struct tcp_md5sig cmd; struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&cmd.tcpm_addr; union tcp_ao_addr *addr; int l3index = 0; u8 prefixlen; bool l3flag; u8 flags; if (optlen < sizeof(cmd)) return -EINVAL; if (copy_from_sockptr(&cmd, optval, sizeof(cmd))) return -EFAULT; if (sin6->sin6_family != AF_INET6) return -EINVAL; flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX; l3flag = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX; if (optname == TCP_MD5SIG_EXT && cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) { prefixlen = cmd.tcpm_prefixlen; if (prefixlen > 128 || (ipv6_addr_v4mapped(&sin6->sin6_addr) && prefixlen > 32)) return -EINVAL; } else { prefixlen = ipv6_addr_v4mapped(&sin6->sin6_addr) ? 32 : 128; } if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex && cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) { struct net_device *dev; rcu_read_lock(); dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex); if (dev && netif_is_l3_master(dev)) l3index = dev->ifindex; rcu_read_unlock(); /* ok to reference set/not set outside of rcu; * right now device MUST be an L3 master */ if (!dev || !l3index) return -EINVAL; } if (!cmd.tcpm_keylen) { if (ipv6_addr_v4mapped(&sin6->sin6_addr)) return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin6->sin6_addr.s6_addr32[3], AF_INET, prefixlen, l3index, flags); return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin6->sin6_addr, AF_INET6, prefixlen, l3index, flags); } if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) return -EINVAL; if (ipv6_addr_v4mapped(&sin6->sin6_addr)) { addr = (union tcp_md5_addr *)&sin6->sin6_addr.s6_addr32[3]; /* Don't allow keys for peers that have a matching TCP-AO key. * See the comment in tcp_ao_add_cmd() */ if (tcp_ao_required(sk, addr, AF_INET, l3flag ? l3index : -1, false)) return -EKEYREJECTED; return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags, cmd.tcpm_key, cmd.tcpm_keylen); } addr = (union tcp_md5_addr *)&sin6->sin6_addr; /* Don't allow keys for peers that have a matching TCP-AO key. * See the comment in tcp_ao_add_cmd() */ if (tcp_ao_required(sk, addr, AF_INET6, l3flag ? l3index : -1, false)) return -EKEYREJECTED; return tcp_md5_do_add(sk, addr, AF_INET6, prefixlen, l3index, flags, cmd.tcpm_key, cmd.tcpm_keylen); } static int tcp_v6_md5_hash_headers(struct tcp_sigpool *hp, const struct in6_addr *daddr, const struct in6_addr *saddr, const struct tcphdr *th, int nbytes) { struct tcp6_pseudohdr *bp; struct scatterlist sg; struct tcphdr *_th; bp = hp->scratch; /* 1. TCP pseudo-header (RFC2460) */ bp->saddr = *saddr; bp->daddr = *daddr; bp->protocol = cpu_to_be32(IPPROTO_TCP); bp->len = cpu_to_be32(nbytes); _th = (struct tcphdr *)(bp + 1); memcpy(_th, th, sizeof(*th)); _th->check = 0; sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th)); ahash_request_set_crypt(hp->req, &sg, NULL, sizeof(*bp) + sizeof(*th)); return crypto_ahash_update(hp->req); } static int tcp_v6_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, const struct in6_addr *daddr, struct in6_addr *saddr, const struct tcphdr *th) { struct tcp_sigpool hp; if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp)) goto clear_hash_nostart; if (crypto_ahash_init(hp.req)) goto clear_hash; if (tcp_v6_md5_hash_headers(&hp, daddr, saddr, th, th->doff << 2)) goto clear_hash; if (tcp_md5_hash_key(&hp, key)) goto clear_hash; ahash_request_set_crypt(hp.req, NULL, md5_hash, 0); if (crypto_ahash_final(hp.req)) goto clear_hash; tcp_sigpool_end(&hp); return 0; clear_hash: tcp_sigpool_end(&hp); clear_hash_nostart: memset(md5_hash, 0, 16); return 1; } static int tcp_v6_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, const struct sock *sk, const struct sk_buff *skb) { const struct tcphdr *th = tcp_hdr(skb); const struct in6_addr *saddr, *daddr; struct tcp_sigpool hp; if (sk) { /* valid for establish/request sockets */ saddr = &sk->sk_v6_rcv_saddr; daddr = &sk->sk_v6_daddr; } else { const struct ipv6hdr *ip6h = ipv6_hdr(skb); saddr = &ip6h->saddr; daddr = &ip6h->daddr; } if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp)) goto clear_hash_nostart; if (crypto_ahash_init(hp.req)) goto clear_hash; if (tcp_v6_md5_hash_headers(&hp, daddr, saddr, th, skb->len)) goto clear_hash; if (tcp_sigpool_hash_skb_data(&hp, skb, th->doff << 2)) goto clear_hash; if (tcp_md5_hash_key(&hp, key)) goto clear_hash; ahash_request_set_crypt(hp.req, NULL, md5_hash, 0); if (crypto_ahash_final(hp.req)) goto clear_hash; tcp_sigpool_end(&hp); return 0; clear_hash: tcp_sigpool_end(&hp); clear_hash_nostart: memset(md5_hash, 0, 16); return 1; } #endif static void tcp_v6_init_req(struct request_sock *req, const struct sock *sk_listener, struct sk_buff *skb, u32 tw_isn) { bool l3_slave = ipv6_l3mdev_skb(TCP_SKB_CB(skb)->header.h6.flags); struct inet_request_sock *ireq = inet_rsk(req); const struct ipv6_pinfo *np = tcp_inet6_sk(sk_listener); ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr; ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr; /* So that link locals have meaning */ if ((!sk_listener->sk_bound_dev_if || l3_slave) && ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL) ireq->ir_iif = tcp_v6_iif(skb); if (!tw_isn && (ipv6_opt_accepted(sk_listener, skb, &TCP_SKB_CB(skb)->header.h6) || np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo || np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim || inet6_test_bit(REPFLOW, sk_listener))) { refcount_inc(&skb->users); ireq->pktopts = skb; } } static struct dst_entry *tcp_v6_route_req(const struct sock *sk, struct sk_buff *skb, struct flowi *fl, struct request_sock *req, u32 tw_isn) { tcp_v6_init_req(req, sk, skb, tw_isn); if (security_inet_conn_request(sk, skb, req)) return NULL; return inet6_csk_route_req(sk, &fl->u.ip6, req, IPPROTO_TCP); } struct request_sock_ops tcp6_request_sock_ops __read_mostly = { .family = AF_INET6, .obj_size = sizeof(struct tcp6_request_sock), .rtx_syn_ack = tcp_rtx_synack, .send_ack = tcp_v6_reqsk_send_ack, .destructor = tcp_v6_reqsk_destructor, .send_reset = tcp_v6_send_reset, .syn_ack_timeout = tcp_syn_ack_timeout, }; const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = { .mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr), #ifdef CONFIG_TCP_MD5SIG .req_md5_lookup = tcp_v6_md5_lookup, .calc_md5_hash = tcp_v6_md5_hash_skb, #endif #ifdef CONFIG_TCP_AO .ao_lookup = tcp_v6_ao_lookup_rsk, .ao_calc_key = tcp_v6_ao_calc_key_rsk, .ao_synack_hash = tcp_v6_ao_synack_hash, #endif #ifdef CONFIG_SYN_COOKIES .cookie_init_seq = cookie_v6_init_sequence, #endif .route_req = tcp_v6_route_req, .init_seq = tcp_v6_init_seq, .init_ts_off = tcp_v6_init_ts_off, .send_synack = tcp_v6_send_synack, }; static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 tsval, u32 tsecr, int oif, int rst, u8 tclass, __be32 label, u32 priority, u32 txhash, struct tcp_key *key) { const struct tcphdr *th = tcp_hdr(skb); struct tcphdr *t1; struct sk_buff *buff; struct flowi6 fl6; struct net *net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev); struct sock *ctl_sk = net->ipv6.tcp_sk; unsigned int tot_len = sizeof(struct tcphdr); __be32 mrst = 0, *topt; struct dst_entry *dst; __u32 mark = 0; if (tsecr) tot_len += TCPOLEN_TSTAMP_ALIGNED; if (tcp_key_is_md5(key)) tot_len += TCPOLEN_MD5SIG_ALIGNED; if (tcp_key_is_ao(key)) tot_len += tcp_ao_len_aligned(key->ao_key); #ifdef CONFIG_MPTCP if (rst && !tcp_key_is_md5(key)) { mrst = mptcp_reset_option(skb); if (mrst) tot_len += sizeof(__be32); } #endif buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); if (!buff) return; skb_reserve(buff, MAX_TCP_HEADER); t1 = skb_push(buff, tot_len); skb_reset_transport_header(buff); /* Swap the send and the receive. */ memset(t1, 0, sizeof(*t1)); t1->dest = th->source; t1->source = th->dest; t1->doff = tot_len / 4; t1->seq = htonl(seq); t1->ack_seq = htonl(ack); t1->ack = !rst || !th->ack; t1->rst = rst; t1->window = htons(win); topt = (__be32 *)(t1 + 1); if (tsecr) { *topt++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP); *topt++ = htonl(tsval); *topt++ = htonl(tsecr); } if (mrst) *topt++ = mrst; #ifdef CONFIG_TCP_MD5SIG if (tcp_key_is_md5(key)) { *topt++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_MD5SIG << 8) | TCPOLEN_MD5SIG); tcp_v6_md5_hash_hdr((__u8 *)topt, key->md5_key, &ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, t1); } #endif #ifdef CONFIG_TCP_AO if (tcp_key_is_ao(key)) { *topt++ = htonl((TCPOPT_AO << 24) | (tcp_ao_len(key->ao_key) << 16) | (key->ao_key->sndid << 8) | (key->rcv_next)); tcp_ao_hash_hdr(AF_INET6, (char *)topt, key->ao_key, key->traffic_key, (union tcp_ao_addr *)&ipv6_hdr(skb)->saddr, (union tcp_ao_addr *)&ipv6_hdr(skb)->daddr, t1, key->sne); } #endif memset(&fl6, 0, sizeof(fl6)); fl6.daddr = ipv6_hdr(skb)->saddr; fl6.saddr = ipv6_hdr(skb)->daddr; fl6.flowlabel = label; buff->ip_summed = CHECKSUM_PARTIAL; __tcp_v6_send_check(buff, &fl6.saddr, &fl6.daddr); fl6.flowi6_proto = IPPROTO_TCP; if (rt6_need_strict(&fl6.daddr) && !oif) fl6.flowi6_oif = tcp_v6_iif(skb); else { if (!oif && netif_index_is_l3_master(net, skb->skb_iif)) oif = skb->skb_iif; fl6.flowi6_oif = oif; } if (sk) { if (sk->sk_state == TCP_TIME_WAIT) mark = inet_twsk(sk)->tw_mark; else mark = READ_ONCE(sk->sk_mark); skb_set_delivery_time(buff, tcp_transmit_time(sk), true); } if (txhash) { /* autoflowlabel/skb_get_hash_flowi6 rely on buff->hash */ skb_set_hash(buff, txhash, PKT_HASH_TYPE_L4); } fl6.flowi6_mark = IP6_REPLY_MARK(net, skb->mark) ?: mark; fl6.fl6_dport = t1->dest; fl6.fl6_sport = t1->source; fl6.flowi6_uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); security_skb_classify_flow(skb, flowi6_to_flowi_common(&fl6)); /* Pass a socket to ip6_dst_lookup either it is for RST * Underlying function will use this to retrieve the network * namespace */ if (sk && sk->sk_state != TCP_TIME_WAIT) dst = ip6_dst_lookup_flow(net, sk, &fl6, NULL); /*sk's xfrm_policy can be referred*/ else dst = ip6_dst_lookup_flow(net, ctl_sk, &fl6, NULL); if (!IS_ERR(dst)) { skb_dst_set(buff, dst); ip6_xmit(ctl_sk, buff, &fl6, fl6.flowi6_mark, NULL, tclass & ~INET_ECN_MASK, priority); TCP_INC_STATS(net, TCP_MIB_OUTSEGS); if (rst) TCP_INC_STATS(net, TCP_MIB_OUTRSTS); return; } kfree_skb(buff); } static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb, enum sk_rst_reason reason) { const struct tcphdr *th = tcp_hdr(skb); struct ipv6hdr *ipv6h = ipv6_hdr(skb); const __u8 *md5_hash_location = NULL; #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) bool allocated_traffic_key = false; #endif const struct tcp_ao_hdr *aoh; struct tcp_key key = {}; u32 seq = 0, ack_seq = 0; __be32 label = 0; u32 priority = 0; struct net *net; u32 txhash = 0; int oif = 0; #ifdef CONFIG_TCP_MD5SIG unsigned char newhash[16]; int genhash; struct sock *sk1 = NULL; #endif if (th->rst) return; /* If sk not NULL, it means we did a successful lookup and incoming * route had to be correct. prequeue might have dropped our dst. */ if (!sk && !ipv6_unicast_destination(skb)) return; net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev); /* Invalid TCP option size or twice included auth */ if (tcp_parse_auth_options(th, &md5_hash_location, &aoh)) return; #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) rcu_read_lock(); #endif #ifdef CONFIG_TCP_MD5SIG if (sk && sk_fullsock(sk)) { int l3index; /* sdif set, means packet ingressed via a device * in an L3 domain and inet_iif is set to it. */ l3index = tcp_v6_sdif(skb) ? tcp_v6_iif_l3_slave(skb) : 0; key.md5_key = tcp_v6_md5_do_lookup(sk, &ipv6h->saddr, l3index); if (key.md5_key) key.type = TCP_KEY_MD5; } else if (md5_hash_location) { int dif = tcp_v6_iif_l3_slave(skb); int sdif = tcp_v6_sdif(skb); int l3index; /* * active side is lost. Try to find listening socket through * source port, and then find md5 key through listening socket. * we are not loose security here: * Incoming packet is checked with md5 hash with finding key, * no RST generated if md5 hash doesn't match. */ sk1 = inet6_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo, NULL, 0, &ipv6h->saddr, th->source, &ipv6h->daddr, ntohs(th->source), dif, sdif); if (!sk1) goto out; /* sdif set, means packet ingressed via a device * in an L3 domain and dif is set to it. */ l3index = tcp_v6_sdif(skb) ? dif : 0; key.md5_key = tcp_v6_md5_do_lookup(sk1, &ipv6h->saddr, l3index); if (!key.md5_key) goto out; key.type = TCP_KEY_MD5; genhash = tcp_v6_md5_hash_skb(newhash, key.md5_key, NULL, skb); if (genhash || memcmp(md5_hash_location, newhash, 16) != 0) goto out; } #endif if (th->ack) seq = ntohl(th->ack_seq); else ack_seq = ntohl(th->seq) + th->syn + th->fin + skb->len - (th->doff << 2); #ifdef CONFIG_TCP_AO if (aoh) { int l3index; l3index = tcp_v6_sdif(skb) ? tcp_v6_iif_l3_slave(skb) : 0; if (tcp_ao_prepare_reset(sk, skb, aoh, l3index, seq, &key.ao_key, &key.traffic_key, &allocated_traffic_key, &key.rcv_next, &key.sne)) goto out; key.type = TCP_KEY_AO; } #endif if (sk) { oif = sk->sk_bound_dev_if; if (sk_fullsock(sk)) { if (inet6_test_bit(REPFLOW, sk)) label = ip6_flowlabel(ipv6h); priority = READ_ONCE(sk->sk_priority); txhash = sk->sk_txhash; } if (sk->sk_state == TCP_TIME_WAIT) { label = cpu_to_be32(inet_twsk(sk)->tw_flowlabel); priority = inet_twsk(sk)->tw_priority; txhash = inet_twsk(sk)->tw_txhash; } } else { if (net->ipv6.sysctl.flowlabel_reflect & FLOWLABEL_REFLECT_TCP_RESET) label = ip6_flowlabel(ipv6h); } trace_tcp_send_reset(sk, skb, reason); tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, 1, ipv6_get_dsfield(ipv6h), label, priority, txhash, &key); #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) out: if (allocated_traffic_key) kfree(key.traffic_key); rcu_read_unlock(); #endif } static void tcp_v6_send_ack(const struct sock *sk, struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 tsval, u32 tsecr, int oif, struct tcp_key *key, u8 tclass, __be32 label, u32 priority, u32 txhash) { tcp_v6_send_response(sk, skb, seq, ack, win, tsval, tsecr, oif, 0, tclass, label, priority, txhash, key); } static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb) { struct inet_timewait_sock *tw = inet_twsk(sk); struct tcp_timewait_sock *tcptw = tcp_twsk(sk); struct tcp_key key = {}; #ifdef CONFIG_TCP_AO struct tcp_ao_info *ao_info; if (static_branch_unlikely(&tcp_ao_needed.key)) { /* FIXME: the segment to-be-acked is not verified yet */ ao_info = rcu_dereference(tcptw->ao_info); if (ao_info) { const struct tcp_ao_hdr *aoh; /* Invalid TCP option size or twice included auth */ if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) goto out; if (aoh) key.ao_key = tcp_ao_established_key(ao_info, aoh->rnext_keyid, -1); } } if (key.ao_key) { struct tcp_ao_key *rnext_key; key.traffic_key = snd_other_key(key.ao_key); /* rcv_next switches to our rcv_next */ rnext_key = READ_ONCE(ao_info->rnext_key); key.rcv_next = rnext_key->rcvid; key.sne = READ_ONCE(ao_info->snd_sne); key.type = TCP_KEY_AO; #else if (0) { #endif #ifdef CONFIG_TCP_MD5SIG } else if (static_branch_unlikely(&tcp_md5_needed.key)) { key.md5_key = tcp_twsk_md5_key(tcptw); if (key.md5_key) key.type = TCP_KEY_MD5; #endif } tcp_v6_send_ack(sk, skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, tcp_tw_tsval(tcptw), tcptw->tw_ts_recent, tw->tw_bound_dev_if, &key, tw->tw_tclass, cpu_to_be32(tw->tw_flowlabel), tw->tw_priority, tw->tw_txhash); #ifdef CONFIG_TCP_AO out: #endif inet_twsk_put(tw); } static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, struct request_sock *req) { struct tcp_key key = {}; #ifdef CONFIG_TCP_AO if (static_branch_unlikely(&tcp_ao_needed.key) && tcp_rsk_used_ao(req)) { const struct in6_addr *addr = &ipv6_hdr(skb)->saddr; const struct tcp_ao_hdr *aoh; int l3index; l3index = tcp_v6_sdif(skb) ? tcp_v6_iif_l3_slave(skb) : 0; /* Invalid TCP option size or twice included auth */ if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) return; if (!aoh) return; key.ao_key = tcp_ao_do_lookup(sk, l3index, (union tcp_ao_addr *)addr, AF_INET6, aoh->rnext_keyid, -1); if (unlikely(!key.ao_key)) { /* Send ACK with any matching MKT for the peer */ key.ao_key = tcp_ao_do_lookup(sk, l3index, (union tcp_ao_addr *)addr, AF_INET6, -1, -1); /* Matching key disappeared (user removed the key?) * let the handshake timeout. */ if (!key.ao_key) { net_info_ratelimited("TCP-AO key for (%pI6, %d)->(%pI6, %d) suddenly disappeared, won't ACK new connection\n", addr, ntohs(tcp_hdr(skb)->source), &ipv6_hdr(skb)->daddr, ntohs(tcp_hdr(skb)->dest)); return; } } key.traffic_key = kmalloc(tcp_ao_digest_size(key.ao_key), GFP_ATOMIC); if (!key.traffic_key) return; key.type = TCP_KEY_AO; key.rcv_next = aoh->keyid; tcp_v6_ao_calc_key_rsk(key.ao_key, key.traffic_key, req); #else if (0) { #endif #ifdef CONFIG_TCP_MD5SIG } else if (static_branch_unlikely(&tcp_md5_needed.key)) { int l3index = tcp_v6_sdif(skb) ? tcp_v6_iif_l3_slave(skb) : 0; key.md5_key = tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->saddr, l3index); if (key.md5_key) key.type = TCP_KEY_MD5; #endif } /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV * sk->sk_state == TCP_SYN_RECV -> for Fast Open. */ tcp_v6_send_ack(sk, skb, (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt, tcp_rsk(req)->rcv_nxt, tcp_synack_window(req) >> inet_rsk(req)->rcv_wscale, tcp_rsk_tsval(tcp_rsk(req)), READ_ONCE(req->ts_recent), sk->sk_bound_dev_if, &key, ipv6_get_dsfield(ipv6_hdr(skb)), 0, READ_ONCE(sk->sk_priority), READ_ONCE(tcp_rsk(req)->txhash)); if (tcp_key_is_ao(&key)) kfree(key.traffic_key); } static struct sock *tcp_v6_cookie_check(struct sock *sk, struct sk_buff *skb) { #ifdef CONFIG_SYN_COOKIES const struct tcphdr *th = tcp_hdr(skb); if (!th->syn) sk = cookie_v6_check(sk, skb); #endif return sk; } u16 tcp_v6_get_syncookie(struct sock *sk, struct ipv6hdr *iph, struct tcphdr *th, u32 *cookie) { u16 mss = 0; #ifdef CONFIG_SYN_COOKIES mss = tcp_get_syncookie_mss(&tcp6_request_sock_ops, &tcp_request_sock_ipv6_ops, sk, th); if (mss) { *cookie = __cookie_v6_init_sequence(iph, th, &mss); tcp_synq_overflow(sk); } #endif return mss; } static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) { if (skb->protocol == htons(ETH_P_IP)) return tcp_v4_conn_request(sk, skb); if (!ipv6_unicast_destination(skb)) goto drop; if (ipv6_addr_v4mapped(&ipv6_hdr(skb)->saddr)) { __IP6_INC_STATS(sock_net(sk), NULL, IPSTATS_MIB_INHDRERRORS); return 0; } return tcp_conn_request(&tcp6_request_sock_ops, &tcp_request_sock_ipv6_ops, sk, skb); drop: tcp_listendrop(sk); return 0; /* don't send reset */ } static void tcp_v6_restore_cb(struct sk_buff *skb) { /* We need to move header back to the beginning if xfrm6_policy_check() * and tcp_v6_fill_cb() are going to be called again. * ip6_datagram_recv_specific_ctl() also expects IP6CB to be there. */ memmove(IP6CB(skb), &TCP_SKB_CB(skb)->header.h6, sizeof(struct inet6_skb_parm)); } static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct dst_entry *dst, struct request_sock *req_unhash, bool *own_req) { struct inet_request_sock *ireq; struct ipv6_pinfo *newnp; const struct ipv6_pinfo *np = tcp_inet6_sk(sk); struct ipv6_txoptions *opt; struct inet_sock *newinet; bool found_dup_sk = false; struct tcp_sock *newtp; struct sock *newsk; #ifdef CONFIG_TCP_MD5SIG struct tcp_md5sig_key *key; int l3index; #endif struct flowi6 fl6; if (skb->protocol == htons(ETH_P_IP)) { /* * v6 mapped */ newsk = tcp_v4_syn_recv_sock(sk, skb, req, dst, req_unhash, own_req); if (!newsk) return NULL; inet_sk(newsk)->pinet6 = tcp_inet6_sk(newsk); newnp = tcp_inet6_sk(newsk); newtp = tcp_sk(newsk); memcpy(newnp, np, sizeof(struct ipv6_pinfo)); newnp->saddr = newsk->sk_v6_rcv_saddr; inet_csk(newsk)->icsk_af_ops = &ipv6_mapped; if (sk_is_mptcp(newsk)) mptcpv6_handle_mapped(newsk, true); newsk->sk_backlog_rcv = tcp_v4_do_rcv; #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) newtp->af_specific = &tcp_sock_ipv6_mapped_specific; #endif newnp->ipv6_mc_list = NULL; newnp->ipv6_ac_list = NULL; newnp->ipv6_fl_list = NULL; newnp->pktoptions = NULL; newnp->opt = NULL; newnp->mcast_oif = inet_iif(skb); newnp->mcast_hops = ip_hdr(skb)->ttl; newnp->rcv_flowinfo = 0; if (inet6_test_bit(REPFLOW, sk)) newnp->flow_label = 0; /* * No need to charge this sock to the relevant IPv6 refcnt debug socks count * here, tcp_create_openreq_child now does this for us, see the comment in * that function for the gory details. -acme */ /* It is tricky place. Until this moment IPv4 tcp worked with IPv6 icsk.icsk_af_ops. Sync it now. */ tcp_sync_mss(newsk, inet_csk(newsk)->icsk_pmtu_cookie); return newsk; } ireq = inet_rsk(req); if (sk_acceptq_is_full(sk)) goto out_overflow; if (!dst) { dst = inet6_csk_route_req(sk, &fl6, req, IPPROTO_TCP); if (!dst) goto out; } newsk = tcp_create_openreq_child(sk, req, skb); if (!newsk) goto out_nonewsk; /* * No need to charge this sock to the relevant IPv6 refcnt debug socks * count here, tcp_create_openreq_child now does this for us, see the * comment in that function for the gory details. -acme */ newsk->sk_gso_type = SKB_GSO_TCPV6; inet6_sk_rx_dst_set(newsk, skb); inet_sk(newsk)->pinet6 = tcp_inet6_sk(newsk); newtp = tcp_sk(newsk); newinet = inet_sk(newsk); newnp = tcp_inet6_sk(newsk); memcpy(newnp, np, sizeof(struct ipv6_pinfo)); ip6_dst_store(newsk, dst, NULL, NULL); newsk->sk_v6_daddr = ireq->ir_v6_rmt_addr; newnp->saddr = ireq->ir_v6_loc_addr; newsk->sk_v6_rcv_saddr = ireq->ir_v6_loc_addr; newsk->sk_bound_dev_if = ireq->ir_iif; /* Now IPv6 options... First: no IPv4 options. */ newinet->inet_opt = NULL; newnp->ipv6_mc_list = NULL; newnp->ipv6_ac_list = NULL; newnp->ipv6_fl_list = NULL; /* Clone RX bits */ newnp->rxopt.all = np->rxopt.all; newnp->pktoptions = NULL; newnp->opt = NULL; newnp->mcast_oif = tcp_v6_iif(skb); newnp->mcast_hops = ipv6_hdr(skb)->hop_limit; newnp->rcv_flowinfo = ip6_flowinfo(ipv6_hdr(skb)); if (inet6_test_bit(REPFLOW, sk)) newnp->flow_label = ip6_flowlabel(ipv6_hdr(skb)); /* Set ToS of the new socket based upon the value of incoming SYN. * ECT bits are set later in tcp_init_transfer(). */ if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)) newnp->tclass = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK; /* Clone native IPv6 options from listening socket (if any) Yes, keeping reference count would be much more clever, but we make one more one thing there: reattach optmem to newsk. */ opt = ireq->ipv6_opt; if (!opt) opt = rcu_dereference(np->opt); if (opt) { opt = ipv6_dup_options(newsk, opt); RCU_INIT_POINTER(newnp->opt, opt); } inet_csk(newsk)->icsk_ext_hdr_len = 0; if (opt) inet_csk(newsk)->icsk_ext_hdr_len = opt->opt_nflen + opt->opt_flen; tcp_ca_openreq_child(newsk, dst); tcp_sync_mss(newsk, dst_mtu(dst)); newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst)); tcp_initialize_rcv_mss(newsk); newinet->inet_daddr = newinet->inet_saddr = LOOPBACK4_IPV6; newinet->inet_rcv_saddr = LOOPBACK4_IPV6; #ifdef CONFIG_TCP_MD5SIG l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif); if (!tcp_rsk_used_ao(req)) { /* Copy over the MD5 key from the original socket */ key = tcp_v6_md5_do_lookup(sk, &newsk->sk_v6_daddr, l3index); if (key) { const union tcp_md5_addr *addr; addr = (union tcp_md5_addr *)&newsk->sk_v6_daddr; if (tcp_md5_key_copy(newsk, addr, AF_INET6, 128, l3index, key)) { inet_csk_prepare_forced_close(newsk); tcp_done(newsk); goto out; } } } #endif #ifdef CONFIG_TCP_AO /* Copy over tcp_ao_info if any */ if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET6)) goto out; /* OOM */ #endif if (__inet_inherit_port(sk, newsk) < 0) { inet_csk_prepare_forced_close(newsk); tcp_done(newsk); goto out; } *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash), &found_dup_sk); if (*own_req) { tcp_move_syn(newtp, req); /* Clone pktoptions received with SYN, if we own the req */ if (ireq->pktopts) { newnp->pktoptions = skb_clone_and_charge_r(ireq->pktopts, newsk); consume_skb(ireq->pktopts); ireq->pktopts = NULL; if (newnp->pktoptions) tcp_v6_restore_cb(newnp->pktoptions); } } else { if (!req_unhash && found_dup_sk) { /* This code path should only be executed in the * syncookie case only */ bh_unlock_sock(newsk); sock_put(newsk); newsk = NULL; } } return newsk; out_overflow: __NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); out_nonewsk: dst_release(dst); out: tcp_listendrop(sk); return NULL; } INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, u32)); /* The socket must have it's spinlock held when we get * here, unless it is a TCP_LISTEN socket. * * We have a potential double-lock case here, so even when * doing backlog processing we use the BH locking scheme. * This is because we cannot sleep with the original spinlock * held. */ INDIRECT_CALLABLE_SCOPE int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) { struct ipv6_pinfo *np = tcp_inet6_sk(sk); struct sk_buff *opt_skb = NULL; enum skb_drop_reason reason; struct tcp_sock *tp; /* Imagine: socket is IPv6. IPv4 packet arrives, goes to IPv4 receive handler and backlogged. From backlog it always goes here. Kerboom... Fortunately, tcp_rcv_established and rcv_established handle them correctly, but it is not case with tcp_v6_hnd_req and tcp_v6_send_reset(). --ANK */ if (skb->protocol == htons(ETH_P_IP)) return tcp_v4_do_rcv(sk, skb); /* * socket locking is here for SMP purposes as backlog rcv * is currently called with bh processing disabled. */ /* Do Stevens' IPV6_PKTOPTIONS. Yes, guys, it is the only place in our code, where we may make it not affecting IPv4. The rest of code is protocol independent, and I do not like idea to uglify IPv4. Actually, all the idea behind IPV6_PKTOPTIONS looks not very well thought. For now we latch options, received in the last packet, enqueued by tcp. Feel free to propose better solution. --ANK (980728) */ if (np->rxopt.all) opt_skb = skb_clone_and_charge_r(skb, sk); if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ struct dst_entry *dst; dst = rcu_dereference_protected(sk->sk_rx_dst, lockdep_sock_is_held(sk)); sock_rps_save_rxhash(sk, skb); sk_mark_napi_id(sk, skb); if (dst) { if (sk->sk_rx_dst_ifindex != skb->skb_iif || INDIRECT_CALL_1(dst->ops->check, ip6_dst_check, dst, sk->sk_rx_dst_cookie) == NULL) { RCU_INIT_POINTER(sk->sk_rx_dst, NULL); dst_release(dst); } } tcp_rcv_established(sk, skb); if (opt_skb) goto ipv6_pktoptions; return 0; } if (tcp_checksum_complete(skb)) goto csum_err; if (sk->sk_state == TCP_LISTEN) { struct sock *nsk = tcp_v6_cookie_check(sk, skb); if (nsk != sk) { if (nsk) { reason = tcp_child_process(sk, nsk, skb); if (reason) goto reset; } if (opt_skb) __kfree_skb(opt_skb); return 0; } } else sock_rps_save_rxhash(sk, skb); reason = tcp_rcv_state_process(sk, skb); if (reason) goto reset; if (opt_skb) goto ipv6_pktoptions; return 0; reset: tcp_v6_send_reset(sk, skb, sk_rst_convert_drop_reason(reason)); discard: if (opt_skb) __kfree_skb(opt_skb); kfree_skb_reason(skb, reason); return 0; csum_err: reason = SKB_DROP_REASON_TCP_CSUM; trace_tcp_bad_csum(skb); TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); goto discard; ipv6_pktoptions: /* Do you ask, what is it? 1. skb was enqueued by tcp. 2. skb is added to tail of read queue, rather than out of order. 3. socket is not in passive state. 4. Finally, it really contains options, which user wants to receive. */ tp = tcp_sk(sk); if (TCP_SKB_CB(opt_skb)->end_seq == tp->rcv_nxt && !((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) { if (np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo) WRITE_ONCE(np->mcast_oif, tcp_v6_iif(opt_skb)); if (np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) WRITE_ONCE(np->mcast_hops, ipv6_hdr(opt_skb)->hop_limit); if (np->rxopt.bits.rxflow || np->rxopt.bits.rxtclass) np->rcv_flowinfo = ip6_flowinfo(ipv6_hdr(opt_skb)); if (inet6_test_bit(REPFLOW, sk)) np->flow_label = ip6_flowlabel(ipv6_hdr(opt_skb)); if (ipv6_opt_accepted(sk, opt_skb, &TCP_SKB_CB(opt_skb)->header.h6)) { tcp_v6_restore_cb(opt_skb); opt_skb = xchg(&np->pktoptions, opt_skb); } else { __kfree_skb(opt_skb); opt_skb = xchg(&np->pktoptions, NULL); } } consume_skb(opt_skb); return 0; } static void tcp_v6_fill_cb(struct sk_buff *skb, const struct ipv6hdr *hdr, const struct tcphdr *th) { /* This is tricky: we move IP6CB at its correct location into * TCP_SKB_CB(). It must be done after xfrm6_policy_check(), because * _decode_session6() uses IP6CB(). * barrier() makes sure compiler won't play aliasing games. */ memmove(&TCP_SKB_CB(skb)->header.h6, IP6CB(skb), sizeof(struct inet6_skb_parm)); barrier(); TCP_SKB_CB(skb)->seq = ntohl(th->seq); TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + skb->len - th->doff*4); TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); TCP_SKB_CB(skb)->ip_dsfield = ipv6_get_dsfield(hdr); TCP_SKB_CB(skb)->sacked = 0; TCP_SKB_CB(skb)->has_rxtstamp = skb->tstamp || skb_hwtstamps(skb)->hwtstamp; } INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb) { enum skb_drop_reason drop_reason; int sdif = inet6_sdif(skb); int dif = inet6_iif(skb); const struct tcphdr *th; const struct ipv6hdr *hdr; bool refcounted; struct sock *sk; int ret; u32 isn; struct net *net = dev_net(skb->dev); drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; if (skb->pkt_type != PACKET_HOST) goto discard_it; /* * Count it even if it's bad. */ __TCP_INC_STATS(net, TCP_MIB_INSEGS); if (!pskb_may_pull(skb, sizeof(struct tcphdr))) goto discard_it; th = (const struct tcphdr *)skb->data; if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) { drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL; goto bad_packet; } if (!pskb_may_pull(skb, th->doff*4)) goto discard_it; if (skb_checksum_init(skb, IPPROTO_TCP, ip6_compute_pseudo)) goto csum_error; th = (const struct tcphdr *)skb->data; hdr = ipv6_hdr(skb); lookup: sk = __inet6_lookup_skb(net->ipv4.tcp_death_row.hashinfo, skb, __tcp_hdrlen(th), th->source, th->dest, inet6_iif(skb), sdif, &refcounted); if (!sk) goto no_tcp_socket; if (sk->sk_state == TCP_TIME_WAIT) goto do_time_wait; if (sk->sk_state == TCP_NEW_SYN_RECV) { struct request_sock *req = inet_reqsk(sk); bool req_stolen = false; struct sock *nsk; sk = req->rsk_listener; if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) drop_reason = SKB_DROP_REASON_XFRM_POLICY; else drop_reason = tcp_inbound_hash(sk, req, skb, &hdr->saddr, &hdr->daddr, AF_INET6, dif, sdif); if (drop_reason) { sk_drops_add(sk, skb); reqsk_put(req); goto discard_it; } if (tcp_checksum_complete(skb)) { reqsk_put(req); goto csum_error; } if (unlikely(sk->sk_state != TCP_LISTEN)) { nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb); if (!nsk) { inet_csk_reqsk_queue_drop_and_put(sk, req); goto lookup; } sk = nsk; /* reuseport_migrate_sock() has already held one sk_refcnt * before returning. */ } else { sock_hold(sk); } refcounted = true; nsk = NULL; if (!tcp_filter(sk, skb)) { th = (const struct tcphdr *)skb->data; hdr = ipv6_hdr(skb); tcp_v6_fill_cb(skb, hdr, th); nsk = tcp_check_req(sk, skb, req, false, &req_stolen); } else { drop_reason = SKB_DROP_REASON_SOCKET_FILTER; } if (!nsk) { reqsk_put(req); if (req_stolen) { /* Another cpu got exclusive access to req * and created a full blown socket. * Try to feed this packet to this socket * instead of discarding it. */ tcp_v6_restore_cb(skb); sock_put(sk); goto lookup; } goto discard_and_relse; } nf_reset_ct(skb); if (nsk == sk) { reqsk_put(req); tcp_v6_restore_cb(skb); } else { drop_reason = tcp_child_process(sk, nsk, skb); if (drop_reason) { enum sk_rst_reason rst_reason; rst_reason = sk_rst_convert_drop_reason(drop_reason); tcp_v6_send_reset(nsk, skb, rst_reason); goto discard_and_relse; } sock_put(sk); return 0; } } process: if (static_branch_unlikely(&ip6_min_hopcount)) { /* min_hopcount can be changed concurrently from do_ipv6_setsockopt() */ if (unlikely(hdr->hop_limit < READ_ONCE(tcp_inet6_sk(sk)->min_hopcount))) { __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); drop_reason = SKB_DROP_REASON_TCP_MINTTL; goto discard_and_relse; } } if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) { drop_reason = SKB_DROP_REASON_XFRM_POLICY; goto discard_and_relse; } drop_reason = tcp_inbound_hash(sk, NULL, skb, &hdr->saddr, &hdr->daddr, AF_INET6, dif, sdif); if (drop_reason) goto discard_and_relse; nf_reset_ct(skb); if (tcp_filter(sk, skb)) { drop_reason = SKB_DROP_REASON_SOCKET_FILTER; goto discard_and_relse; } th = (const struct tcphdr *)skb->data; hdr = ipv6_hdr(skb); tcp_v6_fill_cb(skb, hdr, th); skb->dev = NULL; if (sk->sk_state == TCP_LISTEN) { ret = tcp_v6_do_rcv(sk, skb); goto put_and_return; } sk_incoming_cpu_update(sk); bh_lock_sock_nested(sk); tcp_segs_in(tcp_sk(sk), skb); ret = 0; if (!sock_owned_by_user(sk)) { ret = tcp_v6_do_rcv(sk, skb); } else { if (tcp_add_backlog(sk, skb, &drop_reason)) goto discard_and_relse; } bh_unlock_sock(sk); put_and_return: if (refcounted) sock_put(sk); return ret ? -1 : 0; no_tcp_socket: drop_reason = SKB_DROP_REASON_NO_SOCKET; if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) goto discard_it; tcp_v6_fill_cb(skb, hdr, th); if (tcp_checksum_complete(skb)) { csum_error: drop_reason = SKB_DROP_REASON_TCP_CSUM; trace_tcp_bad_csum(skb); __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS); bad_packet: __TCP_INC_STATS(net, TCP_MIB_INERRS); } else { tcp_v6_send_reset(NULL, skb, sk_rst_convert_drop_reason(drop_reason)); } discard_it: SKB_DR_OR(drop_reason, NOT_SPECIFIED); kfree_skb_reason(skb, drop_reason); return 0; discard_and_relse: sk_drops_add(sk, skb); if (refcounted) sock_put(sk); goto discard_it; do_time_wait: if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) { drop_reason = SKB_DROP_REASON_XFRM_POLICY; inet_twsk_put(inet_twsk(sk)); goto discard_it; } tcp_v6_fill_cb(skb, hdr, th); if (tcp_checksum_complete(skb)) { inet_twsk_put(inet_twsk(sk)); goto csum_error; } switch (tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn)) { case TCP_TW_SYN: { struct sock *sk2; sk2 = inet6_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo, skb, __tcp_hdrlen(th), &ipv6_hdr(skb)->saddr, th->source, &ipv6_hdr(skb)->daddr, ntohs(th->dest), tcp_v6_iif_l3_slave(skb), sdif); if (sk2) { struct inet_timewait_sock *tw = inet_twsk(sk); inet_twsk_deschedule_put(tw); sk = sk2; tcp_v6_restore_cb(skb); refcounted = false; __this_cpu_write(tcp_tw_isn, isn); goto process; } } /* to ACK */ fallthrough; case TCP_TW_ACK: tcp_v6_timewait_ack(sk, skb); break; case TCP_TW_RST: tcp_v6_send_reset(sk, skb, SK_RST_REASON_TCP_TIMEWAIT_SOCKET); inet_twsk_deschedule_put(inet_twsk(sk)); goto discard_it; case TCP_TW_SUCCESS: ; } goto discard_it; } void tcp_v6_early_demux(struct sk_buff *skb) { struct net *net = dev_net(skb->dev); const struct ipv6hdr *hdr; const struct tcphdr *th; struct sock *sk; if (skb->pkt_type != PACKET_HOST) return; if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr))) return; hdr = ipv6_hdr(skb); th = tcp_hdr(skb); if (th->doff < sizeof(struct tcphdr) / 4) return; /* Note : We use inet6_iif() here, not tcp_v6_iif() */ sk = __inet6_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, &hdr->saddr, th->source, &hdr->daddr, ntohs(th->dest), inet6_iif(skb), inet6_sdif(skb)); if (sk) { skb->sk = sk; skb->destructor = sock_edemux; if (sk_fullsock(sk)) { struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst); if (dst) dst = dst_check(dst, sk->sk_rx_dst_cookie); if (dst && sk->sk_rx_dst_ifindex == skb->skb_iif) skb_dst_set_noref(skb, dst); } } } static struct timewait_sock_ops tcp6_timewait_sock_ops = { .twsk_obj_size = sizeof(struct tcp6_timewait_sock), .twsk_destructor = tcp_twsk_destructor, }; INDIRECT_CALLABLE_SCOPE void tcp_v6_send_check(struct sock *sk, struct sk_buff *skb) { __tcp_v6_send_check(skb, &sk->sk_v6_rcv_saddr, &sk->sk_v6_daddr); } const struct inet_connection_sock_af_ops ipv6_specific = { .queue_xmit = inet6_csk_xmit, .send_check = tcp_v6_send_check, .rebuild_header = inet6_sk_rebuild_header, .sk_rx_dst_set = inet6_sk_rx_dst_set, .conn_request = tcp_v6_conn_request, .syn_recv_sock = tcp_v6_syn_recv_sock, .net_header_len = sizeof(struct ipv6hdr), .setsockopt = ipv6_setsockopt, .getsockopt = ipv6_getsockopt, .addr2sockaddr = inet6_csk_addr2sockaddr, .sockaddr_len = sizeof(struct sockaddr_in6), .mtu_reduced = tcp_v6_mtu_reduced, }; #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) static const struct tcp_sock_af_ops tcp_sock_ipv6_specific = { #ifdef CONFIG_TCP_MD5SIG .md5_lookup = tcp_v6_md5_lookup, .calc_md5_hash = tcp_v6_md5_hash_skb, .md5_parse = tcp_v6_parse_md5_keys, #endif #ifdef CONFIG_TCP_AO .ao_lookup = tcp_v6_ao_lookup, .calc_ao_hash = tcp_v6_ao_hash_skb, .ao_parse = tcp_v6_parse_ao, .ao_calc_key_sk = tcp_v6_ao_calc_key_sk, #endif }; #endif /* * TCP over IPv4 via INET6 API */ static const struct inet_connection_sock_af_ops ipv6_mapped = { .queue_xmit = ip_queue_xmit, .send_check = tcp_v4_send_check, .rebuild_header = inet_sk_rebuild_header, .sk_rx_dst_set = inet_sk_rx_dst_set, .conn_request = tcp_v6_conn_request, .syn_recv_sock = tcp_v6_syn_recv_sock, .net_header_len = sizeof(struct iphdr), .setsockopt = ipv6_setsockopt, .getsockopt = ipv6_getsockopt, .addr2sockaddr = inet6_csk_addr2sockaddr, .sockaddr_len = sizeof(struct sockaddr_in6), .mtu_reduced = tcp_v4_mtu_reduced, }; #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) static const struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific = { #ifdef CONFIG_TCP_MD5SIG .md5_lookup = tcp_v4_md5_lookup, .calc_md5_hash = tcp_v4_md5_hash_skb, .md5_parse = tcp_v6_parse_md5_keys, #endif #ifdef CONFIG_TCP_AO .ao_lookup = tcp_v6_ao_lookup, .calc_ao_hash = tcp_v4_ao_hash_skb, .ao_parse = tcp_v6_parse_ao, .ao_calc_key_sk = tcp_v4_ao_calc_key_sk, #endif }; #endif /* NOTE: A lot of things set to zero explicitly by call to * sk_alloc() so need not be done here. */ static int tcp_v6_init_sock(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); tcp_init_sock(sk); icsk->icsk_af_ops = &ipv6_specific; #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) tcp_sk(sk)->af_specific = &tcp_sock_ipv6_specific; #endif return 0; } #ifdef CONFIG_PROC_FS /* Proc filesystem TCPv6 sock list dumping. */ static void get_openreq6(struct seq_file *seq, const struct request_sock *req, int i) { long ttd = req->rsk_timer.expires - jiffies; const struct in6_addr *src = &inet_rsk(req)->ir_v6_loc_addr; const struct in6_addr *dest = &inet_rsk(req)->ir_v6_rmt_addr; if (ttd < 0) ttd = 0; seq_printf(seq, "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X " "%02X %08X:%08X %02X:%08lX %08X %5u %8d %d %d %pK\n", i, src->s6_addr32[0], src->s6_addr32[1], src->s6_addr32[2], src->s6_addr32[3], inet_rsk(req)->ir_num, dest->s6_addr32[0], dest->s6_addr32[1], dest->s6_addr32[2], dest->s6_addr32[3], ntohs(inet_rsk(req)->ir_rmt_port), TCP_SYN_RECV, 0, 0, /* could print option size, but that is af dependent. */ 1, /* timers active (only the expire timer) */ jiffies_to_clock_t(ttd), req->num_timeout, from_kuid_munged(seq_user_ns(seq), sock_i_uid(req->rsk_listener)), 0, /* non standard timer */ 0, /* open_requests have no inode */ 0, req); } static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) { const struct in6_addr *dest, *src; __u16 destp, srcp; int timer_active; unsigned long timer_expires; const struct inet_sock *inet = inet_sk(sp); const struct tcp_sock *tp = tcp_sk(sp); const struct inet_connection_sock *icsk = inet_csk(sp); const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq; int rx_queue; int state; dest = &sp->sk_v6_daddr; src = &sp->sk_v6_rcv_saddr; destp = ntohs(inet->inet_dport); srcp = ntohs(inet->inet_sport); if (icsk->icsk_pending == ICSK_TIME_RETRANS || icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { timer_active = 1; timer_expires = icsk->icsk_timeout; } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { timer_active = 4; timer_expires = icsk->icsk_timeout; } else if (timer_pending(&sp->sk_timer)) { timer_active = 2; timer_expires = sp->sk_timer.expires; } else { timer_active = 0; timer_expires = jiffies; } state = inet_sk_state_load(sp); if (state == TCP_LISTEN) rx_queue = READ_ONCE(sp->sk_ack_backlog); else /* Because we don't lock the socket, * we might find a transient negative value. */ rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) - READ_ONCE(tp->copied_seq), 0); seq_printf(seq, "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X " "%02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %lu %lu %u %u %d\n", i, src->s6_addr32[0], src->s6_addr32[1], src->s6_addr32[2], src->s6_addr32[3], srcp, dest->s6_addr32[0], dest->s6_addr32[1], dest->s6_addr32[2], dest->s6_addr32[3], destp, state, READ_ONCE(tp->write_seq) - tp->snd_una, rx_queue, timer_active, jiffies_delta_to_clock_t(timer_expires - jiffies), icsk->icsk_retransmits, from_kuid_munged(seq_user_ns(seq), sock_i_uid(sp)), icsk->icsk_probes_out, sock_i_ino(sp), refcount_read(&sp->sk_refcnt), sp, jiffies_to_clock_t(icsk->icsk_rto), jiffies_to_clock_t(icsk->icsk_ack.ato), (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sp), tcp_snd_cwnd(tp), state == TCP_LISTEN ? fastopenq->max_qlen : (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh) ); } static void get_timewait6_sock(struct seq_file *seq, struct inet_timewait_sock *tw, int i) { long delta = tw->tw_timer.expires - jiffies; const struct in6_addr *dest, *src; __u16 destp, srcp; dest = &tw->tw_v6_daddr; src = &tw->tw_v6_rcv_saddr; destp = ntohs(tw->tw_dport); srcp = ntohs(tw->tw_sport); seq_printf(seq, "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X " "%02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK\n", i, src->s6_addr32[0], src->s6_addr32[1], src->s6_addr32[2], src->s6_addr32[3], srcp, dest->s6_addr32[0], dest->s6_addr32[1], dest->s6_addr32[2], dest->s6_addr32[3], destp, tw->tw_substate, 0, 0, 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0, refcount_read(&tw->tw_refcnt), tw); } static int tcp6_seq_show(struct seq_file *seq, void *v) { struct tcp_iter_state *st; struct sock *sk = v; if (v == SEQ_START_TOKEN) { seq_puts(seq, " sl " "local_address " "remote_address " "st tx_queue rx_queue tr tm->when retrnsmt" " uid timeout inode\n"); goto out; } st = seq->private; if (sk->sk_state == TCP_TIME_WAIT) get_timewait6_sock(seq, v, st->num); else if (sk->sk_state == TCP_NEW_SYN_RECV) get_openreq6(seq, v, st->num); else get_tcp6_sock(seq, v, st->num); out: return 0; } static const struct seq_operations tcp6_seq_ops = { .show = tcp6_seq_show, .start = tcp_seq_start, .next = tcp_seq_next, .stop = tcp_seq_stop, }; static struct tcp_seq_afinfo tcp6_seq_afinfo = { .family = AF_INET6, }; int __net_init tcp6_proc_init(struct net *net) { if (!proc_create_net_data("tcp6", 0444, net->proc_net, &tcp6_seq_ops, sizeof(struct tcp_iter_state), &tcp6_seq_afinfo)) return -ENOMEM; return 0; } void tcp6_proc_exit(struct net *net) { remove_proc_entry("tcp6", net->proc_net); } #endif struct proto tcpv6_prot = { .name = "TCPv6", .owner = THIS_MODULE, .close = tcp_close, .pre_connect = tcp_v6_pre_connect, .connect = tcp_v6_connect, .disconnect = tcp_disconnect, .accept = inet_csk_accept, .ioctl = tcp_ioctl, .init = tcp_v6_init_sock, .destroy = tcp_v4_destroy_sock, .shutdown = tcp_shutdown, .setsockopt = tcp_setsockopt, .getsockopt = tcp_getsockopt, .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt, .keepalive = tcp_set_keepalive, .recvmsg = tcp_recvmsg, .sendmsg = tcp_sendmsg, .splice_eof = tcp_splice_eof, .backlog_rcv = tcp_v6_do_rcv, .release_cb = tcp_release_cb, .hash = inet6_hash, .unhash = inet_unhash, .get_port = inet_csk_get_port, .put_port = inet_put_port, #ifdef CONFIG_BPF_SYSCALL .psock_update_sk_prot = tcp_bpf_update_proto, #endif .enter_memory_pressure = tcp_enter_memory_pressure, .leave_memory_pressure = tcp_leave_memory_pressure, .stream_memory_free = tcp_stream_memory_free, .sockets_allocated = &tcp_sockets_allocated, .memory_allocated = &tcp_memory_allocated, .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc, .memory_pressure = &tcp_memory_pressure, .orphan_count = &tcp_orphan_count, .sysctl_mem = sysctl_tcp_mem, .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), .max_header = MAX_TCP_HEADER, .obj_size = sizeof(struct tcp6_sock), .ipv6_pinfo_offset = offsetof(struct tcp6_sock, inet6), .slab_flags = SLAB_TYPESAFE_BY_RCU, .twsk_prot = &tcp6_timewait_sock_ops, .rsk_prot = &tcp6_request_sock_ops, .h.hashinfo = NULL, .no_autobind = true, .diag_destroy = tcp_abort, }; EXPORT_SYMBOL_GPL(tcpv6_prot); static struct inet_protosw tcpv6_protosw = { .type = SOCK_STREAM, .protocol = IPPROTO_TCP, .prot = &tcpv6_prot, .ops = &inet6_stream_ops, .flags = INET_PROTOSW_PERMANENT | INET_PROTOSW_ICSK, }; static int __net_init tcpv6_net_init(struct net *net) { return inet_ctl_sock_create(&net->ipv6.tcp_sk, PF_INET6, SOCK_RAW, IPPROTO_TCP, net); } static void __net_exit tcpv6_net_exit(struct net *net) { inet_ctl_sock_destroy(net->ipv6.tcp_sk); } static struct pernet_operations tcpv6_net_ops = { .init = tcpv6_net_init, .exit = tcpv6_net_exit, }; int __init tcpv6_init(void) { int ret; net_hotdata.tcpv6_protocol = (struct inet6_protocol) { .handler = tcp_v6_rcv, .err_handler = tcp_v6_err, .flags = INET6_PROTO_NOPOLICY | INET6_PROTO_FINAL, }; ret = inet6_add_protocol(&net_hotdata.tcpv6_protocol, IPPROTO_TCP); if (ret) goto out; /* register inet6 protocol */ ret = inet6_register_protosw(&tcpv6_protosw); if (ret) goto out_tcpv6_protocol; ret = register_pernet_subsys(&tcpv6_net_ops); if (ret) goto out_tcpv6_protosw; ret = mptcpv6_init(); if (ret) goto out_tcpv6_pernet_subsys; out: return ret; out_tcpv6_pernet_subsys: unregister_pernet_subsys(&tcpv6_net_ops); out_tcpv6_protosw: inet6_unregister_protosw(&tcpv6_protosw); out_tcpv6_protocol: inet6_del_protocol(&net_hotdata.tcpv6_protocol, IPPROTO_TCP); goto out; } void tcpv6_exit(void) { unregister_pernet_subsys(&tcpv6_net_ops); inet6_unregister_protosw(&tcpv6_protosw); inet6_del_protocol(&net_hotdata.tcpv6_protocol, IPPROTO_TCP); } |
| 7 21 19 46 12 4 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __ASM_GENERIC_UNALIGNED_H #define __ASM_GENERIC_UNALIGNED_H /* * This is the most generic implementation of unaligned accesses * and should work almost anywhere. */ #include <linux/unaligned/packed_struct.h> #include <asm/byteorder.h> #define __get_unaligned_t(type, ptr) ({ \ const struct { type x; } __packed *__pptr = (typeof(__pptr))(ptr); \ __pptr->x; \ }) #define __put_unaligned_t(type, val, ptr) do { \ struct { type x; } __packed *__pptr = (typeof(__pptr))(ptr); \ __pptr->x = (val); \ } while (0) #define get_unaligned(ptr) __get_unaligned_t(typeof(*(ptr)), (ptr)) #define put_unaligned(val, ptr) __put_unaligned_t(typeof(*(ptr)), (val), (ptr)) static inline u16 get_unaligned_le16(const void *p) { return le16_to_cpu(__get_unaligned_t(__le16, p)); } static inline u32 get_unaligned_le32(const void *p) { return le32_to_cpu(__get_unaligned_t(__le32, p)); } static inline u64 get_unaligned_le64(const void *p) { return le64_to_cpu(__get_unaligned_t(__le64, p)); } static inline void put_unaligned_le16(u16 val, void *p) { __put_unaligned_t(__le16, cpu_to_le16(val), p); } static inline void put_unaligned_le32(u32 val, void *p) { __put_unaligned_t(__le32, cpu_to_le32(val), p); } static inline void put_unaligned_le64(u64 val, void *p) { __put_unaligned_t(__le64, cpu_to_le64(val), p); } static inline u16 get_unaligned_be16(const void *p) { return be16_to_cpu(__get_unaligned_t(__be16, p)); } static inline u32 get_unaligned_be32(const void *p) { return be32_to_cpu(__get_unaligned_t(__be32, p)); } static inline u64 get_unaligned_be64(const void *p) { return be64_to_cpu(__get_unaligned_t(__be64, p)); } static inline void put_unaligned_be16(u16 val, void *p) { __put_unaligned_t(__be16, cpu_to_be16(val), p); } static inline void put_unaligned_be32(u32 val, void *p) { __put_unaligned_t(__be32, cpu_to_be32(val), p); } static inline void put_unaligned_be64(u64 val, void *p) { __put_unaligned_t(__be64, cpu_to_be64(val), p); } static inline u32 __get_unaligned_be24(const u8 *p) { return p[0] << 16 | p[1] << 8 | p[2]; } static inline u32 get_unaligned_be24(const void *p) { return __get_unaligned_be24(p); } static inline u32 __get_unaligned_le24(const u8 *p) { return p[0] | p[1] << 8 | p[2] << 16; } static inline u32 get_unaligned_le24(const void *p) { return __get_unaligned_le24(p); } static inline void __put_unaligned_be24(const u32 val, u8 *p) { *p++ = (val >> 16) & 0xff; *p++ = (val >> 8) & 0xff; *p++ = val & 0xff; } static inline void put_unaligned_be24(const u32 val, void *p) { __put_unaligned_be24(val, p); } static inline void __put_unaligned_le24(const u32 val, u8 *p) { *p++ = val & 0xff; *p++ = (val >> 8) & 0xff; *p++ = (val >> 16) & 0xff; } static inline void put_unaligned_le24(const u32 val, void *p) { __put_unaligned_le24(val, p); } static inline void __put_unaligned_be48(const u64 val, u8 *p) { *p++ = (val >> 40) & 0xff; *p++ = (val >> 32) & 0xff; *p++ = (val >> 24) & 0xff; *p++ = (val >> 16) & 0xff; *p++ = (val >> 8) & 0xff; *p++ = val & 0xff; } static inline void put_unaligned_be48(const u64 val, void *p) { __put_unaligned_be48(val, p); } static inline u64 __get_unaligned_be48(const u8 *p) { return (u64)p[0] << 40 | (u64)p[1] << 32 | (u64)p[2] << 24 | p[3] << 16 | p[4] << 8 | p[5]; } static inline u64 get_unaligned_be48(const void *p) { return __get_unaligned_be48(p); } #endif /* __ASM_GENERIC_UNALIGNED_H */ |
| 140 2 2 1 1 2 3 3 3 1 1 1 1 1 3 3 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 140 141 1 1 3 3 3 3 7 7 7 7 1 6 4 2 1 1 1 4 1 3 2 1 1 1 3 3 3 1 3 2 3 1 1 1 3 3 3 7 7 140 140 138 140 139 141 141 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 | // SPDX-License-Identifier: GPL-2.0 /* * device_cgroup.c - device cgroup subsystem * * Copyright 2007 IBM Corp */ #include <linux/bpf-cgroup.h> #include <linux/device_cgroup.h> #include <linux/cgroup.h> #include <linux/ctype.h> #include <linux/list.h> #include <linux/uaccess.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/rcupdate.h> #include <linux/mutex.h> #ifdef CONFIG_CGROUP_DEVICE static DEFINE_MUTEX(devcgroup_mutex); enum devcg_behavior { DEVCG_DEFAULT_NONE, DEVCG_DEFAULT_ALLOW, DEVCG_DEFAULT_DENY, }; /* * exception list locking rules: * hold devcgroup_mutex for update/read. * hold rcu_read_lock() for read. */ struct dev_exception_item { u32 major, minor; short type; short access; struct list_head list; struct rcu_head rcu; }; struct dev_cgroup { struct cgroup_subsys_state css; struct list_head exceptions; enum devcg_behavior behavior; }; static inline struct dev_cgroup *css_to_devcgroup(struct cgroup_subsys_state *s) { return s ? container_of(s, struct dev_cgroup, css) : NULL; } static inline struct dev_cgroup *task_devcgroup(struct task_struct *task) { return css_to_devcgroup(task_css(task, devices_cgrp_id)); } /* * called under devcgroup_mutex */ static int dev_exceptions_copy(struct list_head *dest, struct list_head *orig) { struct dev_exception_item *ex, *tmp, *new; lockdep_assert_held(&devcgroup_mutex); list_for_each_entry(ex, orig, list) { new = kmemdup(ex, sizeof(*ex), GFP_KERNEL); if (!new) goto free_and_exit; list_add_tail(&new->list, dest); } return 0; free_and_exit: list_for_each_entry_safe(ex, tmp, dest, list) { list_del(&ex->list); kfree(ex); } return -ENOMEM; } static void dev_exceptions_move(struct list_head *dest, struct list_head *orig) { struct dev_exception_item *ex, *tmp; lockdep_assert_held(&devcgroup_mutex); list_for_each_entry_safe(ex, tmp, orig, list) { list_move_tail(&ex->list, dest); } } /* * called under devcgroup_mutex */ static int dev_exception_add(struct dev_cgroup *dev_cgroup, struct dev_exception_item *ex) { struct dev_exception_item *excopy, *walk; lockdep_assert_held(&devcgroup_mutex); excopy = kmemdup(ex, sizeof(*ex), GFP_KERNEL); if (!excopy) return -ENOMEM; list_for_each_entry(walk, &dev_cgroup->exceptions, list) { if (walk->type != ex->type) continue; if (walk->major != ex->major) continue; if (walk->minor != ex->minor) continue; walk->access |= ex->access; kfree(excopy); excopy = NULL; } if (excopy != NULL) list_add_tail_rcu(&excopy->list, &dev_cgroup->exceptions); return 0; } /* * called under devcgroup_mutex */ static void dev_exception_rm(struct dev_cgroup *dev_cgroup, struct dev_exception_item *ex) { struct dev_exception_item *walk, *tmp; lockdep_assert_held(&devcgroup_mutex); list_for_each_entry_safe(walk, tmp, &dev_cgroup->exceptions, list) { if (walk->type != ex->type) continue; if (walk->major != ex->major) continue; if (walk->minor != ex->minor) continue; walk->access &= ~ex->access; if (!walk->access) { list_del_rcu(&walk->list); kfree_rcu(walk, rcu); } } } static void __dev_exception_clean(struct dev_cgroup *dev_cgroup) { struct dev_exception_item *ex, *tmp; list_for_each_entry_safe(ex, tmp, &dev_cgroup->exceptions, list) { list_del_rcu(&ex->list); kfree_rcu(ex, rcu); } } /** * dev_exception_clean - frees all entries of the exception list * @dev_cgroup: dev_cgroup with the exception list to be cleaned * * called under devcgroup_mutex */ static void dev_exception_clean(struct dev_cgroup *dev_cgroup) { lockdep_assert_held(&devcgroup_mutex); __dev_exception_clean(dev_cgroup); } static inline bool is_devcg_online(const struct dev_cgroup *devcg) { return (devcg->behavior != DEVCG_DEFAULT_NONE); } /** * devcgroup_online - initializes devcgroup's behavior and exceptions based on * parent's * @css: css getting online * returns 0 in case of success, error code otherwise */ static int devcgroup_online(struct cgroup_subsys_state *css) { struct dev_cgroup *dev_cgroup = css_to_devcgroup(css); struct dev_cgroup *parent_dev_cgroup = css_to_devcgroup(css->parent); int ret = 0; mutex_lock(&devcgroup_mutex); if (parent_dev_cgroup == NULL) dev_cgroup->behavior = DEVCG_DEFAULT_ALLOW; else { ret = dev_exceptions_copy(&dev_cgroup->exceptions, &parent_dev_cgroup->exceptions); if (!ret) dev_cgroup->behavior = parent_dev_cgroup->behavior; } mutex_unlock(&devcgroup_mutex); return ret; } static void devcgroup_offline(struct cgroup_subsys_state *css) { struct dev_cgroup *dev_cgroup = css_to_devcgroup(css); mutex_lock(&devcgroup_mutex); dev_cgroup->behavior = DEVCG_DEFAULT_NONE; mutex_unlock(&devcgroup_mutex); } /* * called from kernel/cgroup/cgroup.c with cgroup_lock() held. */ static struct cgroup_subsys_state * devcgroup_css_alloc(struct cgroup_subsys_state *parent_css) { struct dev_cgroup *dev_cgroup; dev_cgroup = kzalloc(sizeof(*dev_cgroup), GFP_KERNEL); if (!dev_cgroup) return ERR_PTR(-ENOMEM); INIT_LIST_HEAD(&dev_cgroup->exceptions); dev_cgroup->behavior = DEVCG_DEFAULT_NONE; return &dev_cgroup->css; } static void devcgroup_css_free(struct cgroup_subsys_state *css) { struct dev_cgroup *dev_cgroup = css_to_devcgroup(css); __dev_exception_clean(dev_cgroup); kfree(dev_cgroup); } #define DEVCG_ALLOW 1 #define DEVCG_DENY 2 #define DEVCG_LIST 3 #define MAJMINLEN 13 #define ACCLEN 4 static void set_access(char *acc, short access) { int idx = 0; memset(acc, 0, ACCLEN); if (access & DEVCG_ACC_READ) acc[idx++] = 'r'; if (access & DEVCG_ACC_WRITE) acc[idx++] = 'w'; if (access & DEVCG_ACC_MKNOD) acc[idx++] = 'm'; } static char type_to_char(short type) { if (type == DEVCG_DEV_ALL) return 'a'; if (type == DEVCG_DEV_CHAR) return 'c'; if (type == DEVCG_DEV_BLOCK) return 'b'; return 'X'; } static void set_majmin(char *str, unsigned m) { if (m == ~0) strcpy(str, "*"); else sprintf(str, "%u", m); } static int devcgroup_seq_show(struct seq_file *m, void *v) { struct dev_cgroup *devcgroup = css_to_devcgroup(seq_css(m)); struct dev_exception_item *ex; char maj[MAJMINLEN], min[MAJMINLEN], acc[ACCLEN]; rcu_read_lock(); /* * To preserve the compatibility: * - Only show the "all devices" when the default policy is to allow * - List the exceptions in case the default policy is to deny * This way, the file remains as a "whitelist of devices" */ if (devcgroup->behavior == DEVCG_DEFAULT_ALLOW) { set_access(acc, DEVCG_ACC_MASK); set_majmin(maj, ~0); set_majmin(min, ~0); seq_printf(m, "%c %s:%s %s\n", type_to_char(DEVCG_DEV_ALL), maj, min, acc); } else { list_for_each_entry_rcu(ex, &devcgroup->exceptions, list) { set_access(acc, ex->access); set_majmin(maj, ex->major); set_majmin(min, ex->minor); seq_printf(m, "%c %s:%s %s\n", type_to_char(ex->type), maj, min, acc); } } rcu_read_unlock(); return 0; } /** * match_exception - iterates the exception list trying to find a complete match * @exceptions: list of exceptions * @type: device type (DEVCG_DEV_BLOCK or DEVCG_DEV_CHAR) * @major: device file major number, ~0 to match all * @minor: device file minor number, ~0 to match all * @access: permission mask (DEVCG_ACC_READ, DEVCG_ACC_WRITE, DEVCG_ACC_MKNOD) * * It is considered a complete match if an exception is found that will * contain the entire range of provided parameters. * * Return: true in case it matches an exception completely */ static bool match_exception(struct list_head *exceptions, short type, u32 major, u32 minor, short access) { struct dev_exception_item *ex; list_for_each_entry_rcu(ex, exceptions, list) { if ((type & DEVCG_DEV_BLOCK) && !(ex->type & DEVCG_DEV_BLOCK)) continue; if ((type & DEVCG_DEV_CHAR) && !(ex->type & DEVCG_DEV_CHAR)) continue; if (ex->major != ~0 && ex->major != major) continue; if (ex->minor != ~0 && ex->minor != minor) continue; /* provided access cannot have more than the exception rule */ if (access & (~ex->access)) continue; return true; } return false; } /** * match_exception_partial - iterates the exception list trying to find a partial match * @exceptions: list of exceptions * @type: device type (DEVCG_DEV_BLOCK or DEVCG_DEV_CHAR) * @major: device file major number, ~0 to match all * @minor: device file minor number, ~0 to match all * @access: permission mask (DEVCG_ACC_READ, DEVCG_ACC_WRITE, DEVCG_ACC_MKNOD) * * It is considered a partial match if an exception's range is found to * contain *any* of the devices specified by provided parameters. This is * used to make sure no extra access is being granted that is forbidden by * any of the exception list. * * Return: true in case the provided range mat matches an exception completely */ static bool match_exception_partial(struct list_head *exceptions, short type, u32 major, u32 minor, short access) { struct dev_exception_item *ex; list_for_each_entry_rcu(ex, exceptions, list, lockdep_is_held(&devcgroup_mutex)) { if ((type & DEVCG_DEV_BLOCK) && !(ex->type & DEVCG_DEV_BLOCK)) continue; if ((type & DEVCG_DEV_CHAR) && !(ex->type & DEVCG_DEV_CHAR)) continue; /* * We must be sure that both the exception and the provided * range aren't masking all devices */ if (ex->major != ~0 && major != ~0 && ex->major != major) continue; if (ex->minor != ~0 && minor != ~0 && ex->minor != minor) continue; /* * In order to make sure the provided range isn't matching * an exception, all its access bits shouldn't match the * exception's access bits */ if (!(access & ex->access)) continue; return true; } return false; } /** * verify_new_ex - verifies if a new exception is allowed by parent cgroup's permissions * @dev_cgroup: dev cgroup to be tested against * @refex: new exception * @behavior: behavior of the exception's dev_cgroup * * This is used to make sure a child cgroup won't have more privileges * than its parent */ static bool verify_new_ex(struct dev_cgroup *dev_cgroup, struct dev_exception_item *refex, enum devcg_behavior behavior) { bool match = false; RCU_LOCKDEP_WARN(!rcu_read_lock_held() && !lockdep_is_held(&devcgroup_mutex), "device_cgroup:verify_new_ex called without proper synchronization"); if (dev_cgroup->behavior == DEVCG_DEFAULT_ALLOW) { if (behavior == DEVCG_DEFAULT_ALLOW) { /* * new exception in the child doesn't matter, only * adding extra restrictions */ return true; } else { /* * new exception in the child will add more devices * that can be accessed, so it can't match any of * parent's exceptions, even slightly */ match = match_exception_partial(&dev_cgroup->exceptions, refex->type, refex->major, refex->minor, refex->access); if (match) return false; return true; } } else { /* * Only behavior == DEVCG_DEFAULT_DENY allowed here, therefore * the new exception will add access to more devices and must * be contained completely in an parent's exception to be * allowed */ match = match_exception(&dev_cgroup->exceptions, refex->type, refex->major, refex->minor, refex->access); if (match) /* parent has an exception that matches the proposed */ return true; else return false; } return false; } /* * parent_has_perm: * when adding a new allow rule to a device exception list, the rule * must be allowed in the parent device */ static int parent_has_perm(struct dev_cgroup *childcg, struct dev_exception_item *ex) { struct dev_cgroup *parent = css_to_devcgroup(childcg->css.parent); if (!parent) return 1; return verify_new_ex(parent, ex, childcg->behavior); } /** * parent_allows_removal - verify if it's ok to remove an exception * @childcg: child cgroup from where the exception will be removed * @ex: exception being removed * * When removing an exception in cgroups with default ALLOW policy, it must * be checked if removing it will give the child cgroup more access than the * parent. * * Return: true if it's ok to remove exception, false otherwise */ static bool parent_allows_removal(struct dev_cgroup *childcg, struct dev_exception_item *ex) { struct dev_cgroup *parent = css_to_devcgroup(childcg->css.parent); if (!parent) return true; /* It's always allowed to remove access to devices */ if (childcg->behavior == DEVCG_DEFAULT_DENY) return true; /* * Make sure you're not removing part or a whole exception existing in * the parent cgroup */ return !match_exception_partial(&parent->exceptions, ex->type, ex->major, ex->minor, ex->access); } /** * may_allow_all - checks if it's possible to change the behavior to * allow based on parent's rules. * @parent: device cgroup's parent * returns: != 0 in case it's allowed, 0 otherwise */ static inline int may_allow_all(struct dev_cgroup *parent) { if (!parent) return 1; return parent->behavior == DEVCG_DEFAULT_ALLOW; } /** * revalidate_active_exceptions - walks through the active exception list and * revalidates the exceptions based on parent's * behavior and exceptions. The exceptions that * are no longer valid will be removed. * Called with devcgroup_mutex held. * @devcg: cgroup which exceptions will be checked * * This is one of the three key functions for hierarchy implementation. * This function is responsible for re-evaluating all the cgroup's active * exceptions due to a parent's exception change. * Refer to Documentation/admin-guide/cgroup-v1/devices.rst for more details. */ static void revalidate_active_exceptions(struct dev_cgroup *devcg) { struct dev_exception_item *ex; struct list_head *this, *tmp; list_for_each_safe(this, tmp, &devcg->exceptions) { ex = container_of(this, struct dev_exception_item, list); if (!parent_has_perm(devcg, ex)) dev_exception_rm(devcg, ex); } } /** * propagate_exception - propagates a new exception to the children * @devcg_root: device cgroup that added a new exception * @ex: new exception to be propagated * * returns: 0 in case of success, != 0 in case of error */ static int propagate_exception(struct dev_cgroup *devcg_root, struct dev_exception_item *ex) { struct cgroup_subsys_state *pos; int rc = 0; rcu_read_lock(); css_for_each_descendant_pre(pos, &devcg_root->css) { struct dev_cgroup *devcg = css_to_devcgroup(pos); /* * Because devcgroup_mutex is held, no devcg will become * online or offline during the tree walk (see on/offline * methods), and online ones are safe to access outside RCU * read lock without bumping refcnt. */ if (pos == &devcg_root->css || !is_devcg_online(devcg)) continue; rcu_read_unlock(); /* * in case both root's behavior and devcg is allow, a new * restriction means adding to the exception list */ if (devcg_root->behavior == DEVCG_DEFAULT_ALLOW && devcg->behavior == DEVCG_DEFAULT_ALLOW) { rc = dev_exception_add(devcg, ex); if (rc) return rc; } else { /* * in the other possible cases: * root's behavior: allow, devcg's: deny * root's behavior: deny, devcg's: deny * the exception will be removed */ dev_exception_rm(devcg, ex); } revalidate_active_exceptions(devcg); rcu_read_lock(); } rcu_read_unlock(); return rc; } /* * Modify the exception list using allow/deny rules. * CAP_SYS_ADMIN is needed for this. It's at least separate from CAP_MKNOD * so we can give a container CAP_MKNOD to let it create devices but not * modify the exception list. * It seems likely we'll want to add a CAP_CONTAINER capability to allow * us to also grant CAP_SYS_ADMIN to containers without giving away the * device exception list controls, but for now we'll stick with CAP_SYS_ADMIN * * Taking rules away is always allowed (given CAP_SYS_ADMIN). Granting * new access is only allowed if you're in the top-level cgroup, or your * parent cgroup has the access you're asking for. */ static int devcgroup_update_access(struct dev_cgroup *devcgroup, int filetype, char *buffer) { const char *b; char temp[12]; /* 11 + 1 characters needed for a u32 */ int count, rc = 0; struct dev_exception_item ex; struct dev_cgroup *parent = css_to_devcgroup(devcgroup->css.parent); struct dev_cgroup tmp_devcgrp; if (!capable(CAP_SYS_ADMIN)) return -EPERM; memset(&ex, 0, sizeof(ex)); memset(&tmp_devcgrp, 0, sizeof(tmp_devcgrp)); b = buffer; switch (*b) { case 'a': switch (filetype) { case DEVCG_ALLOW: if (css_has_online_children(&devcgroup->css)) return -EINVAL; if (!may_allow_all(parent)) return -EPERM; if (!parent) { devcgroup->behavior = DEVCG_DEFAULT_ALLOW; dev_exception_clean(devcgroup); break; } INIT_LIST_HEAD(&tmp_devcgrp.exceptions); rc = dev_exceptions_copy(&tmp_devcgrp.exceptions, &devcgroup->exceptions); if (rc) return rc; dev_exception_clean(devcgroup); rc = dev_exceptions_copy(&devcgroup->exceptions, &parent->exceptions); if (rc) { dev_exceptions_move(&devcgroup->exceptions, &tmp_devcgrp.exceptions); return rc; } devcgroup->behavior = DEVCG_DEFAULT_ALLOW; dev_exception_clean(&tmp_devcgrp); break; case DEVCG_DENY: if (css_has_online_children(&devcgroup->css)) return -EINVAL; dev_exception_clean(devcgroup); devcgroup->behavior = DEVCG_DEFAULT_DENY; break; default: return -EINVAL; } return 0; case 'b': ex.type = DEVCG_DEV_BLOCK; break; case 'c': ex.type = DEVCG_DEV_CHAR; break; default: return -EINVAL; } b++; if (!isspace(*b)) return -EINVAL; b++; if (*b == '*') { ex.major = ~0; b++; } else if (isdigit(*b)) { memset(temp, 0, sizeof(temp)); for (count = 0; count < sizeof(temp) - 1; count++) { temp[count] = *b; b++; if (!isdigit(*b)) break; } rc = kstrtou32(temp, 10, &ex.major); if (rc) return -EINVAL; } else { return -EINVAL; } if (*b != ':') return -EINVAL; b++; /* read minor */ if (*b == '*') { ex.minor = ~0; b++; } else if (isdigit(*b)) { memset(temp, 0, sizeof(temp)); for (count = 0; count < sizeof(temp) - 1; count++) { temp[count] = *b; b++; if (!isdigit(*b)) break; } rc = kstrtou32(temp, 10, &ex.minor); if (rc) return -EINVAL; } else { return -EINVAL; } if (!isspace(*b)) return -EINVAL; for (b++, count = 0; count < 3; count++, b++) { switch (*b) { case 'r': ex.access |= DEVCG_ACC_READ; break; case 'w': ex.access |= DEVCG_ACC_WRITE; break; case 'm': ex.access |= DEVCG_ACC_MKNOD; break; case '\n': case '\0': count = 3; break; default: return -EINVAL; } } switch (filetype) { case DEVCG_ALLOW: /* * If the default policy is to allow by default, try to remove * an matching exception instead. And be silent about it: we * don't want to break compatibility */ if (devcgroup->behavior == DEVCG_DEFAULT_ALLOW) { /* Check if the parent allows removing it first */ if (!parent_allows_removal(devcgroup, &ex)) return -EPERM; dev_exception_rm(devcgroup, &ex); break; } if (!parent_has_perm(devcgroup, &ex)) return -EPERM; rc = dev_exception_add(devcgroup, &ex); break; case DEVCG_DENY: /* * If the default policy is to deny by default, try to remove * an matching exception instead. And be silent about it: we * don't want to break compatibility */ if (devcgroup->behavior == DEVCG_DEFAULT_DENY) dev_exception_rm(devcgroup, &ex); else rc = dev_exception_add(devcgroup, &ex); if (rc) break; /* we only propagate new restrictions */ rc = propagate_exception(devcgroup, &ex); break; default: rc = -EINVAL; } return rc; } static ssize_t devcgroup_access_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { int retval; mutex_lock(&devcgroup_mutex); retval = devcgroup_update_access(css_to_devcgroup(of_css(of)), of_cft(of)->private, strstrip(buf)); mutex_unlock(&devcgroup_mutex); return retval ?: nbytes; } static struct cftype dev_cgroup_files[] = { { .name = "allow", .write = devcgroup_access_write, .private = DEVCG_ALLOW, }, { .name = "deny", .write = devcgroup_access_write, .private = DEVCG_DENY, }, { .name = "list", .seq_show = devcgroup_seq_show, .private = DEVCG_LIST, }, { } /* terminate */ }; struct cgroup_subsys devices_cgrp_subsys = { .css_alloc = devcgroup_css_alloc, .css_free = devcgroup_css_free, .css_online = devcgroup_online, .css_offline = devcgroup_offline, .legacy_cftypes = dev_cgroup_files, }; /** * devcgroup_legacy_check_permission - checks if an inode operation is permitted * @type: device type * @major: device major number * @minor: device minor number * @access: combination of DEVCG_ACC_WRITE, DEVCG_ACC_READ and DEVCG_ACC_MKNOD * * returns 0 on success, -EPERM case the operation is not permitted */ static int devcgroup_legacy_check_permission(short type, u32 major, u32 minor, short access) { struct dev_cgroup *dev_cgroup; bool rc; rcu_read_lock(); dev_cgroup = task_devcgroup(current); if (dev_cgroup->behavior == DEVCG_DEFAULT_ALLOW) /* Can't match any of the exceptions, even partially */ rc = !match_exception_partial(&dev_cgroup->exceptions, type, major, minor, access); else /* Need to match completely one exception to be allowed */ rc = match_exception(&dev_cgroup->exceptions, type, major, minor, access); rcu_read_unlock(); if (!rc) return -EPERM; return 0; } #endif /* CONFIG_CGROUP_DEVICE */ #if defined(CONFIG_CGROUP_DEVICE) || defined(CONFIG_CGROUP_BPF) int devcgroup_check_permission(short type, u32 major, u32 minor, short access) { int rc = BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type, major, minor, access); if (rc) return rc; #ifdef CONFIG_CGROUP_DEVICE return devcgroup_legacy_check_permission(type, major, minor, access); #else /* CONFIG_CGROUP_DEVICE */ return 0; #endif /* CONFIG_CGROUP_DEVICE */ } EXPORT_SYMBOL(devcgroup_check_permission); #endif /* defined(CONFIG_CGROUP_DEVICE) || defined(CONFIG_CGROUP_BPF) */ |
| 4 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright 2012 Xyratex Technology Limited */ /* * This is crypto api shash wrappers to crc32_le. */ #include <asm/unaligned.h> #include <linux/crc32.h> #include <crypto/internal/hash.h> #include <linux/init.h> #include <linux/module.h> #include <linux/string.h> #include <linux/kernel.h> #define CHKSUM_BLOCK_SIZE 1 #define CHKSUM_DIGEST_SIZE 4 /** No default init with ~0 */ static int crc32_cra_init(struct crypto_tfm *tfm) { u32 *key = crypto_tfm_ctx(tfm); *key = 0; return 0; } /* * Setting the seed allows arbitrary accumulators and flexible XOR policy * If your algorithm starts with ~0, then XOR with ~0 before you set * the seed. */ static int crc32_setkey(struct crypto_shash *hash, const u8 *key, unsigned int keylen) { u32 *mctx = crypto_shash_ctx(hash); if (keylen != sizeof(u32)) return -EINVAL; *mctx = get_unaligned_le32(key); return 0; } static int crc32_init(struct shash_desc *desc) { u32 *mctx = crypto_shash_ctx(desc->tfm); u32 *crcp = shash_desc_ctx(desc); *crcp = *mctx; return 0; } static int crc32_update(struct shash_desc *desc, const u8 *data, unsigned int len) { u32 *crcp = shash_desc_ctx(desc); *crcp = crc32_le(*crcp, data, len); return 0; } /* No final XOR 0xFFFFFFFF, like crc32_le */ static int __crc32_finup(u32 *crcp, const u8 *data, unsigned int len, u8 *out) { put_unaligned_le32(crc32_le(*crcp, data, len), out); return 0; } static int crc32_finup(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { return __crc32_finup(shash_desc_ctx(desc), data, len, out); } static int crc32_final(struct shash_desc *desc, u8 *out) { u32 *crcp = shash_desc_ctx(desc); put_unaligned_le32(*crcp, out); return 0; } static int crc32_digest(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { return __crc32_finup(crypto_shash_ctx(desc->tfm), data, len, out); } static struct shash_alg alg = { .setkey = crc32_setkey, .init = crc32_init, .update = crc32_update, .final = crc32_final, .finup = crc32_finup, .digest = crc32_digest, .descsize = sizeof(u32), .digestsize = CHKSUM_DIGEST_SIZE, .base = { .cra_name = "crc32", .cra_driver_name = "crc32-generic", .cra_priority = 100, .cra_flags = CRYPTO_ALG_OPTIONAL_KEY, .cra_blocksize = CHKSUM_BLOCK_SIZE, .cra_ctxsize = sizeof(u32), .cra_module = THIS_MODULE, .cra_init = crc32_cra_init, } }; static int __init crc32_mod_init(void) { return crypto_register_shash(&alg); } static void __exit crc32_mod_fini(void) { crypto_unregister_shash(&alg); } subsys_initcall(crc32_mod_init); module_exit(crc32_mod_fini); MODULE_AUTHOR("Alexander Boyko <alexander_boyko@xyratex.com>"); MODULE_DESCRIPTION("CRC32 calculations wrapper for lib/crc32"); MODULE_LICENSE("GPL"); MODULE_ALIAS_CRYPTO("crc32"); MODULE_ALIAS_CRYPTO("crc32-generic"); |
| 9 9 52 9 7 116 119 117 46 9 46 46 46 9 45 38 43 7 42 36 46 46 46 44 3 3 3 3 3 3 3 3 48 49 46 48 88 68 68 20 87 86 88 87 87 87 43 88 88 1 1 7 6 6 7 7 4 7 2 2 2 1 1 2 2 2 2 2 8 8 8 8 8 8 4 2 3 2 7 7 7 4 3 7 7 6 1 7 7 4 7 4 8 8 1 1 1 1 2 1 2 67 67 67 67 67 6 6 6 6 6 6 6 6 2 4 6 6 41 41 41 41 41 41 41 228 228 226 228 229 227 227 8 229 227 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 | // SPDX-License-Identifier: GPL-2.0-only /* * fs/kernfs/file.c - kernfs file implementation * * Copyright (c) 2001-3 Patrick Mochel * Copyright (c) 2007 SUSE Linux Products GmbH * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org> */ #include <linux/fs.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/poll.h> #include <linux/pagemap.h> #include <linux/sched/mm.h> #include <linux/fsnotify.h> #include <linux/uio.h> #include "kernfs-internal.h" struct kernfs_open_node { struct rcu_head rcu_head; atomic_t event; wait_queue_head_t poll; struct list_head files; /* goes through kernfs_open_file.list */ unsigned int nr_mmapped; unsigned int nr_to_release; }; /* * kernfs_notify() may be called from any context and bounces notifications * through a work item. To minimize space overhead in kernfs_node, the * pending queue is implemented as a singly linked list of kernfs_nodes. * The list is terminated with the self pointer so that whether a * kernfs_node is on the list or not can be determined by testing the next * pointer for %NULL. */ #define KERNFS_NOTIFY_EOL ((void *)&kernfs_notify_list) static DEFINE_SPINLOCK(kernfs_notify_lock); static struct kernfs_node *kernfs_notify_list = KERNFS_NOTIFY_EOL; static inline struct mutex *kernfs_open_file_mutex_ptr(struct kernfs_node *kn) { int idx = hash_ptr(kn, NR_KERNFS_LOCK_BITS); return &kernfs_locks->open_file_mutex[idx]; } static inline struct mutex *kernfs_open_file_mutex_lock(struct kernfs_node *kn) { struct mutex *lock; lock = kernfs_open_file_mutex_ptr(kn); mutex_lock(lock); return lock; } /** * of_on - Get the kernfs_open_node of the specified kernfs_open_file * @of: target kernfs_open_file * * Return: the kernfs_open_node of the kernfs_open_file */ static struct kernfs_open_node *of_on(struct kernfs_open_file *of) { return rcu_dereference_protected(of->kn->attr.open, !list_empty(&of->list)); } /** * kernfs_deref_open_node_locked - Get kernfs_open_node corresponding to @kn * * @kn: target kernfs_node. * * Fetch and return ->attr.open of @kn when caller holds the * kernfs_open_file_mutex_ptr(kn). * * Update of ->attr.open happens under kernfs_open_file_mutex_ptr(kn). So when * the caller guarantees that this mutex is being held, other updaters can't * change ->attr.open and this means that we can safely deref ->attr.open * outside RCU read-side critical section. * * The caller needs to make sure that kernfs_open_file_mutex is held. * * Return: @kn->attr.open when kernfs_open_file_mutex is held. */ static struct kernfs_open_node * kernfs_deref_open_node_locked(struct kernfs_node *kn) { return rcu_dereference_protected(kn->attr.open, lockdep_is_held(kernfs_open_file_mutex_ptr(kn))); } static struct kernfs_open_file *kernfs_of(struct file *file) { return ((struct seq_file *)file->private_data)->private; } /* * Determine the kernfs_ops for the given kernfs_node. This function must * be called while holding an active reference. */ static const struct kernfs_ops *kernfs_ops(struct kernfs_node *kn) { if (kn->flags & KERNFS_LOCKDEP) lockdep_assert_held(kn); return kn->attr.ops; } /* * As kernfs_seq_stop() is also called after kernfs_seq_start() or * kernfs_seq_next() failure, it needs to distinguish whether it's stopping * a seq_file iteration which is fully initialized with an active reference * or an aborted kernfs_seq_start() due to get_active failure. The * position pointer is the only context for each seq_file iteration and * thus the stop condition should be encoded in it. As the return value is * directly visible to userland, ERR_PTR(-ENODEV) is the only acceptable * choice to indicate get_active failure. * * Unfortunately, this is complicated due to the optional custom seq_file * operations which may return ERR_PTR(-ENODEV) too. kernfs_seq_stop() * can't distinguish whether ERR_PTR(-ENODEV) is from get_active failure or * custom seq_file operations and thus can't decide whether put_active * should be performed or not only on ERR_PTR(-ENODEV). * * This is worked around by factoring out the custom seq_stop() and * put_active part into kernfs_seq_stop_active(), skipping it from * kernfs_seq_stop() if ERR_PTR(-ENODEV) while invoking it directly after * custom seq_file operations fail with ERR_PTR(-ENODEV) - this ensures * that kernfs_seq_stop_active() is skipped only after get_active failure. */ static void kernfs_seq_stop_active(struct seq_file *sf, void *v) { struct kernfs_open_file *of = sf->private; const struct kernfs_ops *ops = kernfs_ops(of->kn); if (ops->seq_stop) ops->seq_stop(sf, v); kernfs_put_active(of->kn); } static void *kernfs_seq_start(struct seq_file *sf, loff_t *ppos) { struct kernfs_open_file *of = sf->private; const struct kernfs_ops *ops; /* * @of->mutex nests outside active ref and is primarily to ensure that * the ops aren't called concurrently for the same open file. */ mutex_lock(&of->mutex); if (!kernfs_get_active(of->kn)) return ERR_PTR(-ENODEV); ops = kernfs_ops(of->kn); if (ops->seq_start) { void *next = ops->seq_start(sf, ppos); /* see the comment above kernfs_seq_stop_active() */ if (next == ERR_PTR(-ENODEV)) kernfs_seq_stop_active(sf, next); return next; } return single_start(sf, ppos); } static void *kernfs_seq_next(struct seq_file *sf, void *v, loff_t *ppos) { struct kernfs_open_file *of = sf->private; const struct kernfs_ops *ops = kernfs_ops(of->kn); if (ops->seq_next) { void *next = ops->seq_next(sf, v, ppos); /* see the comment above kernfs_seq_stop_active() */ if (next == ERR_PTR(-ENODEV)) kernfs_seq_stop_active(sf, next); return next; } else { /* * The same behavior and code as single_open(), always * terminate after the initial read. */ ++*ppos; return NULL; } } static void kernfs_seq_stop(struct seq_file *sf, void *v) { struct kernfs_open_file *of = sf->private; if (v != ERR_PTR(-ENODEV)) kernfs_seq_stop_active(sf, v); mutex_unlock(&of->mutex); } static int kernfs_seq_show(struct seq_file *sf, void *v) { struct kernfs_open_file *of = sf->private; of->event = atomic_read(&of_on(of)->event); return of->kn->attr.ops->seq_show(sf, v); } static const struct seq_operations kernfs_seq_ops = { .start = kernfs_seq_start, .next = kernfs_seq_next, .stop = kernfs_seq_stop, .show = kernfs_seq_show, }; /* * As reading a bin file can have side-effects, the exact offset and bytes * specified in read(2) call should be passed to the read callback making * it difficult to use seq_file. Implement simplistic custom buffering for * bin files. */ static ssize_t kernfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) { struct kernfs_open_file *of = kernfs_of(iocb->ki_filp); ssize_t len = min_t(size_t, iov_iter_count(iter), PAGE_SIZE); const struct kernfs_ops *ops; char *buf; buf = of->prealloc_buf; if (buf) mutex_lock(&of->prealloc_mutex); else buf = kmalloc(len, GFP_KERNEL); if (!buf) return -ENOMEM; /* * @of->mutex nests outside active ref and is used both to ensure that * the ops aren't called concurrently for the same open file. */ mutex_lock(&of->mutex); if (!kernfs_get_active(of->kn)) { len = -ENODEV; mutex_unlock(&of->mutex); goto out_free; } of->event = atomic_read(&of_on(of)->event); ops = kernfs_ops(of->kn); if (ops->read) len = ops->read(of, buf, len, iocb->ki_pos); else len = -EINVAL; kernfs_put_active(of->kn); mutex_unlock(&of->mutex); if (len < 0) goto out_free; if (copy_to_iter(buf, len, iter) != len) { len = -EFAULT; goto out_free; } iocb->ki_pos += len; out_free: if (buf == of->prealloc_buf) mutex_unlock(&of->prealloc_mutex); else kfree(buf); return len; } static ssize_t kernfs_fop_read_iter(struct kiocb *iocb, struct iov_iter *iter) { if (kernfs_of(iocb->ki_filp)->kn->flags & KERNFS_HAS_SEQ_SHOW) return seq_read_iter(iocb, iter); return kernfs_file_read_iter(iocb, iter); } /* * Copy data in from userland and pass it to the matching kernfs write * operation. * * There is no easy way for us to know if userspace is only doing a partial * write, so we don't support them. We expect the entire buffer to come on * the first write. Hint: if you're writing a value, first read the file, * modify only the value you're changing, then write entire buffer * back. */ static ssize_t kernfs_fop_write_iter(struct kiocb *iocb, struct iov_iter *iter) { struct kernfs_open_file *of = kernfs_of(iocb->ki_filp); ssize_t len = iov_iter_count(iter); const struct kernfs_ops *ops; char *buf; if (of->atomic_write_len) { if (len > of->atomic_write_len) return -E2BIG; } else { len = min_t(size_t, len, PAGE_SIZE); } buf = of->prealloc_buf; if (buf) mutex_lock(&of->prealloc_mutex); else buf = kmalloc(len + 1, GFP_KERNEL); if (!buf) return -ENOMEM; if (copy_from_iter(buf, len, iter) != len) { len = -EFAULT; goto out_free; } buf[len] = '\0'; /* guarantee string termination */ /* * @of->mutex nests outside active ref and is used both to ensure that * the ops aren't called concurrently for the same open file. */ mutex_lock(&of->mutex); if (!kernfs_get_active(of->kn)) { mutex_unlock(&of->mutex); len = -ENODEV; goto out_free; } ops = kernfs_ops(of->kn); if (ops->write) len = ops->write(of, buf, len, iocb->ki_pos); else len = -EINVAL; kernfs_put_active(of->kn); mutex_unlock(&of->mutex); if (len > 0) iocb->ki_pos += len; out_free: if (buf == of->prealloc_buf) mutex_unlock(&of->prealloc_mutex); else kfree(buf); return len; } static void kernfs_vma_open(struct vm_area_struct *vma) { struct file *file = vma->vm_file; struct kernfs_open_file *of = kernfs_of(file); if (!of->vm_ops) return; if (!kernfs_get_active(of->kn)) return; if (of->vm_ops->open) of->vm_ops->open(vma); kernfs_put_active(of->kn); } static vm_fault_t kernfs_vma_fault(struct vm_fault *vmf) { struct file *file = vmf->vma->vm_file; struct kernfs_open_file *of = kernfs_of(file); vm_fault_t ret; if (!of->vm_ops) return VM_FAULT_SIGBUS; if (!kernfs_get_active(of->kn)) return VM_FAULT_SIGBUS; ret = VM_FAULT_SIGBUS; if (of->vm_ops->fault) ret = of->vm_ops->fault(vmf); kernfs_put_active(of->kn); return ret; } static vm_fault_t kernfs_vma_page_mkwrite(struct vm_fault *vmf) { struct file *file = vmf->vma->vm_file; struct kernfs_open_file *of = kernfs_of(file); vm_fault_t ret; if (!of->vm_ops) return VM_FAULT_SIGBUS; if (!kernfs_get_active(of->kn)) return VM_FAULT_SIGBUS; ret = 0; if (of->vm_ops->page_mkwrite) ret = of->vm_ops->page_mkwrite(vmf); else file_update_time(file); kernfs_put_active(of->kn); return ret; } static int kernfs_vma_access(struct vm_area_struct *vma, unsigned long addr, void *buf, int len, int write) { struct file *file = vma->vm_file; struct kernfs_open_file *of = kernfs_of(file); int ret; if (!of->vm_ops) return -EINVAL; if (!kernfs_get_active(of->kn)) return -EINVAL; ret = -EINVAL; if (of->vm_ops->access) ret = of->vm_ops->access(vma, addr, buf, len, write); kernfs_put_active(of->kn); return ret; } static const struct vm_operations_struct kernfs_vm_ops = { .open = kernfs_vma_open, .fault = kernfs_vma_fault, .page_mkwrite = kernfs_vma_page_mkwrite, .access = kernfs_vma_access, }; static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma) { struct kernfs_open_file *of = kernfs_of(file); const struct kernfs_ops *ops; int rc; /* * mmap path and of->mutex are prone to triggering spurious lockdep * warnings and we don't want to add spurious locking dependency * between the two. Check whether mmap is actually implemented * without grabbing @of->mutex by testing HAS_MMAP flag. See the * comment in kernfs_fop_open() for more details. */ if (!(of->kn->flags & KERNFS_HAS_MMAP)) return -ENODEV; mutex_lock(&of->mutex); rc = -ENODEV; if (!kernfs_get_active(of->kn)) goto out_unlock; ops = kernfs_ops(of->kn); rc = ops->mmap(of, vma); if (rc) goto out_put; /* * PowerPC's pci_mmap of legacy_mem uses shmem_zero_setup() * to satisfy versions of X which crash if the mmap fails: that * substitutes a new vm_file, and we don't then want bin_vm_ops. */ if (vma->vm_file != file) goto out_put; rc = -EINVAL; if (of->mmapped && of->vm_ops != vma->vm_ops) goto out_put; /* * It is not possible to successfully wrap close. * So error if someone is trying to use close. */ if (vma->vm_ops && vma->vm_ops->close) goto out_put; rc = 0; if (!of->mmapped) { of->mmapped = true; of_on(of)->nr_mmapped++; of->vm_ops = vma->vm_ops; } vma->vm_ops = &kernfs_vm_ops; out_put: kernfs_put_active(of->kn); out_unlock: mutex_unlock(&of->mutex); return rc; } /** * kernfs_get_open_node - get or create kernfs_open_node * @kn: target kernfs_node * @of: kernfs_open_file for this instance of open * * If @kn->attr.open exists, increment its reference count; otherwise, * create one. @of is chained to the files list. * * Locking: * Kernel thread context (may sleep). * * Return: * %0 on success, -errno on failure. */ static int kernfs_get_open_node(struct kernfs_node *kn, struct kernfs_open_file *of) { struct kernfs_open_node *on; struct mutex *mutex; mutex = kernfs_open_file_mutex_lock(kn); on = kernfs_deref_open_node_locked(kn); if (!on) { /* not there, initialize a new one */ on = kzalloc(sizeof(*on), GFP_KERNEL); if (!on) { mutex_unlock(mutex); return -ENOMEM; } atomic_set(&on->event, 1); init_waitqueue_head(&on->poll); INIT_LIST_HEAD(&on->files); rcu_assign_pointer(kn->attr.open, on); } list_add_tail(&of->list, &on->files); if (kn->flags & KERNFS_HAS_RELEASE) on->nr_to_release++; mutex_unlock(mutex); return 0; } /** * kernfs_unlink_open_file - Unlink @of from @kn. * * @kn: target kernfs_node * @of: associated kernfs_open_file * @open_failed: ->open() failed, cancel ->release() * * Unlink @of from list of @kn's associated open files. If list of * associated open files becomes empty, disassociate and free * kernfs_open_node. * * LOCKING: * None. */ static void kernfs_unlink_open_file(struct kernfs_node *kn, struct kernfs_open_file *of, bool open_failed) { struct kernfs_open_node *on; struct mutex *mutex; mutex = kernfs_open_file_mutex_lock(kn); on = kernfs_deref_open_node_locked(kn); if (!on) { mutex_unlock(mutex); return; } if (of) { if (kn->flags & KERNFS_HAS_RELEASE) { WARN_ON_ONCE(of->released == open_failed); if (open_failed) on->nr_to_release--; } if (of->mmapped) on->nr_mmapped--; list_del(&of->list); } if (list_empty(&on->files)) { rcu_assign_pointer(kn->attr.open, NULL); kfree_rcu(on, rcu_head); } mutex_unlock(mutex); } static int kernfs_fop_open(struct inode *inode, struct file *file) { struct kernfs_node *kn = inode->i_private; struct kernfs_root *root = kernfs_root(kn); const struct kernfs_ops *ops; struct kernfs_open_file *of; bool has_read, has_write, has_mmap; int error = -EACCES; if (!kernfs_get_active(kn)) return -ENODEV; ops = kernfs_ops(kn); has_read = ops->seq_show || ops->read || ops->mmap; has_write = ops->write || ops->mmap; has_mmap = ops->mmap; /* see the flag definition for details */ if (root->flags & KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK) { if ((file->f_mode & FMODE_WRITE) && (!(inode->i_mode & S_IWUGO) || !has_write)) goto err_out; if ((file->f_mode & FMODE_READ) && (!(inode->i_mode & S_IRUGO) || !has_read)) goto err_out; } /* allocate a kernfs_open_file for the file */ error = -ENOMEM; of = kzalloc(sizeof(struct kernfs_open_file), GFP_KERNEL); if (!of) goto err_out; /* * The following is done to give a different lockdep key to * @of->mutex for files which implement mmap. This is a rather * crude way to avoid false positive lockdep warning around * mm->mmap_lock - mmap nests @of->mutex under mm->mmap_lock and * reading /sys/block/sda/trace/act_mask grabs sr_mutex, under * which mm->mmap_lock nests, while holding @of->mutex. As each * open file has a separate mutex, it's okay as long as those don't * happen on the same file. At this point, we can't easily give * each file a separate locking class. Let's differentiate on * whether the file has mmap or not for now. * * For similar reasons, writable and readonly files are given different * lockdep key, because the writable file /sys/power/resume may call vfs * lookup helpers for arbitrary paths and readonly files can be read by * overlayfs from vfs helpers when sysfs is a lower layer of overalyfs. * * All three cases look the same. They're supposed to * look that way and give @of->mutex different static lockdep keys. */ if (has_mmap) mutex_init(&of->mutex); else if (file->f_mode & FMODE_WRITE) mutex_init(&of->mutex); else mutex_init(&of->mutex); of->kn = kn; of->file = file; /* * Write path needs to atomic_write_len outside active reference. * Cache it in open_file. See kernfs_fop_write_iter() for details. */ of->atomic_write_len = ops->atomic_write_len; error = -EINVAL; /* * ->seq_show is incompatible with ->prealloc, * as seq_read does its own allocation. * ->read must be used instead. */ if (ops->prealloc && ops->seq_show) goto err_free; if (ops->prealloc) { int len = of->atomic_write_len ?: PAGE_SIZE; of->prealloc_buf = kmalloc(len + 1, GFP_KERNEL); error = -ENOMEM; if (!of->prealloc_buf) goto err_free; mutex_init(&of->prealloc_mutex); } /* * Always instantiate seq_file even if read access doesn't use * seq_file or is not requested. This unifies private data access * and readable regular files are the vast majority anyway. */ if (ops->seq_show) error = seq_open(file, &kernfs_seq_ops); else error = seq_open(file, NULL); if (error) goto err_free; of->seq_file = file->private_data; of->seq_file->private = of; /* seq_file clears PWRITE unconditionally, restore it if WRITE */ if (file->f_mode & FMODE_WRITE) file->f_mode |= FMODE_PWRITE; /* make sure we have open node struct */ error = kernfs_get_open_node(kn, of); if (error) goto err_seq_release; if (ops->open) { /* nobody has access to @of yet, skip @of->mutex */ error = ops->open(of); if (error) goto err_put_node; } /* open succeeded, put active references */ kernfs_put_active(kn); return 0; err_put_node: kernfs_unlink_open_file(kn, of, true); err_seq_release: seq_release(inode, file); err_free: kfree(of->prealloc_buf); kfree(of); err_out: kernfs_put_active(kn); return error; } /* used from release/drain to ensure that ->release() is called exactly once */ static void kernfs_release_file(struct kernfs_node *kn, struct kernfs_open_file *of) { /* * @of is guaranteed to have no other file operations in flight and * we just want to synchronize release and drain paths. * @kernfs_open_file_mutex_ptr(kn) is enough. @of->mutex can't be used * here because drain path may be called from places which can * cause circular dependency. */ lockdep_assert_held(kernfs_open_file_mutex_ptr(kn)); if (!of->released) { /* * A file is never detached without being released and we * need to be able to release files which are deactivated * and being drained. Don't use kernfs_ops(). */ kn->attr.ops->release(of); of->released = true; of_on(of)->nr_to_release--; } } static int kernfs_fop_release(struct inode *inode, struct file *filp) { struct kernfs_node *kn = inode->i_private; struct kernfs_open_file *of = kernfs_of(filp); if (kn->flags & KERNFS_HAS_RELEASE) { struct mutex *mutex; mutex = kernfs_open_file_mutex_lock(kn); kernfs_release_file(kn, of); mutex_unlock(mutex); } kernfs_unlink_open_file(kn, of, false); seq_release(inode, filp); kfree(of->prealloc_buf); kfree(of); return 0; } bool kernfs_should_drain_open_files(struct kernfs_node *kn) { struct kernfs_open_node *on; bool ret; /* * @kn being deactivated guarantees that @kn->attr.open can't change * beneath us making the lockless test below safe. */ WARN_ON_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS); rcu_read_lock(); on = rcu_dereference(kn->attr.open); ret = on && (on->nr_mmapped || on->nr_to_release); rcu_read_unlock(); return ret; } void kernfs_drain_open_files(struct kernfs_node *kn) { struct kernfs_open_node *on; struct kernfs_open_file *of; struct mutex *mutex; mutex = kernfs_open_file_mutex_lock(kn); on = kernfs_deref_open_node_locked(kn); if (!on) { mutex_unlock(mutex); return; } list_for_each_entry(of, &on->files, list) { struct inode *inode = file_inode(of->file); if (of->mmapped) { unmap_mapping_range(inode->i_mapping, 0, 0, 1); of->mmapped = false; on->nr_mmapped--; } if (kn->flags & KERNFS_HAS_RELEASE) kernfs_release_file(kn, of); } WARN_ON_ONCE(on->nr_mmapped || on->nr_to_release); mutex_unlock(mutex); } /* * Kernfs attribute files are pollable. The idea is that you read * the content and then you use 'poll' or 'select' to wait for * the content to change. When the content changes (assuming the * manager for the kobject supports notification), poll will * return EPOLLERR|EPOLLPRI, and select will return the fd whether * it is waiting for read, write, or exceptions. * Once poll/select indicates that the value has changed, you * need to close and re-open the file, or seek to 0 and read again. * Reminder: this only works for attributes which actively support * it, and it is not possible to test an attribute from userspace * to see if it supports poll (Neither 'poll' nor 'select' return * an appropriate error code). When in doubt, set a suitable timeout value. */ __poll_t kernfs_generic_poll(struct kernfs_open_file *of, poll_table *wait) { struct kernfs_open_node *on = of_on(of); poll_wait(of->file, &on->poll, wait); if (of->event != atomic_read(&on->event)) return DEFAULT_POLLMASK|EPOLLERR|EPOLLPRI; return DEFAULT_POLLMASK; } static __poll_t kernfs_fop_poll(struct file *filp, poll_table *wait) { struct kernfs_open_file *of = kernfs_of(filp); struct kernfs_node *kn = kernfs_dentry_node(filp->f_path.dentry); __poll_t ret; if (!kernfs_get_active(kn)) return DEFAULT_POLLMASK|EPOLLERR|EPOLLPRI; if (kn->attr.ops->poll) ret = kn->attr.ops->poll(of, wait); else ret = kernfs_generic_poll(of, wait); kernfs_put_active(kn); return ret; } static loff_t kernfs_fop_llseek(struct file *file, loff_t offset, int whence) { struct kernfs_open_file *of = kernfs_of(file); const struct kernfs_ops *ops; loff_t ret; /* * @of->mutex nests outside active ref and is primarily to ensure that * the ops aren't called concurrently for the same open file. */ mutex_lock(&of->mutex); if (!kernfs_get_active(of->kn)) { mutex_unlock(&of->mutex); return -ENODEV; } ops = kernfs_ops(of->kn); if (ops->llseek) ret = ops->llseek(of, offset, whence); else ret = generic_file_llseek(file, offset, whence); kernfs_put_active(of->kn); mutex_unlock(&of->mutex); return ret; } static void kernfs_notify_workfn(struct work_struct *work) { struct kernfs_node *kn; struct kernfs_super_info *info; struct kernfs_root *root; repeat: /* pop one off the notify_list */ spin_lock_irq(&kernfs_notify_lock); kn = kernfs_notify_list; if (kn == KERNFS_NOTIFY_EOL) { spin_unlock_irq(&kernfs_notify_lock); return; } kernfs_notify_list = kn->attr.notify_next; kn->attr.notify_next = NULL; spin_unlock_irq(&kernfs_notify_lock); root = kernfs_root(kn); /* kick fsnotify */ down_read(&root->kernfs_supers_rwsem); list_for_each_entry(info, &kernfs_root(kn)->supers, node) { struct kernfs_node *parent; struct inode *p_inode = NULL; struct inode *inode; struct qstr name; /* * We want fsnotify_modify() on @kn but as the * modifications aren't originating from userland don't * have the matching @file available. Look up the inodes * and generate the events manually. */ inode = ilookup(info->sb, kernfs_ino(kn)); if (!inode) continue; name = (struct qstr)QSTR_INIT(kn->name, strlen(kn->name)); parent = kernfs_get_parent(kn); if (parent) { p_inode = ilookup(info->sb, kernfs_ino(parent)); if (p_inode) { fsnotify(FS_MODIFY | FS_EVENT_ON_CHILD, inode, FSNOTIFY_EVENT_INODE, p_inode, &name, inode, 0); iput(p_inode); } kernfs_put(parent); } if (!p_inode) fsnotify_inode(inode, FS_MODIFY); iput(inode); } up_read(&root->kernfs_supers_rwsem); kernfs_put(kn); goto repeat; } /** * kernfs_notify - notify a kernfs file * @kn: file to notify * * Notify @kn such that poll(2) on @kn wakes up. Maybe be called from any * context. */ void kernfs_notify(struct kernfs_node *kn) { static DECLARE_WORK(kernfs_notify_work, kernfs_notify_workfn); unsigned long flags; struct kernfs_open_node *on; if (WARN_ON(kernfs_type(kn) != KERNFS_FILE)) return; /* kick poll immediately */ rcu_read_lock(); on = rcu_dereference(kn->attr.open); if (on) { atomic_inc(&on->event); wake_up_interruptible(&on->poll); } rcu_read_unlock(); /* schedule work to kick fsnotify */ spin_lock_irqsave(&kernfs_notify_lock, flags); if (!kn->attr.notify_next) { kernfs_get(kn); kn->attr.notify_next = kernfs_notify_list; kernfs_notify_list = kn; schedule_work(&kernfs_notify_work); } spin_unlock_irqrestore(&kernfs_notify_lock, flags); } EXPORT_SYMBOL_GPL(kernfs_notify); const struct file_operations kernfs_file_fops = { .read_iter = kernfs_fop_read_iter, .write_iter = kernfs_fop_write_iter, .llseek = kernfs_fop_llseek, .mmap = kernfs_fop_mmap, .open = kernfs_fop_open, .release = kernfs_fop_release, .poll = kernfs_fop_poll, .fsync = noop_fsync, .splice_read = copy_splice_read, .splice_write = iter_file_splice_write, }; /** * __kernfs_create_file - kernfs internal function to create a file * @parent: directory to create the file in * @name: name of the file * @mode: mode of the file * @uid: uid of the file * @gid: gid of the file * @size: size of the file * @ops: kernfs operations for the file * @priv: private data for the file * @ns: optional namespace tag of the file * @key: lockdep key for the file's active_ref, %NULL to disable lockdep * * Return: the created node on success, ERR_PTR() value on error. */ struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent, const char *name, umode_t mode, kuid_t uid, kgid_t gid, loff_t size, const struct kernfs_ops *ops, void *priv, const void *ns, struct lock_class_key *key) { struct kernfs_node *kn; unsigned flags; int rc; flags = KERNFS_FILE; kn = kernfs_new_node(parent, name, (mode & S_IALLUGO) | S_IFREG, uid, gid, flags); if (!kn) return ERR_PTR(-ENOMEM); kn->attr.ops = ops; kn->attr.size = size; kn->ns = ns; kn->priv = priv; #ifdef CONFIG_DEBUG_LOCK_ALLOC if (key) { lockdep_init_map(&kn->dep_map, "kn->active", key, 0); kn->flags |= KERNFS_LOCKDEP; } #endif /* * kn->attr.ops is accessible only while holding active ref. We * need to know whether some ops are implemented outside active * ref. Cache their existence in flags. */ if (ops->seq_show) kn->flags |= KERNFS_HAS_SEQ_SHOW; if (ops->mmap) kn->flags |= KERNFS_HAS_MMAP; if (ops->release) kn->flags |= KERNFS_HAS_RELEASE; rc = kernfs_add_one(kn); if (rc) { kernfs_put(kn); return ERR_PTR(rc); } return kn; } |
| 3 3 3 3 3 3 3 3 2 1 1 9 9 9 9 8 2 1 2 3 3 2 1 3 3 3 1 1 1 1 1 1 1 1 1 1 3 3 3 3 2 3 3 1 1 1 1 1 1 1 1 3 1 3 1 1 1 3 3 3 1 3 3 3 2 1 1 3 3 3 3 3 3 3 2 2 4 2 4 4 4 4 3 2 4 3 3 3 3 3 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/balloc.c * * Copyright (C) 1992, 1993, 1994, 1995 * Remy Card (card@masi.ibp.fr) * Laboratoire MASI - Institut Blaise Pascal * Universite Pierre et Marie Curie (Paris VI) * * Enhanced block allocation by Stephen Tweedie (sct@redhat.com), 1993 * Big-endian to little-endian byte-swapping/bitmaps by * David S. Miller (davem@caip.rutgers.edu), 1995 */ #include <linux/time.h> #include <linux/capability.h> #include <linux/fs.h> #include <linux/quotaops.h> #include <linux/buffer_head.h> #include "ext4.h" #include "ext4_jbd2.h" #include "mballoc.h" #include <trace/events/ext4.h> #include <kunit/static_stub.h> static unsigned ext4_num_base_meta_clusters(struct super_block *sb, ext4_group_t block_group); /* * balloc.c contains the blocks allocation and deallocation routines */ /* * Calculate block group number for a given block number */ ext4_group_t ext4_get_group_number(struct super_block *sb, ext4_fsblk_t block) { ext4_group_t group; if (test_opt2(sb, STD_GROUP_SIZE)) group = (block - le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) >> (EXT4_BLOCK_SIZE_BITS(sb) + EXT4_CLUSTER_BITS(sb) + 3); else ext4_get_group_no_and_offset(sb, block, &group, NULL); return group; } /* * Calculate the block group number and offset into the block/cluster * allocation bitmap, given a block number */ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr, ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp) { struct ext4_super_block *es = EXT4_SB(sb)->s_es; ext4_grpblk_t offset; blocknr = blocknr - le32_to_cpu(es->s_first_data_block); offset = do_div(blocknr, EXT4_BLOCKS_PER_GROUP(sb)) >> EXT4_SB(sb)->s_cluster_bits; if (offsetp) *offsetp = offset; if (blockgrpp) *blockgrpp = blocknr; } /* * Check whether the 'block' lives within the 'block_group'. Returns 1 if so * and 0 otherwise. */ static inline int ext4_block_in_group(struct super_block *sb, ext4_fsblk_t block, ext4_group_t block_group) { ext4_group_t actual_group; actual_group = ext4_get_group_number(sb, block); return (actual_group == block_group) ? 1 : 0; } /* * Return the number of clusters used for file system metadata; this * represents the overhead needed by the file system. */ static unsigned ext4_num_overhead_clusters(struct super_block *sb, ext4_group_t block_group, struct ext4_group_desc *gdp) { unsigned base_clusters, num_clusters; int block_cluster = -1, inode_cluster; int itbl_cluster_start = -1, itbl_cluster_end = -1; ext4_fsblk_t start = ext4_group_first_block_no(sb, block_group); ext4_fsblk_t end = start + EXT4_BLOCKS_PER_GROUP(sb) - 1; ext4_fsblk_t itbl_blk_start, itbl_blk_end; struct ext4_sb_info *sbi = EXT4_SB(sb); /* This is the number of clusters used by the superblock, * block group descriptors, and reserved block group * descriptor blocks */ base_clusters = ext4_num_base_meta_clusters(sb, block_group); num_clusters = base_clusters; /* * Account and record inode table clusters if any cluster * is in the block group, or inode table cluster range is * [-1, -1] and won't overlap with block/inode bitmap cluster * accounted below. */ itbl_blk_start = ext4_inode_table(sb, gdp); itbl_blk_end = itbl_blk_start + sbi->s_itb_per_group - 1; if (itbl_blk_start <= end && itbl_blk_end >= start) { itbl_blk_start = max(itbl_blk_start, start); itbl_blk_end = min(itbl_blk_end, end); itbl_cluster_start = EXT4_B2C(sbi, itbl_blk_start - start); itbl_cluster_end = EXT4_B2C(sbi, itbl_blk_end - start); num_clusters += itbl_cluster_end - itbl_cluster_start + 1; /* check if border cluster is overlapped */ if (itbl_cluster_start == base_clusters - 1) num_clusters--; } /* * For the allocation bitmaps, we first need to check to see * if the block is in the block group. If it is, then check * to see if the cluster is already accounted for in the clusters * used for the base metadata cluster and inode tables cluster. * Normally all of these blocks are contiguous, so the special * case handling shouldn't be necessary except for *very* * unusual file system layouts. */ if (ext4_block_in_group(sb, ext4_block_bitmap(sb, gdp), block_group)) { block_cluster = EXT4_B2C(sbi, ext4_block_bitmap(sb, gdp) - start); if (block_cluster >= base_clusters && (block_cluster < itbl_cluster_start || block_cluster > itbl_cluster_end)) num_clusters++; } if (ext4_block_in_group(sb, ext4_inode_bitmap(sb, gdp), block_group)) { inode_cluster = EXT4_B2C(sbi, ext4_inode_bitmap(sb, gdp) - start); /* * Additional check if inode bitmap is in just accounted * block_cluster */ if (inode_cluster != block_cluster && inode_cluster >= base_clusters && (inode_cluster < itbl_cluster_start || inode_cluster > itbl_cluster_end)) num_clusters++; } return num_clusters; } static unsigned int num_clusters_in_group(struct super_block *sb, ext4_group_t block_group) { unsigned int blocks; if (block_group == ext4_get_groups_count(sb) - 1) { /* * Even though mke2fs always initializes the first and * last group, just in case some other tool was used, * we need to make sure we calculate the right free * blocks. */ blocks = ext4_blocks_count(EXT4_SB(sb)->s_es) - ext4_group_first_block_no(sb, block_group); } else blocks = EXT4_BLOCKS_PER_GROUP(sb); return EXT4_NUM_B2C(EXT4_SB(sb), blocks); } /* Initializes an uninitialized block bitmap */ static int ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, ext4_group_t block_group, struct ext4_group_desc *gdp) { unsigned int bit, bit_max; struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_fsblk_t start, tmp; ASSERT(buffer_locked(bh)); if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) { ext4_mark_group_bitmap_corrupted(sb, block_group, EXT4_GROUP_INFO_BBITMAP_CORRUPT | EXT4_GROUP_INFO_IBITMAP_CORRUPT); return -EFSBADCRC; } memset(bh->b_data, 0, sb->s_blocksize); bit_max = ext4_num_base_meta_clusters(sb, block_group); if ((bit_max >> 3) >= bh->b_size) return -EFSCORRUPTED; for (bit = 0; bit < bit_max; bit++) ext4_set_bit(bit, bh->b_data); start = ext4_group_first_block_no(sb, block_group); /* Set bits for block and inode bitmaps, and inode table */ tmp = ext4_block_bitmap(sb, gdp); if (ext4_block_in_group(sb, tmp, block_group)) ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data); tmp = ext4_inode_bitmap(sb, gdp); if (ext4_block_in_group(sb, tmp, block_group)) ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data); tmp = ext4_inode_table(sb, gdp); for (; tmp < ext4_inode_table(sb, gdp) + sbi->s_itb_per_group; tmp++) { if (ext4_block_in_group(sb, tmp, block_group)) ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data); } /* * Also if the number of blocks within the group is less than * the blocksize * 8 ( which is the size of bitmap ), set rest * of the block bitmap to 1 */ ext4_mark_bitmap_end(num_clusters_in_group(sb, block_group), sb->s_blocksize * 8, bh->b_data); return 0; } /* Return the number of free blocks in a block group. It is used when * the block bitmap is uninitialized, so we can't just count the bits * in the bitmap. */ unsigned ext4_free_clusters_after_init(struct super_block *sb, ext4_group_t block_group, struct ext4_group_desc *gdp) { return num_clusters_in_group(sb, block_group) - ext4_num_overhead_clusters(sb, block_group, gdp); } /* * The free blocks are managed by bitmaps. A file system contains several * blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap * block for inodes, N blocks for the inode table and data blocks. * * The file system contains group descriptors which are located after the * super block. Each descriptor contains the number of the bitmap block and * the free blocks count in the block. The descriptors are loaded in memory * when a file system is mounted (see ext4_fill_super). */ /** * ext4_get_group_desc() -- load group descriptor from disk * @sb: super block * @block_group: given block group * @bh: pointer to the buffer head to store the block * group descriptor */ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb, ext4_group_t block_group, struct buffer_head **bh) { unsigned int group_desc; unsigned int offset; ext4_group_t ngroups = ext4_get_groups_count(sb); struct ext4_group_desc *desc; struct ext4_sb_info *sbi = EXT4_SB(sb); struct buffer_head *bh_p; KUNIT_STATIC_STUB_REDIRECT(ext4_get_group_desc, sb, block_group, bh); if (block_group >= ngroups) { ext4_error(sb, "block_group >= groups_count - block_group = %u," " groups_count = %u", block_group, ngroups); return NULL; } group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb); offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1); bh_p = sbi_array_rcu_deref(sbi, s_group_desc, group_desc); /* * sbi_array_rcu_deref returns with rcu unlocked, this is ok since * the pointer being dereferenced won't be dereferenced again. By * looking at the usage in add_new_gdb() the value isn't modified, * just the pointer, and so it remains valid. */ if (!bh_p) { ext4_error(sb, "Group descriptor not loaded - " "block_group = %u, group_desc = %u, desc = %u", block_group, group_desc, offset); return NULL; } desc = (struct ext4_group_desc *)( (__u8 *)bh_p->b_data + offset * EXT4_DESC_SIZE(sb)); if (bh) *bh = bh_p; return desc; } static ext4_fsblk_t ext4_valid_block_bitmap_padding(struct super_block *sb, ext4_group_t block_group, struct buffer_head *bh) { ext4_grpblk_t next_zero_bit; unsigned long bitmap_size = sb->s_blocksize * 8; unsigned int offset = num_clusters_in_group(sb, block_group); if (bitmap_size <= offset) return 0; next_zero_bit = ext4_find_next_zero_bit(bh->b_data, bitmap_size, offset); return (next_zero_bit < bitmap_size ? next_zero_bit : 0); } struct ext4_group_info *ext4_get_group_info(struct super_block *sb, ext4_group_t group) { struct ext4_group_info **grp_info; long indexv, indexh; if (unlikely(group >= EXT4_SB(sb)->s_groups_count)) return NULL; indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb)); indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1); grp_info = sbi_array_rcu_deref(EXT4_SB(sb), s_group_info, indexv); return grp_info[indexh]; } /* * Return the block number which was discovered to be invalid, or 0 if * the block bitmap is valid. */ static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb, struct ext4_group_desc *desc, ext4_group_t block_group, struct buffer_head *bh) { struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_grpblk_t offset; ext4_grpblk_t next_zero_bit; ext4_grpblk_t max_bit = EXT4_CLUSTERS_PER_GROUP(sb); ext4_fsblk_t blk; ext4_fsblk_t group_first_block; if (ext4_has_feature_flex_bg(sb)) { /* with FLEX_BG, the inode/block bitmaps and itable * blocks may not be in the group at all * so the bitmap validation will be skipped for those groups * or it has to also read the block group where the bitmaps * are located to verify they are set. */ return 0; } group_first_block = ext4_group_first_block_no(sb, block_group); /* check whether block bitmap block number is set */ blk = ext4_block_bitmap(sb, desc); offset = blk - group_first_block; if (offset < 0 || EXT4_B2C(sbi, offset) >= max_bit || !ext4_test_bit(EXT4_B2C(sbi, offset), bh->b_data)) /* bad block bitmap */ return blk; /* check whether the inode bitmap block number is set */ blk = ext4_inode_bitmap(sb, desc); offset = blk - group_first_block; if (offset < 0 || EXT4_B2C(sbi, offset) >= max_bit || !ext4_test_bit(EXT4_B2C(sbi, offset), bh->b_data)) /* bad block bitmap */ return blk; /* check whether the inode table block number is set */ blk = ext4_inode_table(sb, desc); offset = blk - group_first_block; if (offset < 0 || EXT4_B2C(sbi, offset) >= max_bit || EXT4_B2C(sbi, offset + sbi->s_itb_per_group - 1) >= max_bit) return blk; next_zero_bit = ext4_find_next_zero_bit(bh->b_data, EXT4_B2C(sbi, offset + sbi->s_itb_per_group - 1) + 1, EXT4_B2C(sbi, offset)); if (next_zero_bit < EXT4_B2C(sbi, offset + sbi->s_itb_per_group - 1) + 1) /* bad bitmap for inode tables */ return blk; return 0; } static int ext4_validate_block_bitmap(struct super_block *sb, struct ext4_group_desc *desc, ext4_group_t block_group, struct buffer_head *bh) { ext4_fsblk_t blk; struct ext4_group_info *grp; if (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY) return 0; grp = ext4_get_group_info(sb, block_group); if (buffer_verified(bh)) return 0; if (!grp || EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) return -EFSCORRUPTED; ext4_lock_group(sb, block_group); if (buffer_verified(bh)) goto verified; if (unlikely(!ext4_block_bitmap_csum_verify(sb, desc, bh) || ext4_simulate_fail(sb, EXT4_SIM_BBITMAP_CRC))) { ext4_unlock_group(sb, block_group); ext4_error(sb, "bg %u: bad block bitmap checksum", block_group); ext4_mark_group_bitmap_corrupted(sb, block_group, EXT4_GROUP_INFO_BBITMAP_CORRUPT); return -EFSBADCRC; } blk = ext4_valid_block_bitmap(sb, desc, block_group, bh); if (unlikely(blk != 0)) { ext4_unlock_group(sb, block_group); ext4_error(sb, "bg %u: block %llu: invalid block bitmap", block_group, blk); ext4_mark_group_bitmap_corrupted(sb, block_group, EXT4_GROUP_INFO_BBITMAP_CORRUPT); return -EFSCORRUPTED; } blk = ext4_valid_block_bitmap_padding(sb, block_group, bh); if (unlikely(blk != 0)) { ext4_unlock_group(sb, block_group); ext4_error(sb, "bg %u: block %llu: padding at end of block bitmap is not set", block_group, blk); ext4_mark_group_bitmap_corrupted(sb, block_group, EXT4_GROUP_INFO_BBITMAP_CORRUPT); return -EFSCORRUPTED; } set_buffer_verified(bh); verified: ext4_unlock_group(sb, block_group); return 0; } /** * ext4_read_block_bitmap_nowait() * @sb: super block * @block_group: given block group * @ignore_locked: ignore locked buffers * * Read the bitmap for a given block_group,and validate the * bits for block/inode/inode tables are set in the bitmaps * * Return buffer_head on success or an ERR_PTR in case of failure. */ struct buffer_head * ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group, bool ignore_locked) { struct ext4_group_desc *desc; struct ext4_sb_info *sbi = EXT4_SB(sb); struct buffer_head *bh; ext4_fsblk_t bitmap_blk; int err; KUNIT_STATIC_STUB_REDIRECT(ext4_read_block_bitmap_nowait, sb, block_group, ignore_locked); desc = ext4_get_group_desc(sb, block_group, NULL); if (!desc) return ERR_PTR(-EFSCORRUPTED); bitmap_blk = ext4_block_bitmap(sb, desc); if ((bitmap_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) || (bitmap_blk >= ext4_blocks_count(sbi->s_es))) { ext4_error(sb, "Invalid block bitmap block %llu in " "block_group %u", bitmap_blk, block_group); ext4_mark_group_bitmap_corrupted(sb, block_group, EXT4_GROUP_INFO_BBITMAP_CORRUPT); return ERR_PTR(-EFSCORRUPTED); } bh = sb_getblk(sb, bitmap_blk); if (unlikely(!bh)) { ext4_warning(sb, "Cannot get buffer for block bitmap - " "block_group = %u, block_bitmap = %llu", block_group, bitmap_blk); return ERR_PTR(-ENOMEM); } if (ignore_locked && buffer_locked(bh)) { /* buffer under IO already, return if called for prefetching */ put_bh(bh); return NULL; } if (bitmap_uptodate(bh)) goto verify; lock_buffer(bh); if (bitmap_uptodate(bh)) { unlock_buffer(bh); goto verify; } ext4_lock_group(sb, block_group); if (ext4_has_group_desc_csum(sb) && (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) { if (block_group == 0) { ext4_unlock_group(sb, block_group); unlock_buffer(bh); ext4_error(sb, "Block bitmap for bg 0 marked " "uninitialized"); err = -EFSCORRUPTED; goto out; } err = ext4_init_block_bitmap(sb, bh, block_group, desc); if (err) { ext4_unlock_group(sb, block_group); unlock_buffer(bh); ext4_error(sb, "Failed to init block bitmap for group " "%u: %d", block_group, err); goto out; } set_bitmap_uptodate(bh); set_buffer_uptodate(bh); set_buffer_verified(bh); ext4_unlock_group(sb, block_group); unlock_buffer(bh); return bh; } ext4_unlock_group(sb, block_group); if (buffer_uptodate(bh)) { /* * if not uninit if bh is uptodate, * bitmap is also uptodate */ set_bitmap_uptodate(bh); unlock_buffer(bh); goto verify; } /* * submit the buffer_head for reading */ set_buffer_new(bh); trace_ext4_read_block_bitmap_load(sb, block_group, ignore_locked); ext4_read_bh_nowait(bh, REQ_META | REQ_PRIO | (ignore_locked ? REQ_RAHEAD : 0), ext4_end_bitmap_read); return bh; verify: err = ext4_validate_block_bitmap(sb, desc, block_group, bh); if (err) goto out; return bh; out: put_bh(bh); return ERR_PTR(err); } /* Returns 0 on success, -errno on error */ int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group, struct buffer_head *bh) { struct ext4_group_desc *desc; KUNIT_STATIC_STUB_REDIRECT(ext4_wait_block_bitmap, sb, block_group, bh); if (!buffer_new(bh)) return 0; desc = ext4_get_group_desc(sb, block_group, NULL); if (!desc) return -EFSCORRUPTED; wait_on_buffer(bh); ext4_simulate_fail_bh(sb, bh, EXT4_SIM_BBITMAP_EIO); if (!buffer_uptodate(bh)) { ext4_error_err(sb, EIO, "Cannot read block bitmap - " "block_group = %u, block_bitmap = %llu", block_group, (unsigned long long) bh->b_blocknr); ext4_mark_group_bitmap_corrupted(sb, block_group, EXT4_GROUP_INFO_BBITMAP_CORRUPT); return -EIO; } clear_buffer_new(bh); /* Panic or remount fs read-only if block bitmap is invalid */ return ext4_validate_block_bitmap(sb, desc, block_group, bh); } struct buffer_head * ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) { struct buffer_head *bh; int err; bh = ext4_read_block_bitmap_nowait(sb, block_group, false); if (IS_ERR(bh)) return bh; err = ext4_wait_block_bitmap(sb, block_group, bh); if (err) { put_bh(bh); return ERR_PTR(err); } return bh; } /** * ext4_has_free_clusters() * @sbi: in-core super block structure. * @nclusters: number of needed blocks * @flags: flags from ext4_mb_new_blocks() * * Check if filesystem has nclusters free & available for allocation. * On success return 1, return 0 on failure. */ static int ext4_has_free_clusters(struct ext4_sb_info *sbi, s64 nclusters, unsigned int flags) { s64 free_clusters, dirty_clusters, rsv, resv_clusters; struct percpu_counter *fcc = &sbi->s_freeclusters_counter; struct percpu_counter *dcc = &sbi->s_dirtyclusters_counter; free_clusters = percpu_counter_read_positive(fcc); dirty_clusters = percpu_counter_read_positive(dcc); resv_clusters = atomic64_read(&sbi->s_resv_clusters); /* * r_blocks_count should always be multiple of the cluster ratio so * we are safe to do a plane bit shift only. */ rsv = (ext4_r_blocks_count(sbi->s_es) >> sbi->s_cluster_bits) + resv_clusters; if (free_clusters - (nclusters + rsv + dirty_clusters) < EXT4_FREECLUSTERS_WATERMARK) { free_clusters = percpu_counter_sum_positive(fcc); dirty_clusters = percpu_counter_sum_positive(dcc); } /* Check whether we have space after accounting for current * dirty clusters & root reserved clusters. */ if (free_clusters >= (rsv + nclusters + dirty_clusters)) return 1; /* Hm, nope. Are (enough) root reserved clusters available? */ if (uid_eq(sbi->s_resuid, current_fsuid()) || (!gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) && in_group_p(sbi->s_resgid)) || capable(CAP_SYS_RESOURCE) || (flags & EXT4_MB_USE_ROOT_BLOCKS)) { if (free_clusters >= (nclusters + dirty_clusters + resv_clusters)) return 1; } /* No free blocks. Let's see if we can dip into reserved pool */ if (flags & EXT4_MB_USE_RESERVED) { if (free_clusters >= (nclusters + dirty_clusters)) return 1; } return 0; } int ext4_claim_free_clusters(struct ext4_sb_info *sbi, s64 nclusters, unsigned int flags) { if (ext4_has_free_clusters(sbi, nclusters, flags)) { percpu_counter_add(&sbi->s_dirtyclusters_counter, nclusters); return 0; } else return -ENOSPC; } /** * ext4_should_retry_alloc() - check if a block allocation should be retried * @sb: superblock * @retries: number of retry attempts made so far * * ext4_should_retry_alloc() is called when ENOSPC is returned while * attempting to allocate blocks. If there's an indication that a pending * journal transaction might free some space and allow another attempt to * succeed, this function will wait for the current or committing transaction * to complete and then return TRUE. */ int ext4_should_retry_alloc(struct super_block *sb, int *retries) { struct ext4_sb_info *sbi = EXT4_SB(sb); if (!sbi->s_journal) return 0; if (++(*retries) > 3) { percpu_counter_inc(&sbi->s_sra_exceeded_retry_limit); return 0; } /* * if there's no indication that blocks are about to be freed it's * possible we just missed a transaction commit that did so */ smp_mb(); if (sbi->s_mb_free_pending == 0) { if (test_opt(sb, DISCARD)) { atomic_inc(&sbi->s_retry_alloc_pending); flush_work(&sbi->s_discard_work); atomic_dec(&sbi->s_retry_alloc_pending); } return ext4_has_free_clusters(sbi, 1, 0); } /* * it's possible we've just missed a transaction commit here, * so ignore the returned status */ ext4_debug("%s: retrying operation after ENOSPC\n", sb->s_id); (void) jbd2_journal_force_commit_nested(sbi->s_journal); return 1; } /* * ext4_new_meta_blocks() -- allocate block for meta data (indexing) blocks * * @handle: handle to this transaction * @inode: file inode * @goal: given target block(filesystem wide) * @count: pointer to total number of clusters needed * @errp: error code * * Return 1st allocated block number on success, *count stores total account * error stores in errp pointer */ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, ext4_fsblk_t goal, unsigned int flags, unsigned long *count, int *errp) { struct ext4_allocation_request ar; ext4_fsblk_t ret; memset(&ar, 0, sizeof(ar)); /* Fill with neighbour allocated blocks */ ar.inode = inode; ar.goal = goal; ar.len = count ? *count : 1; ar.flags = flags; ret = ext4_mb_new_blocks(handle, &ar, errp); if (count) *count = ar.len; /* * Account for the allocated meta blocks. We will never * fail EDQUOT for metdata, but we do account for it. */ if (!(*errp) && (flags & EXT4_MB_DELALLOC_RESERVED)) { dquot_alloc_block_nofail(inode, EXT4_C2B(EXT4_SB(inode->i_sb), ar.len)); } return ret; } /** * ext4_count_free_clusters() -- count filesystem free clusters * @sb: superblock * * Adds up the number of free clusters from each block group. */ ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb) { ext4_fsblk_t desc_count; struct ext4_group_desc *gdp; ext4_group_t i; ext4_group_t ngroups = ext4_get_groups_count(sb); struct ext4_group_info *grp; #ifdef EXT4FS_DEBUG struct ext4_super_block *es; ext4_fsblk_t bitmap_count; unsigned int x; struct buffer_head *bitmap_bh = NULL; es = EXT4_SB(sb)->s_es; desc_count = 0; bitmap_count = 0; gdp = NULL; for (i = 0; i < ngroups; i++) { gdp = ext4_get_group_desc(sb, i, NULL); if (!gdp) continue; grp = NULL; if (EXT4_SB(sb)->s_group_info) grp = ext4_get_group_info(sb, i); if (!grp || !EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) desc_count += ext4_free_group_clusters(sb, gdp); brelse(bitmap_bh); bitmap_bh = ext4_read_block_bitmap(sb, i); if (IS_ERR(bitmap_bh)) { bitmap_bh = NULL; continue; } x = ext4_count_free(bitmap_bh->b_data, EXT4_CLUSTERS_PER_GROUP(sb) / 8); printk(KERN_DEBUG "group %u: stored = %d, counted = %u\n", i, ext4_free_group_clusters(sb, gdp), x); bitmap_count += x; } brelse(bitmap_bh); printk(KERN_DEBUG "ext4_count_free_clusters: stored = %llu" ", computed = %llu, %llu\n", EXT4_NUM_B2C(EXT4_SB(sb), ext4_free_blocks_count(es)), desc_count, bitmap_count); return bitmap_count; #else desc_count = 0; for (i = 0; i < ngroups; i++) { gdp = ext4_get_group_desc(sb, i, NULL); if (!gdp) continue; grp = NULL; if (EXT4_SB(sb)->s_group_info) grp = ext4_get_group_info(sb, i); if (!grp || !EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) desc_count += ext4_free_group_clusters(sb, gdp); } return desc_count; #endif } static inline int test_root(ext4_group_t a, int b) { while (1) { if (a < b) return 0; if (a == b) return 1; if ((a % b) != 0) return 0; a = a / b; } } /** * ext4_bg_has_super - number of blocks used by the superblock in group * @sb: superblock for filesystem * @group: group number to check * * Return the number of blocks used by the superblock (primary or backup) * in this group. Currently this will be only 0 or 1. */ int ext4_bg_has_super(struct super_block *sb, ext4_group_t group) { struct ext4_super_block *es = EXT4_SB(sb)->s_es; if (group == 0) return 1; if (ext4_has_feature_sparse_super2(sb)) { if (group == le32_to_cpu(es->s_backup_bgs[0]) || group == le32_to_cpu(es->s_backup_bgs[1])) return 1; return 0; } if ((group <= 1) || !ext4_has_feature_sparse_super(sb)) return 1; if (!(group & 1)) return 0; if (test_root(group, 3) || (test_root(group, 5)) || test_root(group, 7)) return 1; return 0; } static unsigned long ext4_bg_num_gdb_meta(struct super_block *sb, ext4_group_t group) { unsigned long metagroup = group / EXT4_DESC_PER_BLOCK(sb); ext4_group_t first = metagroup * EXT4_DESC_PER_BLOCK(sb); ext4_group_t last = first + EXT4_DESC_PER_BLOCK(sb) - 1; if (group == first || group == first + 1 || group == last) return 1; return 0; } static unsigned long ext4_bg_num_gdb_nometa(struct super_block *sb, ext4_group_t group) { if (!ext4_bg_has_super(sb, group)) return 0; if (ext4_has_feature_meta_bg(sb)) return le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg); else return EXT4_SB(sb)->s_gdb_count; } /** * ext4_bg_num_gdb - number of blocks used by the group table in group * @sb: superblock for filesystem * @group: group number to check * * Return the number of blocks used by the group descriptor table * (primary or backup) in this group. In the future there may be a * different number of descriptor blocks in each group. */ unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group) { unsigned long first_meta_bg = le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg); unsigned long metagroup = group / EXT4_DESC_PER_BLOCK(sb); if (!ext4_has_feature_meta_bg(sb) || metagroup < first_meta_bg) return ext4_bg_num_gdb_nometa(sb, group); return ext4_bg_num_gdb_meta(sb,group); } /* * This function returns the number of file system metadata blocks at * the beginning of a block group, including the reserved gdt blocks. */ unsigned int ext4_num_base_meta_blocks(struct super_block *sb, ext4_group_t block_group) { struct ext4_sb_info *sbi = EXT4_SB(sb); unsigned num; /* Check for superblock and gdt backups in this group */ num = ext4_bg_has_super(sb, block_group); if (!ext4_has_feature_meta_bg(sb) || block_group < le32_to_cpu(sbi->s_es->s_first_meta_bg) * sbi->s_desc_per_block) { if (num) { num += ext4_bg_num_gdb_nometa(sb, block_group); num += le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks); } } else { /* For META_BG_BLOCK_GROUPS */ num += ext4_bg_num_gdb_meta(sb, block_group); } return num; } static unsigned int ext4_num_base_meta_clusters(struct super_block *sb, ext4_group_t block_group) { return EXT4_NUM_B2C(EXT4_SB(sb), ext4_num_base_meta_blocks(sb, block_group)); } /** * ext4_inode_to_goal_block - return a hint for block allocation * @inode: inode for block allocation * * Return the ideal location to start allocating blocks for a * newly created inode. */ ext4_fsblk_t ext4_inode_to_goal_block(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); ext4_group_t block_group; ext4_grpblk_t colour; int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb)); ext4_fsblk_t bg_start; ext4_fsblk_t last_block; block_group = ei->i_block_group; if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) { /* * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME * block groups per flexgroup, reserve the first block * group for directories and special files. Regular * files will start at the second block group. This * tends to speed up directory access and improves * fsck times. */ block_group &= ~(flex_size-1); if (S_ISREG(inode->i_mode)) block_group++; } bg_start = ext4_group_first_block_no(inode->i_sb, block_group); last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; /* * If we are doing delayed allocation, we don't need take * colour into account. */ if (test_opt(inode->i_sb, DELALLOC)) return bg_start; if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block) colour = (task_pid_nr(current) % 16) * (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); else colour = (task_pid_nr(current) % 16) * ((last_block - bg_start) / 16); return bg_start + colour; } |
| 6 2 52 3240 61 3228 3 15 3 73 3081 4 5 5 341 13 2343 5 16 2 2 2040 20 3271 5 2584 27 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Berkeley style UIO structures - Alan Cox 1994. */ #ifndef __LINUX_UIO_H #define __LINUX_UIO_H #include <linux/kernel.h> #include <linux/thread_info.h> #include <linux/mm_types.h> #include <uapi/linux/uio.h> struct page; typedef unsigned int __bitwise iov_iter_extraction_t; struct kvec { void *iov_base; /* and that should *never* hold a userland pointer */ size_t iov_len; }; enum iter_type { /* iter types */ ITER_UBUF, ITER_IOVEC, ITER_BVEC, ITER_KVEC, ITER_XARRAY, ITER_DISCARD, }; #define ITER_SOURCE 1 // == WRITE #define ITER_DEST 0 // == READ struct iov_iter_state { size_t iov_offset; size_t count; unsigned long nr_segs; }; struct iov_iter { u8 iter_type; bool nofault; bool data_source; size_t iov_offset; /* * Hack alert: overlay ubuf_iovec with iovec + count, so * that the members resolve correctly regardless of the type * of iterator used. This means that you can use: * * &iter->__ubuf_iovec or iter->__iov * * interchangably for the user_backed cases, hence simplifying * some of the cases that need to deal with both. */ union { /* * This really should be a const, but we cannot do that without * also modifying any of the zero-filling iter init functions. * Leave it non-const for now, but it should be treated as such. */ struct iovec __ubuf_iovec; struct { union { /* use iter_iov() to get the current vec */ const struct iovec *__iov; const struct kvec *kvec; const struct bio_vec *bvec; struct xarray *xarray; void __user *ubuf; }; size_t count; }; }; union { unsigned long nr_segs; loff_t xarray_start; }; }; static inline const struct iovec *iter_iov(const struct iov_iter *iter) { if (iter->iter_type == ITER_UBUF) return (const struct iovec *) &iter->__ubuf_iovec; return iter->__iov; } #define iter_iov_addr(iter) (iter_iov(iter)->iov_base + (iter)->iov_offset) #define iter_iov_len(iter) (iter_iov(iter)->iov_len - (iter)->iov_offset) static inline enum iter_type iov_iter_type(const struct iov_iter *i) { return i->iter_type; } static inline void iov_iter_save_state(struct iov_iter *iter, struct iov_iter_state *state) { state->iov_offset = iter->iov_offset; state->count = iter->count; state->nr_segs = iter->nr_segs; } static inline bool iter_is_ubuf(const struct iov_iter *i) { return iov_iter_type(i) == ITER_UBUF; } static inline bool iter_is_iovec(const struct iov_iter *i) { return iov_iter_type(i) == ITER_IOVEC; } static inline bool iov_iter_is_kvec(const struct iov_iter *i) { return iov_iter_type(i) == ITER_KVEC; } static inline bool iov_iter_is_bvec(const struct iov_iter *i) { return iov_iter_type(i) == ITER_BVEC; } static inline bool iov_iter_is_discard(const struct iov_iter *i) { return iov_iter_type(i) == ITER_DISCARD; } static inline bool iov_iter_is_xarray(const struct iov_iter *i) { return iov_iter_type(i) == ITER_XARRAY; } static inline unsigned char iov_iter_rw(const struct iov_iter *i) { return i->data_source ? WRITE : READ; } static inline bool user_backed_iter(const struct iov_iter *i) { return iter_is_ubuf(i) || iter_is_iovec(i); } /* * Total number of bytes covered by an iovec. * * NOTE that it is not safe to use this function until all the iovec's * segment lengths have been validated. Because the individual lengths can * overflow a size_t when added together. */ static inline size_t iov_length(const struct iovec *iov, unsigned long nr_segs) { unsigned long seg; size_t ret = 0; for (seg = 0; seg < nr_segs; seg++) ret += iov[seg].iov_len; return ret; } size_t copy_page_from_iter_atomic(struct page *page, size_t offset, size_t bytes, struct iov_iter *i); void iov_iter_advance(struct iov_iter *i, size_t bytes); void iov_iter_revert(struct iov_iter *i, size_t bytes); size_t fault_in_iov_iter_readable(const struct iov_iter *i, size_t bytes); size_t fault_in_iov_iter_writeable(const struct iov_iter *i, size_t bytes); size_t iov_iter_single_seg_count(const struct iov_iter *i); size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes, struct iov_iter *i); size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes, struct iov_iter *i); size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i); size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i); size_t _copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i); static inline size_t copy_folio_to_iter(struct folio *folio, size_t offset, size_t bytes, struct iov_iter *i) { return copy_page_to_iter(&folio->page, offset, bytes, i); } static inline size_t copy_folio_from_iter_atomic(struct folio *folio, size_t offset, size_t bytes, struct iov_iter *i) { return copy_page_from_iter_atomic(&folio->page, offset, bytes, i); } size_t copy_page_to_iter_nofault(struct page *page, unsigned offset, size_t bytes, struct iov_iter *i); static __always_inline __must_check size_t copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i) { if (check_copy_size(addr, bytes, true)) return _copy_to_iter(addr, bytes, i); return 0; } static __always_inline __must_check size_t copy_from_iter(void *addr, size_t bytes, struct iov_iter *i) { if (check_copy_size(addr, bytes, false)) return _copy_from_iter(addr, bytes, i); return 0; } static __always_inline __must_check bool copy_to_iter_full(const void *addr, size_t bytes, struct iov_iter *i) { size_t copied = copy_to_iter(addr, bytes, i); if (likely(copied == bytes)) return true; iov_iter_revert(i, copied); return false; } static __always_inline __must_check bool copy_from_iter_full(void *addr, size_t bytes, struct iov_iter *i) { size_t copied = copy_from_iter(addr, bytes, i); if (likely(copied == bytes)) return true; iov_iter_revert(i, copied); return false; } static __always_inline __must_check size_t copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i) { if (check_copy_size(addr, bytes, false)) return _copy_from_iter_nocache(addr, bytes, i); return 0; } static __always_inline __must_check bool copy_from_iter_full_nocache(void *addr, size_t bytes, struct iov_iter *i) { size_t copied = copy_from_iter_nocache(addr, bytes, i); if (likely(copied == bytes)) return true; iov_iter_revert(i, copied); return false; } #ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE /* * Note, users like pmem that depend on the stricter semantics of * _copy_from_iter_flushcache() than _copy_from_iter_nocache() must check for * IS_ENABLED(CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE) before assuming that the * destination is flushed from the cache on return. */ size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i); #else #define _copy_from_iter_flushcache _copy_from_iter_nocache #endif #ifdef CONFIG_ARCH_HAS_COPY_MC size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i); #else #define _copy_mc_to_iter _copy_to_iter #endif size_t iov_iter_zero(size_t bytes, struct iov_iter *); bool iov_iter_is_aligned(const struct iov_iter *i, unsigned addr_mask, unsigned len_mask); unsigned long iov_iter_alignment(const struct iov_iter *i); unsigned long iov_iter_gap_alignment(const struct iov_iter *i); void iov_iter_init(struct iov_iter *i, unsigned int direction, const struct iovec *iov, unsigned long nr_segs, size_t count); void iov_iter_kvec(struct iov_iter *i, unsigned int direction, const struct kvec *kvec, unsigned long nr_segs, size_t count); void iov_iter_bvec(struct iov_iter *i, unsigned int direction, const struct bio_vec *bvec, unsigned long nr_segs, size_t count); void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count); void iov_iter_xarray(struct iov_iter *i, unsigned int direction, struct xarray *xarray, loff_t start, size_t count); ssize_t iov_iter_get_pages2(struct iov_iter *i, struct page **pages, size_t maxsize, unsigned maxpages, size_t *start); ssize_t iov_iter_get_pages_alloc2(struct iov_iter *i, struct page ***pages, size_t maxsize, size_t *start); int iov_iter_npages(const struct iov_iter *i, int maxpages); void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state); const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags); static inline size_t iov_iter_count(const struct iov_iter *i) { return i->count; } /* * Cap the iov_iter by given limit; note that the second argument is * *not* the new size - it's upper limit for such. Passing it a value * greater than the amount of data in iov_iter is fine - it'll just do * nothing in that case. */ static inline void iov_iter_truncate(struct iov_iter *i, u64 count) { /* * count doesn't have to fit in size_t - comparison extends both * operands to u64 here and any value that would be truncated by * conversion in assignement is by definition greater than all * values of size_t, including old i->count. */ if (i->count > count) i->count = count; } /* * reexpand a previously truncated iterator; count must be no more than how much * we had shrunk it. */ static inline void iov_iter_reexpand(struct iov_iter *i, size_t count) { i->count = count; } static inline int iov_iter_npages_cap(struct iov_iter *i, int maxpages, size_t max_bytes) { size_t shorted = 0; int npages; if (iov_iter_count(i) > max_bytes) { shorted = iov_iter_count(i) - max_bytes; iov_iter_truncate(i, max_bytes); } npages = iov_iter_npages(i, maxpages); if (shorted) iov_iter_reexpand(i, iov_iter_count(i) + shorted); return npages; } struct iovec *iovec_from_user(const struct iovec __user *uvector, unsigned long nr_segs, unsigned long fast_segs, struct iovec *fast_iov, bool compat); ssize_t import_iovec(int type, const struct iovec __user *uvec, unsigned nr_segs, unsigned fast_segs, struct iovec **iovp, struct iov_iter *i); ssize_t __import_iovec(int type, const struct iovec __user *uvec, unsigned nr_segs, unsigned fast_segs, struct iovec **iovp, struct iov_iter *i, bool compat); int import_ubuf(int type, void __user *buf, size_t len, struct iov_iter *i); static inline void iov_iter_ubuf(struct iov_iter *i, unsigned int direction, void __user *buf, size_t count) { WARN_ON(direction & ~(READ | WRITE)); *i = (struct iov_iter) { .iter_type = ITER_UBUF, .data_source = direction, .ubuf = buf, .count = count, .nr_segs = 1 }; } /* Flags for iov_iter_get/extract_pages*() */ /* Allow P2PDMA on the extracted pages */ #define ITER_ALLOW_P2PDMA ((__force iov_iter_extraction_t)0x01) ssize_t iov_iter_extract_pages(struct iov_iter *i, struct page ***pages, size_t maxsize, unsigned int maxpages, iov_iter_extraction_t extraction_flags, size_t *offset0); /** * iov_iter_extract_will_pin - Indicate how pages from the iterator will be retained * @iter: The iterator * * Examine the iterator and indicate by returning true or false as to how, if * at all, pages extracted from the iterator will be retained by the extraction * function. * * %true indicates that the pages will have a pin placed in them that the * caller must unpin. This is must be done for DMA/async DIO to force fork() * to forcibly copy a page for the child (the parent must retain the original * page). * * %false indicates that no measures are taken and that it's up to the caller * to retain the pages. */ static inline bool iov_iter_extract_will_pin(const struct iov_iter *iter) { return user_backed_iter(iter); } struct sg_table; ssize_t extract_iter_to_sg(struct iov_iter *iter, size_t len, struct sg_table *sgtable, unsigned int sg_max, iov_iter_extraction_t extraction_flags); #endif |
| 2 5 9 5 15 5 24 4 1 8 9 8 20 1 25 70 118 10 2 10 10 9 18 3 2 2 2 1 2 16 2 16 16 17 12 3 5 29 28 3 3 21 21 20 1 1 6 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __NET_IP_TUNNELS_H #define __NET_IP_TUNNELS_H 1 #include <linux/if_tunnel.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/socket.h> #include <linux/types.h> #include <linux/u64_stats_sync.h> #include <linux/bitops.h> #include <net/dsfield.h> #include <net/gro_cells.h> #include <net/inet_ecn.h> #include <net/netns/generic.h> #include <net/rtnetlink.h> #include <net/lwtunnel.h> #include <net/dst_cache.h> #if IS_ENABLED(CONFIG_IPV6) #include <net/ipv6.h> #include <net/ip6_fib.h> #include <net/ip6_route.h> #endif /* Keep error state on tunnel for 30 sec */ #define IPTUNNEL_ERR_TIMEO (30*HZ) /* Used to memset ip_tunnel padding. */ #define IP_TUNNEL_KEY_SIZE offsetofend(struct ip_tunnel_key, tp_dst) /* Used to memset ipv4 address padding. */ #define IP_TUNNEL_KEY_IPV4_PAD offsetofend(struct ip_tunnel_key, u.ipv4.dst) #define IP_TUNNEL_KEY_IPV4_PAD_LEN \ (sizeof_field(struct ip_tunnel_key, u) - \ sizeof_field(struct ip_tunnel_key, u.ipv4)) #define __ipt_flag_op(op, ...) \ op(__VA_ARGS__, __IP_TUNNEL_FLAG_NUM) #define IP_TUNNEL_DECLARE_FLAGS(...) \ __ipt_flag_op(DECLARE_BITMAP, __VA_ARGS__) #define ip_tunnel_flags_zero(...) __ipt_flag_op(bitmap_zero, __VA_ARGS__) #define ip_tunnel_flags_copy(...) __ipt_flag_op(bitmap_copy, __VA_ARGS__) #define ip_tunnel_flags_and(...) __ipt_flag_op(bitmap_and, __VA_ARGS__) #define ip_tunnel_flags_or(...) __ipt_flag_op(bitmap_or, __VA_ARGS__) #define ip_tunnel_flags_empty(...) \ __ipt_flag_op(bitmap_empty, __VA_ARGS__) #define ip_tunnel_flags_intersect(...) \ __ipt_flag_op(bitmap_intersects, __VA_ARGS__) #define ip_tunnel_flags_subset(...) \ __ipt_flag_op(bitmap_subset, __VA_ARGS__) struct ip_tunnel_key { __be64 tun_id; union { struct { __be32 src; __be32 dst; } ipv4; struct { struct in6_addr src; struct in6_addr dst; } ipv6; } u; IP_TUNNEL_DECLARE_FLAGS(tun_flags); __be32 label; /* Flow Label for IPv6 */ u32 nhid; u8 tos; /* TOS for IPv4, TC for IPv6 */ u8 ttl; /* TTL for IPv4, HL for IPv6 */ __be16 tp_src; __be16 tp_dst; __u8 flow_flags; }; struct ip_tunnel_encap { u16 type; u16 flags; __be16 sport; __be16 dport; }; /* Flags for ip_tunnel_info mode. */ #define IP_TUNNEL_INFO_TX 0x01 /* represents tx tunnel parameters */ #define IP_TUNNEL_INFO_IPV6 0x02 /* key contains IPv6 addresses */ #define IP_TUNNEL_INFO_BRIDGE 0x04 /* represents a bridged tunnel id */ /* Maximum tunnel options length. */ #define IP_TUNNEL_OPTS_MAX \ GENMASK((sizeof_field(struct ip_tunnel_info, \ options_len) * BITS_PER_BYTE) - 1, 0) #define ip_tunnel_info_opts(info) \ _Generic(info, \ const struct ip_tunnel_info * : ((const void *)((info) + 1)),\ struct ip_tunnel_info * : ((void *)((info) + 1))\ ) struct ip_tunnel_info { struct ip_tunnel_key key; struct ip_tunnel_encap encap; #ifdef CONFIG_DST_CACHE struct dst_cache dst_cache; #endif u8 options_len; u8 mode; }; /* 6rd prefix/relay information */ #ifdef CONFIG_IPV6_SIT_6RD struct ip_tunnel_6rd_parm { struct in6_addr prefix; __be32 relay_prefix; u16 prefixlen; u16 relay_prefixlen; }; #endif struct ip_tunnel_prl_entry { struct ip_tunnel_prl_entry __rcu *next; __be32 addr; u16 flags; struct rcu_head rcu_head; }; struct metadata_dst; /* Kernel-side variant of ip_tunnel_parm */ struct ip_tunnel_parm_kern { char name[IFNAMSIZ]; IP_TUNNEL_DECLARE_FLAGS(i_flags); IP_TUNNEL_DECLARE_FLAGS(o_flags); __be32 i_key; __be32 o_key; int link; struct iphdr iph; }; struct ip_tunnel { struct ip_tunnel __rcu *next; struct hlist_node hash_node; struct net_device *dev; netdevice_tracker dev_tracker; struct net *net; /* netns for packet i/o */ unsigned long err_time; /* Time when the last ICMP error * arrived */ int err_count; /* Number of arrived ICMP errors */ /* These four fields used only by GRE */ u32 i_seqno; /* The last seen seqno */ atomic_t o_seqno; /* The last output seqno */ int tun_hlen; /* Precalculated header length */ /* These four fields used only by ERSPAN */ u32 index; /* ERSPAN type II index */ u8 erspan_ver; /* ERSPAN version */ u8 dir; /* ERSPAN direction */ u16 hwid; /* ERSPAN hardware ID */ struct dst_cache dst_cache; struct ip_tunnel_parm_kern parms; int mlink; int encap_hlen; /* Encap header length (FOU,GUE) */ int hlen; /* tun_hlen + encap_hlen */ struct ip_tunnel_encap encap; /* for SIT */ #ifdef CONFIG_IPV6_SIT_6RD struct ip_tunnel_6rd_parm ip6rd; #endif struct ip_tunnel_prl_entry __rcu *prl; /* potential router list */ unsigned int prl_count; /* # of entries in PRL */ unsigned int ip_tnl_net_id; struct gro_cells gro_cells; __u32 fwmark; bool collect_md; bool ignore_df; }; struct tnl_ptk_info { IP_TUNNEL_DECLARE_FLAGS(flags); __be16 proto; __be32 key; __be32 seq; int hdr_len; }; #define PACKET_RCVD 0 #define PACKET_REJECT 1 #define PACKET_NEXT 2 #define IP_TNL_HASH_BITS 7 #define IP_TNL_HASH_SIZE (1 << IP_TNL_HASH_BITS) struct ip_tunnel_net { struct net_device *fb_tunnel_dev; struct rtnl_link_ops *rtnl_link_ops; struct hlist_head tunnels[IP_TNL_HASH_SIZE]; struct ip_tunnel __rcu *collect_md_tun; int type; }; static inline void ip_tunnel_set_options_present(unsigned long *flags) { IP_TUNNEL_DECLARE_FLAGS(present) = { }; __set_bit(IP_TUNNEL_GENEVE_OPT_BIT, present); __set_bit(IP_TUNNEL_VXLAN_OPT_BIT, present); __set_bit(IP_TUNNEL_ERSPAN_OPT_BIT, present); __set_bit(IP_TUNNEL_GTP_OPT_BIT, present); __set_bit(IP_TUNNEL_PFCP_OPT_BIT, present); ip_tunnel_flags_or(flags, flags, present); } static inline void ip_tunnel_clear_options_present(unsigned long *flags) { IP_TUNNEL_DECLARE_FLAGS(present) = { }; __set_bit(IP_TUNNEL_GENEVE_OPT_BIT, present); __set_bit(IP_TUNNEL_VXLAN_OPT_BIT, present); __set_bit(IP_TUNNEL_ERSPAN_OPT_BIT, present); __set_bit(IP_TUNNEL_GTP_OPT_BIT, present); __set_bit(IP_TUNNEL_PFCP_OPT_BIT, present); __ipt_flag_op(bitmap_andnot, flags, flags, present); } static inline bool ip_tunnel_is_options_present(const unsigned long *flags) { IP_TUNNEL_DECLARE_FLAGS(present) = { }; __set_bit(IP_TUNNEL_GENEVE_OPT_BIT, present); __set_bit(IP_TUNNEL_VXLAN_OPT_BIT, present); __set_bit(IP_TUNNEL_ERSPAN_OPT_BIT, present); __set_bit(IP_TUNNEL_GTP_OPT_BIT, present); __set_bit(IP_TUNNEL_PFCP_OPT_BIT, present); return ip_tunnel_flags_intersect(flags, present); } static inline bool ip_tunnel_flags_is_be16_compat(const unsigned long *flags) { IP_TUNNEL_DECLARE_FLAGS(supp) = { }; bitmap_set(supp, 0, BITS_PER_TYPE(__be16)); __set_bit(IP_TUNNEL_VTI_BIT, supp); return ip_tunnel_flags_subset(flags, supp); } static inline void ip_tunnel_flags_from_be16(unsigned long *dst, __be16 flags) { ip_tunnel_flags_zero(dst); bitmap_write(dst, be16_to_cpu(flags), 0, BITS_PER_TYPE(__be16)); __assign_bit(IP_TUNNEL_VTI_BIT, dst, flags & VTI_ISVTI); } static inline __be16 ip_tunnel_flags_to_be16(const unsigned long *flags) { __be16 ret; ret = cpu_to_be16(bitmap_read(flags, 0, BITS_PER_TYPE(__be16))); if (test_bit(IP_TUNNEL_VTI_BIT, flags)) ret |= VTI_ISVTI; return ret; } static inline void ip_tunnel_key_init(struct ip_tunnel_key *key, __be32 saddr, __be32 daddr, u8 tos, u8 ttl, __be32 label, __be16 tp_src, __be16 tp_dst, __be64 tun_id, const unsigned long *tun_flags) { key->tun_id = tun_id; key->u.ipv4.src = saddr; key->u.ipv4.dst = daddr; memset((unsigned char *)key + IP_TUNNEL_KEY_IPV4_PAD, 0, IP_TUNNEL_KEY_IPV4_PAD_LEN); key->tos = tos; key->ttl = ttl; key->label = label; ip_tunnel_flags_copy(key->tun_flags, tun_flags); /* For the tunnel types on the top of IPsec, the tp_src and tp_dst of * the upper tunnel are used. * E.g: GRE over IPSEC, the tp_src and tp_port are zero. */ key->tp_src = tp_src; key->tp_dst = tp_dst; /* Clear struct padding. */ if (sizeof(*key) != IP_TUNNEL_KEY_SIZE) memset((unsigned char *)key + IP_TUNNEL_KEY_SIZE, 0, sizeof(*key) - IP_TUNNEL_KEY_SIZE); } static inline bool ip_tunnel_dst_cache_usable(const struct sk_buff *skb, const struct ip_tunnel_info *info) { if (skb->mark) return false; return !info || !test_bit(IP_TUNNEL_NOCACHE_BIT, info->key.tun_flags); } static inline unsigned short ip_tunnel_info_af(const struct ip_tunnel_info *tun_info) { return tun_info->mode & IP_TUNNEL_INFO_IPV6 ? AF_INET6 : AF_INET; } static inline __be64 key32_to_tunnel_id(__be32 key) { #ifdef __BIG_ENDIAN return (__force __be64)key; #else return (__force __be64)((__force u64)key << 32); #endif } /* Returns the least-significant 32 bits of a __be64. */ static inline __be32 tunnel_id_to_key32(__be64 tun_id) { #ifdef __BIG_ENDIAN return (__force __be32)tun_id; #else return (__force __be32)((__force u64)tun_id >> 32); #endif } #ifdef CONFIG_INET static inline void ip_tunnel_init_flow(struct flowi4 *fl4, int proto, __be32 daddr, __be32 saddr, __be32 key, __u8 tos, struct net *net, int oif, __u32 mark, __u32 tun_inner_hash, __u8 flow_flags) { memset(fl4, 0, sizeof(*fl4)); if (oif) { fl4->flowi4_l3mdev = l3mdev_master_upper_ifindex_by_index_rcu(net, oif); /* Legacy VRF/l3mdev use case */ fl4->flowi4_oif = fl4->flowi4_l3mdev ? 0 : oif; } fl4->daddr = daddr; fl4->saddr = saddr; fl4->flowi4_tos = tos; fl4->flowi4_proto = proto; fl4->fl4_gre_key = key; fl4->flowi4_mark = mark; fl4->flowi4_multipath_hash = tun_inner_hash; fl4->flowi4_flags = flow_flags; } int ip_tunnel_init(struct net_device *dev); void ip_tunnel_uninit(struct net_device *dev); void ip_tunnel_dellink(struct net_device *dev, struct list_head *head); struct net *ip_tunnel_get_link_net(const struct net_device *dev); int ip_tunnel_get_iflink(const struct net_device *dev); int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id, struct rtnl_link_ops *ops, char *devname); void ip_tunnel_delete_nets(struct list_head *list_net, unsigned int id, struct rtnl_link_ops *ops, struct list_head *dev_to_kill); void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, const struct iphdr *tnl_params, const u8 protocol); void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, const u8 proto, int tunnel_hlen); int ip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm_kern *p, int cmd); bool ip_tunnel_parm_from_user(struct ip_tunnel_parm_kern *kp, const void __user *data); bool ip_tunnel_parm_to_user(void __user *data, struct ip_tunnel_parm_kern *kp); int ip_tunnel_siocdevprivate(struct net_device *dev, struct ifreq *ifr, void __user *data, int cmd); int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict); int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu); struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn, int link, const unsigned long *flags, __be32 remote, __be32 local, __be32 key); void ip_tunnel_md_udp_encap(struct sk_buff *skb, struct ip_tunnel_info *info); int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb, const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst, bool log_ecn_error); int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[], struct ip_tunnel_parm_kern *p, __u32 fwmark); int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[], struct ip_tunnel_parm_kern *p, __u32 fwmark); void ip_tunnel_setup(struct net_device *dev, unsigned int net_id); bool ip_tunnel_netlink_encap_parms(struct nlattr *data[], struct ip_tunnel_encap *encap); void ip_tunnel_netlink_parms(struct nlattr *data[], struct ip_tunnel_parm_kern *parms); extern const struct header_ops ip_tunnel_header_ops; __be16 ip_tunnel_parse_protocol(const struct sk_buff *skb); struct ip_tunnel_encap_ops { size_t (*encap_hlen)(struct ip_tunnel_encap *e); int (*build_header)(struct sk_buff *skb, struct ip_tunnel_encap *e, u8 *protocol, struct flowi4 *fl4); int (*err_handler)(struct sk_buff *skb, u32 info); }; #define MAX_IPTUN_ENCAP_OPS 8 extern const struct ip_tunnel_encap_ops __rcu * iptun_encaps[MAX_IPTUN_ENCAP_OPS]; int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *op, unsigned int num); int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *op, unsigned int num); int ip_tunnel_encap_setup(struct ip_tunnel *t, struct ip_tunnel_encap *ipencap); static inline bool pskb_inet_may_pull(struct sk_buff *skb) { int nhlen; switch (skb->protocol) { #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): nhlen = sizeof(struct ipv6hdr); break; #endif case htons(ETH_P_IP): nhlen = sizeof(struct iphdr); break; default: nhlen = 0; } return pskb_network_may_pull(skb, nhlen); } /* Variant of pskb_inet_may_pull(). */ static inline bool skb_vlan_inet_prepare(struct sk_buff *skb, bool inner_proto_inherit) { int nhlen = 0, maclen = inner_proto_inherit ? 0 : ETH_HLEN; __be16 type = skb->protocol; /* Essentially this is skb_protocol(skb, true) * And we get MAC len. */ if (eth_type_vlan(type)) type = __vlan_get_protocol(skb, type, &maclen); switch (type) { #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): nhlen = sizeof(struct ipv6hdr); break; #endif case htons(ETH_P_IP): nhlen = sizeof(struct iphdr); break; } /* For ETH_P_IPV6/ETH_P_IP we make sure to pull * a base network header in skb->head. */ if (!pskb_may_pull(skb, maclen + nhlen)) return false; skb_set_network_header(skb, maclen); return true; } static inline int ip_encap_hlen(struct ip_tunnel_encap *e) { const struct ip_tunnel_encap_ops *ops; int hlen = -EINVAL; if (e->type == TUNNEL_ENCAP_NONE) return 0; if (e->type >= MAX_IPTUN_ENCAP_OPS) return -EINVAL; rcu_read_lock(); ops = rcu_dereference(iptun_encaps[e->type]); if (likely(ops && ops->encap_hlen)) hlen = ops->encap_hlen(e); rcu_read_unlock(); return hlen; } static inline int ip_tunnel_encap(struct sk_buff *skb, struct ip_tunnel_encap *e, u8 *protocol, struct flowi4 *fl4) { const struct ip_tunnel_encap_ops *ops; int ret = -EINVAL; if (e->type == TUNNEL_ENCAP_NONE) return 0; if (e->type >= MAX_IPTUN_ENCAP_OPS) return -EINVAL; rcu_read_lock(); ops = rcu_dereference(iptun_encaps[e->type]); if (likely(ops && ops->build_header)) ret = ops->build_header(skb, e, protocol, fl4); rcu_read_unlock(); return ret; } /* Extract dsfield from inner protocol */ static inline u8 ip_tunnel_get_dsfield(const struct iphdr *iph, const struct sk_buff *skb) { __be16 payload_protocol = skb_protocol(skb, true); if (payload_protocol == htons(ETH_P_IP)) return iph->tos; else if (payload_protocol == htons(ETH_P_IPV6)) return ipv6_get_dsfield((const struct ipv6hdr *)iph); else return 0; } static inline __be32 ip_tunnel_get_flowlabel(const struct iphdr *iph, const struct sk_buff *skb) { __be16 payload_protocol = skb_protocol(skb, true); if (payload_protocol == htons(ETH_P_IPV6)) return ip6_flowlabel((const struct ipv6hdr *)iph); else return 0; } static inline u8 ip_tunnel_get_ttl(const struct iphdr *iph, const struct sk_buff *skb) { __be16 payload_protocol = skb_protocol(skb, true); if (payload_protocol == htons(ETH_P_IP)) return iph->ttl; else if (payload_protocol == htons(ETH_P_IPV6)) return ((const struct ipv6hdr *)iph)->hop_limit; else return 0; } /* Propogate ECN bits out */ static inline u8 ip_tunnel_ecn_encap(u8 tos, const struct iphdr *iph, const struct sk_buff *skb) { u8 inner = ip_tunnel_get_dsfield(iph, skb); return INET_ECN_encapsulate(tos, inner); } int __iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto, bool raw_proto, bool xnet); static inline int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto, bool xnet) { return __iptunnel_pull_header(skb, hdr_len, inner_proto, false, xnet); } void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, __be32 src, __be32 dst, u8 proto, u8 tos, u8 ttl, __be16 df, bool xnet); struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md, gfp_t flags); int skb_tunnel_check_pmtu(struct sk_buff *skb, struct dst_entry *encap_dst, int headroom, bool reply); int iptunnel_handle_offloads(struct sk_buff *skb, int gso_type_mask); static inline int iptunnel_pull_offloads(struct sk_buff *skb) { if (skb_is_gso(skb)) { int err; err = skb_unclone(skb, GFP_ATOMIC); if (unlikely(err)) return err; skb_shinfo(skb)->gso_type &= ~(NETIF_F_GSO_ENCAP_ALL >> NETIF_F_GSO_SHIFT); } skb->encapsulation = 0; return 0; } static inline void iptunnel_xmit_stats(struct net_device *dev, int pkt_len) { if (pkt_len > 0) { struct pcpu_sw_netstats *tstats = get_cpu_ptr(dev->tstats); u64_stats_update_begin(&tstats->syncp); u64_stats_add(&tstats->tx_bytes, pkt_len); u64_stats_inc(&tstats->tx_packets); u64_stats_update_end(&tstats->syncp); put_cpu_ptr(tstats); return; } if (pkt_len < 0) { DEV_STATS_INC(dev, tx_errors); DEV_STATS_INC(dev, tx_aborted_errors); } else { DEV_STATS_INC(dev, tx_dropped); } } static inline void ip_tunnel_info_opts_get(void *to, const struct ip_tunnel_info *info) { memcpy(to, info + 1, info->options_len); } static inline void ip_tunnel_info_opts_set(struct ip_tunnel_info *info, const void *from, int len, const unsigned long *flags) { info->options_len = len; if (len > 0) { memcpy(ip_tunnel_info_opts(info), from, len); ip_tunnel_flags_or(info->key.tun_flags, info->key.tun_flags, flags); } } static inline struct ip_tunnel_info *lwt_tun_info(struct lwtunnel_state *lwtstate) { return (struct ip_tunnel_info *)lwtstate->data; } DECLARE_STATIC_KEY_FALSE(ip_tunnel_metadata_cnt); /* Returns > 0 if metadata should be collected */ static inline int ip_tunnel_collect_metadata(void) { return static_branch_unlikely(&ip_tunnel_metadata_cnt); } void __init ip_tunnel_core_init(void); void ip_tunnel_need_metadata(void); void ip_tunnel_unneed_metadata(void); #else /* CONFIG_INET */ static inline struct ip_tunnel_info *lwt_tun_info(struct lwtunnel_state *lwtstate) { return NULL; } static inline void ip_tunnel_need_metadata(void) { } static inline void ip_tunnel_unneed_metadata(void) { } static inline void ip_tunnel_info_opts_get(void *to, const struct ip_tunnel_info *info) { } static inline void ip_tunnel_info_opts_set(struct ip_tunnel_info *info, const void *from, int len, const unsigned long *flags) { info->options_len = 0; } #endif /* CONFIG_INET */ #endif /* __NET_IP_TUNNELS_H */ |
| 3 3 2 3 1 3 2 3 5 5 5 2 2 2 9 2 2 2 1 2 9 9 8 8 3 3 3 3 3 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 | // SPDX-License-Identifier: GPL-2.0 OR MIT /* * Copyright (c) 2006-2009 VMware, Inc., Palo Alto, CA., USA * Copyright (c) 2012 David Airlie <airlied@linux.ie> * Copyright (c) 2013 David Herrmann <dh.herrmann@gmail.com> * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. */ #include <linux/mm.h> #include <linux/module.h> #include <linux/rbtree.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/types.h> #include <drm/drm_mm.h> #include <drm/drm_vma_manager.h> /** * DOC: vma offset manager * * The vma-manager is responsible to map arbitrary driver-dependent memory * regions into the linear user address-space. It provides offsets to the * caller which can then be used on the address_space of the drm-device. It * takes care to not overlap regions, size them appropriately and to not * confuse mm-core by inconsistent fake vm_pgoff fields. * Drivers shouldn't use this for object placement in VMEM. This manager should * only be used to manage mappings into linear user-space VMs. * * We use drm_mm as backend to manage object allocations. But it is highly * optimized for alloc/free calls, not lookups. Hence, we use an rb-tree to * speed up offset lookups. * * You must not use multiple offset managers on a single address_space. * Otherwise, mm-core will be unable to tear down memory mappings as the VM will * no longer be linear. * * This offset manager works on page-based addresses. That is, every argument * and return code (with the exception of drm_vma_node_offset_addr()) is given * in number of pages, not number of bytes. That means, object sizes and offsets * must always be page-aligned (as usual). * If you want to get a valid byte-based user-space address for a given offset, * please see drm_vma_node_offset_addr(). * * Additionally to offset management, the vma offset manager also handles access * management. For every open-file context that is allowed to access a given * node, you must call drm_vma_node_allow(). Otherwise, an mmap() call on this * open-file with the offset of the node will fail with -EACCES. To revoke * access again, use drm_vma_node_revoke(). However, the caller is responsible * for destroying already existing mappings, if required. */ /** * drm_vma_offset_manager_init - Initialize new offset-manager * @mgr: Manager object * @page_offset: Offset of available memory area (page-based) * @size: Size of available address space range (page-based) * * Initialize a new offset-manager. The offset and area size available for the * manager are given as @page_offset and @size. Both are interpreted as * page-numbers, not bytes. * * Adding/removing nodes from the manager is locked internally and protected * against concurrent access. However, node allocation and destruction is left * for the caller. While calling into the vma-manager, a given node must * always be guaranteed to be referenced. */ void drm_vma_offset_manager_init(struct drm_vma_offset_manager *mgr, unsigned long page_offset, unsigned long size) { rwlock_init(&mgr->vm_lock); drm_mm_init(&mgr->vm_addr_space_mm, page_offset, size); } EXPORT_SYMBOL(drm_vma_offset_manager_init); /** * drm_vma_offset_manager_destroy() - Destroy offset manager * @mgr: Manager object * * Destroy an object manager which was previously created via * drm_vma_offset_manager_init(). The caller must remove all allocated nodes * before destroying the manager. Otherwise, drm_mm will refuse to free the * requested resources. * * The manager must not be accessed after this function is called. */ void drm_vma_offset_manager_destroy(struct drm_vma_offset_manager *mgr) { drm_mm_takedown(&mgr->vm_addr_space_mm); } EXPORT_SYMBOL(drm_vma_offset_manager_destroy); /** * drm_vma_offset_lookup_locked() - Find node in offset space * @mgr: Manager object * @start: Start address for object (page-based) * @pages: Size of object (page-based) * * Find a node given a start address and object size. This returns the _best_ * match for the given node. That is, @start may point somewhere into a valid * region and the given node will be returned, as long as the node spans the * whole requested area (given the size in number of pages as @pages). * * Note that before lookup the vma offset manager lookup lock must be acquired * with drm_vma_offset_lock_lookup(). See there for an example. This can then be * used to implement weakly referenced lookups using kref_get_unless_zero(). * * Example: * * :: * * drm_vma_offset_lock_lookup(mgr); * node = drm_vma_offset_lookup_locked(mgr); * if (node) * kref_get_unless_zero(container_of(node, sth, entr)); * drm_vma_offset_unlock_lookup(mgr); * * RETURNS: * Returns NULL if no suitable node can be found. Otherwise, the best match * is returned. It's the caller's responsibility to make sure the node doesn't * get destroyed before the caller can access it. */ struct drm_vma_offset_node *drm_vma_offset_lookup_locked(struct drm_vma_offset_manager *mgr, unsigned long start, unsigned long pages) { struct drm_mm_node *node, *best; struct rb_node *iter; unsigned long offset; iter = mgr->vm_addr_space_mm.interval_tree.rb_root.rb_node; best = NULL; while (likely(iter)) { node = rb_entry(iter, struct drm_mm_node, rb); offset = node->start; if (start >= offset) { iter = iter->rb_right; best = node; if (start == offset) break; } else { iter = iter->rb_left; } } /* verify that the node spans the requested area */ if (best) { offset = best->start + best->size; if (offset < start + pages) best = NULL; } if (!best) return NULL; return container_of(best, struct drm_vma_offset_node, vm_node); } EXPORT_SYMBOL(drm_vma_offset_lookup_locked); /** * drm_vma_offset_add() - Add offset node to manager * @mgr: Manager object * @node: Node to be added * @pages: Allocation size visible to user-space (in number of pages) * * Add a node to the offset-manager. If the node was already added, this does * nothing and return 0. @pages is the size of the object given in number of * pages. * After this call succeeds, you can access the offset of the node until it * is removed again. * * If this call fails, it is safe to retry the operation or call * drm_vma_offset_remove(), anyway. However, no cleanup is required in that * case. * * @pages is not required to be the same size as the underlying memory object * that you want to map. It only limits the size that user-space can map into * their address space. * * RETURNS: * 0 on success, negative error code on failure. */ int drm_vma_offset_add(struct drm_vma_offset_manager *mgr, struct drm_vma_offset_node *node, unsigned long pages) { int ret = 0; write_lock(&mgr->vm_lock); if (!drm_mm_node_allocated(&node->vm_node)) ret = drm_mm_insert_node(&mgr->vm_addr_space_mm, &node->vm_node, pages); write_unlock(&mgr->vm_lock); return ret; } EXPORT_SYMBOL(drm_vma_offset_add); /** * drm_vma_offset_remove() - Remove offset node from manager * @mgr: Manager object * @node: Node to be removed * * Remove a node from the offset manager. If the node wasn't added before, this * does nothing. After this call returns, the offset and size will be 0 until a * new offset is allocated via drm_vma_offset_add() again. Helper functions like * drm_vma_node_start() and drm_vma_node_offset_addr() will return 0 if no * offset is allocated. */ void drm_vma_offset_remove(struct drm_vma_offset_manager *mgr, struct drm_vma_offset_node *node) { write_lock(&mgr->vm_lock); if (drm_mm_node_allocated(&node->vm_node)) { drm_mm_remove_node(&node->vm_node); memset(&node->vm_node, 0, sizeof(node->vm_node)); } write_unlock(&mgr->vm_lock); } EXPORT_SYMBOL(drm_vma_offset_remove); static int vma_node_allow(struct drm_vma_offset_node *node, struct drm_file *tag, bool ref_counted) { struct rb_node **iter; struct rb_node *parent = NULL; struct drm_vma_offset_file *new, *entry; int ret = 0; /* Preallocate entry to avoid atomic allocations below. It is quite * unlikely that an open-file is added twice to a single node so we * don't optimize for this case. OOM is checked below only if the entry * is actually used. */ new = kmalloc(sizeof(*entry), GFP_KERNEL); write_lock(&node->vm_lock); iter = &node->vm_files.rb_node; while (likely(*iter)) { parent = *iter; entry = rb_entry(*iter, struct drm_vma_offset_file, vm_rb); if (tag == entry->vm_tag) { if (ref_counted) entry->vm_count++; goto unlock; } else if (tag > entry->vm_tag) { iter = &(*iter)->rb_right; } else { iter = &(*iter)->rb_left; } } if (!new) { ret = -ENOMEM; goto unlock; } new->vm_tag = tag; new->vm_count = 1; rb_link_node(&new->vm_rb, parent, iter); rb_insert_color(&new->vm_rb, &node->vm_files); new = NULL; unlock: write_unlock(&node->vm_lock); kfree(new); return ret; } /** * drm_vma_node_allow - Add open-file to list of allowed users * @node: Node to modify * @tag: Tag of file to remove * * Add @tag to the list of allowed open-files for this node. If @tag is * already on this list, the ref-count is incremented. * * The list of allowed-users is preserved across drm_vma_offset_add() and * drm_vma_offset_remove() calls. You may even call it if the node is currently * not added to any offset-manager. * * You must remove all open-files the same number of times as you added them * before destroying the node. Otherwise, you will leak memory. * * This is locked against concurrent access internally. * * RETURNS: * 0 on success, negative error code on internal failure (out-of-mem) */ int drm_vma_node_allow(struct drm_vma_offset_node *node, struct drm_file *tag) { return vma_node_allow(node, tag, true); } EXPORT_SYMBOL(drm_vma_node_allow); /** * drm_vma_node_allow_once - Add open-file to list of allowed users * @node: Node to modify * @tag: Tag of file to remove * * Add @tag to the list of allowed open-files for this node. * * The list of allowed-users is preserved across drm_vma_offset_add() and * drm_vma_offset_remove() calls. You may even call it if the node is currently * not added to any offset-manager. * * This is not ref-counted unlike drm_vma_node_allow() hence drm_vma_node_revoke() * should only be called once after this. * * This is locked against concurrent access internally. * * RETURNS: * 0 on success, negative error code on internal failure (out-of-mem) */ int drm_vma_node_allow_once(struct drm_vma_offset_node *node, struct drm_file *tag) { return vma_node_allow(node, tag, false); } EXPORT_SYMBOL(drm_vma_node_allow_once); /** * drm_vma_node_revoke - Remove open-file from list of allowed users * @node: Node to modify * @tag: Tag of file to remove * * Decrement the ref-count of @tag in the list of allowed open-files on @node. * If the ref-count drops to zero, remove @tag from the list. You must call * this once for every drm_vma_node_allow() on @tag. * * This is locked against concurrent access internally. * * If @tag is not on the list, nothing is done. */ void drm_vma_node_revoke(struct drm_vma_offset_node *node, struct drm_file *tag) { struct drm_vma_offset_file *entry; struct rb_node *iter; write_lock(&node->vm_lock); iter = node->vm_files.rb_node; while (likely(iter)) { entry = rb_entry(iter, struct drm_vma_offset_file, vm_rb); if (tag == entry->vm_tag) { if (!--entry->vm_count) { rb_erase(&entry->vm_rb, &node->vm_files); kfree(entry); } break; } else if (tag > entry->vm_tag) { iter = iter->rb_right; } else { iter = iter->rb_left; } } write_unlock(&node->vm_lock); } EXPORT_SYMBOL(drm_vma_node_revoke); /** * drm_vma_node_is_allowed - Check whether an open-file is granted access * @node: Node to check * @tag: Tag of file to remove * * Search the list in @node whether @tag is currently on the list of allowed * open-files (see drm_vma_node_allow()). * * This is locked against concurrent access internally. * * RETURNS: * true if @filp is on the list */ bool drm_vma_node_is_allowed(struct drm_vma_offset_node *node, struct drm_file *tag) { struct drm_vma_offset_file *entry; struct rb_node *iter; read_lock(&node->vm_lock); iter = node->vm_files.rb_node; while (likely(iter)) { entry = rb_entry(iter, struct drm_vma_offset_file, vm_rb); if (tag == entry->vm_tag) break; else if (tag > entry->vm_tag) iter = iter->rb_right; else iter = iter->rb_left; } read_unlock(&node->vm_lock); return iter; } EXPORT_SYMBOL(drm_vma_node_is_allowed); |
| 5 5 5 4 5 5 4 4 4 4 4 14 14 14 7 7 17 17 17 17 1 1 1 1 1 1 6 6 6 11 11 11 11 11 11 4 4 4 4 4 4 4 4 4 4 3 1 3 3 7 7 7 7 7 7 3 7 1 2 2 4 4 6 11 38 11 11 4 11 11 11 11 11 4 12 12 12 12 11 11 11 11 11 11 11 11 11 11 11 4 4 4 4 4 4 4 4 12 12 12 2 2 11 12 4 4 4 13 13 11 4 13 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 | // SPDX-License-Identifier: GPL-2.0 /* * udc.c - Core UDC Framework * * Copyright (C) 2010 Texas Instruments * Author: Felipe Balbi <balbi@ti.com> */ #define pr_fmt(fmt) "UDC core: " fmt #include <linux/kernel.h> #include <linux/module.h> #include <linux/device.h> #include <linux/list.h> #include <linux/idr.h> #include <linux/err.h> #include <linux/dma-mapping.h> #include <linux/sched/task_stack.h> #include <linux/workqueue.h> #include <linux/usb/ch9.h> #include <linux/usb/gadget.h> #include <linux/usb.h> #include "trace.h" static DEFINE_IDA(gadget_id_numbers); static const struct bus_type gadget_bus_type; /** * struct usb_udc - describes one usb device controller * @driver: the gadget driver pointer. For use by the class code * @dev: the child device to the actual controller * @gadget: the gadget. For use by the class code * @list: for use by the udc class driver * @vbus: for udcs who care about vbus status, this value is real vbus status; * for udcs who do not care about vbus status, this value is always true * @started: the UDC's started state. True if the UDC had started. * @allow_connect: Indicates whether UDC is allowed to be pulled up. * Set/cleared by gadget_(un)bind_driver() after gadget driver is bound or * unbound. * @vbus_work: work routine to handle VBUS status change notifications. * @connect_lock: protects udc->started, gadget->connect, * gadget->allow_connect and gadget->deactivate. The routines * usb_gadget_connect_locked(), usb_gadget_disconnect_locked(), * usb_udc_connect_control_locked(), usb_gadget_udc_start_locked() and * usb_gadget_udc_stop_locked() are called with this lock held. * * This represents the internal data structure which is used by the UDC-class * to hold information about udc driver and gadget together. */ struct usb_udc { struct usb_gadget_driver *driver; struct usb_gadget *gadget; struct device dev; struct list_head list; bool vbus; bool started; bool allow_connect; struct work_struct vbus_work; struct mutex connect_lock; }; static const struct class udc_class; static LIST_HEAD(udc_list); /* Protects udc_list, udc->driver, driver->is_bound, and related calls */ static DEFINE_MUTEX(udc_lock); /* ------------------------------------------------------------------------- */ /** * usb_ep_set_maxpacket_limit - set maximum packet size limit for endpoint * @ep:the endpoint being configured * @maxpacket_limit:value of maximum packet size limit * * This function should be used only in UDC drivers to initialize endpoint * (usually in probe function). */ void usb_ep_set_maxpacket_limit(struct usb_ep *ep, unsigned maxpacket_limit) { ep->maxpacket_limit = maxpacket_limit; ep->maxpacket = maxpacket_limit; trace_usb_ep_set_maxpacket_limit(ep, 0); } EXPORT_SYMBOL_GPL(usb_ep_set_maxpacket_limit); /** * usb_ep_enable - configure endpoint, making it usable * @ep:the endpoint being configured. may not be the endpoint named "ep0". * drivers discover endpoints through the ep_list of a usb_gadget. * * When configurations are set, or when interface settings change, the driver * will enable or disable the relevant endpoints. while it is enabled, an * endpoint may be used for i/o until the driver receives a disconnect() from * the host or until the endpoint is disabled. * * the ep0 implementation (which calls this routine) must ensure that the * hardware capabilities of each endpoint match the descriptor provided * for it. for example, an endpoint named "ep2in-bulk" would be usable * for interrupt transfers as well as bulk, but it likely couldn't be used * for iso transfers or for endpoint 14. some endpoints are fully * configurable, with more generic names like "ep-a". (remember that for * USB, "in" means "towards the USB host".) * * This routine may be called in an atomic (interrupt) context. * * returns zero, or a negative error code. */ int usb_ep_enable(struct usb_ep *ep) { int ret = 0; if (ep->enabled) goto out; /* UDC drivers can't handle endpoints with maxpacket size 0 */ if (usb_endpoint_maxp(ep->desc) == 0) { /* * We should log an error message here, but we can't call * dev_err() because there's no way to find the gadget * given only ep. */ ret = -EINVAL; goto out; } ret = ep->ops->enable(ep, ep->desc); if (ret) goto out; ep->enabled = true; out: trace_usb_ep_enable(ep, ret); return ret; } EXPORT_SYMBOL_GPL(usb_ep_enable); /** * usb_ep_disable - endpoint is no longer usable * @ep:the endpoint being unconfigured. may not be the endpoint named "ep0". * * no other task may be using this endpoint when this is called. * any pending and uncompleted requests will complete with status * indicating disconnect (-ESHUTDOWN) before this call returns. * gadget drivers must call usb_ep_enable() again before queueing * requests to the endpoint. * * This routine may be called in an atomic (interrupt) context. * * returns zero, or a negative error code. */ int usb_ep_disable(struct usb_ep *ep) { int ret = 0; if (!ep->enabled) goto out; ret = ep->ops->disable(ep); if (ret) goto out; ep->enabled = false; out: trace_usb_ep_disable(ep, ret); return ret; } EXPORT_SYMBOL_GPL(usb_ep_disable); /** * usb_ep_alloc_request - allocate a request object to use with this endpoint * @ep:the endpoint to be used with with the request * @gfp_flags:GFP_* flags to use * * Request objects must be allocated with this call, since they normally * need controller-specific setup and may even need endpoint-specific * resources such as allocation of DMA descriptors. * Requests may be submitted with usb_ep_queue(), and receive a single * completion callback. Free requests with usb_ep_free_request(), when * they are no longer needed. * * Returns the request, or null if one could not be allocated. */ struct usb_request *usb_ep_alloc_request(struct usb_ep *ep, gfp_t gfp_flags) { struct usb_request *req = NULL; req = ep->ops->alloc_request(ep, gfp_flags); trace_usb_ep_alloc_request(ep, req, req ? 0 : -ENOMEM); return req; } EXPORT_SYMBOL_GPL(usb_ep_alloc_request); /** * usb_ep_free_request - frees a request object * @ep:the endpoint associated with the request * @req:the request being freed * * Reverses the effect of usb_ep_alloc_request(). * Caller guarantees the request is not queued, and that it will * no longer be requeued (or otherwise used). */ void usb_ep_free_request(struct usb_ep *ep, struct usb_request *req) { trace_usb_ep_free_request(ep, req, 0); ep->ops->free_request(ep, req); } EXPORT_SYMBOL_GPL(usb_ep_free_request); /** * usb_ep_queue - queues (submits) an I/O request to an endpoint. * @ep:the endpoint associated with the request * @req:the request being submitted * @gfp_flags: GFP_* flags to use in case the lower level driver couldn't * pre-allocate all necessary memory with the request. * * This tells the device controller to perform the specified request through * that endpoint (reading or writing a buffer). When the request completes, * including being canceled by usb_ep_dequeue(), the request's completion * routine is called to return the request to the driver. Any endpoint * (except control endpoints like ep0) may have more than one transfer * request queued; they complete in FIFO order. Once a gadget driver * submits a request, that request may not be examined or modified until it * is given back to that driver through the completion callback. * * Each request is turned into one or more packets. The controller driver * never merges adjacent requests into the same packet. OUT transfers * will sometimes use data that's already buffered in the hardware. * Drivers can rely on the fact that the first byte of the request's buffer * always corresponds to the first byte of some USB packet, for both * IN and OUT transfers. * * Bulk endpoints can queue any amount of data; the transfer is packetized * automatically. The last packet will be short if the request doesn't fill it * out completely. Zero length packets (ZLPs) should be avoided in portable * protocols since not all usb hardware can successfully handle zero length * packets. (ZLPs may be explicitly written, and may be implicitly written if * the request 'zero' flag is set.) Bulk endpoints may also be used * for interrupt transfers; but the reverse is not true, and some endpoints * won't support every interrupt transfer. (Such as 768 byte packets.) * * Interrupt-only endpoints are less functional than bulk endpoints, for * example by not supporting queueing or not handling buffers that are * larger than the endpoint's maxpacket size. They may also treat data * toggle differently. * * Control endpoints ... after getting a setup() callback, the driver queues * one response (even if it would be zero length). That enables the * status ack, after transferring data as specified in the response. Setup * functions may return negative error codes to generate protocol stalls. * (Note that some USB device controllers disallow protocol stall responses * in some cases.) When control responses are deferred (the response is * written after the setup callback returns), then usb_ep_set_halt() may be * used on ep0 to trigger protocol stalls. Depending on the controller, * it may not be possible to trigger a status-stage protocol stall when the * data stage is over, that is, from within the response's completion * routine. * * For periodic endpoints, like interrupt or isochronous ones, the usb host * arranges to poll once per interval, and the gadget driver usually will * have queued some data to transfer at that time. * * Note that @req's ->complete() callback must never be called from * within usb_ep_queue() as that can create deadlock situations. * * This routine may be called in interrupt context. * * Returns zero, or a negative error code. Endpoints that are not enabled * report errors; errors will also be * reported when the usb peripheral is disconnected. * * If and only if @req is successfully queued (the return value is zero), * @req->complete() will be called exactly once, when the Gadget core and * UDC are finished with the request. When the completion function is called, * control of the request is returned to the device driver which submitted it. * The completion handler may then immediately free or reuse @req. */ int usb_ep_queue(struct usb_ep *ep, struct usb_request *req, gfp_t gfp_flags) { int ret = 0; if (!ep->enabled && ep->address) { pr_debug("USB gadget: queue request to disabled ep 0x%x (%s)\n", ep->address, ep->name); ret = -ESHUTDOWN; goto out; } ret = ep->ops->queue(ep, req, gfp_flags); out: trace_usb_ep_queue(ep, req, ret); return ret; } EXPORT_SYMBOL_GPL(usb_ep_queue); /** * usb_ep_dequeue - dequeues (cancels, unlinks) an I/O request from an endpoint * @ep:the endpoint associated with the request * @req:the request being canceled * * If the request is still active on the endpoint, it is dequeued and * eventually its completion routine is called (with status -ECONNRESET); * else a negative error code is returned. This routine is asynchronous, * that is, it may return before the completion routine runs. * * Note that some hardware can't clear out write fifos (to unlink the request * at the head of the queue) except as part of disconnecting from usb. Such * restrictions prevent drivers from supporting configuration changes, * even to configuration zero (a "chapter 9" requirement). * * This routine may be called in interrupt context. */ int usb_ep_dequeue(struct usb_ep *ep, struct usb_request *req) { int ret; ret = ep->ops->dequeue(ep, req); trace_usb_ep_dequeue(ep, req, ret); return ret; } EXPORT_SYMBOL_GPL(usb_ep_dequeue); /** * usb_ep_set_halt - sets the endpoint halt feature. * @ep: the non-isochronous endpoint being stalled * * Use this to stall an endpoint, perhaps as an error report. * Except for control endpoints, * the endpoint stays halted (will not stream any data) until the host * clears this feature; drivers may need to empty the endpoint's request * queue first, to make sure no inappropriate transfers happen. * * Note that while an endpoint CLEAR_FEATURE will be invisible to the * gadget driver, a SET_INTERFACE will not be. To reset endpoints for the * current altsetting, see usb_ep_clear_halt(). When switching altsettings, * it's simplest to use usb_ep_enable() or usb_ep_disable() for the endpoints. * * This routine may be called in interrupt context. * * Returns zero, or a negative error code. On success, this call sets * underlying hardware state that blocks data transfers. * Attempts to halt IN endpoints will fail (returning -EAGAIN) if any * transfer requests are still queued, or if the controller hardware * (usually a FIFO) still holds bytes that the host hasn't collected. */ int usb_ep_set_halt(struct usb_ep *ep) { int ret; ret = ep->ops->set_halt(ep, 1); trace_usb_ep_set_halt(ep, ret); return ret; } EXPORT_SYMBOL_GPL(usb_ep_set_halt); /** * usb_ep_clear_halt - clears endpoint halt, and resets toggle * @ep:the bulk or interrupt endpoint being reset * * Use this when responding to the standard usb "set interface" request, * for endpoints that aren't reconfigured, after clearing any other state * in the endpoint's i/o queue. * * This routine may be called in interrupt context. * * Returns zero, or a negative error code. On success, this call clears * the underlying hardware state reflecting endpoint halt and data toggle. * Note that some hardware can't support this request (like pxa2xx_udc), * and accordingly can't correctly implement interface altsettings. */ int usb_ep_clear_halt(struct usb_ep *ep) { int ret; ret = ep->ops->set_halt(ep, 0); trace_usb_ep_clear_halt(ep, ret); return ret; } EXPORT_SYMBOL_GPL(usb_ep_clear_halt); /** * usb_ep_set_wedge - sets the halt feature and ignores clear requests * @ep: the endpoint being wedged * * Use this to stall an endpoint and ignore CLEAR_FEATURE(HALT_ENDPOINT) * requests. If the gadget driver clears the halt status, it will * automatically unwedge the endpoint. * * This routine may be called in interrupt context. * * Returns zero on success, else negative errno. */ int usb_ep_set_wedge(struct usb_ep *ep) { int ret; if (ep->ops->set_wedge) ret = ep->ops->set_wedge(ep); else ret = ep->ops->set_halt(ep, 1); trace_usb_ep_set_wedge(ep, ret); return ret; } EXPORT_SYMBOL_GPL(usb_ep_set_wedge); /** * usb_ep_fifo_status - returns number of bytes in fifo, or error * @ep: the endpoint whose fifo status is being checked. * * FIFO endpoints may have "unclaimed data" in them in certain cases, * such as after aborted transfers. Hosts may not have collected all * the IN data written by the gadget driver (and reported by a request * completion). The gadget driver may not have collected all the data * written OUT to it by the host. Drivers that need precise handling for * fault reporting or recovery may need to use this call. * * This routine may be called in interrupt context. * * This returns the number of such bytes in the fifo, or a negative * errno if the endpoint doesn't use a FIFO or doesn't support such * precise handling. */ int usb_ep_fifo_status(struct usb_ep *ep) { int ret; if (ep->ops->fifo_status) ret = ep->ops->fifo_status(ep); else ret = -EOPNOTSUPP; trace_usb_ep_fifo_status(ep, ret); return ret; } EXPORT_SYMBOL_GPL(usb_ep_fifo_status); /** * usb_ep_fifo_flush - flushes contents of a fifo * @ep: the endpoint whose fifo is being flushed. * * This call may be used to flush the "unclaimed data" that may exist in * an endpoint fifo after abnormal transaction terminations. The call * must never be used except when endpoint is not being used for any * protocol translation. * * This routine may be called in interrupt context. */ void usb_ep_fifo_flush(struct usb_ep *ep) { if (ep->ops->fifo_flush) ep->ops->fifo_flush(ep); trace_usb_ep_fifo_flush(ep, 0); } EXPORT_SYMBOL_GPL(usb_ep_fifo_flush); /* ------------------------------------------------------------------------- */ /** * usb_gadget_frame_number - returns the current frame number * @gadget: controller that reports the frame number * * Returns the usb frame number, normally eleven bits from a SOF packet, * or negative errno if this device doesn't support this capability. */ int usb_gadget_frame_number(struct usb_gadget *gadget) { int ret; ret = gadget->ops->get_frame(gadget); trace_usb_gadget_frame_number(gadget, ret); return ret; } EXPORT_SYMBOL_GPL(usb_gadget_frame_number); /** * usb_gadget_wakeup - tries to wake up the host connected to this gadget * @gadget: controller used to wake up the host * * Returns zero on success, else negative error code if the hardware * doesn't support such attempts, or its support has not been enabled * by the usb host. Drivers must return device descriptors that report * their ability to support this, or hosts won't enable it. * * This may also try to use SRP to wake the host and start enumeration, * even if OTG isn't otherwise in use. OTG devices may also start * remote wakeup even when hosts don't explicitly enable it. */ int usb_gadget_wakeup(struct usb_gadget *gadget) { int ret = 0; if (!gadget->ops->wakeup) { ret = -EOPNOTSUPP; goto out; } ret = gadget->ops->wakeup(gadget); out: trace_usb_gadget_wakeup(gadget, ret); return ret; } EXPORT_SYMBOL_GPL(usb_gadget_wakeup); /** * usb_gadget_set_remote_wakeup - configures the device remote wakeup feature. * @gadget:the device being configured for remote wakeup * @set:value to be configured. * * set to one to enable remote wakeup feature and zero to disable it. * * returns zero on success, else negative errno. */ int usb_gadget_set_remote_wakeup(struct usb_gadget *gadget, int set) { int ret = 0; if (!gadget->ops->set_remote_wakeup) { ret = -EOPNOTSUPP; goto out; } ret = gadget->ops->set_remote_wakeup(gadget, set); out: trace_usb_gadget_set_remote_wakeup(gadget, ret); return ret; } EXPORT_SYMBOL_GPL(usb_gadget_set_remote_wakeup); /** * usb_gadget_set_selfpowered - sets the device selfpowered feature. * @gadget:the device being declared as self-powered * * this affects the device status reported by the hardware driver * to reflect that it now has a local power supply. * * returns zero on success, else negative errno. */ int usb_gadget_set_selfpowered(struct usb_gadget *gadget) { int ret = 0; if (!gadget->ops->set_selfpowered) { ret = -EOPNOTSUPP; goto out; } ret = gadget->ops->set_selfpowered(gadget, 1); out: trace_usb_gadget_set_selfpowered(gadget, ret); return ret; } EXPORT_SYMBOL_GPL(usb_gadget_set_selfpowered); /** * usb_gadget_clear_selfpowered - clear the device selfpowered feature. * @gadget:the device being declared as bus-powered * * this affects the device status reported by the hardware driver. * some hardware may not support bus-powered operation, in which * case this feature's value can never change. * * returns zero on success, else negative errno. */ int usb_gadget_clear_selfpowered(struct usb_gadget *gadget) { int ret = 0; if (!gadget->ops->set_selfpowered) { ret = -EOPNOTSUPP; goto out; } ret = gadget->ops->set_selfpowered(gadget, 0); out: trace_usb_gadget_clear_selfpowered(gadget, ret); return ret; } EXPORT_SYMBOL_GPL(usb_gadget_clear_selfpowered); /** * usb_gadget_vbus_connect - Notify controller that VBUS is powered * @gadget:The device which now has VBUS power. * Context: can sleep * * This call is used by a driver for an external transceiver (or GPIO) * that detects a VBUS power session starting. Common responses include * resuming the controller, activating the D+ (or D-) pullup to let the * host detect that a USB device is attached, and starting to draw power * (8mA or possibly more, especially after SET_CONFIGURATION). * * Returns zero on success, else negative errno. */ int usb_gadget_vbus_connect(struct usb_gadget *gadget) { int ret = 0; if (!gadget->ops->vbus_session) { ret = -EOPNOTSUPP; goto out; } ret = gadget->ops->vbus_session(gadget, 1); out: trace_usb_gadget_vbus_connect(gadget, ret); return ret; } EXPORT_SYMBOL_GPL(usb_gadget_vbus_connect); /** * usb_gadget_vbus_draw - constrain controller's VBUS power usage * @gadget:The device whose VBUS usage is being described * @mA:How much current to draw, in milliAmperes. This should be twice * the value listed in the configuration descriptor bMaxPower field. * * This call is used by gadget drivers during SET_CONFIGURATION calls, * reporting how much power the device may consume. For example, this * could affect how quickly batteries are recharged. * * Returns zero on success, else negative errno. */ int usb_gadget_vbus_draw(struct usb_gadget *gadget, unsigned mA) { int ret = 0; if (!gadget->ops->vbus_draw) { ret = -EOPNOTSUPP; goto out; } ret = gadget->ops->vbus_draw(gadget, mA); if (!ret) gadget->mA = mA; out: trace_usb_gadget_vbus_draw(gadget, ret); return ret; } EXPORT_SYMBOL_GPL(usb_gadget_vbus_draw); /** * usb_gadget_vbus_disconnect - notify controller about VBUS session end * @gadget:the device whose VBUS supply is being described * Context: can sleep * * This call is used by a driver for an external transceiver (or GPIO) * that detects a VBUS power session ending. Common responses include * reversing everything done in usb_gadget_vbus_connect(). * * Returns zero on success, else negative errno. */ int usb_gadget_vbus_disconnect(struct usb_gadget *gadget) { int ret = 0; if (!gadget->ops->vbus_session) { ret = -EOPNOTSUPP; goto out; } ret = gadget->ops->vbus_session(gadget, 0); out: trace_usb_gadget_vbus_disconnect(gadget, ret); return ret; } EXPORT_SYMBOL_GPL(usb_gadget_vbus_disconnect); static int usb_gadget_connect_locked(struct usb_gadget *gadget) __must_hold(&gadget->udc->connect_lock) { int ret = 0; if (!gadget->ops->pullup) { ret = -EOPNOTSUPP; goto out; } if (gadget->deactivated || !gadget->udc->allow_connect || !gadget->udc->started) { /* * If the gadget isn't usable (because it is deactivated, * unbound, or not yet started), we only save the new state. * The gadget will be connected automatically when it is * activated/bound/started. */ gadget->connected = true; goto out; } ret = gadget->ops->pullup(gadget, 1); if (!ret) gadget->connected = 1; out: trace_usb_gadget_connect(gadget, ret); return ret; } /** * usb_gadget_connect - software-controlled connect to USB host * @gadget:the peripheral being connected * * Enables the D+ (or potentially D-) pullup. The host will start * enumerating this gadget when the pullup is active and a VBUS session * is active (the link is powered). * * Returns zero on success, else negative errno. */ int usb_gadget_connect(struct usb_gadget *gadget) { int ret; mutex_lock(&gadget->udc->connect_lock); ret = usb_gadget_connect_locked(gadget); mutex_unlock(&gadget->udc->connect_lock); return ret; } EXPORT_SYMBOL_GPL(usb_gadget_connect); static int usb_gadget_disconnect_locked(struct usb_gadget *gadget) __must_hold(&gadget->udc->connect_lock) { int ret = 0; if (!gadget->ops->pullup) { ret = -EOPNOTSUPP; goto out; } if (!gadget->connected) goto out; if (gadget->deactivated || !gadget->udc->started) { /* * If gadget is deactivated we only save new state. * Gadget will stay disconnected after activation. */ gadget->connected = false; goto out; } ret = gadget->ops->pullup(gadget, 0); if (!ret) gadget->connected = 0; mutex_lock(&udc_lock); if (gadget->udc->driver) gadget->udc->driver->disconnect(gadget); mutex_unlock(&udc_lock); out: trace_usb_gadget_disconnect(gadget, ret); return ret; } /** * usb_gadget_disconnect - software-controlled disconnect from USB host * @gadget:the peripheral being disconnected * * Disables the D+ (or potentially D-) pullup, which the host may see * as a disconnect (when a VBUS session is active). Not all systems * support software pullup controls. * * Following a successful disconnect, invoke the ->disconnect() callback * for the current gadget driver so that UDC drivers don't need to. * * Returns zero on success, else negative errno. */ int usb_gadget_disconnect(struct usb_gadget *gadget) { int ret; mutex_lock(&gadget->udc->connect_lock); ret = usb_gadget_disconnect_locked(gadget); mutex_unlock(&gadget->udc->connect_lock); return ret; } EXPORT_SYMBOL_GPL(usb_gadget_disconnect); /** * usb_gadget_deactivate - deactivate function which is not ready to work * @gadget: the peripheral being deactivated * * This routine may be used during the gadget driver bind() call to prevent * the peripheral from ever being visible to the USB host, unless later * usb_gadget_activate() is called. For example, user mode components may * need to be activated before the system can talk to hosts. * * This routine may sleep; it must not be called in interrupt context * (such as from within a gadget driver's disconnect() callback). * * Returns zero on success, else negative errno. */ int usb_gadget_deactivate(struct usb_gadget *gadget) { int ret = 0; mutex_lock(&gadget->udc->connect_lock); if (gadget->deactivated) goto unlock; if (gadget->connected) { ret = usb_gadget_disconnect_locked(gadget); if (ret) goto unlock; /* * If gadget was being connected before deactivation, we want * to reconnect it in usb_gadget_activate(). */ gadget->connected = true; } gadget->deactivated = true; unlock: mutex_unlock(&gadget->udc->connect_lock); trace_usb_gadget_deactivate(gadget, ret); return ret; } EXPORT_SYMBOL_GPL(usb_gadget_deactivate); /** * usb_gadget_activate - activate function which is not ready to work * @gadget: the peripheral being activated * * This routine activates gadget which was previously deactivated with * usb_gadget_deactivate() call. It calls usb_gadget_connect() if needed. * * This routine may sleep; it must not be called in interrupt context. * * Returns zero on success, else negative errno. */ int usb_gadget_activate(struct usb_gadget *gadget) { int ret = 0; mutex_lock(&gadget->udc->connect_lock); if (!gadget->deactivated) goto unlock; gadget->deactivated = false; /* * If gadget has been connected before deactivation, or became connected * while it was being deactivated, we call usb_gadget_connect(). */ if (gadget->connected) ret = usb_gadget_connect_locked(gadget); unlock: mutex_unlock(&gadget->udc->connect_lock); trace_usb_gadget_activate(gadget, ret); return ret; } EXPORT_SYMBOL_GPL(usb_gadget_activate); /* ------------------------------------------------------------------------- */ #ifdef CONFIG_HAS_DMA int usb_gadget_map_request_by_dev(struct device *dev, struct usb_request *req, int is_in) { if (req->length == 0) return 0; if (req->sg_was_mapped) { req->num_mapped_sgs = req->num_sgs; return 0; } if (req->num_sgs) { int mapped; mapped = dma_map_sg(dev, req->sg, req->num_sgs, is_in ? DMA_TO_DEVICE : DMA_FROM_DEVICE); if (mapped == 0) { dev_err(dev, "failed to map SGs\n"); return -EFAULT; } req->num_mapped_sgs = mapped; } else { if (is_vmalloc_addr(req->buf)) { dev_err(dev, "buffer is not dma capable\n"); return -EFAULT; } else if (object_is_on_stack(req->buf)) { dev_err(dev, "buffer is on stack\n"); return -EFAULT; } req->dma = dma_map_single(dev, req->buf, req->length, is_in ? DMA_TO_DEVICE : DMA_FROM_DEVICE); if (dma_mapping_error(dev, req->dma)) { dev_err(dev, "failed to map buffer\n"); return -EFAULT; } req->dma_mapped = 1; } return 0; } EXPORT_SYMBOL_GPL(usb_gadget_map_request_by_dev); int usb_gadget_map_request(struct usb_gadget *gadget, struct usb_request *req, int is_in) { return usb_gadget_map_request_by_dev(gadget->dev.parent, req, is_in); } EXPORT_SYMBOL_GPL(usb_gadget_map_request); void usb_gadget_unmap_request_by_dev(struct device *dev, struct usb_request *req, int is_in) { if (req->length == 0 || req->sg_was_mapped) return; if (req->num_mapped_sgs) { dma_unmap_sg(dev, req->sg, req->num_sgs, is_in ? DMA_TO_DEVICE : DMA_FROM_DEVICE); req->num_mapped_sgs = 0; } else if (req->dma_mapped) { dma_unmap_single(dev, req->dma, req->length, is_in ? DMA_TO_DEVICE : DMA_FROM_DEVICE); req->dma_mapped = 0; } } EXPORT_SYMBOL_GPL(usb_gadget_unmap_request_by_dev); void usb_gadget_unmap_request(struct usb_gadget *gadget, struct usb_request *req, int is_in) { usb_gadget_unmap_request_by_dev(gadget->dev.parent, req, is_in); } EXPORT_SYMBOL_GPL(usb_gadget_unmap_request); #endif /* CONFIG_HAS_DMA */ /* ------------------------------------------------------------------------- */ /** * usb_gadget_giveback_request - give the request back to the gadget layer * @ep: the endpoint to be used with with the request * @req: the request being given back * * This is called by device controller drivers in order to return the * completed request back to the gadget layer. */ void usb_gadget_giveback_request(struct usb_ep *ep, struct usb_request *req) { if (likely(req->status == 0)) usb_led_activity(USB_LED_EVENT_GADGET); trace_usb_gadget_giveback_request(ep, req, 0); req->complete(ep, req); } EXPORT_SYMBOL_GPL(usb_gadget_giveback_request); /* ------------------------------------------------------------------------- */ /** * gadget_find_ep_by_name - returns ep whose name is the same as sting passed * in second parameter or NULL if searched endpoint not found * @g: controller to check for quirk * @name: name of searched endpoint */ struct usb_ep *gadget_find_ep_by_name(struct usb_gadget *g, const char *name) { struct usb_ep *ep; gadget_for_each_ep(ep, g) { if (!strcmp(ep->name, name)) return ep; } return NULL; } EXPORT_SYMBOL_GPL(gadget_find_ep_by_name); /* ------------------------------------------------------------------------- */ int usb_gadget_ep_match_desc(struct usb_gadget *gadget, struct usb_ep *ep, struct usb_endpoint_descriptor *desc, struct usb_ss_ep_comp_descriptor *ep_comp) { u8 type; u16 max; int num_req_streams = 0; /* endpoint already claimed? */ if (ep->claimed) return 0; type = usb_endpoint_type(desc); max = usb_endpoint_maxp(desc); if (usb_endpoint_dir_in(desc) && !ep->caps.dir_in) return 0; if (usb_endpoint_dir_out(desc) && !ep->caps.dir_out) return 0; if (max > ep->maxpacket_limit) return 0; /* "high bandwidth" works only at high speed */ if (!gadget_is_dualspeed(gadget) && usb_endpoint_maxp_mult(desc) > 1) return 0; switch (type) { case USB_ENDPOINT_XFER_CONTROL: /* only support ep0 for portable CONTROL traffic */ return 0; case USB_ENDPOINT_XFER_ISOC: if (!ep->caps.type_iso) return 0; /* ISO: limit 1023 bytes full speed, 1024 high/super speed */ if (!gadget_is_dualspeed(gadget) && max > 1023) return 0; break; case USB_ENDPOINT_XFER_BULK: if (!ep->caps.type_bulk) return 0; if (ep_comp && gadget_is_superspeed(gadget)) { /* Get the number of required streams from the * EP companion descriptor and see if the EP * matches it */ num_req_streams = ep_comp->bmAttributes & 0x1f; if (num_req_streams > ep->max_streams) return 0; } break; case USB_ENDPOINT_XFER_INT: /* Bulk endpoints handle interrupt transfers, * except the toggle-quirky iso-synch kind */ if (!ep->caps.type_int && !ep->caps.type_bulk) return 0; /* INT: limit 64 bytes full speed, 1024 high/super speed */ if (!gadget_is_dualspeed(gadget) && max > 64) return 0; break; } return 1; } EXPORT_SYMBOL_GPL(usb_gadget_ep_match_desc); /** * usb_gadget_check_config - checks if the UDC can support the binded * configuration * @gadget: controller to check the USB configuration * * Ensure that a UDC is able to support the requested resources by a * configuration, and that there are no resource limitations, such as * internal memory allocated to all requested endpoints. * * Returns zero on success, else a negative errno. */ int usb_gadget_check_config(struct usb_gadget *gadget) { if (gadget->ops->check_config) return gadget->ops->check_config(gadget); return 0; } EXPORT_SYMBOL_GPL(usb_gadget_check_config); /* ------------------------------------------------------------------------- */ static void usb_gadget_state_work(struct work_struct *work) { struct usb_gadget *gadget = work_to_gadget(work); struct usb_udc *udc = gadget->udc; if (udc) sysfs_notify(&udc->dev.kobj, NULL, "state"); } void usb_gadget_set_state(struct usb_gadget *gadget, enum usb_device_state state) { gadget->state = state; schedule_work(&gadget->work); } EXPORT_SYMBOL_GPL(usb_gadget_set_state); /* ------------------------------------------------------------------------- */ /* Acquire connect_lock before calling this function. */ static int usb_udc_connect_control_locked(struct usb_udc *udc) __must_hold(&udc->connect_lock) { if (udc->vbus) return usb_gadget_connect_locked(udc->gadget); else return usb_gadget_disconnect_locked(udc->gadget); } static void vbus_event_work(struct work_struct *work) { struct usb_udc *udc = container_of(work, struct usb_udc, vbus_work); mutex_lock(&udc->connect_lock); usb_udc_connect_control_locked(udc); mutex_unlock(&udc->connect_lock); } /** * usb_udc_vbus_handler - updates the udc core vbus status, and try to * connect or disconnect gadget * @gadget: The gadget which vbus change occurs * @status: The vbus status * * The udc driver calls it when it wants to connect or disconnect gadget * according to vbus status. * * This function can be invoked from interrupt context by irq handlers of * the gadget drivers, however, usb_udc_connect_control() has to run in * non-atomic context due to the following: * a. Some of the gadget driver implementations expect the ->pullup * callback to be invoked in non-atomic context. * b. usb_gadget_disconnect() acquires udc_lock which is a mutex. * Hence offload invocation of usb_udc_connect_control() to workqueue. */ void usb_udc_vbus_handler(struct usb_gadget *gadget, bool status) { struct usb_udc *udc = gadget->udc; if (udc) { udc->vbus = status; schedule_work(&udc->vbus_work); } } EXPORT_SYMBOL_GPL(usb_udc_vbus_handler); /** * usb_gadget_udc_reset - notifies the udc core that bus reset occurs * @gadget: The gadget which bus reset occurs * @driver: The gadget driver we want to notify * * If the udc driver has bus reset handler, it needs to call this when the bus * reset occurs, it notifies the gadget driver that the bus reset occurs as * well as updates gadget state. */ void usb_gadget_udc_reset(struct usb_gadget *gadget, struct usb_gadget_driver *driver) { driver->reset(gadget); usb_gadget_set_state(gadget, USB_STATE_DEFAULT); } EXPORT_SYMBOL_GPL(usb_gadget_udc_reset); /** * usb_gadget_udc_start_locked - tells usb device controller to start up * @udc: The UDC to be started * * This call is issued by the UDC Class driver when it's about * to register a gadget driver to the device controller, before * calling gadget driver's bind() method. * * It allows the controller to be powered off until strictly * necessary to have it powered on. * * Returns zero on success, else negative errno. * * Caller should acquire connect_lock before invoking this function. */ static inline int usb_gadget_udc_start_locked(struct usb_udc *udc) __must_hold(&udc->connect_lock) { int ret; if (udc->started) { dev_err(&udc->dev, "UDC had already started\n"); return -EBUSY; } ret = udc->gadget->ops->udc_start(udc->gadget, udc->driver); if (!ret) udc->started = true; return ret; } /** * usb_gadget_udc_stop_locked - tells usb device controller we don't need it anymore * @udc: The UDC to be stopped * * This call is issued by the UDC Class driver after calling * gadget driver's unbind() method. * * The details are implementation specific, but it can go as * far as powering off UDC completely and disable its data * line pullups. * * Caller should acquire connect lock before invoking this function. */ static inline void usb_gadget_udc_stop_locked(struct usb_udc *udc) __must_hold(&udc->connect_lock) { if (!udc->started) { dev_err(&udc->dev, "UDC had already stopped\n"); return; } udc->gadget->ops->udc_stop(udc->gadget); udc->started = false; } /** * usb_gadget_udc_set_speed - tells usb device controller speed supported by * current driver * @udc: The device we want to set maximum speed * @speed: The maximum speed to allowed to run * * This call is issued by the UDC Class driver before calling * usb_gadget_udc_start() in order to make sure that we don't try to * connect on speeds the gadget driver doesn't support. */ static inline void usb_gadget_udc_set_speed(struct usb_udc *udc, enum usb_device_speed speed) { struct usb_gadget *gadget = udc->gadget; enum usb_device_speed s; if (speed == USB_SPEED_UNKNOWN) s = gadget->max_speed; else s = min(speed, gadget->max_speed); if (s == USB_SPEED_SUPER_PLUS && gadget->ops->udc_set_ssp_rate) gadget->ops->udc_set_ssp_rate(gadget, gadget->max_ssp_rate); else if (gadget->ops->udc_set_speed) gadget->ops->udc_set_speed(gadget, s); } /** * usb_gadget_enable_async_callbacks - tell usb device controller to enable asynchronous callbacks * @udc: The UDC which should enable async callbacks * * This routine is used when binding gadget drivers. It undoes the effect * of usb_gadget_disable_async_callbacks(); the UDC driver should enable IRQs * (if necessary) and resume issuing callbacks. * * This routine will always be called in process context. */ static inline void usb_gadget_enable_async_callbacks(struct usb_udc *udc) { struct usb_gadget *gadget = udc->gadget; if (gadget->ops->udc_async_callbacks) gadget->ops->udc_async_callbacks(gadget, true); } /** * usb_gadget_disable_async_callbacks - tell usb device controller to disable asynchronous callbacks * @udc: The UDC which should disable async callbacks * * This routine is used when unbinding gadget drivers. It prevents a race: * The UDC driver doesn't know when the gadget driver's ->unbind callback * runs, so unless it is told to disable asynchronous callbacks, it might * issue a callback (such as ->disconnect) after the unbind has completed. * * After this function runs, the UDC driver must suppress all ->suspend, * ->resume, ->disconnect, ->reset, and ->setup callbacks to the gadget driver * until async callbacks are again enabled. A simple-minded but effective * way to accomplish this is to tell the UDC hardware not to generate any * more IRQs. * * Request completion callbacks must still be issued. However, it's okay * to defer them until the request is cancelled, since the pull-up will be * turned off during the time period when async callbacks are disabled. * * This routine will always be called in process context. */ static inline void usb_gadget_disable_async_callbacks(struct usb_udc *udc) { struct usb_gadget *gadget = udc->gadget; if (gadget->ops->udc_async_callbacks) gadget->ops->udc_async_callbacks(gadget, false); } /** * usb_udc_release - release the usb_udc struct * @dev: the dev member within usb_udc * * This is called by driver's core in order to free memory once the last * reference is released. */ static void usb_udc_release(struct device *dev) { struct usb_udc *udc; udc = container_of(dev, struct usb_udc, dev); dev_dbg(dev, "releasing '%s'\n", dev_name(dev)); kfree(udc); } static const struct attribute_group *usb_udc_attr_groups[]; static void usb_udc_nop_release(struct device *dev) { dev_vdbg(dev, "%s\n", __func__); } /** * usb_initialize_gadget - initialize a gadget and its embedded struct device * @parent: the parent device to this udc. Usually the controller driver's * device. * @gadget: the gadget to be initialized. * @release: a gadget release function. */ void usb_initialize_gadget(struct device *parent, struct usb_gadget *gadget, void (*release)(struct device *dev)) { INIT_WORK(&gadget->work, usb_gadget_state_work); gadget->dev.parent = parent; if (release) gadget->dev.release = release; else gadget->dev.release = usb_udc_nop_release; device_initialize(&gadget->dev); gadget->dev.bus = &gadget_bus_type; } EXPORT_SYMBOL_GPL(usb_initialize_gadget); /** * usb_add_gadget - adds a new gadget to the udc class driver list * @gadget: the gadget to be added to the list. * * Returns zero on success, negative errno otherwise. * Does not do a final usb_put_gadget() if an error occurs. */ int usb_add_gadget(struct usb_gadget *gadget) { struct usb_udc *udc; int ret = -ENOMEM; udc = kzalloc(sizeof(*udc), GFP_KERNEL); if (!udc) goto error; device_initialize(&udc->dev); udc->dev.release = usb_udc_release; udc->dev.class = &udc_class; udc->dev.groups = usb_udc_attr_groups; udc->dev.parent = gadget->dev.parent; ret = dev_set_name(&udc->dev, "%s", kobject_name(&gadget->dev.parent->kobj)); if (ret) goto err_put_udc; udc->gadget = gadget; gadget->udc = udc; mutex_init(&udc->connect_lock); udc->started = false; mutex_lock(&udc_lock); list_add_tail(&udc->list, &udc_list); mutex_unlock(&udc_lock); INIT_WORK(&udc->vbus_work, vbus_event_work); ret = device_add(&udc->dev); if (ret) goto err_unlist_udc; usb_gadget_set_state(gadget, USB_STATE_NOTATTACHED); udc->vbus = true; ret = ida_alloc(&gadget_id_numbers, GFP_KERNEL); if (ret < 0) goto err_del_udc; gadget->id_number = ret; dev_set_name(&gadget->dev, "gadget.%d", ret); ret = device_add(&gadget->dev); if (ret) goto err_free_id; ret = sysfs_create_link(&udc->dev.kobj, &gadget->dev.kobj, "gadget"); if (ret) goto err_del_gadget; return 0; err_del_gadget: device_del(&gadget->dev); err_free_id: ida_free(&gadget_id_numbers, gadget->id_number); err_del_udc: flush_work(&gadget->work); device_del(&udc->dev); err_unlist_udc: mutex_lock(&udc_lock); list_del(&udc->list); mutex_unlock(&udc_lock); err_put_udc: put_device(&udc->dev); error: return ret; } EXPORT_SYMBOL_GPL(usb_add_gadget); /** * usb_add_gadget_udc_release - adds a new gadget to the udc class driver list * @parent: the parent device to this udc. Usually the controller driver's * device. * @gadget: the gadget to be added to the list. * @release: a gadget release function. * * Returns zero on success, negative errno otherwise. * Calls the gadget release function in the latter case. */ int usb_add_gadget_udc_release(struct device *parent, struct usb_gadget *gadget, void (*release)(struct device *dev)) { int ret; usb_initialize_gadget(parent, gadget, release); ret = usb_add_gadget(gadget); if (ret) usb_put_gadget(gadget); return ret; } EXPORT_SYMBOL_GPL(usb_add_gadget_udc_release); /** * usb_get_gadget_udc_name - get the name of the first UDC controller * This functions returns the name of the first UDC controller in the system. * Please note that this interface is usefull only for legacy drivers which * assume that there is only one UDC controller in the system and they need to * get its name before initialization. There is no guarantee that the UDC * of the returned name will be still available, when gadget driver registers * itself. * * Returns pointer to string with UDC controller name on success, NULL * otherwise. Caller should kfree() returned string. */ char *usb_get_gadget_udc_name(void) { struct usb_udc *udc; char *name = NULL; /* For now we take the first available UDC */ mutex_lock(&udc_lock); list_for_each_entry(udc, &udc_list, list) { if (!udc->driver) { name = kstrdup(udc->gadget->name, GFP_KERNEL); break; } } mutex_unlock(&udc_lock); return name; } EXPORT_SYMBOL_GPL(usb_get_gadget_udc_name); /** * usb_add_gadget_udc - adds a new gadget to the udc class driver list * @parent: the parent device to this udc. Usually the controller * driver's device. * @gadget: the gadget to be added to the list * * Returns zero on success, negative errno otherwise. */ int usb_add_gadget_udc(struct device *parent, struct usb_gadget *gadget) { return usb_add_gadget_udc_release(parent, gadget, NULL); } EXPORT_SYMBOL_GPL(usb_add_gadget_udc); /** * usb_del_gadget - deletes a gadget and unregisters its udc * @gadget: the gadget to be deleted. * * This will unbind @gadget, if it is bound. * It will not do a final usb_put_gadget(). */ void usb_del_gadget(struct usb_gadget *gadget) { struct usb_udc *udc = gadget->udc; if (!udc) return; dev_vdbg(gadget->dev.parent, "unregistering gadget\n"); mutex_lock(&udc_lock); list_del(&udc->list); mutex_unlock(&udc_lock); kobject_uevent(&udc->dev.kobj, KOBJ_REMOVE); sysfs_remove_link(&udc->dev.kobj, "gadget"); flush_work(&gadget->work); device_del(&gadget->dev); ida_free(&gadget_id_numbers, gadget->id_number); cancel_work_sync(&udc->vbus_work); device_unregister(&udc->dev); } EXPORT_SYMBOL_GPL(usb_del_gadget); /** * usb_del_gadget_udc - unregisters a gadget * @gadget: the gadget to be unregistered. * * Calls usb_del_gadget() and does a final usb_put_gadget(). */ void usb_del_gadget_udc(struct usb_gadget *gadget) { usb_del_gadget(gadget); usb_put_gadget(gadget); } EXPORT_SYMBOL_GPL(usb_del_gadget_udc); /* ------------------------------------------------------------------------- */ static int gadget_match_driver(struct device *dev, struct device_driver *drv) { struct usb_gadget *gadget = dev_to_usb_gadget(dev); struct usb_udc *udc = gadget->udc; struct usb_gadget_driver *driver = container_of(drv, struct usb_gadget_driver, driver); /* If the driver specifies a udc_name, it must match the UDC's name */ if (driver->udc_name && strcmp(driver->udc_name, dev_name(&udc->dev)) != 0) return 0; /* If the driver is already bound to a gadget, it doesn't match */ if (driver->is_bound) return 0; /* Otherwise any gadget driver matches any UDC */ return 1; } static int gadget_bind_driver(struct device *dev) { struct usb_gadget *gadget = dev_to_usb_gadget(dev); struct usb_udc *udc = gadget->udc; struct usb_gadget_driver *driver = container_of(dev->driver, struct usb_gadget_driver, driver); int ret = 0; mutex_lock(&udc_lock); if (driver->is_bound) { mutex_unlock(&udc_lock); return -ENXIO; /* Driver binds to only one gadget */ } driver->is_bound = true; udc->driver = driver; mutex_unlock(&udc_lock); dev_dbg(&udc->dev, "binding gadget driver [%s]\n", driver->function); usb_gadget_udc_set_speed(udc, driver->max_speed); ret = driver->bind(udc->gadget, driver); if (ret) goto err_bind; mutex_lock(&udc->connect_lock); ret = usb_gadget_udc_start_locked(udc); if (ret) { mutex_unlock(&udc->connect_lock); goto err_start; } usb_gadget_enable_async_callbacks(udc); udc->allow_connect = true; ret = usb_udc_connect_control_locked(udc); if (ret) goto err_connect_control; mutex_unlock(&udc->connect_lock); kobject_uevent(&udc->dev.kobj, KOBJ_CHANGE); return 0; err_connect_control: udc->allow_connect = false; usb_gadget_disable_async_callbacks(udc); if (gadget->irq) synchronize_irq(gadget->irq); usb_gadget_udc_stop_locked(udc); mutex_unlock(&udc->connect_lock); err_start: driver->unbind(udc->gadget); err_bind: if (ret != -EISNAM) dev_err(&udc->dev, "failed to start %s: %d\n", driver->function, ret); mutex_lock(&udc_lock); udc->driver = NULL; driver->is_bound = false; mutex_unlock(&udc_lock); return ret; } static void gadget_unbind_driver(struct device *dev) { struct usb_gadget *gadget = dev_to_usb_gadget(dev); struct usb_udc *udc = gadget->udc; struct usb_gadget_driver *driver = udc->driver; dev_dbg(&udc->dev, "unbinding gadget driver [%s]\n", driver->function); udc->allow_connect = false; cancel_work_sync(&udc->vbus_work); mutex_lock(&udc->connect_lock); usb_gadget_disconnect_locked(gadget); usb_gadget_disable_async_callbacks(udc); if (gadget->irq) synchronize_irq(gadget->irq); mutex_unlock(&udc->connect_lock); udc->driver->unbind(gadget); mutex_lock(&udc->connect_lock); usb_gadget_udc_stop_locked(udc); mutex_unlock(&udc->connect_lock); mutex_lock(&udc_lock); driver->is_bound = false; udc->driver = NULL; mutex_unlock(&udc_lock); kobject_uevent(&udc->dev.kobj, KOBJ_CHANGE); } /* ------------------------------------------------------------------------- */ int usb_gadget_register_driver_owner(struct usb_gadget_driver *driver, struct module *owner, const char *mod_name) { int ret; if (!driver || !driver->bind || !driver->setup) return -EINVAL; driver->driver.bus = &gadget_bus_type; driver->driver.owner = owner; driver->driver.mod_name = mod_name; ret = driver_register(&driver->driver); if (ret) { pr_warn("%s: driver registration failed: %d\n", driver->function, ret); return ret; } mutex_lock(&udc_lock); if (!driver->is_bound) { if (driver->match_existing_only) { pr_warn("%s: couldn't find an available UDC or it's busy\n", driver->function); ret = -EBUSY; } else { pr_info("%s: couldn't find an available UDC\n", driver->function); ret = 0; } } mutex_unlock(&udc_lock); if (ret) driver_unregister(&driver->driver); return ret; } EXPORT_SYMBOL_GPL(usb_gadget_register_driver_owner); int usb_gadget_unregister_driver(struct usb_gadget_driver *driver) { if (!driver || !driver->unbind) return -EINVAL; driver_unregister(&driver->driver); return 0; } EXPORT_SYMBOL_GPL(usb_gadget_unregister_driver); /* ------------------------------------------------------------------------- */ static ssize_t srp_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t n) { struct usb_udc *udc = container_of(dev, struct usb_udc, dev); if (sysfs_streq(buf, "1")) usb_gadget_wakeup(udc->gadget); return n; } static DEVICE_ATTR_WO(srp); static ssize_t soft_connect_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t n) { struct usb_udc *udc = container_of(dev, struct usb_udc, dev); ssize_t ret; device_lock(&udc->gadget->dev); if (!udc->driver) { dev_err(dev, "soft-connect without a gadget driver\n"); ret = -EOPNOTSUPP; goto out; } if (sysfs_streq(buf, "connect")) { mutex_lock(&udc->connect_lock); usb_gadget_udc_start_locked(udc); usb_gadget_connect_locked(udc->gadget); mutex_unlock(&udc->connect_lock); } else if (sysfs_streq(buf, "disconnect")) { mutex_lock(&udc->connect_lock); usb_gadget_disconnect_locked(udc->gadget); usb_gadget_udc_stop_locked(udc); mutex_unlock(&udc->connect_lock); } else { dev_err(dev, "unsupported command '%s'\n", buf); ret = -EINVAL; goto out; } ret = n; out: device_unlock(&udc->gadget->dev); return ret; } static DEVICE_ATTR_WO(soft_connect); static ssize_t state_show(struct device *dev, struct device_attribute *attr, char *buf) { struct usb_udc *udc = container_of(dev, struct usb_udc, dev); struct usb_gadget *gadget = udc->gadget; return sprintf(buf, "%s\n", usb_state_string(gadget->state)); } static DEVICE_ATTR_RO(state); static ssize_t function_show(struct device *dev, struct device_attribute *attr, char *buf) { struct usb_udc *udc = container_of(dev, struct usb_udc, dev); struct usb_gadget_driver *drv; int rc = 0; mutex_lock(&udc_lock); drv = udc->driver; if (drv && drv->function) rc = scnprintf(buf, PAGE_SIZE, "%s\n", drv->function); mutex_unlock(&udc_lock); return rc; } static DEVICE_ATTR_RO(function); #define USB_UDC_SPEED_ATTR(name, param) \ ssize_t name##_show(struct device *dev, \ struct device_attribute *attr, char *buf) \ { \ struct usb_udc *udc = container_of(dev, struct usb_udc, dev); \ return scnprintf(buf, PAGE_SIZE, "%s\n", \ usb_speed_string(udc->gadget->param)); \ } \ static DEVICE_ATTR_RO(name) static USB_UDC_SPEED_ATTR(current_speed, speed); static USB_UDC_SPEED_ATTR(maximum_speed, max_speed); #define USB_UDC_ATTR(name) \ ssize_t name##_show(struct device *dev, \ struct device_attribute *attr, char *buf) \ { \ struct usb_udc *udc = container_of(dev, struct usb_udc, dev); \ struct usb_gadget *gadget = udc->gadget; \ \ return scnprintf(buf, PAGE_SIZE, "%d\n", gadget->name); \ } \ static DEVICE_ATTR_RO(name) static USB_UDC_ATTR(is_otg); static USB_UDC_ATTR(is_a_peripheral); static USB_UDC_ATTR(b_hnp_enable); static USB_UDC_ATTR(a_hnp_support); static USB_UDC_ATTR(a_alt_hnp_support); static USB_UDC_ATTR(is_selfpowered); static struct attribute *usb_udc_attrs[] = { &dev_attr_srp.attr, &dev_attr_soft_connect.attr, &dev_attr_state.attr, &dev_attr_function.attr, &dev_attr_current_speed.attr, &dev_attr_maximum_speed.attr, &dev_attr_is_otg.attr, &dev_attr_is_a_peripheral.attr, &dev_attr_b_hnp_enable.attr, &dev_attr_a_hnp_support.attr, &dev_attr_a_alt_hnp_support.attr, &dev_attr_is_selfpowered.attr, NULL, }; static const struct attribute_group usb_udc_attr_group = { .attrs = usb_udc_attrs, }; static const struct attribute_group *usb_udc_attr_groups[] = { &usb_udc_attr_group, NULL, }; static int usb_udc_uevent(const struct device *dev, struct kobj_uevent_env *env) { const struct usb_udc *udc = container_of(dev, struct usb_udc, dev); int ret; ret = add_uevent_var(env, "USB_UDC_NAME=%s", udc->gadget->name); if (ret) { dev_err(dev, "failed to add uevent USB_UDC_NAME\n"); return ret; } mutex_lock(&udc_lock); if (udc->driver) ret = add_uevent_var(env, "USB_UDC_DRIVER=%s", udc->driver->function); mutex_unlock(&udc_lock); if (ret) { dev_err(dev, "failed to add uevent USB_UDC_DRIVER\n"); return ret; } return 0; } static const struct class udc_class = { .name = "udc", .dev_uevent = usb_udc_uevent, }; static const struct bus_type gadget_bus_type = { .name = "gadget", .probe = gadget_bind_driver, .remove = gadget_unbind_driver, .match = gadget_match_driver, }; static int __init usb_udc_init(void) { int rc; rc = class_register(&udc_class); if (rc) return rc; rc = bus_register(&gadget_bus_type); if (rc) class_unregister(&udc_class); return rc; } subsys_initcall(usb_udc_init); static void __exit usb_udc_exit(void) { bus_unregister(&gadget_bus_type); class_unregister(&udc_class); } module_exit(usb_udc_exit); MODULE_DESCRIPTION("UDC Framework"); MODULE_AUTHOR("Felipe Balbi <balbi@ti.com>"); MODULE_LICENSE("GPL v2"); |
| 4 4 4 4 4 4 4 4 11 11 11 11 11 11 15 15 15 4 4 4 4 15 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 | // SPDX-License-Identifier: GPL-2.0-only /* * AppArmor security module * * This file contains AppArmor function for pathnames * * Copyright (C) 1998-2008 Novell/SUSE * Copyright 2009-2010 Canonical Ltd. */ #include <linux/magic.h> #include <linux/mount.h> #include <linux/namei.h> #include <linux/nsproxy.h> #include <linux/path.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/fs_struct.h> #include "include/apparmor.h" #include "include/path.h" #include "include/policy.h" /* modified from dcache.c */ static int prepend(char **buffer, int buflen, const char *str, int namelen) { buflen -= namelen; if (buflen < 0) return -ENAMETOOLONG; *buffer -= namelen; memcpy(*buffer, str, namelen); return 0; } #define CHROOT_NSCONNECT (PATH_CHROOT_REL | PATH_CHROOT_NSCONNECT) /* If the path is not connected to the expected root, * check if it is a sysctl and handle specially else remove any * leading / that __d_path may have returned. * Unless * specifically directed to connect the path, * OR * if in a chroot and doing chroot relative paths and the path * resolves to the namespace root (would be connected outside * of chroot) and specifically directed to connect paths to * namespace root. */ static int disconnect(const struct path *path, char *buf, char **name, int flags, const char *disconnected) { int error = 0; if (!(flags & PATH_CONNECT_PATH) && !(((flags & CHROOT_NSCONNECT) == CHROOT_NSCONNECT) && our_mnt(path->mnt))) { /* disconnected path, don't return pathname starting * with '/' */ error = -EACCES; if (**name == '/') *name = *name + 1; } else { if (**name != '/') /* CONNECT_PATH with missing root */ error = prepend(name, *name - buf, "/", 1); if (!error && disconnected) error = prepend(name, *name - buf, disconnected, strlen(disconnected)); } return error; } /** * d_namespace_path - lookup a name associated with a given path * @path: path to lookup (NOT NULL) * @buf: buffer to store path to (NOT NULL) * @name: Returns - pointer for start of path name with in @buf (NOT NULL) * @flags: flags controlling path lookup * @disconnected: string to prefix to disconnected paths * * Handle path name lookup. * * Returns: %0 else error code if path lookup fails * When no error the path name is returned in @name which points to * a position in @buf */ static int d_namespace_path(const struct path *path, char *buf, char **name, int flags, const char *disconnected) { char *res; int error = 0; int connected = 1; int isdir = (flags & PATH_IS_DIR) ? 1 : 0; int buflen = aa_g_path_max - isdir; if (path->mnt->mnt_flags & MNT_INTERNAL) { /* it's not mounted anywhere */ res = dentry_path(path->dentry, buf, buflen); *name = res; if (IS_ERR(res)) { *name = buf; return PTR_ERR(res); } if (path->dentry->d_sb->s_magic == PROC_SUPER_MAGIC && strncmp(*name, "/sys/", 5) == 0) { /* TODO: convert over to using a per namespace * control instead of hard coded /proc */ error = prepend(name, *name - buf, "/proc", 5); goto out; } else error = disconnect(path, buf, name, flags, disconnected); goto out; } /* resolve paths relative to chroot?*/ if (flags & PATH_CHROOT_REL) { struct path root; get_fs_root(current->fs, &root); res = __d_path(path, &root, buf, buflen); path_put(&root); } else { res = d_absolute_path(path, buf, buflen); if (!our_mnt(path->mnt)) connected = 0; } /* handle error conditions - and still allow a partial path to * be returned. */ if (!res || IS_ERR(res)) { if (PTR_ERR(res) == -ENAMETOOLONG) { error = -ENAMETOOLONG; *name = buf; goto out; } connected = 0; res = dentry_path_raw(path->dentry, buf, buflen); if (IS_ERR(res)) { error = PTR_ERR(res); *name = buf; goto out; } } else if (!our_mnt(path->mnt)) connected = 0; *name = res; if (!connected) error = disconnect(path, buf, name, flags, disconnected); /* Handle two cases: * 1. A deleted dentry && profile is not allowing mediation of deleted * 2. On some filesystems, newly allocated dentries appear to the * security_path hooks as a deleted dentry except without an inode * allocated. */ if (d_unlinked(path->dentry) && d_is_positive(path->dentry) && !(flags & (PATH_MEDIATE_DELETED | PATH_DELEGATE_DELETED))) { error = -ENOENT; goto out; } out: /* * Append "/" to the pathname. The root directory is a special * case; it already ends in slash. */ if (!error && isdir && ((*name)[1] != '\0' || (*name)[0] != '/')) strcpy(&buf[aa_g_path_max - 2], "/"); return error; } /** * aa_path_name - get the pathname to a buffer ensure dir / is appended * @path: path the file (NOT NULL) * @flags: flags controlling path name generation * @buffer: buffer to put name in (NOT NULL) * @name: Returns - the generated path name if !error (NOT NULL) * @info: Returns - information on why the path lookup failed (MAYBE NULL) * @disconnected: string to prepend to disconnected paths * * @name is a pointer to the beginning of the pathname (which usually differs * from the beginning of the buffer), or NULL. If there is an error @name * may contain a partial or invalid name that can be used for audit purposes, * but it can not be used for mediation. * * We need PATH_IS_DIR to indicate whether the file is a directory or not * because the file may not yet exist, and so we cannot check the inode's * file type. * * Returns: %0 else error code if could retrieve name */ int aa_path_name(const struct path *path, int flags, char *buffer, const char **name, const char **info, const char *disconnected) { char *str = NULL; int error = d_namespace_path(path, buffer, &str, flags, disconnected); if (info && error) { if (error == -ENOENT) *info = "Failed name lookup - deleted entry"; else if (error == -EACCES) *info = "Failed name lookup - disconnected path"; else if (error == -ENAMETOOLONG) *info = "Failed name lookup - name too long"; else *info = "Failed name lookup"; } *name = str; return error; } |
| 283 201 191 1 96 94 6 2 3 3 122 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_TIME64_H #define _LINUX_TIME64_H #include <linux/math64.h> #include <vdso/time64.h> typedef __s64 time64_t; typedef __u64 timeu64_t; #include <uapi/linux/time.h> struct timespec64 { time64_t tv_sec; /* seconds */ long tv_nsec; /* nanoseconds */ }; struct itimerspec64 { struct timespec64 it_interval; struct timespec64 it_value; }; /* Parameters used to convert the timespec values: */ #define PSEC_PER_NSEC 1000L /* Located here for timespec[64]_valid_strict */ #define TIME64_MAX ((s64)~((u64)1 << 63)) #define TIME64_MIN (-TIME64_MAX - 1) #define KTIME_MAX ((s64)~((u64)1 << 63)) #define KTIME_MIN (-KTIME_MAX - 1) #define KTIME_SEC_MAX (KTIME_MAX / NSEC_PER_SEC) #define KTIME_SEC_MIN (KTIME_MIN / NSEC_PER_SEC) /* * Limits for settimeofday(): * * To prevent setting the time close to the wraparound point time setting * is limited so a reasonable uptime can be accomodated. Uptime of 30 years * should be really sufficient, which means the cutoff is 2232. At that * point the cutoff is just a small part of the larger problem. */ #define TIME_UPTIME_SEC_MAX (30LL * 365 * 24 *3600) #define TIME_SETTOD_SEC_MAX (KTIME_SEC_MAX - TIME_UPTIME_SEC_MAX) static inline int timespec64_equal(const struct timespec64 *a, const struct timespec64 *b) { return (a->tv_sec == b->tv_sec) && (a->tv_nsec == b->tv_nsec); } /* * lhs < rhs: return <0 * lhs == rhs: return 0 * lhs > rhs: return >0 */ static inline int timespec64_compare(const struct timespec64 *lhs, const struct timespec64 *rhs) { if (lhs->tv_sec < rhs->tv_sec) return -1; if (lhs->tv_sec > rhs->tv_sec) return 1; return lhs->tv_nsec - rhs->tv_nsec; } extern void set_normalized_timespec64(struct timespec64 *ts, time64_t sec, s64 nsec); static inline struct timespec64 timespec64_add(struct timespec64 lhs, struct timespec64 rhs) { struct timespec64 ts_delta; set_normalized_timespec64(&ts_delta, lhs.tv_sec + rhs.tv_sec, lhs.tv_nsec + rhs.tv_nsec); return ts_delta; } /* * sub = lhs - rhs, in normalized form */ static inline struct timespec64 timespec64_sub(struct timespec64 lhs, struct timespec64 rhs) { struct timespec64 ts_delta; set_normalized_timespec64(&ts_delta, lhs.tv_sec - rhs.tv_sec, lhs.tv_nsec - rhs.tv_nsec); return ts_delta; } /* * Returns true if the timespec64 is norm, false if denorm: */ static inline bool timespec64_valid(const struct timespec64 *ts) { /* Dates before 1970 are bogus */ if (ts->tv_sec < 0) return false; /* Can't have more nanoseconds then a second */ if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) return false; return true; } static inline bool timespec64_valid_strict(const struct timespec64 *ts) { if (!timespec64_valid(ts)) return false; /* Disallow values that could overflow ktime_t */ if ((unsigned long long)ts->tv_sec >= KTIME_SEC_MAX) return false; return true; } static inline bool timespec64_valid_settod(const struct timespec64 *ts) { if (!timespec64_valid(ts)) return false; /* Disallow values which cause overflow issues vs. CLOCK_REALTIME */ if ((unsigned long long)ts->tv_sec >= TIME_SETTOD_SEC_MAX) return false; return true; } /** * timespec64_to_ns - Convert timespec64 to nanoseconds * @ts: pointer to the timespec64 variable to be converted * * Returns the scalar nanosecond representation of the timespec64 * parameter. */ static inline s64 timespec64_to_ns(const struct timespec64 *ts) { /* Prevent multiplication overflow / underflow */ if (ts->tv_sec >= KTIME_SEC_MAX) return KTIME_MAX; if (ts->tv_sec <= KTIME_SEC_MIN) return KTIME_MIN; return ((s64) ts->tv_sec * NSEC_PER_SEC) + ts->tv_nsec; } /** * ns_to_timespec64 - Convert nanoseconds to timespec64 * @nsec: the nanoseconds value to be converted * * Returns the timespec64 representation of the nsec parameter. */ extern struct timespec64 ns_to_timespec64(s64 nsec); /** * timespec64_add_ns - Adds nanoseconds to a timespec64 * @a: pointer to timespec64 to be incremented * @ns: unsigned nanoseconds value to be added * * This must always be inlined because its used from the x86-64 vdso, * which cannot call other kernel functions. */ static __always_inline void timespec64_add_ns(struct timespec64 *a, u64 ns) { a->tv_sec += __iter_div_u64_rem(a->tv_nsec + ns, NSEC_PER_SEC, &ns); a->tv_nsec = ns; } /* * timespec64_add_safe assumes both values are positive and checks for * overflow. It will return TIME64_MAX in case of overflow. */ extern struct timespec64 timespec64_add_safe(const struct timespec64 lhs, const struct timespec64 rhs); #endif /* _LINUX_TIME64_H */ |
| 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __NET_TC_MIR_H #define __NET_TC_MIR_H #include <net/act_api.h> #include <linux/tc_act/tc_mirred.h> struct tcf_mirred { struct tc_action common; int tcfm_eaction; u32 tcfm_blockid; bool tcfm_mac_header_xmit; struct net_device __rcu *tcfm_dev; netdevice_tracker tcfm_dev_tracker; struct list_head tcfm_list; }; #define to_mirred(a) ((struct tcf_mirred *)a) static inline bool is_tcf_mirred_egress_redirect(const struct tc_action *a) { #ifdef CONFIG_NET_CLS_ACT if (a->ops && a->ops->id == TCA_ID_MIRRED) return to_mirred(a)->tcfm_eaction == TCA_EGRESS_REDIR; #endif return false; } static inline bool is_tcf_mirred_egress_mirror(const struct tc_action *a) { #ifdef CONFIG_NET_CLS_ACT if (a->ops && a->ops->id == TCA_ID_MIRRED) return to_mirred(a)->tcfm_eaction == TCA_EGRESS_MIRROR; #endif return false; } static inline bool is_tcf_mirred_ingress_redirect(const struct tc_action *a) { #ifdef CONFIG_NET_CLS_ACT if (a->ops && a->ops->id == TCA_ID_MIRRED) return to_mirred(a)->tcfm_eaction == TCA_INGRESS_REDIR; #endif return false; } static inline bool is_tcf_mirred_ingress_mirror(const struct tc_action *a) { #ifdef CONFIG_NET_CLS_ACT if (a->ops && a->ops->id == TCA_ID_MIRRED) return to_mirred(a)->tcfm_eaction == TCA_INGRESS_MIRROR; #endif return false; } static inline struct net_device *tcf_mirred_dev(const struct tc_action *a) { return rtnl_dereference(to_mirred(a)->tcfm_dev); } #endif /* __NET_TC_MIR_H */ |
| 30 32 29 28 29 33 33 33 1 1 6 7 7 7 7 6 6 6 2 1 1 1 1 1 1 1 1 1 2 7 1 8 7 7 7 7 7 6 8 1 6 6 2 2 1 1 6 6 6 6 6 6 6 4 3 3 2 2 2 3 3 3 3 2 2 3 9 9 8 8 1 6 9 7 7 7 7 7 1 6 7 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 1 2 1 1 2 2 1 1 1 1 3 3 3 3 3 3 1 1 2 2 1 1 1 1 2 1 1 1 1 1 1 1 2 2 2 2 3 2 1 1 1 2 1 8 8 8 2 13 13 12 12 1 11 11 12 13 1 1 1 1 1 1 1 1 2 2 1 1 49 1 49 50 49 50 50 50 50 19 5 5 5 5 2 2 2 3 3 3 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 | /* * Copyright (c) 2005-2006 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include <linux/completion.h> #include <linux/file.h> #include <linux/mutex.h> #include <linux/poll.h> #include <linux/sched.h> #include <linux/idr.h> #include <linux/in.h> #include <linux/in6.h> #include <linux/miscdevice.h> #include <linux/slab.h> #include <linux/sysctl.h> #include <linux/module.h> #include <linux/nsproxy.h> #include <linux/nospec.h> #include <rdma/rdma_user_cm.h> #include <rdma/ib_marshall.h> #include <rdma/rdma_cm.h> #include <rdma/rdma_cm_ib.h> #include <rdma/ib_addr.h> #include <rdma/ib.h> #include <rdma/ib_cm.h> #include <rdma/rdma_netlink.h> #include "core_priv.h" MODULE_AUTHOR("Sean Hefty"); MODULE_DESCRIPTION("RDMA Userspace Connection Manager Access"); MODULE_LICENSE("Dual BSD/GPL"); static unsigned int max_backlog = 1024; static struct ctl_table_header *ucma_ctl_table_hdr; static struct ctl_table ucma_ctl_table[] = { { .procname = "max_backlog", .data = &max_backlog, .maxlen = sizeof max_backlog, .mode = 0644, .proc_handler = proc_dointvec, }, }; struct ucma_file { struct mutex mut; struct file *filp; struct list_head ctx_list; struct list_head event_list; wait_queue_head_t poll_wait; }; struct ucma_context { u32 id; struct completion comp; refcount_t ref; int events_reported; atomic_t backlog; struct ucma_file *file; struct rdma_cm_id *cm_id; struct mutex mutex; u64 uid; struct list_head list; struct list_head mc_list; struct work_struct close_work; }; struct ucma_multicast { struct ucma_context *ctx; u32 id; int events_reported; u64 uid; u8 join_state; struct list_head list; struct sockaddr_storage addr; }; struct ucma_event { struct ucma_context *ctx; struct ucma_context *conn_req_ctx; struct ucma_multicast *mc; struct list_head list; struct rdma_ucm_event_resp resp; }; static DEFINE_XARRAY_ALLOC(ctx_table); static DEFINE_XARRAY_ALLOC(multicast_table); static const struct file_operations ucma_fops; static int ucma_destroy_private_ctx(struct ucma_context *ctx); static inline struct ucma_context *_ucma_find_context(int id, struct ucma_file *file) { struct ucma_context *ctx; ctx = xa_load(&ctx_table, id); if (!ctx) ctx = ERR_PTR(-ENOENT); else if (ctx->file != file) ctx = ERR_PTR(-EINVAL); return ctx; } static struct ucma_context *ucma_get_ctx(struct ucma_file *file, int id) { struct ucma_context *ctx; xa_lock(&ctx_table); ctx = _ucma_find_context(id, file); if (!IS_ERR(ctx)) if (!refcount_inc_not_zero(&ctx->ref)) ctx = ERR_PTR(-ENXIO); xa_unlock(&ctx_table); return ctx; } static void ucma_put_ctx(struct ucma_context *ctx) { if (refcount_dec_and_test(&ctx->ref)) complete(&ctx->comp); } /* * Same as ucm_get_ctx but requires that ->cm_id->device is valid, eg that the * CM_ID is bound. */ static struct ucma_context *ucma_get_ctx_dev(struct ucma_file *file, int id) { struct ucma_context *ctx = ucma_get_ctx(file, id); if (IS_ERR(ctx)) return ctx; if (!ctx->cm_id->device) { ucma_put_ctx(ctx); return ERR_PTR(-EINVAL); } return ctx; } static void ucma_close_id(struct work_struct *work) { struct ucma_context *ctx = container_of(work, struct ucma_context, close_work); /* once all inflight tasks are finished, we close all underlying * resources. The context is still alive till its explicit destryoing * by its creator. This puts back the xarray's reference. */ ucma_put_ctx(ctx); wait_for_completion(&ctx->comp); /* No new events will be generated after destroying the id. */ rdma_destroy_id(ctx->cm_id); /* Reading the cm_id without holding a positive ref is not allowed */ ctx->cm_id = NULL; } static struct ucma_context *ucma_alloc_ctx(struct ucma_file *file) { struct ucma_context *ctx; ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) return NULL; INIT_WORK(&ctx->close_work, ucma_close_id); init_completion(&ctx->comp); INIT_LIST_HEAD(&ctx->mc_list); /* So list_del() will work if we don't do ucma_finish_ctx() */ INIT_LIST_HEAD(&ctx->list); ctx->file = file; mutex_init(&ctx->mutex); if (xa_alloc(&ctx_table, &ctx->id, NULL, xa_limit_32b, GFP_KERNEL)) { kfree(ctx); return NULL; } return ctx; } static void ucma_set_ctx_cm_id(struct ucma_context *ctx, struct rdma_cm_id *cm_id) { refcount_set(&ctx->ref, 1); ctx->cm_id = cm_id; } static void ucma_finish_ctx(struct ucma_context *ctx) { lockdep_assert_held(&ctx->file->mut); list_add_tail(&ctx->list, &ctx->file->ctx_list); xa_store(&ctx_table, ctx->id, ctx, GFP_KERNEL); } static void ucma_copy_conn_event(struct rdma_ucm_conn_param *dst, struct rdma_conn_param *src) { if (src->private_data_len) memcpy(dst->private_data, src->private_data, src->private_data_len); dst->private_data_len = src->private_data_len; dst->responder_resources = src->responder_resources; dst->initiator_depth = src->initiator_depth; dst->flow_control = src->flow_control; dst->retry_count = src->retry_count; dst->rnr_retry_count = src->rnr_retry_count; dst->srq = src->srq; dst->qp_num = src->qp_num; } static void ucma_copy_ud_event(struct ib_device *device, struct rdma_ucm_ud_param *dst, struct rdma_ud_param *src) { if (src->private_data_len) memcpy(dst->private_data, src->private_data, src->private_data_len); dst->private_data_len = src->private_data_len; ib_copy_ah_attr_to_user(device, &dst->ah_attr, &src->ah_attr); dst->qp_num = src->qp_num; dst->qkey = src->qkey; } static struct ucma_event *ucma_create_uevent(struct ucma_context *ctx, struct rdma_cm_event *event) { struct ucma_event *uevent; uevent = kzalloc(sizeof(*uevent), GFP_KERNEL); if (!uevent) return NULL; uevent->ctx = ctx; switch (event->event) { case RDMA_CM_EVENT_MULTICAST_JOIN: case RDMA_CM_EVENT_MULTICAST_ERROR: uevent->mc = (struct ucma_multicast *) event->param.ud.private_data; uevent->resp.uid = uevent->mc->uid; uevent->resp.id = uevent->mc->id; break; default: uevent->resp.uid = ctx->uid; uevent->resp.id = ctx->id; break; } uevent->resp.event = event->event; uevent->resp.status = event->status; if (ctx->cm_id->qp_type == IB_QPT_UD) ucma_copy_ud_event(ctx->cm_id->device, &uevent->resp.param.ud, &event->param.ud); else ucma_copy_conn_event(&uevent->resp.param.conn, &event->param.conn); uevent->resp.ece.vendor_id = event->ece.vendor_id; uevent->resp.ece.attr_mod = event->ece.attr_mod; return uevent; } static int ucma_connect_event_handler(struct rdma_cm_id *cm_id, struct rdma_cm_event *event) { struct ucma_context *listen_ctx = cm_id->context; struct ucma_context *ctx; struct ucma_event *uevent; if (!atomic_add_unless(&listen_ctx->backlog, -1, 0)) return -ENOMEM; ctx = ucma_alloc_ctx(listen_ctx->file); if (!ctx) goto err_backlog; ucma_set_ctx_cm_id(ctx, cm_id); uevent = ucma_create_uevent(listen_ctx, event); if (!uevent) goto err_alloc; uevent->conn_req_ctx = ctx; uevent->resp.id = ctx->id; ctx->cm_id->context = ctx; mutex_lock(&ctx->file->mut); ucma_finish_ctx(ctx); list_add_tail(&uevent->list, &ctx->file->event_list); mutex_unlock(&ctx->file->mut); wake_up_interruptible(&ctx->file->poll_wait); return 0; err_alloc: ucma_destroy_private_ctx(ctx); err_backlog: atomic_inc(&listen_ctx->backlog); /* Returning error causes the new ID to be destroyed */ return -ENOMEM; } static int ucma_event_handler(struct rdma_cm_id *cm_id, struct rdma_cm_event *event) { struct ucma_event *uevent; struct ucma_context *ctx = cm_id->context; if (event->event == RDMA_CM_EVENT_CONNECT_REQUEST) return ucma_connect_event_handler(cm_id, event); /* * We ignore events for new connections until userspace has set their * context. This can only happen if an error occurs on a new connection * before the user accepts it. This is okay, since the accept will just * fail later. However, we do need to release the underlying HW * resources in case of a device removal event. */ if (ctx->uid) { uevent = ucma_create_uevent(ctx, event); if (!uevent) return 0; mutex_lock(&ctx->file->mut); list_add_tail(&uevent->list, &ctx->file->event_list); mutex_unlock(&ctx->file->mut); wake_up_interruptible(&ctx->file->poll_wait); } if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) { xa_lock(&ctx_table); if (xa_load(&ctx_table, ctx->id) == ctx) queue_work(system_unbound_wq, &ctx->close_work); xa_unlock(&ctx_table); } return 0; } static ssize_t ucma_get_event(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { struct rdma_ucm_get_event cmd; struct ucma_event *uevent; /* * Old 32 bit user space does not send the 4 byte padding in the * reserved field. We don't care, allow it to keep working. */ if (out_len < sizeof(uevent->resp) - sizeof(uevent->resp.reserved) - sizeof(uevent->resp.ece)) return -ENOSPC; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; mutex_lock(&file->mut); while (list_empty(&file->event_list)) { mutex_unlock(&file->mut); if (file->filp->f_flags & O_NONBLOCK) return -EAGAIN; if (wait_event_interruptible(file->poll_wait, !list_empty(&file->event_list))) return -ERESTARTSYS; mutex_lock(&file->mut); } uevent = list_first_entry(&file->event_list, struct ucma_event, list); if (copy_to_user(u64_to_user_ptr(cmd.response), &uevent->resp, min_t(size_t, out_len, sizeof(uevent->resp)))) { mutex_unlock(&file->mut); return -EFAULT; } list_del(&uevent->list); uevent->ctx->events_reported++; if (uevent->mc) uevent->mc->events_reported++; if (uevent->resp.event == RDMA_CM_EVENT_CONNECT_REQUEST) atomic_inc(&uevent->ctx->backlog); mutex_unlock(&file->mut); kfree(uevent); return 0; } static int ucma_get_qp_type(struct rdma_ucm_create_id *cmd, enum ib_qp_type *qp_type) { switch (cmd->ps) { case RDMA_PS_TCP: *qp_type = IB_QPT_RC; return 0; case RDMA_PS_UDP: case RDMA_PS_IPOIB: *qp_type = IB_QPT_UD; return 0; case RDMA_PS_IB: *qp_type = cmd->qp_type; return 0; default: return -EINVAL; } } static ssize_t ucma_create_id(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { struct rdma_ucm_create_id cmd; struct rdma_ucm_create_id_resp resp; struct ucma_context *ctx; struct rdma_cm_id *cm_id; enum ib_qp_type qp_type; int ret; if (out_len < sizeof(resp)) return -ENOSPC; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; ret = ucma_get_qp_type(&cmd, &qp_type); if (ret) return ret; ctx = ucma_alloc_ctx(file); if (!ctx) return -ENOMEM; ctx->uid = cmd.uid; cm_id = rdma_create_user_id(ucma_event_handler, ctx, cmd.ps, qp_type); if (IS_ERR(cm_id)) { ret = PTR_ERR(cm_id); goto err1; } ucma_set_ctx_cm_id(ctx, cm_id); resp.id = ctx->id; if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof(resp))) { ret = -EFAULT; goto err1; } mutex_lock(&file->mut); ucma_finish_ctx(ctx); mutex_unlock(&file->mut); return 0; err1: ucma_destroy_private_ctx(ctx); return ret; } static void ucma_cleanup_multicast(struct ucma_context *ctx) { struct ucma_multicast *mc, *tmp; xa_lock(&multicast_table); list_for_each_entry_safe(mc, tmp, &ctx->mc_list, list) { list_del(&mc->list); /* * At this point mc->ctx->ref is 0 so the mc cannot leave the * lock on the reader and this is enough serialization */ __xa_erase(&multicast_table, mc->id); kfree(mc); } xa_unlock(&multicast_table); } static void ucma_cleanup_mc_events(struct ucma_multicast *mc) { struct ucma_event *uevent, *tmp; rdma_lock_handler(mc->ctx->cm_id); mutex_lock(&mc->ctx->file->mut); list_for_each_entry_safe(uevent, tmp, &mc->ctx->file->event_list, list) { if (uevent->mc != mc) continue; list_del(&uevent->list); kfree(uevent); } mutex_unlock(&mc->ctx->file->mut); rdma_unlock_handler(mc->ctx->cm_id); } static int ucma_cleanup_ctx_events(struct ucma_context *ctx) { int events_reported; struct ucma_event *uevent, *tmp; LIST_HEAD(list); /* Cleanup events not yet reported to the user.*/ mutex_lock(&ctx->file->mut); list_for_each_entry_safe(uevent, tmp, &ctx->file->event_list, list) { if (uevent->ctx != ctx) continue; if (uevent->resp.event == RDMA_CM_EVENT_CONNECT_REQUEST && xa_cmpxchg(&ctx_table, uevent->conn_req_ctx->id, uevent->conn_req_ctx, XA_ZERO_ENTRY, GFP_KERNEL) == uevent->conn_req_ctx) { list_move_tail(&uevent->list, &list); continue; } list_del(&uevent->list); kfree(uevent); } list_del(&ctx->list); events_reported = ctx->events_reported; mutex_unlock(&ctx->file->mut); /* * If this was a listening ID then any connections spawned from it that * have not been delivered to userspace are cleaned up too. Must be done * outside any locks. */ list_for_each_entry_safe(uevent, tmp, &list, list) { ucma_destroy_private_ctx(uevent->conn_req_ctx); kfree(uevent); } return events_reported; } /* * When this is called the xarray must have a XA_ZERO_ENTRY in the ctx->id (ie * the ctx is not public to the user). This either because: * - ucma_finish_ctx() hasn't been called * - xa_cmpxchg() succeed to remove the entry (only one thread can succeed) */ static int ucma_destroy_private_ctx(struct ucma_context *ctx) { int events_reported; /* * Destroy the underlying cm_id. New work queuing is prevented now by * the removal from the xarray. Once the work is cancled ref will either * be 0 because the work ran to completion and consumed the ref from the * xarray, or it will be positive because we still have the ref from the * xarray. This can also be 0 in cases where cm_id was never set */ cancel_work_sync(&ctx->close_work); if (refcount_read(&ctx->ref)) ucma_close_id(&ctx->close_work); events_reported = ucma_cleanup_ctx_events(ctx); ucma_cleanup_multicast(ctx); WARN_ON(xa_cmpxchg(&ctx_table, ctx->id, XA_ZERO_ENTRY, NULL, GFP_KERNEL) != NULL); mutex_destroy(&ctx->mutex); kfree(ctx); return events_reported; } static ssize_t ucma_destroy_id(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { struct rdma_ucm_destroy_id cmd; struct rdma_ucm_destroy_id_resp resp; struct ucma_context *ctx; int ret = 0; if (out_len < sizeof(resp)) return -ENOSPC; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; xa_lock(&ctx_table); ctx = _ucma_find_context(cmd.id, file); if (!IS_ERR(ctx)) { if (__xa_cmpxchg(&ctx_table, ctx->id, ctx, XA_ZERO_ENTRY, GFP_KERNEL) != ctx) ctx = ERR_PTR(-ENOENT); } xa_unlock(&ctx_table); if (IS_ERR(ctx)) return PTR_ERR(ctx); resp.events_reported = ucma_destroy_private_ctx(ctx); if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof(resp))) ret = -EFAULT; return ret; } static ssize_t ucma_bind_ip(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { struct rdma_ucm_bind_ip cmd; struct ucma_context *ctx; int ret; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; if (!rdma_addr_size_in6(&cmd.addr)) return -EINVAL; ctx = ucma_get_ctx(file, cmd.id); if (IS_ERR(ctx)) return PTR_ERR(ctx); mutex_lock(&ctx->mutex); ret = rdma_bind_addr(ctx->cm_id, (struct sockaddr *) &cmd.addr); mutex_unlock(&ctx->mutex); ucma_put_ctx(ctx); return ret; } static ssize_t ucma_bind(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { struct rdma_ucm_bind cmd; struct ucma_context *ctx; int ret; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; if (cmd.reserved || !cmd.addr_size || cmd.addr_size != rdma_addr_size_kss(&cmd.addr)) return -EINVAL; ctx = ucma_get_ctx(file, cmd.id); if (IS_ERR(ctx)) return PTR_ERR(ctx); mutex_lock(&ctx->mutex); ret = rdma_bind_addr(ctx->cm_id, (struct sockaddr *) &cmd.addr); mutex_unlock(&ctx->mutex); ucma_put_ctx(ctx); return ret; } static ssize_t ucma_resolve_ip(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { struct rdma_ucm_resolve_ip cmd; struct ucma_context *ctx; int ret; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; if ((cmd.src_addr.sin6_family && !rdma_addr_size_in6(&cmd.src_addr)) || !rdma_addr_size_in6(&cmd.dst_addr)) return -EINVAL; ctx = ucma_get_ctx(file, cmd.id); if (IS_ERR(ctx)) return PTR_ERR(ctx); mutex_lock(&ctx->mutex); ret = rdma_resolve_addr(ctx->cm_id, (struct sockaddr *) &cmd.src_addr, (struct sockaddr *) &cmd.dst_addr, cmd.timeout_ms); mutex_unlock(&ctx->mutex); ucma_put_ctx(ctx); return ret; } static ssize_t ucma_resolve_addr(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { struct rdma_ucm_resolve_addr cmd; struct ucma_context *ctx; int ret; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; if (cmd.reserved || (cmd.src_size && (cmd.src_size != rdma_addr_size_kss(&cmd.src_addr))) || !cmd.dst_size || (cmd.dst_size != rdma_addr_size_kss(&cmd.dst_addr))) return -EINVAL; ctx = ucma_get_ctx(file, cmd.id); if (IS_ERR(ctx)) return PTR_ERR(ctx); mutex_lock(&ctx->mutex); ret = rdma_resolve_addr(ctx->cm_id, (struct sockaddr *) &cmd.src_addr, (struct sockaddr *) &cmd.dst_addr, cmd.timeout_ms); mutex_unlock(&ctx->mutex); ucma_put_ctx(ctx); return ret; } static ssize_t ucma_resolve_route(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { struct rdma_ucm_resolve_route cmd; struct ucma_context *ctx; int ret; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; ctx = ucma_get_ctx_dev(file, cmd.id); if (IS_ERR(ctx)) return PTR_ERR(ctx); mutex_lock(&ctx->mutex); ret = rdma_resolve_route(ctx->cm_id, cmd.timeout_ms); mutex_unlock(&ctx->mutex); ucma_put_ctx(ctx); return ret; } static void ucma_copy_ib_route(struct rdma_ucm_query_route_resp *resp, struct rdma_route *route) { struct rdma_dev_addr *dev_addr; resp->num_paths = route->num_pri_alt_paths; switch (route->num_pri_alt_paths) { case 0: dev_addr = &route->addr.dev_addr; rdma_addr_get_dgid(dev_addr, (union ib_gid *) &resp->ib_route[0].dgid); rdma_addr_get_sgid(dev_addr, (union ib_gid *) &resp->ib_route[0].sgid); resp->ib_route[0].pkey = cpu_to_be16(ib_addr_get_pkey(dev_addr)); break; case 2: ib_copy_path_rec_to_user(&resp->ib_route[1], &route->path_rec[1]); fallthrough; case 1: ib_copy_path_rec_to_user(&resp->ib_route[0], &route->path_rec[0]); break; default: break; } } static void ucma_copy_iboe_route(struct rdma_ucm_query_route_resp *resp, struct rdma_route *route) { resp->num_paths = route->num_pri_alt_paths; switch (route->num_pri_alt_paths) { case 0: rdma_ip2gid((struct sockaddr *)&route->addr.dst_addr, (union ib_gid *)&resp->ib_route[0].dgid); rdma_ip2gid((struct sockaddr *)&route->addr.src_addr, (union ib_gid *)&resp->ib_route[0].sgid); resp->ib_route[0].pkey = cpu_to_be16(0xffff); break; case 2: ib_copy_path_rec_to_user(&resp->ib_route[1], &route->path_rec[1]); fallthrough; case 1: ib_copy_path_rec_to_user(&resp->ib_route[0], &route->path_rec[0]); break; default: break; } } static void ucma_copy_iw_route(struct rdma_ucm_query_route_resp *resp, struct rdma_route *route) { struct rdma_dev_addr *dev_addr; dev_addr = &route->addr.dev_addr; rdma_addr_get_dgid(dev_addr, (union ib_gid *) &resp->ib_route[0].dgid); rdma_addr_get_sgid(dev_addr, (union ib_gid *) &resp->ib_route[0].sgid); } static ssize_t ucma_query_route(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { struct rdma_ucm_query cmd; struct rdma_ucm_query_route_resp resp; struct ucma_context *ctx; struct sockaddr *addr; int ret = 0; if (out_len < offsetof(struct rdma_ucm_query_route_resp, ibdev_index)) return -ENOSPC; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; ctx = ucma_get_ctx(file, cmd.id); if (IS_ERR(ctx)) return PTR_ERR(ctx); mutex_lock(&ctx->mutex); memset(&resp, 0, sizeof resp); addr = (struct sockaddr *) &ctx->cm_id->route.addr.src_addr; memcpy(&resp.src_addr, addr, addr->sa_family == AF_INET ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6)); addr = (struct sockaddr *) &ctx->cm_id->route.addr.dst_addr; memcpy(&resp.dst_addr, addr, addr->sa_family == AF_INET ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6)); if (!ctx->cm_id->device) goto out; resp.node_guid = (__force __u64) ctx->cm_id->device->node_guid; resp.ibdev_index = ctx->cm_id->device->index; resp.port_num = ctx->cm_id->port_num; if (rdma_cap_ib_sa(ctx->cm_id->device, ctx->cm_id->port_num)) ucma_copy_ib_route(&resp, &ctx->cm_id->route); else if (rdma_protocol_roce(ctx->cm_id->device, ctx->cm_id->port_num)) ucma_copy_iboe_route(&resp, &ctx->cm_id->route); else if (rdma_protocol_iwarp(ctx->cm_id->device, ctx->cm_id->port_num)) ucma_copy_iw_route(&resp, &ctx->cm_id->route); out: mutex_unlock(&ctx->mutex); if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, min_t(size_t, out_len, sizeof(resp)))) ret = -EFAULT; ucma_put_ctx(ctx); return ret; } static void ucma_query_device_addr(struct rdma_cm_id *cm_id, struct rdma_ucm_query_addr_resp *resp) { if (!cm_id->device) return; resp->node_guid = (__force __u64) cm_id->device->node_guid; resp->ibdev_index = cm_id->device->index; resp->port_num = cm_id->port_num; resp->pkey = (__force __u16) cpu_to_be16( ib_addr_get_pkey(&cm_id->route.addr.dev_addr)); } static ssize_t ucma_query_addr(struct ucma_context *ctx, void __user *response, int out_len) { struct rdma_ucm_query_addr_resp resp; struct sockaddr *addr; int ret = 0; if (out_len < offsetof(struct rdma_ucm_query_addr_resp, ibdev_index)) return -ENOSPC; memset(&resp, 0, sizeof resp); addr = (struct sockaddr *) &ctx->cm_id->route.addr.src_addr; resp.src_size = rdma_addr_size(addr); memcpy(&resp.src_addr, addr, resp.src_size); addr = (struct sockaddr *) &ctx->cm_id->route.addr.dst_addr; resp.dst_size = rdma_addr_size(addr); memcpy(&resp.dst_addr, addr, resp.dst_size); ucma_query_device_addr(ctx->cm_id, &resp); if (copy_to_user(response, &resp, min_t(size_t, out_len, sizeof(resp)))) ret = -EFAULT; return ret; } static ssize_t ucma_query_path(struct ucma_context *ctx, void __user *response, int out_len) { struct rdma_ucm_query_path_resp *resp; int i, ret = 0; if (out_len < sizeof(*resp)) return -ENOSPC; resp = kzalloc(out_len, GFP_KERNEL); if (!resp) return -ENOMEM; resp->num_paths = ctx->cm_id->route.num_pri_alt_paths; for (i = 0, out_len -= sizeof(*resp); i < resp->num_paths && out_len > sizeof(struct ib_path_rec_data); i++, out_len -= sizeof(struct ib_path_rec_data)) { struct sa_path_rec *rec = &ctx->cm_id->route.path_rec[i]; resp->path_data[i].flags = IB_PATH_GMP | IB_PATH_PRIMARY | IB_PATH_BIDIRECTIONAL; if (rec->rec_type == SA_PATH_REC_TYPE_OPA) { struct sa_path_rec ib; sa_convert_path_opa_to_ib(&ib, rec); ib_sa_pack_path(&ib, &resp->path_data[i].path_rec); } else { ib_sa_pack_path(rec, &resp->path_data[i].path_rec); } } if (copy_to_user(response, resp, struct_size(resp, path_data, i))) ret = -EFAULT; kfree(resp); return ret; } static ssize_t ucma_query_gid(struct ucma_context *ctx, void __user *response, int out_len) { struct rdma_ucm_query_addr_resp resp; struct sockaddr_ib *addr; int ret = 0; if (out_len < offsetof(struct rdma_ucm_query_addr_resp, ibdev_index)) return -ENOSPC; memset(&resp, 0, sizeof resp); ucma_query_device_addr(ctx->cm_id, &resp); addr = (struct sockaddr_ib *) &resp.src_addr; resp.src_size = sizeof(*addr); if (ctx->cm_id->route.addr.src_addr.ss_family == AF_IB) { memcpy(addr, &ctx->cm_id->route.addr.src_addr, resp.src_size); } else { addr->sib_family = AF_IB; addr->sib_pkey = (__force __be16) resp.pkey; rdma_read_gids(ctx->cm_id, (union ib_gid *)&addr->sib_addr, NULL); addr->sib_sid = rdma_get_service_id(ctx->cm_id, (struct sockaddr *) &ctx->cm_id->route.addr.src_addr); } addr = (struct sockaddr_ib *) &resp.dst_addr; resp.dst_size = sizeof(*addr); if (ctx->cm_id->route.addr.dst_addr.ss_family == AF_IB) { memcpy(addr, &ctx->cm_id->route.addr.dst_addr, resp.dst_size); } else { addr->sib_family = AF_IB; addr->sib_pkey = (__force __be16) resp.pkey; rdma_read_gids(ctx->cm_id, NULL, (union ib_gid *)&addr->sib_addr); addr->sib_sid = rdma_get_service_id(ctx->cm_id, (struct sockaddr *) &ctx->cm_id->route.addr.dst_addr); } if (copy_to_user(response, &resp, min_t(size_t, out_len, sizeof(resp)))) ret = -EFAULT; return ret; } static ssize_t ucma_query(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { struct rdma_ucm_query cmd; struct ucma_context *ctx; void __user *response; int ret; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; response = u64_to_user_ptr(cmd.response); ctx = ucma_get_ctx(file, cmd.id); if (IS_ERR(ctx)) return PTR_ERR(ctx); mutex_lock(&ctx->mutex); switch (cmd.option) { case RDMA_USER_CM_QUERY_ADDR: ret = ucma_query_addr(ctx, response, out_len); break; case RDMA_USER_CM_QUERY_PATH: ret = ucma_query_path(ctx, response, out_len); break; case RDMA_USER_CM_QUERY_GID: ret = ucma_query_gid(ctx, response, out_len); break; default: ret = -ENOSYS; break; } mutex_unlock(&ctx->mutex); ucma_put_ctx(ctx); return ret; } static void ucma_copy_conn_param(struct rdma_cm_id *id, struct rdma_conn_param *dst, struct rdma_ucm_conn_param *src) { dst->private_data = src->private_data; dst->private_data_len = src->private_data_len; dst->responder_resources = src->responder_resources; dst->initiator_depth = src->initiator_depth; dst->flow_control = src->flow_control; dst->retry_count = src->retry_count; dst->rnr_retry_count = src->rnr_retry_count; dst->srq = src->srq; dst->qp_num = src->qp_num & 0xFFFFFF; dst->qkey = (id->route.addr.src_addr.ss_family == AF_IB) ? src->qkey : 0; } static ssize_t ucma_connect(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { struct rdma_conn_param conn_param; struct rdma_ucm_ece ece = {}; struct rdma_ucm_connect cmd; struct ucma_context *ctx; size_t in_size; int ret; if (in_len < offsetofend(typeof(cmd), reserved)) return -EINVAL; in_size = min_t(size_t, in_len, sizeof(cmd)); if (copy_from_user(&cmd, inbuf, in_size)) return -EFAULT; if (!cmd.conn_param.valid) return -EINVAL; ctx = ucma_get_ctx_dev(file, cmd.id); if (IS_ERR(ctx)) return PTR_ERR(ctx); ucma_copy_conn_param(ctx->cm_id, &conn_param, &cmd.conn_param); if (offsetofend(typeof(cmd), ece) <= in_size) { ece.vendor_id = cmd.ece.vendor_id; ece.attr_mod = cmd.ece.attr_mod; } mutex_lock(&ctx->mutex); ret = rdma_connect_ece(ctx->cm_id, &conn_param, &ece); mutex_unlock(&ctx->mutex); ucma_put_ctx(ctx); return ret; } static ssize_t ucma_listen(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { struct rdma_ucm_listen cmd; struct ucma_context *ctx; int ret; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; ctx = ucma_get_ctx(file, cmd.id); if (IS_ERR(ctx)) return PTR_ERR(ctx); if (cmd.backlog <= 0 || cmd.backlog > max_backlog) cmd.backlog = max_backlog; atomic_set(&ctx->backlog, cmd.backlog); mutex_lock(&ctx->mutex); ret = rdma_listen(ctx->cm_id, cmd.backlog); mutex_unlock(&ctx->mutex); ucma_put_ctx(ctx); return ret; } static ssize_t ucma_accept(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { struct rdma_ucm_accept cmd; struct rdma_conn_param conn_param; struct rdma_ucm_ece ece = {}; struct ucma_context *ctx; size_t in_size; int ret; if (in_len < offsetofend(typeof(cmd), reserved)) return -EINVAL; in_size = min_t(size_t, in_len, sizeof(cmd)); if (copy_from_user(&cmd, inbuf, in_size)) return -EFAULT; ctx = ucma_get_ctx_dev(file, cmd.id); if (IS_ERR(ctx)) return PTR_ERR(ctx); if (offsetofend(typeof(cmd), ece) <= in_size) { ece.vendor_id = cmd.ece.vendor_id; ece.attr_mod = cmd.ece.attr_mod; } if (cmd.conn_param.valid) { ucma_copy_conn_param(ctx->cm_id, &conn_param, &cmd.conn_param); mutex_lock(&ctx->mutex); rdma_lock_handler(ctx->cm_id); ret = rdma_accept_ece(ctx->cm_id, &conn_param, &ece); if (!ret) { /* The uid must be set atomically with the handler */ ctx->uid = cmd.uid; } rdma_unlock_handler(ctx->cm_id); mutex_unlock(&ctx->mutex); } else { mutex_lock(&ctx->mutex); rdma_lock_handler(ctx->cm_id); ret = rdma_accept_ece(ctx->cm_id, NULL, &ece); rdma_unlock_handler(ctx->cm_id); mutex_unlock(&ctx->mutex); } ucma_put_ctx(ctx); return ret; } static ssize_t ucma_reject(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { struct rdma_ucm_reject cmd; struct ucma_context *ctx; int ret; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; if (!cmd.reason) cmd.reason = IB_CM_REJ_CONSUMER_DEFINED; switch (cmd.reason) { case IB_CM_REJ_CONSUMER_DEFINED: case IB_CM_REJ_VENDOR_OPTION_NOT_SUPPORTED: break; default: return -EINVAL; } ctx = ucma_get_ctx_dev(file, cmd.id); if (IS_ERR(ctx)) return PTR_ERR(ctx); mutex_lock(&ctx->mutex); ret = rdma_reject(ctx->cm_id, cmd.private_data, cmd.private_data_len, cmd.reason); mutex_unlock(&ctx->mutex); ucma_put_ctx(ctx); return ret; } static ssize_t ucma_disconnect(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { struct rdma_ucm_disconnect cmd; struct ucma_context *ctx; int ret; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; ctx = ucma_get_ctx_dev(file, cmd.id); if (IS_ERR(ctx)) return PTR_ERR(ctx); mutex_lock(&ctx->mutex); ret = rdma_disconnect(ctx->cm_id); mutex_unlock(&ctx->mutex); ucma_put_ctx(ctx); return ret; } static ssize_t ucma_init_qp_attr(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { struct rdma_ucm_init_qp_attr cmd; struct ib_uverbs_qp_attr resp; struct ucma_context *ctx; struct ib_qp_attr qp_attr; int ret; if (out_len < sizeof(resp)) return -ENOSPC; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; if (cmd.qp_state > IB_QPS_ERR) return -EINVAL; ctx = ucma_get_ctx_dev(file, cmd.id); if (IS_ERR(ctx)) return PTR_ERR(ctx); resp.qp_attr_mask = 0; memset(&qp_attr, 0, sizeof qp_attr); qp_attr.qp_state = cmd.qp_state; mutex_lock(&ctx->mutex); ret = rdma_init_qp_attr(ctx->cm_id, &qp_attr, &resp.qp_attr_mask); mutex_unlock(&ctx->mutex); if (ret) goto out; ib_copy_qp_attr_to_user(ctx->cm_id->device, &resp, &qp_attr); if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof(resp))) ret = -EFAULT; out: ucma_put_ctx(ctx); return ret; } static int ucma_set_option_id(struct ucma_context *ctx, int optname, void *optval, size_t optlen) { int ret = 0; switch (optname) { case RDMA_OPTION_ID_TOS: if (optlen != sizeof(u8)) { ret = -EINVAL; break; } rdma_set_service_type(ctx->cm_id, *((u8 *) optval)); break; case RDMA_OPTION_ID_REUSEADDR: if (optlen != sizeof(int)) { ret = -EINVAL; break; } ret = rdma_set_reuseaddr(ctx->cm_id, *((int *) optval) ? 1 : 0); break; case RDMA_OPTION_ID_AFONLY: if (optlen != sizeof(int)) { ret = -EINVAL; break; } ret = rdma_set_afonly(ctx->cm_id, *((int *) optval) ? 1 : 0); break; case RDMA_OPTION_ID_ACK_TIMEOUT: if (optlen != sizeof(u8)) { ret = -EINVAL; break; } ret = rdma_set_ack_timeout(ctx->cm_id, *((u8 *)optval)); break; default: ret = -ENOSYS; } return ret; } static int ucma_set_ib_path(struct ucma_context *ctx, struct ib_path_rec_data *path_data, size_t optlen) { struct sa_path_rec sa_path; struct rdma_cm_event event; int ret; if (optlen % sizeof(*path_data)) return -EINVAL; for (; optlen; optlen -= sizeof(*path_data), path_data++) { if (path_data->flags == (IB_PATH_GMP | IB_PATH_PRIMARY | IB_PATH_BIDIRECTIONAL)) break; } if (!optlen) return -EINVAL; if (!ctx->cm_id->device) return -EINVAL; memset(&sa_path, 0, sizeof(sa_path)); sa_path.rec_type = SA_PATH_REC_TYPE_IB; ib_sa_unpack_path(path_data->path_rec, &sa_path); if (rdma_cap_opa_ah(ctx->cm_id->device, ctx->cm_id->port_num)) { struct sa_path_rec opa; sa_convert_path_ib_to_opa(&opa, &sa_path); mutex_lock(&ctx->mutex); ret = rdma_set_ib_path(ctx->cm_id, &opa); mutex_unlock(&ctx->mutex); } else { mutex_lock(&ctx->mutex); ret = rdma_set_ib_path(ctx->cm_id, &sa_path); mutex_unlock(&ctx->mutex); } if (ret) return ret; memset(&event, 0, sizeof event); event.event = RDMA_CM_EVENT_ROUTE_RESOLVED; return ucma_event_handler(ctx->cm_id, &event); } static int ucma_set_option_ib(struct ucma_context *ctx, int optname, void *optval, size_t optlen) { int ret; switch (optname) { case RDMA_OPTION_IB_PATH: ret = ucma_set_ib_path(ctx, optval, optlen); break; default: ret = -ENOSYS; } return ret; } static int ucma_set_option_level(struct ucma_context *ctx, int level, int optname, void *optval, size_t optlen) { int ret; switch (level) { case RDMA_OPTION_ID: mutex_lock(&ctx->mutex); ret = ucma_set_option_id(ctx, optname, optval, optlen); mutex_unlock(&ctx->mutex); break; case RDMA_OPTION_IB: ret = ucma_set_option_ib(ctx, optname, optval, optlen); break; default: ret = -ENOSYS; } return ret; } static ssize_t ucma_set_option(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { struct rdma_ucm_set_option cmd; struct ucma_context *ctx; void *optval; int ret; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; if (unlikely(cmd.optlen > KMALLOC_MAX_SIZE)) return -EINVAL; ctx = ucma_get_ctx(file, cmd.id); if (IS_ERR(ctx)) return PTR_ERR(ctx); optval = memdup_user(u64_to_user_ptr(cmd.optval), cmd.optlen); if (IS_ERR(optval)) { ret = PTR_ERR(optval); goto out; } ret = ucma_set_option_level(ctx, cmd.level, cmd.optname, optval, cmd.optlen); kfree(optval); out: ucma_put_ctx(ctx); return ret; } static ssize_t ucma_notify(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { struct rdma_ucm_notify cmd; struct ucma_context *ctx; int ret = -EINVAL; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; ctx = ucma_get_ctx(file, cmd.id); if (IS_ERR(ctx)) return PTR_ERR(ctx); mutex_lock(&ctx->mutex); if (ctx->cm_id->device) ret = rdma_notify(ctx->cm_id, (enum ib_event_type)cmd.event); mutex_unlock(&ctx->mutex); ucma_put_ctx(ctx); return ret; } static ssize_t ucma_process_join(struct ucma_file *file, struct rdma_ucm_join_mcast *cmd, int out_len) { struct rdma_ucm_create_id_resp resp; struct ucma_context *ctx; struct ucma_multicast *mc; struct sockaddr *addr; int ret; u8 join_state; if (out_len < sizeof(resp)) return -ENOSPC; addr = (struct sockaddr *) &cmd->addr; if (cmd->addr_size != rdma_addr_size(addr)) return -EINVAL; if (cmd->join_flags == RDMA_MC_JOIN_FLAG_FULLMEMBER) join_state = BIT(FULLMEMBER_JOIN); else if (cmd->join_flags == RDMA_MC_JOIN_FLAG_SENDONLY_FULLMEMBER) join_state = BIT(SENDONLY_FULLMEMBER_JOIN); else return -EINVAL; ctx = ucma_get_ctx_dev(file, cmd->id); if (IS_ERR(ctx)) return PTR_ERR(ctx); mc = kzalloc(sizeof(*mc), GFP_KERNEL); if (!mc) { ret = -ENOMEM; goto err_put_ctx; } mc->ctx = ctx; mc->join_state = join_state; mc->uid = cmd->uid; memcpy(&mc->addr, addr, cmd->addr_size); xa_lock(&multicast_table); if (__xa_alloc(&multicast_table, &mc->id, NULL, xa_limit_32b, GFP_KERNEL)) { ret = -ENOMEM; goto err_free_mc; } list_add_tail(&mc->list, &ctx->mc_list); xa_unlock(&multicast_table); mutex_lock(&ctx->mutex); ret = rdma_join_multicast(ctx->cm_id, (struct sockaddr *)&mc->addr, join_state, mc); mutex_unlock(&ctx->mutex); if (ret) goto err_xa_erase; resp.id = mc->id; if (copy_to_user(u64_to_user_ptr(cmd->response), &resp, sizeof(resp))) { ret = -EFAULT; goto err_leave_multicast; } xa_store(&multicast_table, mc->id, mc, 0); ucma_put_ctx(ctx); return 0; err_leave_multicast: mutex_lock(&ctx->mutex); rdma_leave_multicast(ctx->cm_id, (struct sockaddr *) &mc->addr); mutex_unlock(&ctx->mutex); ucma_cleanup_mc_events(mc); err_xa_erase: xa_lock(&multicast_table); list_del(&mc->list); __xa_erase(&multicast_table, mc->id); err_free_mc: xa_unlock(&multicast_table); kfree(mc); err_put_ctx: ucma_put_ctx(ctx); return ret; } static ssize_t ucma_join_ip_multicast(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { struct rdma_ucm_join_ip_mcast cmd; struct rdma_ucm_join_mcast join_cmd; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; join_cmd.response = cmd.response; join_cmd.uid = cmd.uid; join_cmd.id = cmd.id; join_cmd.addr_size = rdma_addr_size_in6(&cmd.addr); if (!join_cmd.addr_size) return -EINVAL; join_cmd.join_flags = RDMA_MC_JOIN_FLAG_FULLMEMBER; memcpy(&join_cmd.addr, &cmd.addr, join_cmd.addr_size); return ucma_process_join(file, &join_cmd, out_len); } static ssize_t ucma_join_multicast(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { struct rdma_ucm_join_mcast cmd; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; if (!rdma_addr_size_kss(&cmd.addr)) return -EINVAL; return ucma_process_join(file, &cmd, out_len); } static ssize_t ucma_leave_multicast(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { struct rdma_ucm_destroy_id cmd; struct rdma_ucm_destroy_id_resp resp; struct ucma_multicast *mc; int ret = 0; if (out_len < sizeof(resp)) return -ENOSPC; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; xa_lock(&multicast_table); mc = xa_load(&multicast_table, cmd.id); if (!mc) mc = ERR_PTR(-ENOENT); else if (READ_ONCE(mc->ctx->file) != file) mc = ERR_PTR(-EINVAL); else if (!refcount_inc_not_zero(&mc->ctx->ref)) mc = ERR_PTR(-ENXIO); if (IS_ERR(mc)) { xa_unlock(&multicast_table); ret = PTR_ERR(mc); goto out; } list_del(&mc->list); __xa_erase(&multicast_table, mc->id); xa_unlock(&multicast_table); mutex_lock(&mc->ctx->mutex); rdma_leave_multicast(mc->ctx->cm_id, (struct sockaddr *) &mc->addr); mutex_unlock(&mc->ctx->mutex); ucma_cleanup_mc_events(mc); ucma_put_ctx(mc->ctx); resp.events_reported = mc->events_reported; kfree(mc); if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof(resp))) ret = -EFAULT; out: return ret; } static ssize_t ucma_migrate_id(struct ucma_file *new_file, const char __user *inbuf, int in_len, int out_len) { struct rdma_ucm_migrate_id cmd; struct rdma_ucm_migrate_resp resp; struct ucma_event *uevent, *tmp; struct ucma_context *ctx; LIST_HEAD(event_list); struct fd f; struct ucma_file *cur_file; int ret = 0; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; /* Get current fd to protect against it being closed */ f = fdget(cmd.fd); if (!f.file) return -ENOENT; if (f.file->f_op != &ucma_fops) { ret = -EINVAL; goto file_put; } cur_file = f.file->private_data; /* Validate current fd and prevent destruction of id. */ ctx = ucma_get_ctx(cur_file, cmd.id); if (IS_ERR(ctx)) { ret = PTR_ERR(ctx); goto file_put; } rdma_lock_handler(ctx->cm_id); /* * ctx->file can only be changed under the handler & xa_lock. xa_load() * must be checked again to ensure the ctx hasn't begun destruction * since the ucma_get_ctx(). */ xa_lock(&ctx_table); if (_ucma_find_context(cmd.id, cur_file) != ctx) { xa_unlock(&ctx_table); ret = -ENOENT; goto err_unlock; } ctx->file = new_file; xa_unlock(&ctx_table); mutex_lock(&cur_file->mut); list_del(&ctx->list); /* * At this point lock_handler() prevents addition of new uevents for * this ctx. */ list_for_each_entry_safe(uevent, tmp, &cur_file->event_list, list) if (uevent->ctx == ctx) list_move_tail(&uevent->list, &event_list); resp.events_reported = ctx->events_reported; mutex_unlock(&cur_file->mut); mutex_lock(&new_file->mut); list_add_tail(&ctx->list, &new_file->ctx_list); list_splice_tail(&event_list, &new_file->event_list); mutex_unlock(&new_file->mut); if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof(resp))) ret = -EFAULT; err_unlock: rdma_unlock_handler(ctx->cm_id); ucma_put_ctx(ctx); file_put: fdput(f); return ret; } static ssize_t (*ucma_cmd_table[])(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) = { [RDMA_USER_CM_CMD_CREATE_ID] = ucma_create_id, [RDMA_USER_CM_CMD_DESTROY_ID] = ucma_destroy_id, [RDMA_USER_CM_CMD_BIND_IP] = ucma_bind_ip, [RDMA_USER_CM_CMD_RESOLVE_IP] = ucma_resolve_ip, [RDMA_USER_CM_CMD_RESOLVE_ROUTE] = ucma_resolve_route, [RDMA_USER_CM_CMD_QUERY_ROUTE] = ucma_query_route, [RDMA_USER_CM_CMD_CONNECT] = ucma_connect, [RDMA_USER_CM_CMD_LISTEN] = ucma_listen, [RDMA_USER_CM_CMD_ACCEPT] = ucma_accept, [RDMA_USER_CM_CMD_REJECT] = ucma_reject, [RDMA_USER_CM_CMD_DISCONNECT] = ucma_disconnect, [RDMA_USER_CM_CMD_INIT_QP_ATTR] = ucma_init_qp_attr, [RDMA_USER_CM_CMD_GET_EVENT] = ucma_get_event, [RDMA_USER_CM_CMD_GET_OPTION] = NULL, [RDMA_USER_CM_CMD_SET_OPTION] = ucma_set_option, [RDMA_USER_CM_CMD_NOTIFY] = ucma_notify, [RDMA_USER_CM_CMD_JOIN_IP_MCAST] = ucma_join_ip_multicast, [RDMA_USER_CM_CMD_LEAVE_MCAST] = ucma_leave_multicast, [RDMA_USER_CM_CMD_MIGRATE_ID] = ucma_migrate_id, [RDMA_USER_CM_CMD_QUERY] = ucma_query, [RDMA_USER_CM_CMD_BIND] = ucma_bind, [RDMA_USER_CM_CMD_RESOLVE_ADDR] = ucma_resolve_addr, [RDMA_USER_CM_CMD_JOIN_MCAST] = ucma_join_multicast }; static ssize_t ucma_write(struct file *filp, const char __user *buf, size_t len, loff_t *pos) { struct ucma_file *file = filp->private_data; struct rdma_ucm_cmd_hdr hdr; ssize_t ret; if (!ib_safe_file_access(filp)) { pr_err_once("%s: process %d (%s) changed security contexts after opening file descriptor, this is not allowed.\n", __func__, task_tgid_vnr(current), current->comm); return -EACCES; } if (len < sizeof(hdr)) return -EINVAL; if (copy_from_user(&hdr, buf, sizeof(hdr))) return -EFAULT; if (hdr.cmd >= ARRAY_SIZE(ucma_cmd_table)) return -EINVAL; hdr.cmd = array_index_nospec(hdr.cmd, ARRAY_SIZE(ucma_cmd_table)); if (hdr.in + sizeof(hdr) > len) return -EINVAL; if (!ucma_cmd_table[hdr.cmd]) return -ENOSYS; ret = ucma_cmd_table[hdr.cmd](file, buf + sizeof(hdr), hdr.in, hdr.out); if (!ret) ret = len; return ret; } static __poll_t ucma_poll(struct file *filp, struct poll_table_struct *wait) { struct ucma_file *file = filp->private_data; __poll_t mask = 0; poll_wait(filp, &file->poll_wait, wait); if (!list_empty(&file->event_list)) mask = EPOLLIN | EPOLLRDNORM; return mask; } /* * ucma_open() does not need the BKL: * * - no global state is referred to; * - there is no ioctl method to race against; * - no further module initialization is required for open to work * after the device is registered. */ static int ucma_open(struct inode *inode, struct file *filp) { struct ucma_file *file; file = kmalloc(sizeof *file, GFP_KERNEL); if (!file) return -ENOMEM; INIT_LIST_HEAD(&file->event_list); INIT_LIST_HEAD(&file->ctx_list); init_waitqueue_head(&file->poll_wait); mutex_init(&file->mut); filp->private_data = file; file->filp = filp; return stream_open(inode, filp); } static int ucma_close(struct inode *inode, struct file *filp) { struct ucma_file *file = filp->private_data; /* * All paths that touch ctx_list or ctx_list starting from write() are * prevented by this being a FD release function. The list_add_tail() in * ucma_connect_event_handler() can run concurrently, however it only * adds to the list *after* a listening ID. By only reading the first of * the list, and relying on ucma_destroy_private_ctx() to block * ucma_connect_event_handler(), no additional locking is needed. */ while (!list_empty(&file->ctx_list)) { struct ucma_context *ctx = list_first_entry( &file->ctx_list, struct ucma_context, list); WARN_ON(xa_cmpxchg(&ctx_table, ctx->id, ctx, XA_ZERO_ENTRY, GFP_KERNEL) != ctx); ucma_destroy_private_ctx(ctx); } kfree(file); return 0; } static const struct file_operations ucma_fops = { .owner = THIS_MODULE, .open = ucma_open, .release = ucma_close, .write = ucma_write, .poll = ucma_poll, .llseek = no_llseek, }; static struct miscdevice ucma_misc = { .minor = MISC_DYNAMIC_MINOR, .name = "rdma_cm", .nodename = "infiniband/rdma_cm", .mode = 0666, .fops = &ucma_fops, }; static int ucma_get_global_nl_info(struct ib_client_nl_info *res) { res->abi = RDMA_USER_CM_ABI_VERSION; res->cdev = ucma_misc.this_device; return 0; } static struct ib_client rdma_cma_client = { .name = "rdma_cm", .get_global_nl_info = ucma_get_global_nl_info, }; MODULE_ALIAS_RDMA_CLIENT("rdma_cm"); static ssize_t abi_version_show(struct device *dev, struct device_attribute *attr, char *buf) { return sysfs_emit(buf, "%d\n", RDMA_USER_CM_ABI_VERSION); } static DEVICE_ATTR_RO(abi_version); static int __init ucma_init(void) { int ret; ret = misc_register(&ucma_misc); if (ret) return ret; ret = device_create_file(ucma_misc.this_device, &dev_attr_abi_version); if (ret) { pr_err("rdma_ucm: couldn't create abi_version attr\n"); goto err1; } ucma_ctl_table_hdr = register_net_sysctl(&init_net, "net/rdma_ucm", ucma_ctl_table); if (!ucma_ctl_table_hdr) { pr_err("rdma_ucm: couldn't register sysctl paths\n"); ret = -ENOMEM; goto err2; } ret = ib_register_client(&rdma_cma_client); if (ret) goto err3; return 0; err3: unregister_net_sysctl_table(ucma_ctl_table_hdr); err2: device_remove_file(ucma_misc.this_device, &dev_attr_abi_version); err1: misc_deregister(&ucma_misc); return ret; } static void __exit ucma_cleanup(void) { ib_unregister_client(&rdma_cma_client); unregister_net_sysctl_table(ucma_ctl_table_hdr); device_remove_file(ucma_misc.this_device, &dev_attr_abi_version); misc_deregister(&ucma_misc); } module_init(ucma_init); module_exit(ucma_cleanup); |
| 11 18 17 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 | /* * Copyright (c) 2016 Intel Corporation * * Permission to use, copy, modify, distribute, and sell this software and its * documentation for any purpose is hereby granted without fee, provided that * the above copyright notice appear in all copies and that both that copyright * notice and this permission notice appear in supporting documentation, and * that the name of the copyright holders not be used in advertising or * publicity pertaining to distribution of the software without specific, * written prior permission. The copyright holders make no representations * about the suitability of this software for any purpose. It is provided "as * is" without express or implied warranty. * * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO * EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT OR * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE * OF THIS SOFTWARE. */ #ifndef __DRM_PLANE_H__ #define __DRM_PLANE_H__ #include <linux/list.h> #include <linux/ctype.h> #include <linux/kmsg_dump.h> #include <drm/drm_mode_object.h> #include <drm/drm_color_mgmt.h> #include <drm/drm_rect.h> #include <drm/drm_modeset_lock.h> #include <drm/drm_util.h> struct drm_crtc; struct drm_plane_size_hint; struct drm_printer; struct drm_modeset_acquire_ctx; enum drm_scaling_filter { DRM_SCALING_FILTER_DEFAULT, DRM_SCALING_FILTER_NEAREST_NEIGHBOR, }; /** * struct drm_plane_state - mutable plane state * * Please note that the destination coordinates @crtc_x, @crtc_y, @crtc_h and * @crtc_w and the source coordinates @src_x, @src_y, @src_h and @src_w are the * raw coordinates provided by userspace. Drivers should use * drm_atomic_helper_check_plane_state() and only use the derived rectangles in * @src and @dst to program the hardware. */ struct drm_plane_state { /** @plane: backpointer to the plane */ struct drm_plane *plane; /** * @crtc: * * Currently bound CRTC, NULL if disabled. Do not write this directly, * use drm_atomic_set_crtc_for_plane() */ struct drm_crtc *crtc; /** * @fb: * * Currently bound framebuffer. Do not write this directly, use * drm_atomic_set_fb_for_plane() */ struct drm_framebuffer *fb; /** * @fence: * * Optional fence to wait for before scanning out @fb. The core atomic * code will set this when userspace is using explicit fencing. Do not * write this field directly for a driver's implicit fence. * * Drivers should store any implicit fence in this from their * &drm_plane_helper_funcs.prepare_fb callback. See * drm_gem_plane_helper_prepare_fb() for a suitable helper. */ struct dma_fence *fence; /** * @crtc_x: * * Left position of visible portion of plane on crtc, signed dest * location allows it to be partially off screen. */ int32_t crtc_x; /** * @crtc_y: * * Upper position of visible portion of plane on crtc, signed dest * location allows it to be partially off screen. */ int32_t crtc_y; /** @crtc_w: width of visible portion of plane on crtc */ /** @crtc_h: height of visible portion of plane on crtc */ uint32_t crtc_w, crtc_h; /** * @src_x: left position of visible portion of plane within plane (in * 16.16 fixed point). */ uint32_t src_x; /** * @src_y: upper position of visible portion of plane within plane (in * 16.16 fixed point). */ uint32_t src_y; /** @src_w: width of visible portion of plane (in 16.16) */ /** @src_h: height of visible portion of plane (in 16.16) */ uint32_t src_h, src_w; /** @hotspot_x: x offset to mouse cursor hotspot */ /** @hotspot_y: y offset to mouse cursor hotspot */ int32_t hotspot_x, hotspot_y; /** * @alpha: * Opacity of the plane with 0 as completely transparent and 0xffff as * completely opaque. See drm_plane_create_alpha_property() for more * details. */ u16 alpha; /** * @pixel_blend_mode: * The alpha blending equation selection, describing how the pixels from * the current plane are composited with the background. Value can be * one of DRM_MODE_BLEND_* */ uint16_t pixel_blend_mode; /** * @rotation: * Rotation of the plane. See drm_plane_create_rotation_property() for * more details. */ unsigned int rotation; /** * @zpos: * Priority of the given plane on crtc (optional). * * User-space may set mutable zpos properties so that multiple active * planes on the same CRTC have identical zpos values. This is a * user-space bug, but drivers can solve the conflict by comparing the * plane object IDs; the plane with a higher ID is stacked on top of a * plane with a lower ID. * * See drm_plane_create_zpos_property() and * drm_plane_create_zpos_immutable_property() for more details. */ unsigned int zpos; /** * @normalized_zpos: * Normalized value of zpos: unique, range from 0 to N-1 where N is the * number of active planes for given crtc. Note that the driver must set * &drm_mode_config.normalize_zpos or call drm_atomic_normalize_zpos() to * update this before it can be trusted. */ unsigned int normalized_zpos; /** * @color_encoding: * * Color encoding for non RGB formats */ enum drm_color_encoding color_encoding; /** * @color_range: * * Color range for non RGB formats */ enum drm_color_range color_range; /** * @fb_damage_clips: * * Blob representing damage (area in plane framebuffer that changed * since last plane update) as an array of &drm_mode_rect in framebuffer * coodinates of the attached framebuffer. Note that unlike plane src, * damage clips are not in 16.16 fixed point. * * See drm_plane_get_damage_clips() and * drm_plane_get_damage_clips_count() for accessing these. */ struct drm_property_blob *fb_damage_clips; /** * @ignore_damage_clips: * * Set by drivers to indicate the drm_atomic_helper_damage_iter_init() * helper that the @fb_damage_clips blob property should be ignored. * * See :ref:`damage_tracking_properties` for more information. */ bool ignore_damage_clips; /** * @src: * * source coordinates of the plane (in 16.16). * * When using drm_atomic_helper_check_plane_state(), * the coordinates are clipped, but the driver may choose * to use unclipped coordinates instead when the hardware * performs the clipping automatically. */ /** * @dst: * * clipped destination coordinates of the plane. * * When using drm_atomic_helper_check_plane_state(), * the coordinates are clipped, but the driver may choose * to use unclipped coordinates instead when the hardware * performs the clipping automatically. */ struct drm_rect src, dst; /** * @visible: * * Visibility of the plane. This can be false even if fb!=NULL and * crtc!=NULL, due to clipping. */ bool visible; /** * @scaling_filter: * * Scaling filter to be applied */ enum drm_scaling_filter scaling_filter; /** * @commit: Tracks the pending commit to prevent use-after-free conditions, * and for async plane updates. * * May be NULL. */ struct drm_crtc_commit *commit; /** @state: backpointer to global drm_atomic_state */ struct drm_atomic_state *state; /** * @color_mgmt_changed: Color management properties have changed. Used * by the atomic helpers and drivers to steer the atomic commit control * flow. */ bool color_mgmt_changed : 1; }; static inline struct drm_rect drm_plane_state_src(const struct drm_plane_state *state) { struct drm_rect src = { .x1 = state->src_x, .y1 = state->src_y, .x2 = state->src_x + state->src_w, .y2 = state->src_y + state->src_h, }; return src; } static inline struct drm_rect drm_plane_state_dest(const struct drm_plane_state *state) { struct drm_rect dest = { .x1 = state->crtc_x, .y1 = state->crtc_y, .x2 = state->crtc_x + state->crtc_w, .y2 = state->crtc_y + state->crtc_h, }; return dest; } /** * struct drm_plane_funcs - driver plane control functions */ struct drm_plane_funcs { /** * @update_plane: * * This is the legacy entry point to enable and configure the plane for * the given CRTC and framebuffer. It is never called to disable the * plane, i.e. the passed-in crtc and fb paramters are never NULL. * * The source rectangle in frame buffer memory coordinates is given by * the src_x, src_y, src_w and src_h parameters (as 16.16 fixed point * values). Devices that don't support subpixel plane coordinates can * ignore the fractional part. * * The destination rectangle in CRTC coordinates is given by the * crtc_x, crtc_y, crtc_w and crtc_h parameters (as integer values). * Devices scale the source rectangle to the destination rectangle. If * scaling is not supported, and the source rectangle size doesn't match * the destination rectangle size, the driver must return a * -<errorname>EINVAL</errorname> error. * * Drivers implementing atomic modeset should use * drm_atomic_helper_update_plane() to implement this hook. * * RETURNS: * * 0 on success or a negative error code on failure. */ int (*update_plane)(struct drm_plane *plane, struct drm_crtc *crtc, struct drm_framebuffer *fb, int crtc_x, int crtc_y, unsigned int crtc_w, unsigned int crtc_h, uint32_t src_x, uint32_t src_y, uint32_t src_w, uint32_t src_h, struct drm_modeset_acquire_ctx *ctx); /** * @disable_plane: * * This is the legacy entry point to disable the plane. The DRM core * calls this method in response to a DRM_IOCTL_MODE_SETPLANE IOCTL call * with the frame buffer ID set to 0. Disabled planes must not be * processed by the CRTC. * * Drivers implementing atomic modeset should use * drm_atomic_helper_disable_plane() to implement this hook. * * RETURNS: * * 0 on success or a negative error code on failure. */ int (*disable_plane)(struct drm_plane *plane, struct drm_modeset_acquire_ctx *ctx); /** * @destroy: * * Clean up plane resources. This is only called at driver unload time * through drm_mode_config_cleanup() since a plane cannot be hotplugged * in DRM. */ void (*destroy)(struct drm_plane *plane); /** * @reset: * * Reset plane hardware and software state to off. This function isn't * called by the core directly, only through drm_mode_config_reset(). * It's not a helper hook only for historical reasons. * * Atomic drivers can use drm_atomic_helper_plane_reset() to reset * atomic state using this hook. */ void (*reset)(struct drm_plane *plane); /** * @set_property: * * This is the legacy entry point to update a property attached to the * plane. * * This callback is optional if the driver does not support any legacy * driver-private properties. For atomic drivers it is not used because * property handling is done entirely in the DRM core. * * RETURNS: * * 0 on success or a negative error code on failure. */ int (*set_property)(struct drm_plane *plane, struct drm_property *property, uint64_t val); /** * @atomic_duplicate_state: * * Duplicate the current atomic state for this plane and return it. * The core and helpers guarantee that any atomic state duplicated with * this hook and still owned by the caller (i.e. not transferred to the * driver by calling &drm_mode_config_funcs.atomic_commit) will be * cleaned up by calling the @atomic_destroy_state hook in this * structure. * * This callback is mandatory for atomic drivers. * * Atomic drivers which don't subclass &struct drm_plane_state should use * drm_atomic_helper_plane_duplicate_state(). Drivers that subclass the * state structure to extend it with driver-private state should use * __drm_atomic_helper_plane_duplicate_state() to make sure shared state is * duplicated in a consistent fashion across drivers. * * It is an error to call this hook before &drm_plane.state has been * initialized correctly. * * NOTE: * * If the duplicate state references refcounted resources this hook must * acquire a reference for each of them. The driver must release these * references again in @atomic_destroy_state. * * RETURNS: * * Duplicated atomic state or NULL when the allocation failed. */ struct drm_plane_state *(*atomic_duplicate_state)(struct drm_plane *plane); /** * @atomic_destroy_state: * * Destroy a state duplicated with @atomic_duplicate_state and release * or unreference all resources it references * * This callback is mandatory for atomic drivers. */ void (*atomic_destroy_state)(struct drm_plane *plane, struct drm_plane_state *state); /** * @atomic_set_property: * * Decode a driver-private property value and store the decoded value * into the passed-in state structure. Since the atomic core decodes all * standardized properties (even for extensions beyond the core set of * properties which might not be implemented by all drivers) this * requires drivers to subclass the state structure. * * Such driver-private properties should really only be implemented for * truly hardware/vendor specific state. Instead it is preferred to * standardize atomic extension and decode the properties used to expose * such an extension in the core. * * Do not call this function directly, use * drm_atomic_plane_set_property() instead. * * This callback is optional if the driver does not support any * driver-private atomic properties. * * NOTE: * * This function is called in the state assembly phase of atomic * modesets, which can be aborted for any reason (including on * userspace's request to just check whether a configuration would be * possible). Drivers MUST NOT touch any persistent state (hardware or * software) or data structures except the passed in @state parameter. * * Also since userspace controls in which order properties are set this * function must not do any input validation (since the state update is * incomplete and hence likely inconsistent). Instead any such input * validation must be done in the various atomic_check callbacks. * * RETURNS: * * 0 if the property has been found, -EINVAL if the property isn't * implemented by the driver (which shouldn't ever happen, the core only * asks for properties attached to this plane). No other validation is * allowed by the driver. The core already checks that the property * value is within the range (integer, valid enum value, ...) the driver * set when registering the property. */ int (*atomic_set_property)(struct drm_plane *plane, struct drm_plane_state *state, struct drm_property *property, uint64_t val); /** * @atomic_get_property: * * Reads out the decoded driver-private property. This is used to * implement the GETPLANE IOCTL. * * Do not call this function directly, use * drm_atomic_plane_get_property() instead. * * This callback is optional if the driver does not support any * driver-private atomic properties. * * RETURNS: * * 0 on success, -EINVAL if the property isn't implemented by the * driver (which should never happen, the core only asks for * properties attached to this plane). */ int (*atomic_get_property)(struct drm_plane *plane, const struct drm_plane_state *state, struct drm_property *property, uint64_t *val); /** * @late_register: * * This optional hook can be used to register additional userspace * interfaces attached to the plane like debugfs interfaces. * It is called late in the driver load sequence from drm_dev_register(). * Everything added from this callback should be unregistered in * the early_unregister callback. * * Returns: * * 0 on success, or a negative error code on failure. */ int (*late_register)(struct drm_plane *plane); /** * @early_unregister: * * This optional hook should be used to unregister the additional * userspace interfaces attached to the plane from * @late_register. It is called from drm_dev_unregister(), * early in the driver unload sequence to disable userspace access * before data structures are torndown. */ void (*early_unregister)(struct drm_plane *plane); /** * @atomic_print_state: * * If driver subclasses &struct drm_plane_state, it should implement * this optional hook for printing additional driver specific state. * * Do not call this directly, use drm_atomic_plane_print_state() * instead. */ void (*atomic_print_state)(struct drm_printer *p, const struct drm_plane_state *state); /** * @format_mod_supported: * * This optional hook is used for the DRM to determine if the given * format/modifier combination is valid for the plane. This allows the * DRM to generate the correct format bitmask (which formats apply to * which modifier), and to validate modifiers at atomic_check time. * * If not present, then any modifier in the plane's modifier * list is allowed with any of the plane's formats. * * Returns: * * True if the given modifier is valid for that format on the plane. * False otherwise. */ bool (*format_mod_supported)(struct drm_plane *plane, uint32_t format, uint64_t modifier); }; /** * enum drm_plane_type - uapi plane type enumeration * * For historical reasons not all planes are made the same. This enumeration is * used to tell the different types of planes apart to implement the different * uapi semantics for them. For userspace which is universal plane aware and * which is using that atomic IOCTL there's no difference between these planes * (beyong what the driver and hardware can support of course). * * For compatibility with legacy userspace, only overlay planes are made * available to userspace by default. Userspace clients may set the * &DRM_CLIENT_CAP_UNIVERSAL_PLANES client capability bit to indicate that they * wish to receive a universal plane list containing all plane types. See also * drm_for_each_legacy_plane(). * * In addition to setting each plane's type, drivers need to setup the * &drm_crtc.primary and optionally &drm_crtc.cursor pointers for legacy * IOCTLs. See drm_crtc_init_with_planes(). * * WARNING: The values of this enum is UABI since they're exposed in the "type" * property. */ enum drm_plane_type { /** * @DRM_PLANE_TYPE_OVERLAY: * * Overlay planes represent all non-primary, non-cursor planes. Some * drivers refer to these types of planes as "sprites" internally. */ DRM_PLANE_TYPE_OVERLAY, /** * @DRM_PLANE_TYPE_PRIMARY: * * A primary plane attached to a CRTC is the most likely to be able to * light up the CRTC when no scaling/cropping is used and the plane * covers the whole CRTC. */ DRM_PLANE_TYPE_PRIMARY, /** * @DRM_PLANE_TYPE_CURSOR: * * A cursor plane attached to a CRTC is more likely to be able to be * enabled when no scaling/cropping is used and the framebuffer has the * size indicated by &drm_mode_config.cursor_width and * &drm_mode_config.cursor_height. Additionally, if the driver doesn't * support modifiers, the framebuffer should have a linear layout. */ DRM_PLANE_TYPE_CURSOR, }; /** * struct drm_plane - central DRM plane control structure * * Planes represent the scanout hardware of a display block. They receive their * input data from a &drm_framebuffer and feed it to a &drm_crtc. Planes control * the color conversion, see `Plane Composition Properties`_ for more details, * and are also involved in the color conversion of input pixels, see `Color * Management Properties`_ for details on that. */ struct drm_plane { /** @dev: DRM device this plane belongs to */ struct drm_device *dev; /** * @head: * * List of all planes on @dev, linked from &drm_mode_config.plane_list. * Invariant over the lifetime of @dev and therefore does not need * locking. */ struct list_head head; /** @name: human readable name, can be overwritten by the driver */ char *name; /** * @mutex: * * Protects modeset plane state, together with the &drm_crtc.mutex of * CRTC this plane is linked to (when active, getting activated or * getting disabled). * * For atomic drivers specifically this protects @state. */ struct drm_modeset_lock mutex; /** @base: base mode object */ struct drm_mode_object base; /** * @possible_crtcs: pipes this plane can be bound to constructed from * drm_crtc_mask() */ uint32_t possible_crtcs; /** @format_types: array of formats supported by this plane */ uint32_t *format_types; /** @format_count: Size of the array pointed at by @format_types. */ unsigned int format_count; /** * @format_default: driver hasn't supplied supported formats for the * plane. Used by the non-atomic driver compatibility wrapper only. */ bool format_default; /** @modifiers: array of modifiers supported by this plane */ uint64_t *modifiers; /** @modifier_count: Size of the array pointed at by @modifier_count. */ unsigned int modifier_count; /** * @crtc: * * Currently bound CRTC, only meaningful for non-atomic drivers. For * atomic drivers this is forced to be NULL, atomic drivers should * instead check &drm_plane_state.crtc. */ struct drm_crtc *crtc; /** * @fb: * * Currently bound framebuffer, only meaningful for non-atomic drivers. * For atomic drivers this is forced to be NULL, atomic drivers should * instead check &drm_plane_state.fb. */ struct drm_framebuffer *fb; /** * @old_fb: * * Temporary tracking of the old fb while a modeset is ongoing. Only * used by non-atomic drivers, forced to be NULL for atomic drivers. */ struct drm_framebuffer *old_fb; /** @funcs: plane control functions */ const struct drm_plane_funcs *funcs; /** @properties: property tracking for this plane */ struct drm_object_properties properties; /** @type: Type of plane, see &enum drm_plane_type for details. */ enum drm_plane_type type; /** * @index: Position inside the mode_config.list, can be used as an array * index. It is invariant over the lifetime of the plane. */ unsigned index; /** @helper_private: mid-layer private data */ const struct drm_plane_helper_funcs *helper_private; /** * @state: * * Current atomic state for this plane. * * This is protected by @mutex. Note that nonblocking atomic commits * access the current plane state without taking locks. Either by going * through the &struct drm_atomic_state pointers, see * for_each_oldnew_plane_in_state(), for_each_old_plane_in_state() and * for_each_new_plane_in_state(). Or through careful ordering of atomic * commit operations as implemented in the atomic helpers, see * &struct drm_crtc_commit. */ struct drm_plane_state *state; /** * @alpha_property: * Optional alpha property for this plane. See * drm_plane_create_alpha_property(). */ struct drm_property *alpha_property; /** * @zpos_property: * Optional zpos property for this plane. See * drm_plane_create_zpos_property(). */ struct drm_property *zpos_property; /** * @rotation_property: * Optional rotation property for this plane. See * drm_plane_create_rotation_property(). */ struct drm_property *rotation_property; /** * @blend_mode_property: * Optional "pixel blend mode" enum property for this plane. * Blend mode property represents the alpha blending equation selection, * describing how the pixels from the current plane are composited with * the background. */ struct drm_property *blend_mode_property; /** * @color_encoding_property: * * Optional "COLOR_ENCODING" enum property for specifying * color encoding for non RGB formats. * See drm_plane_create_color_properties(). */ struct drm_property *color_encoding_property; /** * @color_range_property: * * Optional "COLOR_RANGE" enum property for specifying * color range for non RGB formats. * See drm_plane_create_color_properties(). */ struct drm_property *color_range_property; /** * @scaling_filter_property: property to apply a particular filter while * scaling. */ struct drm_property *scaling_filter_property; /** * @hotspot_x_property: property to set mouse hotspot x offset. */ struct drm_property *hotspot_x_property; /** * @hotspot_y_property: property to set mouse hotspot y offset. */ struct drm_property *hotspot_y_property; /** * @kmsg_panic: Used to register a panic notifier for this plane */ struct kmsg_dumper kmsg_panic; }; #define obj_to_plane(x) container_of(x, struct drm_plane, base) __printf(9, 10) int drm_universal_plane_init(struct drm_device *dev, struct drm_plane *plane, uint32_t possible_crtcs, const struct drm_plane_funcs *funcs, const uint32_t *formats, unsigned int format_count, const uint64_t *format_modifiers, enum drm_plane_type type, const char *name, ...); void drm_plane_cleanup(struct drm_plane *plane); __printf(10, 11) void *__drmm_universal_plane_alloc(struct drm_device *dev, size_t size, size_t offset, uint32_t possible_crtcs, const struct drm_plane_funcs *funcs, const uint32_t *formats, unsigned int format_count, const uint64_t *format_modifiers, enum drm_plane_type plane_type, const char *name, ...); /** * drmm_universal_plane_alloc - Allocate and initialize an universal plane object * @dev: DRM device * @type: the type of the struct which contains struct &drm_plane * @member: the name of the &drm_plane within @type * @possible_crtcs: bitmask of possible CRTCs * @funcs: callbacks for the new plane * @formats: array of supported formats (DRM_FORMAT\_\*) * @format_count: number of elements in @formats * @format_modifiers: array of struct drm_format modifiers terminated by * DRM_FORMAT_MOD_INVALID * @plane_type: type of plane (overlay, primary, cursor) * @name: printf style format string for the plane name, or NULL for default name * * Allocates and initializes a plane object of type @type. Cleanup is * automatically handled through registering drm_plane_cleanup() with * drmm_add_action(). * * The @drm_plane_funcs.destroy hook must be NULL. * * Drivers that only support the DRM_FORMAT_MOD_LINEAR modifier support may set * @format_modifiers to NULL. The plane will advertise the linear modifier. * * Returns: * Pointer to new plane, or ERR_PTR on failure. */ #define drmm_universal_plane_alloc(dev, type, member, possible_crtcs, funcs, formats, \ format_count, format_modifiers, plane_type, name, ...) \ ((type *)__drmm_universal_plane_alloc(dev, sizeof(type), \ offsetof(type, member), \ possible_crtcs, funcs, formats, \ format_count, format_modifiers, \ plane_type, name, ##__VA_ARGS__)) __printf(10, 11) void *__drm_universal_plane_alloc(struct drm_device *dev, size_t size, size_t offset, uint32_t possible_crtcs, const struct drm_plane_funcs *funcs, const uint32_t *formats, unsigned int format_count, const uint64_t *format_modifiers, enum drm_plane_type plane_type, const char *name, ...); /** * drm_universal_plane_alloc() - Allocate and initialize an universal plane object * @dev: DRM device * @type: the type of the struct which contains struct &drm_plane * @member: the name of the &drm_plane within @type * @possible_crtcs: bitmask of possible CRTCs * @funcs: callbacks for the new plane * @formats: array of supported formats (DRM_FORMAT\_\*) * @format_count: number of elements in @formats * @format_modifiers: array of struct drm_format modifiers terminated by * DRM_FORMAT_MOD_INVALID * @plane_type: type of plane (overlay, primary, cursor) * @name: printf style format string for the plane name, or NULL for default name * * Allocates and initializes a plane object of type @type. The caller * is responsible for releasing the allocated memory with kfree(). * * Drivers are encouraged to use drmm_universal_plane_alloc() instead. * * Drivers that only support the DRM_FORMAT_MOD_LINEAR modifier support may set * @format_modifiers to NULL. The plane will advertise the linear modifier. * * Returns: * Pointer to new plane, or ERR_PTR on failure. */ #define drm_universal_plane_alloc(dev, type, member, possible_crtcs, funcs, formats, \ format_count, format_modifiers, plane_type, name, ...) \ ((type *)__drm_universal_plane_alloc(dev, sizeof(type), \ offsetof(type, member), \ possible_crtcs, funcs, formats, \ format_count, format_modifiers, \ plane_type, name, ##__VA_ARGS__)) /** * drm_plane_index - find the index of a registered plane * @plane: plane to find index for * * Given a registered plane, return the index of that plane within a DRM * device's list of planes. */ static inline unsigned int drm_plane_index(const struct drm_plane *plane) { return plane->index; } /** * drm_plane_mask - find the mask of a registered plane * @plane: plane to find mask for */ static inline u32 drm_plane_mask(const struct drm_plane *plane) { return 1 << drm_plane_index(plane); } struct drm_plane * drm_plane_from_index(struct drm_device *dev, int idx); void drm_plane_force_disable(struct drm_plane *plane); int drm_mode_plane_set_obj_prop(struct drm_plane *plane, struct drm_property *property, uint64_t value); /** * drm_plane_find - find a &drm_plane * @dev: DRM device * @file_priv: drm file to check for lease against. * @id: plane id * * Returns the plane with @id, NULL if it doesn't exist. Simple wrapper around * drm_mode_object_find(). */ static inline struct drm_plane *drm_plane_find(struct drm_device *dev, struct drm_file *file_priv, uint32_t id) { struct drm_mode_object *mo; mo = drm_mode_object_find(dev, file_priv, id, DRM_MODE_OBJECT_PLANE); return mo ? obj_to_plane(mo) : NULL; } /** * drm_for_each_plane_mask - iterate over planes specified by bitmask * @plane: the loop cursor * @dev: the DRM device * @plane_mask: bitmask of plane indices * * Iterate over all planes specified by bitmask. */ #define drm_for_each_plane_mask(plane, dev, plane_mask) \ list_for_each_entry((plane), &(dev)->mode_config.plane_list, head) \ for_each_if ((plane_mask) & drm_plane_mask(plane)) /** * drm_for_each_legacy_plane - iterate over all planes for legacy userspace * @plane: the loop cursor * @dev: the DRM device * * Iterate over all legacy planes of @dev, excluding primary and cursor planes. * This is useful for implementing userspace apis when userspace is not * universal plane aware. See also &enum drm_plane_type. */ #define drm_for_each_legacy_plane(plane, dev) \ list_for_each_entry(plane, &(dev)->mode_config.plane_list, head) \ for_each_if (plane->type == DRM_PLANE_TYPE_OVERLAY) /** * drm_for_each_plane - iterate over all planes * @plane: the loop cursor * @dev: the DRM device * * Iterate over all planes of @dev, include primary and cursor planes. */ #define drm_for_each_plane(plane, dev) \ list_for_each_entry(plane, &(dev)->mode_config.plane_list, head) bool drm_any_plane_has_format(struct drm_device *dev, u32 format, u64 modifier); void drm_plane_enable_fb_damage_clips(struct drm_plane *plane); unsigned int drm_plane_get_damage_clips_count(const struct drm_plane_state *state); struct drm_mode_rect * drm_plane_get_damage_clips(const struct drm_plane_state *state); int drm_plane_create_scaling_filter_property(struct drm_plane *plane, unsigned int supported_filters); int drm_plane_add_size_hints_property(struct drm_plane *plane, const struct drm_plane_size_hint *hints, int num_hints); #endif |
| 649 642 636 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _X_TABLES_H #define _X_TABLES_H #include <linux/netdevice.h> #include <linux/static_key.h> #include <linux/netfilter.h> #include <uapi/linux/netfilter/x_tables.h> /* Test a struct->invflags and a boolean for inequality */ #define NF_INVF(ptr, flag, boolean) \ ((boolean) ^ !!((ptr)->invflags & (flag))) /** * struct xt_action_param - parameters for matches/targets * * @match: the match extension * @target: the target extension * @matchinfo: per-match data * @targetinfo: per-target data * @state: pointer to hook state this packet came from * @fragoff: packet is a fragment, this is the data offset * @thoff: position of transport header relative to skb->data * * Fields written to by extensions: * * @hotdrop: drop packet if we had inspection problems */ struct xt_action_param { union { const struct xt_match *match; const struct xt_target *target; }; union { const void *matchinfo, *targinfo; }; const struct nf_hook_state *state; unsigned int thoff; u16 fragoff; bool hotdrop; }; static inline struct net *xt_net(const struct xt_action_param *par) { return par->state->net; } static inline struct net_device *xt_in(const struct xt_action_param *par) { return par->state->in; } static inline const char *xt_inname(const struct xt_action_param *par) { return par->state->in->name; } static inline struct net_device *xt_out(const struct xt_action_param *par) { return par->state->out; } static inline const char *xt_outname(const struct xt_action_param *par) { return par->state->out->name; } static inline unsigned int xt_hooknum(const struct xt_action_param *par) { return par->state->hook; } static inline u_int8_t xt_family(const struct xt_action_param *par) { return par->state->pf; } /** * struct xt_mtchk_param - parameters for match extensions' * checkentry functions * * @net: network namespace through which the check was invoked * @table: table the rule is tried to be inserted into * @entryinfo: the family-specific rule data * (struct ipt_ip, ip6t_ip, arpt_arp or (note) ebt_entry) * @match: struct xt_match through which this function was invoked * @matchinfo: per-match data * @hook_mask: via which hooks the new rule is reachable * Other fields as above. */ struct xt_mtchk_param { struct net *net; const char *table; const void *entryinfo; const struct xt_match *match; void *matchinfo; unsigned int hook_mask; u_int8_t family; bool nft_compat; }; /** * struct xt_mdtor_param - match destructor parameters * Fields as above. */ struct xt_mtdtor_param { struct net *net; const struct xt_match *match; void *matchinfo; u_int8_t family; }; /** * struct xt_tgchk_param - parameters for target extensions' * checkentry functions * * @entryinfo: the family-specific rule data * (struct ipt_entry, ip6t_entry, arpt_entry, ebt_entry) * * Other fields see above. */ struct xt_tgchk_param { struct net *net; const char *table; const void *entryinfo; const struct xt_target *target; void *targinfo; unsigned int hook_mask; u_int8_t family; bool nft_compat; }; /* Target destructor parameters */ struct xt_tgdtor_param { struct net *net; const struct xt_target *target; void *targinfo; u_int8_t family; }; struct xt_match { struct list_head list; const char name[XT_EXTENSION_MAXNAMELEN]; u_int8_t revision; /* Return true or false: return FALSE and set *hotdrop = 1 to force immediate packet drop. */ /* Arguments changed since 2.6.9, as this must now handle non-linear skb, using skb_header_pointer and skb_ip_make_writable. */ bool (*match)(const struct sk_buff *skb, struct xt_action_param *); /* Called when user tries to insert an entry of this type. */ int (*checkentry)(const struct xt_mtchk_param *); /* Called when entry of this type deleted. */ void (*destroy)(const struct xt_mtdtor_param *); #ifdef CONFIG_NETFILTER_XTABLES_COMPAT /* Called when userspace align differs from kernel space one */ void (*compat_from_user)(void *dst, const void *src); int (*compat_to_user)(void __user *dst, const void *src); #endif /* Set this to THIS_MODULE if you are a module, otherwise NULL */ struct module *me; const char *table; unsigned int matchsize; unsigned int usersize; #ifdef CONFIG_NETFILTER_XTABLES_COMPAT unsigned int compatsize; #endif unsigned int hooks; unsigned short proto; unsigned short family; }; /* Registration hooks for targets. */ struct xt_target { struct list_head list; const char name[XT_EXTENSION_MAXNAMELEN]; u_int8_t revision; /* Returns verdict. Argument order changed since 2.6.9, as this must now handle non-linear skbs, using skb_copy_bits and skb_ip_make_writable. */ unsigned int (*target)(struct sk_buff *skb, const struct xt_action_param *); /* Called when user tries to insert an entry of this type: hook_mask is a bitmask of hooks from which it can be called. */ /* Should return 0 on success or an error code otherwise (-Exxxx). */ int (*checkentry)(const struct xt_tgchk_param *); /* Called when entry of this type deleted. */ void (*destroy)(const struct xt_tgdtor_param *); #ifdef CONFIG_NETFILTER_XTABLES_COMPAT /* Called when userspace align differs from kernel space one */ void (*compat_from_user)(void *dst, const void *src); int (*compat_to_user)(void __user *dst, const void *src); #endif /* Set this to THIS_MODULE if you are a module, otherwise NULL */ struct module *me; const char *table; unsigned int targetsize; unsigned int usersize; #ifdef CONFIG_NETFILTER_XTABLES_COMPAT unsigned int compatsize; #endif unsigned int hooks; unsigned short proto; unsigned short family; }; /* Furniture shopping... */ struct xt_table { struct list_head list; /* What hooks you will enter on */ unsigned int valid_hooks; /* Man behind the curtain... */ struct xt_table_info *private; /* hook ops that register the table with the netfilter core */ struct nf_hook_ops *ops; /* Set this to THIS_MODULE if you are a module, otherwise NULL */ struct module *me; u_int8_t af; /* address/protocol family */ int priority; /* hook order */ /* A unique name... */ const char name[XT_TABLE_MAXNAMELEN]; }; #include <linux/netfilter_ipv4.h> /* The table itself */ struct xt_table_info { /* Size per table */ unsigned int size; /* Number of entries: FIXME. --RR */ unsigned int number; /* Initial number of entries. Needed for module usage count */ unsigned int initial_entries; /* Entry points and underflows */ unsigned int hook_entry[NF_INET_NUMHOOKS]; unsigned int underflow[NF_INET_NUMHOOKS]; /* * Number of user chains. Since tables cannot have loops, at most * @stacksize jumps (number of user chains) can possibly be made. */ unsigned int stacksize; void ***jumpstack; unsigned char entries[] __aligned(8); }; int xt_register_target(struct xt_target *target); void xt_unregister_target(struct xt_target *target); int xt_register_targets(struct xt_target *target, unsigned int n); void xt_unregister_targets(struct xt_target *target, unsigned int n); int xt_register_match(struct xt_match *target); void xt_unregister_match(struct xt_match *target); int xt_register_matches(struct xt_match *match, unsigned int n); void xt_unregister_matches(struct xt_match *match, unsigned int n); int xt_check_entry_offsets(const void *base, const char *elems, unsigned int target_offset, unsigned int next_offset); int xt_check_table_hooks(const struct xt_table_info *info, unsigned int valid_hooks); unsigned int *xt_alloc_entry_offsets(unsigned int size); bool xt_find_jump_offset(const unsigned int *offsets, unsigned int target, unsigned int size); int xt_check_proc_name(const char *name, unsigned int size); int xt_check_match(struct xt_mtchk_param *, unsigned int size, u16 proto, bool inv_proto); int xt_check_target(struct xt_tgchk_param *, unsigned int size, u16 proto, bool inv_proto); int xt_match_to_user(const struct xt_entry_match *m, struct xt_entry_match __user *u); int xt_target_to_user(const struct xt_entry_target *t, struct xt_entry_target __user *u); int xt_data_to_user(void __user *dst, const void *src, int usersize, int size, int aligned_size); void *xt_copy_counters(sockptr_t arg, unsigned int len, struct xt_counters_info *info); struct xt_counters *xt_counters_alloc(unsigned int counters); struct xt_table *xt_register_table(struct net *net, const struct xt_table *table, struct xt_table_info *bootstrap, struct xt_table_info *newinfo); void *xt_unregister_table(struct xt_table *table); struct xt_table_info *xt_replace_table(struct xt_table *table, unsigned int num_counters, struct xt_table_info *newinfo, int *error); struct xt_match *xt_find_match(u8 af, const char *name, u8 revision); struct xt_match *xt_request_find_match(u8 af, const char *name, u8 revision); struct xt_target *xt_request_find_target(u8 af, const char *name, u8 revision); int xt_find_revision(u8 af, const char *name, u8 revision, int target, int *err); struct xt_table *xt_find_table(struct net *net, u8 af, const char *name); struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af, const char *name); struct xt_table *xt_request_find_table_lock(struct net *net, u_int8_t af, const char *name); void xt_table_unlock(struct xt_table *t); int xt_proto_init(struct net *net, u_int8_t af); void xt_proto_fini(struct net *net, u_int8_t af); struct xt_table_info *xt_alloc_table_info(unsigned int size); void xt_free_table_info(struct xt_table_info *info); /** * xt_recseq - recursive seqcount for netfilter use * * Packet processing changes the seqcount only if no recursion happened * get_counters() can use read_seqcount_begin()/read_seqcount_retry(), * because we use the normal seqcount convention : * Low order bit set to 1 if a writer is active. */ DECLARE_PER_CPU(seqcount_t, xt_recseq); /* xt_tee_enabled - true if x_tables needs to handle reentrancy * * Enabled if current ip(6)tables ruleset has at least one -j TEE rule. */ extern struct static_key xt_tee_enabled; /** * xt_write_recseq_begin - start of a write section * * Begin packet processing : all readers must wait the end * 1) Must be called with preemption disabled * 2) softirqs must be disabled too (or we should use this_cpu_add()) * Returns : * 1 if no recursion on this cpu * 0 if recursion detected */ static inline unsigned int xt_write_recseq_begin(void) { unsigned int addend; /* * Low order bit of sequence is set if we already * called xt_write_recseq_begin(). */ addend = (__this_cpu_read(xt_recseq.sequence) + 1) & 1; /* * This is kind of a write_seqcount_begin(), but addend is 0 or 1 * We dont check addend value to avoid a test and conditional jump, * since addend is most likely 1 */ __this_cpu_add(xt_recseq.sequence, addend); smp_mb(); return addend; } /** * xt_write_recseq_end - end of a write section * @addend: return value from previous xt_write_recseq_begin() * * End packet processing : all readers can proceed * 1) Must be called with preemption disabled * 2) softirqs must be disabled too (or we should use this_cpu_add()) */ static inline void xt_write_recseq_end(unsigned int addend) { /* this is kind of a write_seqcount_end(), but addend is 0 or 1 */ smp_wmb(); __this_cpu_add(xt_recseq.sequence, addend); } /* * This helper is performance critical and must be inlined */ static inline unsigned long ifname_compare_aligned(const char *_a, const char *_b, const char *_mask) { const unsigned long *a = (const unsigned long *)_a; const unsigned long *b = (const unsigned long *)_b; const unsigned long *mask = (const unsigned long *)_mask; unsigned long ret; ret = (a[0] ^ b[0]) & mask[0]; if (IFNAMSIZ > sizeof(unsigned long)) ret |= (a[1] ^ b[1]) & mask[1]; if (IFNAMSIZ > 2 * sizeof(unsigned long)) ret |= (a[2] ^ b[2]) & mask[2]; if (IFNAMSIZ > 3 * sizeof(unsigned long)) ret |= (a[3] ^ b[3]) & mask[3]; BUILD_BUG_ON(IFNAMSIZ > 4 * sizeof(unsigned long)); return ret; } struct xt_percpu_counter_alloc_state { unsigned int off; const char __percpu *mem; }; bool xt_percpu_counter_alloc(struct xt_percpu_counter_alloc_state *state, struct xt_counters *counter); void xt_percpu_counter_free(struct xt_counters *cnt); static inline struct xt_counters * xt_get_this_cpu_counter(struct xt_counters *cnt) { if (nr_cpu_ids > 1) return this_cpu_ptr((void __percpu *) (unsigned long) cnt->pcnt); return cnt; } static inline struct xt_counters * xt_get_per_cpu_counter(struct xt_counters *cnt, unsigned int cpu) { if (nr_cpu_ids > 1) return per_cpu_ptr((void __percpu *) (unsigned long) cnt->pcnt, cpu); return cnt; } struct nf_hook_ops *xt_hook_ops_alloc(const struct xt_table *, nf_hookfn *); int xt_register_template(const struct xt_table *t, int(*table_init)(struct net *net)); void xt_unregister_template(const struct xt_table *t); #ifdef CONFIG_NETFILTER_XTABLES_COMPAT #include <net/compat.h> struct compat_xt_entry_match { union { struct { u_int16_t match_size; char name[XT_FUNCTION_MAXNAMELEN - 1]; u_int8_t revision; } user; struct { u_int16_t match_size; compat_uptr_t match; } kernel; u_int16_t match_size; } u; unsigned char data[]; }; struct compat_xt_entry_target { union { struct { u_int16_t target_size; char name[XT_FUNCTION_MAXNAMELEN - 1]; u_int8_t revision; } user; struct { u_int16_t target_size; compat_uptr_t target; } kernel; u_int16_t target_size; } u; unsigned char data[]; }; /* FIXME: this works only on 32 bit tasks * need to change whole approach in order to calculate align as function of * current task alignment */ struct compat_xt_counters { compat_u64 pcnt, bcnt; /* Packet and byte counters */ }; struct compat_xt_counters_info { char name[XT_TABLE_MAXNAMELEN]; compat_uint_t num_counters; struct compat_xt_counters counters[]; }; struct _compat_xt_align { __u8 u8; __u16 u16; __u32 u32; compat_u64 u64; }; #define COMPAT_XT_ALIGN(s) __ALIGN_KERNEL((s), __alignof__(struct _compat_xt_align)) void xt_compat_lock(u_int8_t af); void xt_compat_unlock(u_int8_t af); int xt_compat_add_offset(u_int8_t af, unsigned int offset, int delta); void xt_compat_flush_offsets(u_int8_t af); int xt_compat_init_offsets(u8 af, unsigned int number); int xt_compat_calc_jump(u_int8_t af, unsigned int offset); int xt_compat_match_offset(const struct xt_match *match); void xt_compat_match_from_user(struct xt_entry_match *m, void **dstptr, unsigned int *size); int xt_compat_match_to_user(const struct xt_entry_match *m, void __user **dstptr, unsigned int *size); int xt_compat_target_offset(const struct xt_target *target); void xt_compat_target_from_user(struct xt_entry_target *t, void **dstptr, unsigned int *size); int xt_compat_target_to_user(const struct xt_entry_target *t, void __user **dstptr, unsigned int *size); int xt_compat_check_entry_offsets(const void *base, const char *elems, unsigned int target_offset, unsigned int next_offset); #endif /* CONFIG_NETFILTER_XTABLES_COMPAT */ #endif /* _X_TABLES_H */ |
| 1 1 1 1 1 1 3 2 2 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 | // SPDX-License-Identifier: GPL-2.0 /* * Implement the manual drop-all-pagecache function */ #include <linux/pagemap.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/fs.h> #include <linux/writeback.h> #include <linux/sysctl.h> #include <linux/gfp.h> #include <linux/swap.h> #include "internal.h" /* A global variable is a bit ugly, but it keeps the code simple */ int sysctl_drop_caches; static void drop_pagecache_sb(struct super_block *sb, void *unused) { struct inode *inode, *toput_inode = NULL; spin_lock(&sb->s_inode_list_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { spin_lock(&inode->i_lock); /* * We must skip inodes in unusual state. We may also skip * inodes without pages but we deliberately won't in case * we need to reschedule to avoid softlockups. */ if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || (mapping_empty(inode->i_mapping) && !need_resched())) { spin_unlock(&inode->i_lock); continue; } __iget(inode); spin_unlock(&inode->i_lock); spin_unlock(&sb->s_inode_list_lock); invalidate_mapping_pages(inode->i_mapping, 0, -1); iput(toput_inode); toput_inode = inode; cond_resched(); spin_lock(&sb->s_inode_list_lock); } spin_unlock(&sb->s_inode_list_lock); iput(toput_inode); } int drop_caches_sysctl_handler(struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { int ret; ret = proc_dointvec_minmax(table, write, buffer, length, ppos); if (ret) return ret; if (write) { static int stfu; if (sysctl_drop_caches & 1) { lru_add_drain_all(); iterate_supers(drop_pagecache_sb, NULL); count_vm_event(DROP_PAGECACHE); } if (sysctl_drop_caches & 2) { drop_slab(); count_vm_event(DROP_SLAB); } if (!stfu) { pr_info("%s (%d): drop_caches: %d\n", current->comm, task_pid_nr(current), sysctl_drop_caches); } stfu |= sysctl_drop_caches & 4; } return 0; } |
| 8 39 39 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Global definitions for the ARP (RFC 826) protocol. * * Version: @(#)if_arp.h 1.0.1 04/16/93 * * Authors: Original taken from Berkeley UNIX 4.3, (c) UCB 1986-1988 * Portions taken from the KA9Q/NOS (v2.00m PA0GRI) source. * Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Florian La Roche, * Jonathan Layes <layes@loran.com> * Arnaldo Carvalho de Melo <acme@conectiva.com.br> ARPHRD_HWX25 */ #ifndef _LINUX_IF_ARP_H #define _LINUX_IF_ARP_H #include <linux/skbuff.h> #include <uapi/linux/if_arp.h> static inline struct arphdr *arp_hdr(const struct sk_buff *skb) { return (struct arphdr *)skb_network_header(skb); } static inline unsigned int arp_hdr_len(const struct net_device *dev) { switch (dev->type) { #if IS_ENABLED(CONFIG_FIREWIRE_NET) case ARPHRD_IEEE1394: /* ARP header, device address and 2 IP addresses */ return sizeof(struct arphdr) + dev->addr_len + sizeof(u32) * 2; #endif default: /* ARP header, plus 2 device addresses, plus 2 IP addresses. */ return sizeof(struct arphdr) + (dev->addr_len + sizeof(u32)) * 2; } } static inline bool dev_is_mac_header_xmit(const struct net_device *dev) { switch (dev->type) { case ARPHRD_TUNNEL: case ARPHRD_TUNNEL6: case ARPHRD_SIT: case ARPHRD_IPGRE: case ARPHRD_IP6GRE: case ARPHRD_VOID: case ARPHRD_NONE: case ARPHRD_RAWIP: case ARPHRD_PIMREG: /* PPP adds its l2 header automatically in ppp_start_xmit(). * This makes it look like an l3 device to __bpf_redirect() and tcf_mirred_init(). */ case ARPHRD_PPP: return false; default: return true; } } #endif /* _LINUX_IF_ARP_H */ |
| 1 2 2 1 1 1 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 | // SPDX-License-Identifier: GPL-2.0 #ifndef IOU_FILE_TABLE_H #define IOU_FILE_TABLE_H #include <linux/file.h> #include <linux/io_uring_types.h> bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files); void io_free_file_tables(struct io_file_table *table); int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags, struct file *file, unsigned int file_slot); int __io_fixed_fd_install(struct io_ring_ctx *ctx, struct file *file, unsigned int file_slot); int io_fixed_fd_remove(struct io_ring_ctx *ctx, unsigned int offset); int io_register_file_alloc_range(struct io_ring_ctx *ctx, struct io_uring_file_index_range __user *arg); io_req_flags_t io_file_get_flags(struct file *file); static inline void io_file_bitmap_clear(struct io_file_table *table, int bit) { WARN_ON_ONCE(!test_bit(bit, table->bitmap)); __clear_bit(bit, table->bitmap); table->alloc_hint = bit; } static inline void io_file_bitmap_set(struct io_file_table *table, int bit) { WARN_ON_ONCE(test_bit(bit, table->bitmap)); __set_bit(bit, table->bitmap); table->alloc_hint = bit + 1; } static inline struct io_fixed_file * io_fixed_file_slot(struct io_file_table *table, unsigned i) { return &table->files[i]; } #define FFS_NOWAIT 0x1UL #define FFS_ISREG 0x2UL #define FFS_MASK ~(FFS_NOWAIT|FFS_ISREG) static inline unsigned int io_slot_flags(struct io_fixed_file *slot) { return (slot->file_ptr & ~FFS_MASK) << REQ_F_SUPPORT_NOWAIT_BIT; } static inline struct file *io_slot_file(struct io_fixed_file *slot) { return (struct file *)(slot->file_ptr & FFS_MASK); } static inline struct file *io_file_from_index(struct io_file_table *table, int index) { return io_slot_file(io_fixed_file_slot(table, index)); } static inline void io_fixed_file_set(struct io_fixed_file *file_slot, struct file *file) { file_slot->file_ptr = (unsigned long)file | (io_file_get_flags(file) >> REQ_F_SUPPORT_NOWAIT_BIT); } static inline void io_reset_alloc_hint(struct io_ring_ctx *ctx) { ctx->file_table.alloc_hint = ctx->file_alloc_start; } static inline void io_file_table_set_alloc_range(struct io_ring_ctx *ctx, unsigned off, unsigned len) { ctx->file_alloc_start = off; ctx->file_alloc_end = off + len; io_reset_alloc_hint(ctx); } #endif |
| 16 17 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Virtio memory mapped device driver * * Copyright 2011-2014, ARM Ltd. * * This module allows virtio devices to be used over a virtual, memory mapped * platform device. * * The guest device(s) may be instantiated in one of three equivalent ways: * * 1. Static platform device in board's code, eg.: * * static struct platform_device v2m_virtio_device = { * .name = "virtio-mmio", * .id = -1, * .num_resources = 2, * .resource = (struct resource []) { * { * .start = 0x1001e000, * .end = 0x1001e0ff, * .flags = IORESOURCE_MEM, * }, { * .start = 42 + 32, * .end = 42 + 32, * .flags = IORESOURCE_IRQ, * }, * } * }; * * 2. Device Tree node, eg.: * * virtio_block@1e000 { * compatible = "virtio,mmio"; * reg = <0x1e000 0x100>; * interrupts = <42>; * } * * 3. Kernel module (or command line) parameter. Can be used more than once - * one device will be created for each one. Syntax: * * [virtio_mmio.]device=<size>@<baseaddr>:<irq>[:<id>] * where: * <size> := size (can use standard suffixes like K, M or G) * <baseaddr> := physical base address * <irq> := interrupt number (as passed to request_irq()) * <id> := (optional) platform device id * eg.: * virtio_mmio.device=0x100@0x100b0000:48 \ * virtio_mmio.device=1K@0x1001e000:74 * * Based on Virtio PCI driver by Anthony Liguori, copyright IBM Corp. 2007 */ #define pr_fmt(fmt) "virtio-mmio: " fmt #include <linux/acpi.h> #include <linux/dma-mapping.h> #include <linux/highmem.h> #include <linux/interrupt.h> #include <linux/io.h> #include <linux/list.h> #include <linux/module.h> #include <linux/of.h> #include <linux/platform_device.h> #include <linux/pm.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/virtio.h> #include <linux/virtio_config.h> #include <uapi/linux/virtio_mmio.h> #include <linux/virtio_ring.h> /* The alignment to use between consumer and producer parts of vring. * Currently hardcoded to the page size. */ #define VIRTIO_MMIO_VRING_ALIGN PAGE_SIZE #define to_virtio_mmio_device(_plat_dev) \ container_of(_plat_dev, struct virtio_mmio_device, vdev) struct virtio_mmio_device { struct virtio_device vdev; struct platform_device *pdev; void __iomem *base; unsigned long version; /* a list of queues so we can dispatch IRQs */ spinlock_t lock; struct list_head virtqueues; }; struct virtio_mmio_vq_info { /* the actual virtqueue */ struct virtqueue *vq; /* the list node for the virtqueues list */ struct list_head node; }; /* Configuration interface */ static u64 vm_get_features(struct virtio_device *vdev) { struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev); u64 features; writel(1, vm_dev->base + VIRTIO_MMIO_DEVICE_FEATURES_SEL); features = readl(vm_dev->base + VIRTIO_MMIO_DEVICE_FEATURES); features <<= 32; writel(0, vm_dev->base + VIRTIO_MMIO_DEVICE_FEATURES_SEL); features |= readl(vm_dev->base + VIRTIO_MMIO_DEVICE_FEATURES); return features; } static int vm_finalize_features(struct virtio_device *vdev) { struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev); /* Give virtio_ring a chance to accept features. */ vring_transport_features(vdev); /* Make sure there are no mixed devices */ if (vm_dev->version == 2 && !__virtio_test_bit(vdev, VIRTIO_F_VERSION_1)) { dev_err(&vdev->dev, "New virtio-mmio devices (version 2) must provide VIRTIO_F_VERSION_1 feature!\n"); return -EINVAL; } writel(1, vm_dev->base + VIRTIO_MMIO_DRIVER_FEATURES_SEL); writel((u32)(vdev->features >> 32), vm_dev->base + VIRTIO_MMIO_DRIVER_FEATURES); writel(0, vm_dev->base + VIRTIO_MMIO_DRIVER_FEATURES_SEL); writel((u32)vdev->features, vm_dev->base + VIRTIO_MMIO_DRIVER_FEATURES); return 0; } static void vm_get(struct virtio_device *vdev, unsigned int offset, void *buf, unsigned int len) { struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev); void __iomem *base = vm_dev->base + VIRTIO_MMIO_CONFIG; u8 b; __le16 w; __le32 l; if (vm_dev->version == 1) { u8 *ptr = buf; int i; for (i = 0; i < len; i++) ptr[i] = readb(base + offset + i); return; } switch (len) { case 1: b = readb(base + offset); memcpy(buf, &b, sizeof b); break; case 2: w = cpu_to_le16(readw(base + offset)); memcpy(buf, &w, sizeof w); break; case 4: l = cpu_to_le32(readl(base + offset)); memcpy(buf, &l, sizeof l); break; case 8: l = cpu_to_le32(readl(base + offset)); memcpy(buf, &l, sizeof l); l = cpu_to_le32(ioread32(base + offset + sizeof l)); memcpy(buf + sizeof l, &l, sizeof l); break; default: BUG(); } } static void vm_set(struct virtio_device *vdev, unsigned int offset, const void *buf, unsigned int len) { struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev); void __iomem *base = vm_dev->base + VIRTIO_MMIO_CONFIG; u8 b; __le16 w; __le32 l; if (vm_dev->version == 1) { const u8 *ptr = buf; int i; for (i = 0; i < len; i++) writeb(ptr[i], base + offset + i); return; } switch (len) { case 1: memcpy(&b, buf, sizeof b); writeb(b, base + offset); break; case 2: memcpy(&w, buf, sizeof w); writew(le16_to_cpu(w), base + offset); break; case 4: memcpy(&l, buf, sizeof l); writel(le32_to_cpu(l), base + offset); break; case 8: memcpy(&l, buf, sizeof l); writel(le32_to_cpu(l), base + offset); memcpy(&l, buf + sizeof l, sizeof l); writel(le32_to_cpu(l), base + offset + sizeof l); break; default: BUG(); } } static u32 vm_generation(struct virtio_device *vdev) { struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev); if (vm_dev->version == 1) return 0; else return readl(vm_dev->base + VIRTIO_MMIO_CONFIG_GENERATION); } static u8 vm_get_status(struct virtio_device *vdev) { struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev); return readl(vm_dev->base + VIRTIO_MMIO_STATUS) & 0xff; } static void vm_set_status(struct virtio_device *vdev, u8 status) { struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev); /* We should never be setting status to 0. */ BUG_ON(status == 0); /* * Per memory-barriers.txt, wmb() is not needed to guarantee * that the cache coherent memory writes have completed * before writing to the MMIO region. */ writel(status, vm_dev->base + VIRTIO_MMIO_STATUS); } static void vm_reset(struct virtio_device *vdev) { struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev); /* 0 status means a reset. */ writel(0, vm_dev->base + VIRTIO_MMIO_STATUS); } /* Transport interface */ /* the notify function used when creating a virt queue */ static bool vm_notify(struct virtqueue *vq) { struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vq->vdev); /* We write the queue's selector into the notification register to * signal the other end */ writel(vq->index, vm_dev->base + VIRTIO_MMIO_QUEUE_NOTIFY); return true; } static bool vm_notify_with_data(struct virtqueue *vq) { struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vq->vdev); u32 data = vring_notification_data(vq); writel(data, vm_dev->base + VIRTIO_MMIO_QUEUE_NOTIFY); return true; } /* Notify all virtqueues on an interrupt. */ static irqreturn_t vm_interrupt(int irq, void *opaque) { struct virtio_mmio_device *vm_dev = opaque; struct virtio_mmio_vq_info *info; unsigned long status; unsigned long flags; irqreturn_t ret = IRQ_NONE; /* Read and acknowledge interrupts */ status = readl(vm_dev->base + VIRTIO_MMIO_INTERRUPT_STATUS); writel(status, vm_dev->base + VIRTIO_MMIO_INTERRUPT_ACK); if (unlikely(status & VIRTIO_MMIO_INT_CONFIG)) { virtio_config_changed(&vm_dev->vdev); ret = IRQ_HANDLED; } if (likely(status & VIRTIO_MMIO_INT_VRING)) { spin_lock_irqsave(&vm_dev->lock, flags); list_for_each_entry(info, &vm_dev->virtqueues, node) ret |= vring_interrupt(irq, info->vq); spin_unlock_irqrestore(&vm_dev->lock, flags); } return ret; } static void vm_del_vq(struct virtqueue *vq) { struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vq->vdev); struct virtio_mmio_vq_info *info = vq->priv; unsigned long flags; unsigned int index = vq->index; spin_lock_irqsave(&vm_dev->lock, flags); list_del(&info->node); spin_unlock_irqrestore(&vm_dev->lock, flags); /* Select and deactivate the queue */ writel(index, vm_dev->base + VIRTIO_MMIO_QUEUE_SEL); if (vm_dev->version == 1) { writel(0, vm_dev->base + VIRTIO_MMIO_QUEUE_PFN); } else { writel(0, vm_dev->base + VIRTIO_MMIO_QUEUE_READY); WARN_ON(readl(vm_dev->base + VIRTIO_MMIO_QUEUE_READY)); } vring_del_virtqueue(vq); kfree(info); } static void vm_del_vqs(struct virtio_device *vdev) { struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev); struct virtqueue *vq, *n; list_for_each_entry_safe(vq, n, &vdev->vqs, list) vm_del_vq(vq); free_irq(platform_get_irq(vm_dev->pdev, 0), vm_dev); } static void vm_synchronize_cbs(struct virtio_device *vdev) { struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev); synchronize_irq(platform_get_irq(vm_dev->pdev, 0)); } static struct virtqueue *vm_setup_vq(struct virtio_device *vdev, unsigned int index, void (*callback)(struct virtqueue *vq), const char *name, bool ctx) { struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev); bool (*notify)(struct virtqueue *vq); struct virtio_mmio_vq_info *info; struct virtqueue *vq; unsigned long flags; unsigned int num; int err; if (__virtio_test_bit(vdev, VIRTIO_F_NOTIFICATION_DATA)) notify = vm_notify_with_data; else notify = vm_notify; if (!name) return NULL; /* Select the queue we're interested in */ writel(index, vm_dev->base + VIRTIO_MMIO_QUEUE_SEL); /* Queue shouldn't already be set up. */ if (readl(vm_dev->base + (vm_dev->version == 1 ? VIRTIO_MMIO_QUEUE_PFN : VIRTIO_MMIO_QUEUE_READY))) { err = -ENOENT; goto error_available; } /* Allocate and fill out our active queue description */ info = kmalloc(sizeof(*info), GFP_KERNEL); if (!info) { err = -ENOMEM; goto error_kmalloc; } num = readl(vm_dev->base + VIRTIO_MMIO_QUEUE_NUM_MAX); if (num == 0) { err = -ENOENT; goto error_new_virtqueue; } /* Create the vring */ vq = vring_create_virtqueue(index, num, VIRTIO_MMIO_VRING_ALIGN, vdev, true, true, ctx, notify, callback, name); if (!vq) { err = -ENOMEM; goto error_new_virtqueue; } vq->num_max = num; /* Activate the queue */ writel(virtqueue_get_vring_size(vq), vm_dev->base + VIRTIO_MMIO_QUEUE_NUM); if (vm_dev->version == 1) { u64 q_pfn = virtqueue_get_desc_addr(vq) >> PAGE_SHIFT; /* * virtio-mmio v1 uses a 32bit QUEUE PFN. If we have something * that doesn't fit in 32bit, fail the setup rather than * pretending to be successful. */ if (q_pfn >> 32) { dev_err(&vdev->dev, "platform bug: legacy virtio-mmio must not be used with RAM above 0x%llxGB\n", 0x1ULL << (32 + PAGE_SHIFT - 30)); err = -E2BIG; goto error_bad_pfn; } writel(PAGE_SIZE, vm_dev->base + VIRTIO_MMIO_QUEUE_ALIGN); writel(q_pfn, vm_dev->base + VIRTIO_MMIO_QUEUE_PFN); } else { u64 addr; addr = virtqueue_get_desc_addr(vq); writel((u32)addr, vm_dev->base + VIRTIO_MMIO_QUEUE_DESC_LOW); writel((u32)(addr >> 32), vm_dev->base + VIRTIO_MMIO_QUEUE_DESC_HIGH); addr = virtqueue_get_avail_addr(vq); writel((u32)addr, vm_dev->base + VIRTIO_MMIO_QUEUE_AVAIL_LOW); writel((u32)(addr >> 32), vm_dev->base + VIRTIO_MMIO_QUEUE_AVAIL_HIGH); addr = virtqueue_get_used_addr(vq); writel((u32)addr, vm_dev->base + VIRTIO_MMIO_QUEUE_USED_LOW); writel((u32)(addr >> 32), vm_dev->base + VIRTIO_MMIO_QUEUE_USED_HIGH); writel(1, vm_dev->base + VIRTIO_MMIO_QUEUE_READY); } vq->priv = info; info->vq = vq; spin_lock_irqsave(&vm_dev->lock, flags); list_add(&info->node, &vm_dev->virtqueues); spin_unlock_irqrestore(&vm_dev->lock, flags); return vq; error_bad_pfn: vring_del_virtqueue(vq); error_new_virtqueue: if (vm_dev->version == 1) { writel(0, vm_dev->base + VIRTIO_MMIO_QUEUE_PFN); } else { writel(0, vm_dev->base + VIRTIO_MMIO_QUEUE_READY); WARN_ON(readl(vm_dev->base + VIRTIO_MMIO_QUEUE_READY)); } kfree(info); error_kmalloc: error_available: return ERR_PTR(err); } static int vm_find_vqs(struct virtio_device *vdev, unsigned int nvqs, struct virtqueue *vqs[], vq_callback_t *callbacks[], const char * const names[], const bool *ctx, struct irq_affinity *desc) { struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev); int irq = platform_get_irq(vm_dev->pdev, 0); int i, err, queue_idx = 0; if (irq < 0) return irq; err = request_irq(irq, vm_interrupt, IRQF_SHARED, dev_name(&vdev->dev), vm_dev); if (err) return err; if (of_property_read_bool(vm_dev->pdev->dev.of_node, "wakeup-source")) enable_irq_wake(irq); for (i = 0; i < nvqs; ++i) { if (!names[i]) { vqs[i] = NULL; continue; } vqs[i] = vm_setup_vq(vdev, queue_idx++, callbacks[i], names[i], ctx ? ctx[i] : false); if (IS_ERR(vqs[i])) { vm_del_vqs(vdev); return PTR_ERR(vqs[i]); } } return 0; } static const char *vm_bus_name(struct virtio_device *vdev) { struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev); return vm_dev->pdev->name; } static bool vm_get_shm_region(struct virtio_device *vdev, struct virtio_shm_region *region, u8 id) { struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev); u64 len, addr; /* Select the region we're interested in */ writel(id, vm_dev->base + VIRTIO_MMIO_SHM_SEL); /* Read the region size */ len = (u64) readl(vm_dev->base + VIRTIO_MMIO_SHM_LEN_LOW); len |= (u64) readl(vm_dev->base + VIRTIO_MMIO_SHM_LEN_HIGH) << 32; region->len = len; /* Check if region length is -1. If that's the case, the shared memory * region does not exist and there is no need to proceed further. */ if (len == ~(u64)0) return false; /* Read the region base address */ addr = (u64) readl(vm_dev->base + VIRTIO_MMIO_SHM_BASE_LOW); addr |= (u64) readl(vm_dev->base + VIRTIO_MMIO_SHM_BASE_HIGH) << 32; region->addr = addr; return true; } static const struct virtio_config_ops virtio_mmio_config_ops = { .get = vm_get, .set = vm_set, .generation = vm_generation, .get_status = vm_get_status, .set_status = vm_set_status, .reset = vm_reset, .find_vqs = vm_find_vqs, .del_vqs = vm_del_vqs, .get_features = vm_get_features, .finalize_features = vm_finalize_features, .bus_name = vm_bus_name, .get_shm_region = vm_get_shm_region, .synchronize_cbs = vm_synchronize_cbs, }; #ifdef CONFIG_PM_SLEEP static int virtio_mmio_freeze(struct device *dev) { struct virtio_mmio_device *vm_dev = dev_get_drvdata(dev); return virtio_device_freeze(&vm_dev->vdev); } static int virtio_mmio_restore(struct device *dev) { struct virtio_mmio_device *vm_dev = dev_get_drvdata(dev); if (vm_dev->version == 1) writel(PAGE_SIZE, vm_dev->base + VIRTIO_MMIO_GUEST_PAGE_SIZE); return virtio_device_restore(&vm_dev->vdev); } static const struct dev_pm_ops virtio_mmio_pm_ops = { SET_SYSTEM_SLEEP_PM_OPS(virtio_mmio_freeze, virtio_mmio_restore) }; #endif static void virtio_mmio_release_dev(struct device *_d) { struct virtio_device *vdev = container_of(_d, struct virtio_device, dev); struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev); kfree(vm_dev); } /* Platform device */ static int virtio_mmio_probe(struct platform_device *pdev) { struct virtio_mmio_device *vm_dev; unsigned long magic; int rc; vm_dev = kzalloc(sizeof(*vm_dev), GFP_KERNEL); if (!vm_dev) return -ENOMEM; vm_dev->vdev.dev.parent = &pdev->dev; vm_dev->vdev.dev.release = virtio_mmio_release_dev; vm_dev->vdev.config = &virtio_mmio_config_ops; vm_dev->pdev = pdev; INIT_LIST_HEAD(&vm_dev->virtqueues); spin_lock_init(&vm_dev->lock); vm_dev->base = devm_platform_ioremap_resource(pdev, 0); if (IS_ERR(vm_dev->base)) { rc = PTR_ERR(vm_dev->base); goto free_vm_dev; } /* Check magic value */ magic = readl(vm_dev->base + VIRTIO_MMIO_MAGIC_VALUE); if (magic != ('v' | 'i' << 8 | 'r' << 16 | 't' << 24)) { dev_warn(&pdev->dev, "Wrong magic value 0x%08lx!\n", magic); rc = -ENODEV; goto free_vm_dev; } /* Check device version */ vm_dev->version = readl(vm_dev->base + VIRTIO_MMIO_VERSION); if (vm_dev->version < 1 || vm_dev->version > 2) { dev_err(&pdev->dev, "Version %ld not supported!\n", vm_dev->version); rc = -ENXIO; goto free_vm_dev; } vm_dev->vdev.id.device = readl(vm_dev->base + VIRTIO_MMIO_DEVICE_ID); if (vm_dev->vdev.id.device == 0) { /* * virtio-mmio device with an ID 0 is a (dummy) placeholder * with no function. End probing now with no error reported. */ rc = -ENODEV; goto free_vm_dev; } vm_dev->vdev.id.vendor = readl(vm_dev->base + VIRTIO_MMIO_VENDOR_ID); if (vm_dev->version == 1) { writel(PAGE_SIZE, vm_dev->base + VIRTIO_MMIO_GUEST_PAGE_SIZE); rc = dma_set_mask(&pdev->dev, DMA_BIT_MASK(64)); /* * In the legacy case, ensure our coherently-allocated virtio * ring will be at an address expressable as a 32-bit PFN. */ if (!rc) dma_set_coherent_mask(&pdev->dev, DMA_BIT_MASK(32 + PAGE_SHIFT)); } else { rc = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)); } if (rc) rc = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32)); if (rc) dev_warn(&pdev->dev, "Failed to enable 64-bit or 32-bit DMA. Trying to continue, but this might not work.\n"); platform_set_drvdata(pdev, vm_dev); rc = register_virtio_device(&vm_dev->vdev); if (rc) put_device(&vm_dev->vdev.dev); return rc; free_vm_dev: kfree(vm_dev); return rc; } static void virtio_mmio_remove(struct platform_device *pdev) { struct virtio_mmio_device *vm_dev = platform_get_drvdata(pdev); unregister_virtio_device(&vm_dev->vdev); } /* Devices list parameter */ #if defined(CONFIG_VIRTIO_MMIO_CMDLINE_DEVICES) static struct device vm_cmdline_parent = { .init_name = "virtio-mmio-cmdline", }; static int vm_cmdline_parent_registered; static int vm_cmdline_id; static int vm_cmdline_set(const char *device, const struct kernel_param *kp) { int err; struct resource resources[2] = {}; char *str; long long base, size; unsigned int irq; int processed, consumed = 0; struct platform_device *pdev; /* Consume "size" part of the command line parameter */ size = memparse(device, &str); /* Get "@<base>:<irq>[:<id>]" chunks */ processed = sscanf(str, "@%lli:%u%n:%d%n", &base, &irq, &consumed, &vm_cmdline_id, &consumed); /* * sscanf() must process at least 2 chunks; also there * must be no extra characters after the last chunk, so * str[consumed] must be '\0' */ if (processed < 2 || str[consumed] || irq == 0) return -EINVAL; resources[0].flags = IORESOURCE_MEM; resources[0].start = base; resources[0].end = base + size - 1; resources[1].flags = IORESOURCE_IRQ; resources[1].start = resources[1].end = irq; if (!vm_cmdline_parent_registered) { err = device_register(&vm_cmdline_parent); if (err) { put_device(&vm_cmdline_parent); pr_err("Failed to register parent device!\n"); return err; } vm_cmdline_parent_registered = 1; } pr_info("Registering device virtio-mmio.%d at 0x%llx-0x%llx, IRQ %d.\n", vm_cmdline_id, (unsigned long long)resources[0].start, (unsigned long long)resources[0].end, (int)resources[1].start); pdev = platform_device_register_resndata(&vm_cmdline_parent, "virtio-mmio", vm_cmdline_id++, resources, ARRAY_SIZE(resources), NULL, 0); return PTR_ERR_OR_ZERO(pdev); } static int vm_cmdline_get_device(struct device *dev, void *data) { char *buffer = data; unsigned int len = strlen(buffer); struct platform_device *pdev = to_platform_device(dev); snprintf(buffer + len, PAGE_SIZE - len, "0x%llx@0x%llx:%llu:%d\n", pdev->resource[0].end - pdev->resource[0].start + 1ULL, (unsigned long long)pdev->resource[0].start, (unsigned long long)pdev->resource[1].start, pdev->id); return 0; } static int vm_cmdline_get(char *buffer, const struct kernel_param *kp) { buffer[0] = '\0'; device_for_each_child(&vm_cmdline_parent, buffer, vm_cmdline_get_device); return strlen(buffer) + 1; } static const struct kernel_param_ops vm_cmdline_param_ops = { .set = vm_cmdline_set, .get = vm_cmdline_get, }; device_param_cb(device, &vm_cmdline_param_ops, NULL, S_IRUSR); static int vm_unregister_cmdline_device(struct device *dev, void *data) { platform_device_unregister(to_platform_device(dev)); return 0; } static void vm_unregister_cmdline_devices(void) { if (vm_cmdline_parent_registered) { device_for_each_child(&vm_cmdline_parent, NULL, vm_unregister_cmdline_device); device_unregister(&vm_cmdline_parent); vm_cmdline_parent_registered = 0; } } #else static void vm_unregister_cmdline_devices(void) { } #endif /* Platform driver */ static const struct of_device_id virtio_mmio_match[] = { { .compatible = "virtio,mmio", }, {}, }; MODULE_DEVICE_TABLE(of, virtio_mmio_match); #ifdef CONFIG_ACPI static const struct acpi_device_id virtio_mmio_acpi_match[] = { { "LNRO0005", }, { } }; MODULE_DEVICE_TABLE(acpi, virtio_mmio_acpi_match); #endif static struct platform_driver virtio_mmio_driver = { .probe = virtio_mmio_probe, .remove_new = virtio_mmio_remove, .driver = { .name = "virtio-mmio", .of_match_table = virtio_mmio_match, .acpi_match_table = ACPI_PTR(virtio_mmio_acpi_match), #ifdef CONFIG_PM_SLEEP .pm = &virtio_mmio_pm_ops, #endif }, }; static int __init virtio_mmio_init(void) { return platform_driver_register(&virtio_mmio_driver); } static void __exit virtio_mmio_exit(void) { platform_driver_unregister(&virtio_mmio_driver); vm_unregister_cmdline_devices(); } module_init(virtio_mmio_init); module_exit(virtio_mmio_exit); MODULE_AUTHOR("Pawel Moll <pawel.moll@arm.com>"); MODULE_DESCRIPTION("Platform bus driver for memory mapped virtio devices"); MODULE_LICENSE("GPL"); |
| 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 | // SPDX-License-Identifier: GPL-2.0 /* Shared Memory Communications Direct over ISM devices (SMC-D) * * Functions for ISM device. * * Copyright IBM Corp. 2018 */ #include <linux/if_vlan.h> #include <linux/spinlock.h> #include <linux/mutex.h> #include <linux/slab.h> #include <asm/page.h> #include "smc.h" #include "smc_core.h" #include "smc_ism.h" #include "smc_pnet.h" #include "smc_netlink.h" #include "linux/ism.h" struct smcd_dev_list smcd_dev_list = { .list = LIST_HEAD_INIT(smcd_dev_list.list), .mutex = __MUTEX_INITIALIZER(smcd_dev_list.mutex) }; static bool smc_ism_v2_capable; static u8 smc_ism_v2_system_eid[SMC_MAX_EID_LEN]; #if IS_ENABLED(CONFIG_ISM) static void smcd_register_dev(struct ism_dev *ism); static void smcd_unregister_dev(struct ism_dev *ism); static void smcd_handle_event(struct ism_dev *ism, struct ism_event *event); static void smcd_handle_irq(struct ism_dev *ism, unsigned int dmbno, u16 dmbemask); static struct ism_client smc_ism_client = { .name = "SMC-D", .add = smcd_register_dev, .remove = smcd_unregister_dev, .handle_event = smcd_handle_event, .handle_irq = smcd_handle_irq, }; #endif static void smc_ism_create_system_eid(void) { struct smc_ism_seid *seid = (struct smc_ism_seid *)smc_ism_v2_system_eid; #if IS_ENABLED(CONFIG_S390) struct cpuid id; u16 ident_tail; char tmp[5]; memcpy(seid->seid_string, "IBM-SYSZ-ISMSEID00000000", 24); get_cpu_id(&id); ident_tail = (u16)(id.ident & SMC_ISM_IDENT_MASK); snprintf(tmp, 5, "%04X", ident_tail); memcpy(seid->serial_number, tmp, 4); snprintf(tmp, 5, "%04X", id.machine); memcpy(seid->type, tmp, 4); #else memset(seid, 0, SMC_MAX_EID_LEN); #endif } /* Test if an ISM communication is possible - same CPC */ int smc_ism_cantalk(struct smcd_gid *peer_gid, unsigned short vlan_id, struct smcd_dev *smcd) { return smcd->ops->query_remote_gid(smcd, peer_gid, vlan_id ? 1 : 0, vlan_id); } void smc_ism_get_system_eid(u8 **eid) { if (!smc_ism_v2_capable) *eid = NULL; else *eid = smc_ism_v2_system_eid; } u16 smc_ism_get_chid(struct smcd_dev *smcd) { return smcd->ops->get_chid(smcd); } /* HW supports ISM V2 and thus System EID is defined */ bool smc_ism_is_v2_capable(void) { return smc_ism_v2_capable; } void smc_ism_set_v2_capable(void) { smc_ism_v2_capable = true; } /* Set a connection using this DMBE. */ void smc_ism_set_conn(struct smc_connection *conn) { unsigned long flags; spin_lock_irqsave(&conn->lgr->smcd->lock, flags); conn->lgr->smcd->conn[conn->rmb_desc->sba_idx] = conn; spin_unlock_irqrestore(&conn->lgr->smcd->lock, flags); } /* Unset a connection using this DMBE. */ void smc_ism_unset_conn(struct smc_connection *conn) { unsigned long flags; if (!conn->rmb_desc) return; spin_lock_irqsave(&conn->lgr->smcd->lock, flags); conn->lgr->smcd->conn[conn->rmb_desc->sba_idx] = NULL; spin_unlock_irqrestore(&conn->lgr->smcd->lock, flags); } /* Register a VLAN identifier with the ISM device. Use a reference count * and add a VLAN identifier only when the first DMB using this VLAN is * registered. */ int smc_ism_get_vlan(struct smcd_dev *smcd, unsigned short vlanid) { struct smc_ism_vlanid *new_vlan, *vlan; unsigned long flags; int rc = 0; if (!vlanid) /* No valid vlan id */ return -EINVAL; if (!smcd->ops->add_vlan_id) return -EOPNOTSUPP; /* create new vlan entry, in case we need it */ new_vlan = kzalloc(sizeof(*new_vlan), GFP_KERNEL); if (!new_vlan) return -ENOMEM; new_vlan->vlanid = vlanid; refcount_set(&new_vlan->refcnt, 1); /* if there is an existing entry, increase count and return */ spin_lock_irqsave(&smcd->lock, flags); list_for_each_entry(vlan, &smcd->vlan, list) { if (vlan->vlanid == vlanid) { refcount_inc(&vlan->refcnt); kfree(new_vlan); goto out; } } /* no existing entry found. * add new entry to device; might fail, e.g., if HW limit reached */ if (smcd->ops->add_vlan_id(smcd, vlanid)) { kfree(new_vlan); rc = -EIO; goto out; } list_add_tail(&new_vlan->list, &smcd->vlan); out: spin_unlock_irqrestore(&smcd->lock, flags); return rc; } /* Unregister a VLAN identifier with the ISM device. Use a reference count * and remove a VLAN identifier only when the last DMB using this VLAN is * unregistered. */ int smc_ism_put_vlan(struct smcd_dev *smcd, unsigned short vlanid) { struct smc_ism_vlanid *vlan; unsigned long flags; bool found = false; int rc = 0; if (!vlanid) /* No valid vlan id */ return -EINVAL; if (!smcd->ops->del_vlan_id) return -EOPNOTSUPP; spin_lock_irqsave(&smcd->lock, flags); list_for_each_entry(vlan, &smcd->vlan, list) { if (vlan->vlanid == vlanid) { if (!refcount_dec_and_test(&vlan->refcnt)) goto out; found = true; break; } } if (!found) { rc = -ENOENT; goto out; /* VLAN id not in table */ } /* Found and the last reference just gone */ if (smcd->ops->del_vlan_id(smcd, vlanid)) rc = -EIO; list_del(&vlan->list); kfree(vlan); out: spin_unlock_irqrestore(&smcd->lock, flags); return rc; } int smc_ism_unregister_dmb(struct smcd_dev *smcd, struct smc_buf_desc *dmb_desc) { struct smcd_dmb dmb; int rc = 0; if (!dmb_desc->dma_addr) return rc; memset(&dmb, 0, sizeof(dmb)); dmb.dmb_tok = dmb_desc->token; dmb.sba_idx = dmb_desc->sba_idx; dmb.cpu_addr = dmb_desc->cpu_addr; dmb.dma_addr = dmb_desc->dma_addr; dmb.dmb_len = dmb_desc->len; rc = smcd->ops->unregister_dmb(smcd, &dmb); if (!rc || rc == ISM_ERROR) { dmb_desc->cpu_addr = NULL; dmb_desc->dma_addr = 0; } return rc; } int smc_ism_register_dmb(struct smc_link_group *lgr, int dmb_len, struct smc_buf_desc *dmb_desc) { struct smcd_dmb dmb; int rc; memset(&dmb, 0, sizeof(dmb)); dmb.dmb_len = dmb_len; dmb.sba_idx = dmb_desc->sba_idx; dmb.vlan_id = lgr->vlan_id; dmb.rgid = lgr->peer_gid.gid; rc = lgr->smcd->ops->register_dmb(lgr->smcd, &dmb, lgr->smcd->client); if (!rc) { dmb_desc->sba_idx = dmb.sba_idx; dmb_desc->token = dmb.dmb_tok; dmb_desc->cpu_addr = dmb.cpu_addr; dmb_desc->dma_addr = dmb.dma_addr; dmb_desc->len = dmb.dmb_len; } return rc; } bool smc_ism_support_dmb_nocopy(struct smcd_dev *smcd) { /* for now only loopback-ism supports * merging sndbuf with peer DMB to avoid * data copies between them. */ return (smcd->ops->support_dmb_nocopy && smcd->ops->support_dmb_nocopy(smcd)); } int smc_ism_attach_dmb(struct smcd_dev *dev, u64 token, struct smc_buf_desc *dmb_desc) { struct smcd_dmb dmb; int rc = 0; if (!dev->ops->attach_dmb) return -EINVAL; memset(&dmb, 0, sizeof(dmb)); dmb.dmb_tok = token; rc = dev->ops->attach_dmb(dev, &dmb); if (!rc) { dmb_desc->sba_idx = dmb.sba_idx; dmb_desc->token = dmb.dmb_tok; dmb_desc->cpu_addr = dmb.cpu_addr; dmb_desc->dma_addr = dmb.dma_addr; dmb_desc->len = dmb.dmb_len; } return rc; } int smc_ism_detach_dmb(struct smcd_dev *dev, u64 token) { if (!dev->ops->detach_dmb) return -EINVAL; return dev->ops->detach_dmb(dev, token); } static int smc_nl_handle_smcd_dev(struct smcd_dev *smcd, struct sk_buff *skb, struct netlink_callback *cb) { char smc_pnet[SMC_MAX_PNETID_LEN + 1]; struct smc_pci_dev smc_pci_dev; struct nlattr *port_attrs; struct nlattr *attrs; struct ism_dev *ism; int use_cnt = 0; void *nlh; ism = smcd->priv; nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &smc_gen_nl_family, NLM_F_MULTI, SMC_NETLINK_GET_DEV_SMCD); if (!nlh) goto errmsg; attrs = nla_nest_start(skb, SMC_GEN_DEV_SMCD); if (!attrs) goto errout; use_cnt = atomic_read(&smcd->lgr_cnt); if (nla_put_u32(skb, SMC_NLA_DEV_USE_CNT, use_cnt)) goto errattr; if (nla_put_u8(skb, SMC_NLA_DEV_IS_CRIT, use_cnt > 0)) goto errattr; memset(&smc_pci_dev, 0, sizeof(smc_pci_dev)); smc_set_pci_values(to_pci_dev(ism->dev.parent), &smc_pci_dev); if (nla_put_u32(skb, SMC_NLA_DEV_PCI_FID, smc_pci_dev.pci_fid)) goto errattr; if (nla_put_u16(skb, SMC_NLA_DEV_PCI_CHID, smc_pci_dev.pci_pchid)) goto errattr; if (nla_put_u16(skb, SMC_NLA_DEV_PCI_VENDOR, smc_pci_dev.pci_vendor)) goto errattr; if (nla_put_u16(skb, SMC_NLA_DEV_PCI_DEVICE, smc_pci_dev.pci_device)) goto errattr; if (nla_put_string(skb, SMC_NLA_DEV_PCI_ID, smc_pci_dev.pci_id)) goto errattr; port_attrs = nla_nest_start(skb, SMC_NLA_DEV_PORT); if (!port_attrs) goto errattr; if (nla_put_u8(skb, SMC_NLA_DEV_PORT_PNET_USR, smcd->pnetid_by_user)) goto errportattr; memcpy(smc_pnet, smcd->pnetid, SMC_MAX_PNETID_LEN); smc_pnet[SMC_MAX_PNETID_LEN] = 0; if (nla_put_string(skb, SMC_NLA_DEV_PORT_PNETID, smc_pnet)) goto errportattr; nla_nest_end(skb, port_attrs); nla_nest_end(skb, attrs); genlmsg_end(skb, nlh); return 0; errportattr: nla_nest_cancel(skb, port_attrs); errattr: nla_nest_cancel(skb, attrs); errout: nlmsg_cancel(skb, nlh); errmsg: return -EMSGSIZE; } static void smc_nl_prep_smcd_dev(struct smcd_dev_list *dev_list, struct sk_buff *skb, struct netlink_callback *cb) { struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); int snum = cb_ctx->pos[0]; struct smcd_dev *smcd; int num = 0; mutex_lock(&dev_list->mutex); list_for_each_entry(smcd, &dev_list->list, list) { if (num < snum) goto next; if (smc_ism_is_loopback(smcd)) goto next; if (smc_nl_handle_smcd_dev(smcd, skb, cb)) goto errout; next: num++; } errout: mutex_unlock(&dev_list->mutex); cb_ctx->pos[0] = num; } int smcd_nl_get_device(struct sk_buff *skb, struct netlink_callback *cb) { smc_nl_prep_smcd_dev(&smcd_dev_list, skb, cb); return skb->len; } #if IS_ENABLED(CONFIG_ISM) struct smc_ism_event_work { struct work_struct work; struct smcd_dev *smcd; struct ism_event event; }; #define ISM_EVENT_REQUEST 0x0001 #define ISM_EVENT_RESPONSE 0x0002 #define ISM_EVENT_REQUEST_IR 0x00000001 #define ISM_EVENT_CODE_SHUTDOWN 0x80 #define ISM_EVENT_CODE_TESTLINK 0x83 union smcd_sw_event_info { u64 info; struct { u8 uid[SMC_LGR_ID_SIZE]; unsigned short vlan_id; u16 code; }; }; static void smcd_handle_sw_event(struct smc_ism_event_work *wrk) { struct smcd_gid peer_gid = { .gid = wrk->event.tok, .gid_ext = 0 }; union smcd_sw_event_info ev_info; ev_info.info = wrk->event.info; switch (wrk->event.code) { case ISM_EVENT_CODE_SHUTDOWN: /* Peer shut down DMBs */ smc_smcd_terminate(wrk->smcd, &peer_gid, ev_info.vlan_id); break; case ISM_EVENT_CODE_TESTLINK: /* Activity timer */ if (ev_info.code == ISM_EVENT_REQUEST && wrk->smcd->ops->signal_event) { ev_info.code = ISM_EVENT_RESPONSE; wrk->smcd->ops->signal_event(wrk->smcd, &peer_gid, ISM_EVENT_REQUEST_IR, ISM_EVENT_CODE_TESTLINK, ev_info.info); } break; } } /* worker for SMC-D events */ static void smc_ism_event_work(struct work_struct *work) { struct smc_ism_event_work *wrk = container_of(work, struct smc_ism_event_work, work); struct smcd_gid smcd_gid = { .gid = wrk->event.tok, .gid_ext = 0 }; switch (wrk->event.type) { case ISM_EVENT_GID: /* GID event, token is peer GID */ smc_smcd_terminate(wrk->smcd, &smcd_gid, VLAN_VID_MASK); break; case ISM_EVENT_DMB: break; case ISM_EVENT_SWR: /* Software defined event */ smcd_handle_sw_event(wrk); break; } kfree(wrk); } static struct smcd_dev *smcd_alloc_dev(struct device *parent, const char *name, const struct smcd_ops *ops, int max_dmbs) { struct smcd_dev *smcd; smcd = devm_kzalloc(parent, sizeof(*smcd), GFP_KERNEL); if (!smcd) return NULL; smcd->conn = devm_kcalloc(parent, max_dmbs, sizeof(struct smc_connection *), GFP_KERNEL); if (!smcd->conn) return NULL; smcd->event_wq = alloc_ordered_workqueue("ism_evt_wq-%s)", WQ_MEM_RECLAIM, name); if (!smcd->event_wq) return NULL; smcd->ops = ops; spin_lock_init(&smcd->lock); spin_lock_init(&smcd->lgr_lock); INIT_LIST_HEAD(&smcd->vlan); INIT_LIST_HEAD(&smcd->lgr_list); init_waitqueue_head(&smcd->lgrs_deleted); return smcd; } static void smcd_register_dev(struct ism_dev *ism) { const struct smcd_ops *ops = ism_get_smcd_ops(); struct smcd_dev *smcd, *fentry; if (!ops) return; smcd = smcd_alloc_dev(&ism->pdev->dev, dev_name(&ism->pdev->dev), ops, ISM_NR_DMBS); if (!smcd) return; smcd->priv = ism; smcd->client = &smc_ism_client; ism_set_priv(ism, &smc_ism_client, smcd); if (smc_pnetid_by_dev_port(&ism->pdev->dev, 0, smcd->pnetid)) smc_pnetid_by_table_smcd(smcd); if (smcd->ops->supports_v2()) smc_ism_set_v2_capable(); mutex_lock(&smcd_dev_list.mutex); /* sort list: * - devices without pnetid before devices with pnetid; * - loopback-ism always at the very beginning; */ if (!smcd->pnetid[0]) { fentry = list_first_entry_or_null(&smcd_dev_list.list, struct smcd_dev, list); if (fentry && smc_ism_is_loopback(fentry)) list_add(&smcd->list, &fentry->list); else list_add(&smcd->list, &smcd_dev_list.list); } else { list_add_tail(&smcd->list, &smcd_dev_list.list); } mutex_unlock(&smcd_dev_list.mutex); pr_warn_ratelimited("smc: adding smcd device %s with pnetid %.16s%s\n", dev_name(&ism->dev), smcd->pnetid, smcd->pnetid_by_user ? " (user defined)" : ""); return; } static void smcd_unregister_dev(struct ism_dev *ism) { struct smcd_dev *smcd = ism_get_priv(ism, &smc_ism_client); pr_warn_ratelimited("smc: removing smcd device %s\n", dev_name(&ism->dev)); smcd->going_away = 1; smc_smcd_terminate_all(smcd); mutex_lock(&smcd_dev_list.mutex); list_del_init(&smcd->list); mutex_unlock(&smcd_dev_list.mutex); destroy_workqueue(smcd->event_wq); } /* SMCD Device event handler. Called from ISM device interrupt handler. * Parameters are ism device pointer, * - event->type (0 --> DMB, 1 --> GID), * - event->code (event code), * - event->tok (either DMB token when event type 0, or GID when event type 1) * - event->time (time of day) * - event->info (debug info). * * Context: * - Function called in IRQ context from ISM device driver event handler. */ static void smcd_handle_event(struct ism_dev *ism, struct ism_event *event) { struct smcd_dev *smcd = ism_get_priv(ism, &smc_ism_client); struct smc_ism_event_work *wrk; if (smcd->going_away) return; /* copy event to event work queue, and let it be handled there */ wrk = kmalloc(sizeof(*wrk), GFP_ATOMIC); if (!wrk) return; INIT_WORK(&wrk->work, smc_ism_event_work); wrk->smcd = smcd; wrk->event = *event; queue_work(smcd->event_wq, &wrk->work); } /* SMCD Device interrupt handler. Called from ISM device interrupt handler. * Parameters are the ism device pointer, DMB number, and the DMBE bitmask. * Find the connection and schedule the tasklet for this connection. * * Context: * - Function called in IRQ context from ISM device driver IRQ handler. */ static void smcd_handle_irq(struct ism_dev *ism, unsigned int dmbno, u16 dmbemask) { struct smcd_dev *smcd = ism_get_priv(ism, &smc_ism_client); struct smc_connection *conn = NULL; unsigned long flags; spin_lock_irqsave(&smcd->lock, flags); conn = smcd->conn[dmbno]; if (conn && !conn->killed) tasklet_schedule(&conn->rx_tsklet); spin_unlock_irqrestore(&smcd->lock, flags); } #endif int smc_ism_signal_shutdown(struct smc_link_group *lgr) { int rc = 0; #if IS_ENABLED(CONFIG_ISM) union smcd_sw_event_info ev_info; if (lgr->peer_shutdown) return 0; if (!lgr->smcd->ops->signal_event) return 0; memcpy(ev_info.uid, lgr->id, SMC_LGR_ID_SIZE); ev_info.vlan_id = lgr->vlan_id; ev_info.code = ISM_EVENT_REQUEST; rc = lgr->smcd->ops->signal_event(lgr->smcd, &lgr->peer_gid, ISM_EVENT_REQUEST_IR, ISM_EVENT_CODE_SHUTDOWN, ev_info.info); #endif return rc; } int smc_ism_init(void) { int rc = 0; smc_ism_v2_capable = false; smc_ism_create_system_eid(); #if IS_ENABLED(CONFIG_ISM) rc = ism_register_client(&smc_ism_client); #endif return rc; } void smc_ism_exit(void) { #if IS_ENABLED(CONFIG_ISM) ism_unregister_client(&smc_ism_client); #endif } |
| 1 1 1 1 1 2 2 2 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 | // SPDX-License-Identifier: GPL-2.0-only /* * Transparent proxy support for Linux/iptables * * Copyright (c) 2006-2010 BalaBit IT Ltd. * Author: Balazs Scheidler, Krisztian Kovacs */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/skbuff.h> #include <linux/ip.h> #include <net/checksum.h> #include <net/udp.h> #include <net/tcp.h> #include <net/inet_sock.h> #include <net/inet_hashtables.h> #include <linux/inetdevice.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter_ipv4/ip_tables.h> #include <net/netfilter/ipv4/nf_defrag_ipv4.h> #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) #define XT_TPROXY_HAVE_IPV6 1 #include <net/if_inet6.h> #include <net/addrconf.h> #include <net/inet6_hashtables.h> #include <linux/netfilter_ipv6/ip6_tables.h> #include <net/netfilter/ipv6/nf_defrag_ipv6.h> #endif #include <net/netfilter/nf_tproxy.h> #include <linux/netfilter/xt_TPROXY.h> static unsigned int tproxy_tg4(struct net *net, struct sk_buff *skb, __be32 laddr, __be16 lport, u_int32_t mark_mask, u_int32_t mark_value) { const struct iphdr *iph = ip_hdr(skb); struct udphdr _hdr, *hp; struct sock *sk; hp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_hdr), &_hdr); if (hp == NULL) return NF_DROP; /* check if there's an ongoing connection on the packet * addresses, this happens if the redirect already happened * and the current packet belongs to an already established * connection */ sk = nf_tproxy_get_sock_v4(net, skb, iph->protocol, iph->saddr, iph->daddr, hp->source, hp->dest, skb->dev, NF_TPROXY_LOOKUP_ESTABLISHED); laddr = nf_tproxy_laddr4(skb, laddr, iph->daddr); if (!lport) lport = hp->dest; /* UDP has no TCP_TIME_WAIT state, so we never enter here */ if (sk && sk->sk_state == TCP_TIME_WAIT) /* reopening a TIME_WAIT connection needs special handling */ sk = nf_tproxy_handle_time_wait4(net, skb, laddr, lport, sk); else if (!sk) /* no, there's no established connection, check if * there's a listener on the redirected addr/port */ sk = nf_tproxy_get_sock_v4(net, skb, iph->protocol, iph->saddr, laddr, hp->source, lport, skb->dev, NF_TPROXY_LOOKUP_LISTENER); /* NOTE: assign_sock consumes our sk reference */ if (sk && nf_tproxy_sk_is_transparent(sk)) { /* This should be in a separate target, but we don't do multiple targets on the same rule yet */ skb->mark = (skb->mark & ~mark_mask) ^ mark_value; nf_tproxy_assign_sock(skb, sk); return NF_ACCEPT; } return NF_DROP; } static unsigned int tproxy_tg4_v0(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_tproxy_target_info *tgi = par->targinfo; return tproxy_tg4(xt_net(par), skb, tgi->laddr, tgi->lport, tgi->mark_mask, tgi->mark_value); } static unsigned int tproxy_tg4_v1(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_tproxy_target_info_v1 *tgi = par->targinfo; return tproxy_tg4(xt_net(par), skb, tgi->laddr.ip, tgi->lport, tgi->mark_mask, tgi->mark_value); } #ifdef XT_TPROXY_HAVE_IPV6 static unsigned int tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par) { const struct ipv6hdr *iph = ipv6_hdr(skb); const struct xt_tproxy_target_info_v1 *tgi = par->targinfo; struct udphdr _hdr, *hp; struct sock *sk; const struct in6_addr *laddr; __be16 lport; int thoff = 0; int tproto; tproto = ipv6_find_hdr(skb, &thoff, -1, NULL, NULL); if (tproto < 0) return NF_DROP; hp = skb_header_pointer(skb, thoff, sizeof(_hdr), &_hdr); if (!hp) return NF_DROP; /* check if there's an ongoing connection on the packet * addresses, this happens if the redirect already happened * and the current packet belongs to an already established * connection */ sk = nf_tproxy_get_sock_v6(xt_net(par), skb, thoff, tproto, &iph->saddr, &iph->daddr, hp->source, hp->dest, xt_in(par), NF_TPROXY_LOOKUP_ESTABLISHED); laddr = nf_tproxy_laddr6(skb, &tgi->laddr.in6, &iph->daddr); lport = tgi->lport ? tgi->lport : hp->dest; /* UDP has no TCP_TIME_WAIT state, so we never enter here */ if (sk && sk->sk_state == TCP_TIME_WAIT) { const struct xt_tproxy_target_info_v1 *tgi = par->targinfo; /* reopening a TIME_WAIT connection needs special handling */ sk = nf_tproxy_handle_time_wait6(skb, tproto, thoff, xt_net(par), &tgi->laddr.in6, tgi->lport, sk); } else if (!sk) /* no there's no established connection, check if * there's a listener on the redirected addr/port */ sk = nf_tproxy_get_sock_v6(xt_net(par), skb, thoff, tproto, &iph->saddr, laddr, hp->source, lport, xt_in(par), NF_TPROXY_LOOKUP_LISTENER); /* NOTE: assign_sock consumes our sk reference */ if (sk && nf_tproxy_sk_is_transparent(sk)) { /* This should be in a separate target, but we don't do multiple targets on the same rule yet */ skb->mark = (skb->mark & ~tgi->mark_mask) ^ tgi->mark_value; nf_tproxy_assign_sock(skb, sk); return NF_ACCEPT; } return NF_DROP; } static int tproxy_tg6_check(const struct xt_tgchk_param *par) { const struct ip6t_ip6 *i = par->entryinfo; int err; err = nf_defrag_ipv6_enable(par->net); if (err) return err; if ((i->proto == IPPROTO_TCP || i->proto == IPPROTO_UDP) && !(i->invflags & IP6T_INV_PROTO)) return 0; pr_info_ratelimited("Can be used only with -p tcp or -p udp\n"); return -EINVAL; } static void tproxy_tg6_destroy(const struct xt_tgdtor_param *par) { nf_defrag_ipv6_disable(par->net); } #endif static int tproxy_tg4_check(const struct xt_tgchk_param *par) { const struct ipt_ip *i = par->entryinfo; int err; err = nf_defrag_ipv4_enable(par->net); if (err) return err; if ((i->proto == IPPROTO_TCP || i->proto == IPPROTO_UDP) && !(i->invflags & IPT_INV_PROTO)) return 0; pr_info_ratelimited("Can be used only with -p tcp or -p udp\n"); return -EINVAL; } static void tproxy_tg4_destroy(const struct xt_tgdtor_param *par) { nf_defrag_ipv4_disable(par->net); } static struct xt_target tproxy_tg_reg[] __read_mostly = { { .name = "TPROXY", .family = NFPROTO_IPV4, .table = "mangle", .target = tproxy_tg4_v0, .revision = 0, .targetsize = sizeof(struct xt_tproxy_target_info), .checkentry = tproxy_tg4_check, .destroy = tproxy_tg4_destroy, .hooks = 1 << NF_INET_PRE_ROUTING, .me = THIS_MODULE, }, { .name = "TPROXY", .family = NFPROTO_IPV4, .table = "mangle", .target = tproxy_tg4_v1, .revision = 1, .targetsize = sizeof(struct xt_tproxy_target_info_v1), .checkentry = tproxy_tg4_check, .destroy = tproxy_tg4_destroy, .hooks = 1 << NF_INET_PRE_ROUTING, .me = THIS_MODULE, }, #ifdef XT_TPROXY_HAVE_IPV6 { .name = "TPROXY", .family = NFPROTO_IPV6, .table = "mangle", .target = tproxy_tg6_v1, .revision = 1, .targetsize = sizeof(struct xt_tproxy_target_info_v1), .checkentry = tproxy_tg6_check, .destroy = tproxy_tg6_destroy, .hooks = 1 << NF_INET_PRE_ROUTING, .me = THIS_MODULE, }, #endif }; static int __init tproxy_tg_init(void) { return xt_register_targets(tproxy_tg_reg, ARRAY_SIZE(tproxy_tg_reg)); } static void __exit tproxy_tg_exit(void) { xt_unregister_targets(tproxy_tg_reg, ARRAY_SIZE(tproxy_tg_reg)); } module_init(tproxy_tg_init); module_exit(tproxy_tg_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Balazs Scheidler, Krisztian Kovacs"); MODULE_DESCRIPTION("Netfilter transparent proxy (TPROXY) target module."); MODULE_ALIAS("ipt_TPROXY"); MODULE_ALIAS("ip6t_TPROXY"); |
| 4 4 4 4 4 3 3 3 3 3 2 2 1 1 1 1 4 2 2 2 1 1 1 1 1 1 1 2 2 2 2 2 2 2 5 6 6 4 4 3 3 2 1 1 1 1 2 6 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 5 4 4 3 3 1 3 2 1 3 1 3 4 5 1 1 1 1 1 1 1 1 1 3 3 2 2 1 1 1 1 1 3 1 1 1 1 1 1 1 4 4 4 4 4 4 4 4 4 5 5 4 1 1 3 4 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2011 Intel Corporation. All rights reserved. */ #define pr_fmt(fmt) "llcp: %s: " fmt, __func__ #include <linux/init.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/nfc.h> #include <linux/sched/signal.h> #include "nfc.h" #include "llcp.h" static int sock_wait_state(struct sock *sk, int state, unsigned long timeo) { DECLARE_WAITQUEUE(wait, current); int err = 0; pr_debug("sk %p", sk); add_wait_queue(sk_sleep(sk), &wait); set_current_state(TASK_INTERRUPTIBLE); while (sk->sk_state != state) { if (!timeo) { err = -EINPROGRESS; break; } if (signal_pending(current)) { err = sock_intr_errno(timeo); break; } release_sock(sk); timeo = schedule_timeout(timeo); lock_sock(sk); set_current_state(TASK_INTERRUPTIBLE); err = sock_error(sk); if (err) break; } __set_current_state(TASK_RUNNING); remove_wait_queue(sk_sleep(sk), &wait); return err; } static struct proto llcp_sock_proto = { .name = "NFC_LLCP", .owner = THIS_MODULE, .obj_size = sizeof(struct nfc_llcp_sock), }; static int llcp_sock_bind(struct socket *sock, struct sockaddr *addr, int alen) { struct sock *sk = sock->sk; struct nfc_llcp_sock *llcp_sock = nfc_llcp_sock(sk); struct nfc_llcp_local *local; struct nfc_dev *dev; struct sockaddr_nfc_llcp llcp_addr; int len, ret = 0; if (!addr || alen < offsetofend(struct sockaddr, sa_family) || addr->sa_family != AF_NFC) return -EINVAL; pr_debug("sk %p addr %p family %d\n", sk, addr, addr->sa_family); memset(&llcp_addr, 0, sizeof(llcp_addr)); len = min_t(unsigned int, sizeof(llcp_addr), alen); memcpy(&llcp_addr, addr, len); /* This is going to be a listening socket, dsap must be 0 */ if (llcp_addr.dsap != 0) return -EINVAL; lock_sock(sk); if (sk->sk_state != LLCP_CLOSED) { ret = -EBADFD; goto error; } dev = nfc_get_device(llcp_addr.dev_idx); if (dev == NULL) { ret = -ENODEV; goto error; } local = nfc_llcp_find_local(dev); if (local == NULL) { ret = -ENODEV; goto put_dev; } llcp_sock->dev = dev; llcp_sock->local = local; llcp_sock->nfc_protocol = llcp_addr.nfc_protocol; llcp_sock->service_name_len = min_t(unsigned int, llcp_addr.service_name_len, NFC_LLCP_MAX_SERVICE_NAME); llcp_sock->service_name = kmemdup(llcp_addr.service_name, llcp_sock->service_name_len, GFP_KERNEL); if (!llcp_sock->service_name) { ret = -ENOMEM; goto sock_llcp_put_local; } llcp_sock->ssap = nfc_llcp_get_sdp_ssap(local, llcp_sock); if (llcp_sock->ssap == LLCP_SAP_MAX) { ret = -EADDRINUSE; goto free_service_name; } llcp_sock->reserved_ssap = llcp_sock->ssap; nfc_llcp_sock_link(&local->sockets, sk); pr_debug("Socket bound to SAP %d\n", llcp_sock->ssap); sk->sk_state = LLCP_BOUND; nfc_put_device(dev); release_sock(sk); return 0; free_service_name: kfree(llcp_sock->service_name); llcp_sock->service_name = NULL; sock_llcp_put_local: nfc_llcp_local_put(llcp_sock->local); llcp_sock->local = NULL; llcp_sock->dev = NULL; put_dev: nfc_put_device(dev); error: release_sock(sk); return ret; } static int llcp_raw_sock_bind(struct socket *sock, struct sockaddr *addr, int alen) { struct sock *sk = sock->sk; struct nfc_llcp_sock *llcp_sock = nfc_llcp_sock(sk); struct nfc_llcp_local *local; struct nfc_dev *dev; struct sockaddr_nfc_llcp llcp_addr; int len, ret = 0; if (!addr || alen < offsetofend(struct sockaddr, sa_family) || addr->sa_family != AF_NFC) return -EINVAL; pr_debug("sk %p addr %p family %d\n", sk, addr, addr->sa_family); memset(&llcp_addr, 0, sizeof(llcp_addr)); len = min_t(unsigned int, sizeof(llcp_addr), alen); memcpy(&llcp_addr, addr, len); lock_sock(sk); if (sk->sk_state != LLCP_CLOSED) { ret = -EBADFD; goto error; } dev = nfc_get_device(llcp_addr.dev_idx); if (dev == NULL) { ret = -ENODEV; goto error; } local = nfc_llcp_find_local(dev); if (local == NULL) { ret = -ENODEV; goto put_dev; } llcp_sock->dev = dev; llcp_sock->local = local; llcp_sock->nfc_protocol = llcp_addr.nfc_protocol; nfc_llcp_sock_link(&local->raw_sockets, sk); sk->sk_state = LLCP_BOUND; put_dev: nfc_put_device(dev); error: release_sock(sk); return ret; } static int llcp_sock_listen(struct socket *sock, int backlog) { struct sock *sk = sock->sk; int ret = 0; pr_debug("sk %p backlog %d\n", sk, backlog); lock_sock(sk); if ((sock->type != SOCK_SEQPACKET && sock->type != SOCK_STREAM) || sk->sk_state != LLCP_BOUND) { ret = -EBADFD; goto error; } sk->sk_max_ack_backlog = backlog; sk->sk_ack_backlog = 0; pr_debug("Socket listening\n"); sk->sk_state = LLCP_LISTEN; error: release_sock(sk); return ret; } static int nfc_llcp_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct nfc_llcp_sock *llcp_sock = nfc_llcp_sock(sk); u32 opt; int err = 0; pr_debug("%p optname %d\n", sk, optname); if (level != SOL_NFC) return -ENOPROTOOPT; lock_sock(sk); switch (optname) { case NFC_LLCP_RW: if (sk->sk_state == LLCP_CONNECTED || sk->sk_state == LLCP_BOUND || sk->sk_state == LLCP_LISTEN) { err = -EINVAL; break; } err = copy_safe_from_sockptr(&opt, sizeof(opt), optval, optlen); if (err) break; if (opt > LLCP_MAX_RW) { err = -EINVAL; break; } llcp_sock->rw = (u8) opt; break; case NFC_LLCP_MIUX: if (sk->sk_state == LLCP_CONNECTED || sk->sk_state == LLCP_BOUND || sk->sk_state == LLCP_LISTEN) { err = -EINVAL; break; } err = copy_safe_from_sockptr(&opt, sizeof(opt), optval, optlen); if (err) break; if (opt > LLCP_MAX_MIUX) { err = -EINVAL; break; } llcp_sock->miux = cpu_to_be16((u16) opt); break; default: err = -ENOPROTOOPT; break; } release_sock(sk); pr_debug("%p rw %d miux %d\n", llcp_sock, llcp_sock->rw, llcp_sock->miux); return err; } static int nfc_llcp_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct nfc_llcp_local *local; struct sock *sk = sock->sk; struct nfc_llcp_sock *llcp_sock = nfc_llcp_sock(sk); int len, err = 0; u16 miux, remote_miu; u8 rw; pr_debug("%p optname %d\n", sk, optname); if (level != SOL_NFC) return -ENOPROTOOPT; if (get_user(len, optlen)) return -EFAULT; local = llcp_sock->local; if (!local) return -ENODEV; len = min_t(u32, len, sizeof(u32)); lock_sock(sk); switch (optname) { case NFC_LLCP_RW: rw = llcp_sock->rw > LLCP_MAX_RW ? local->rw : llcp_sock->rw; if (put_user(rw, (u32 __user *) optval)) err = -EFAULT; break; case NFC_LLCP_MIUX: miux = be16_to_cpu(llcp_sock->miux) > LLCP_MAX_MIUX ? be16_to_cpu(local->miux) : be16_to_cpu(llcp_sock->miux); if (put_user(miux, (u32 __user *) optval)) err = -EFAULT; break; case NFC_LLCP_REMOTE_MIU: remote_miu = llcp_sock->remote_miu > LLCP_MAX_MIU ? local->remote_miu : llcp_sock->remote_miu; if (put_user(remote_miu, (u32 __user *) optval)) err = -EFAULT; break; case NFC_LLCP_REMOTE_LTO: if (put_user(local->remote_lto / 10, (u32 __user *) optval)) err = -EFAULT; break; case NFC_LLCP_REMOTE_RW: if (put_user(llcp_sock->remote_rw, (u32 __user *) optval)) err = -EFAULT; break; default: err = -ENOPROTOOPT; break; } release_sock(sk); if (put_user(len, optlen)) return -EFAULT; return err; } void nfc_llcp_accept_unlink(struct sock *sk) { struct nfc_llcp_sock *llcp_sock = nfc_llcp_sock(sk); pr_debug("state %d\n", sk->sk_state); list_del_init(&llcp_sock->accept_queue); sk_acceptq_removed(llcp_sock->parent); llcp_sock->parent = NULL; sock_put(sk); } void nfc_llcp_accept_enqueue(struct sock *parent, struct sock *sk) { struct nfc_llcp_sock *llcp_sock = nfc_llcp_sock(sk); struct nfc_llcp_sock *llcp_sock_parent = nfc_llcp_sock(parent); /* Lock will be free from unlink */ sock_hold(sk); list_add_tail(&llcp_sock->accept_queue, &llcp_sock_parent->accept_queue); llcp_sock->parent = parent; sk_acceptq_added(parent); } struct sock *nfc_llcp_accept_dequeue(struct sock *parent, struct socket *newsock) { struct nfc_llcp_sock *lsk, *n, *llcp_parent; struct sock *sk; llcp_parent = nfc_llcp_sock(parent); list_for_each_entry_safe(lsk, n, &llcp_parent->accept_queue, accept_queue) { sk = &lsk->sk; lock_sock(sk); if (sk->sk_state == LLCP_CLOSED) { release_sock(sk); nfc_llcp_accept_unlink(sk); continue; } if (sk->sk_state == LLCP_CONNECTED || !newsock) { list_del_init(&lsk->accept_queue); sock_put(sk); if (newsock) sock_graft(sk, newsock); release_sock(sk); pr_debug("Returning sk state %d\n", sk->sk_state); sk_acceptq_removed(parent); return sk; } release_sock(sk); } return NULL; } static int llcp_sock_accept(struct socket *sock, struct socket *newsock, struct proto_accept_arg *arg) { DECLARE_WAITQUEUE(wait, current); struct sock *sk = sock->sk, *new_sk; long timeo; int ret = 0; pr_debug("parent %p\n", sk); lock_sock_nested(sk, SINGLE_DEPTH_NESTING); if (sk->sk_state != LLCP_LISTEN) { ret = -EBADFD; goto error; } timeo = sock_rcvtimeo(sk, arg->flags & O_NONBLOCK); /* Wait for an incoming connection. */ add_wait_queue_exclusive(sk_sleep(sk), &wait); while (!(new_sk = nfc_llcp_accept_dequeue(sk, newsock))) { set_current_state(TASK_INTERRUPTIBLE); if (!timeo) { ret = -EAGAIN; break; } if (signal_pending(current)) { ret = sock_intr_errno(timeo); break; } release_sock(sk); timeo = schedule_timeout(timeo); lock_sock_nested(sk, SINGLE_DEPTH_NESTING); } __set_current_state(TASK_RUNNING); remove_wait_queue(sk_sleep(sk), &wait); if (ret) goto error; newsock->state = SS_CONNECTED; pr_debug("new socket %p\n", new_sk); error: release_sock(sk); return ret; } static int llcp_sock_getname(struct socket *sock, struct sockaddr *uaddr, int peer) { struct sock *sk = sock->sk; struct nfc_llcp_sock *llcp_sock = nfc_llcp_sock(sk); DECLARE_SOCKADDR(struct sockaddr_nfc_llcp *, llcp_addr, uaddr); if (llcp_sock == NULL || llcp_sock->dev == NULL) return -EBADFD; pr_debug("%p %d %d %d\n", sk, llcp_sock->target_idx, llcp_sock->dsap, llcp_sock->ssap); memset(llcp_addr, 0, sizeof(*llcp_addr)); lock_sock(sk); if (!llcp_sock->dev) { release_sock(sk); return -EBADFD; } llcp_addr->sa_family = AF_NFC; llcp_addr->dev_idx = llcp_sock->dev->idx; llcp_addr->target_idx = llcp_sock->target_idx; llcp_addr->nfc_protocol = llcp_sock->nfc_protocol; llcp_addr->dsap = llcp_sock->dsap; llcp_addr->ssap = llcp_sock->ssap; llcp_addr->service_name_len = llcp_sock->service_name_len; memcpy(llcp_addr->service_name, llcp_sock->service_name, llcp_addr->service_name_len); release_sock(sk); return sizeof(struct sockaddr_nfc_llcp); } static inline __poll_t llcp_accept_poll(struct sock *parent) { struct nfc_llcp_sock *llcp_sock, *parent_sock; struct sock *sk; parent_sock = nfc_llcp_sock(parent); list_for_each_entry(llcp_sock, &parent_sock->accept_queue, accept_queue) { sk = &llcp_sock->sk; if (sk->sk_state == LLCP_CONNECTED) return EPOLLIN | EPOLLRDNORM; } return 0; } static __poll_t llcp_sock_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; __poll_t mask = 0; pr_debug("%p\n", sk); sock_poll_wait(file, sock, wait); if (sk->sk_state == LLCP_LISTEN) return llcp_accept_poll(sk); if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue)) mask |= EPOLLERR | (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0); if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) mask |= EPOLLIN | EPOLLRDNORM; if (sk->sk_state == LLCP_CLOSED) mask |= EPOLLHUP; if (sk->sk_shutdown & RCV_SHUTDOWN) mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; if (sk->sk_shutdown == SHUTDOWN_MASK) mask |= EPOLLHUP; if (sock_writeable(sk) && sk->sk_state == LLCP_CONNECTED) mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; else sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); pr_debug("mask 0x%x\n", mask); return mask; } static int llcp_sock_release(struct socket *sock) { struct sock *sk = sock->sk; struct nfc_llcp_local *local; struct nfc_llcp_sock *llcp_sock = nfc_llcp_sock(sk); int err = 0; if (!sk) return 0; pr_debug("%p\n", sk); local = llcp_sock->local; if (local == NULL) { err = -ENODEV; goto out; } lock_sock(sk); /* Send a DISC */ if (sk->sk_state == LLCP_CONNECTED) nfc_llcp_send_disconnect(llcp_sock); if (sk->sk_state == LLCP_LISTEN) { struct nfc_llcp_sock *lsk, *n; struct sock *accept_sk; list_for_each_entry_safe(lsk, n, &llcp_sock->accept_queue, accept_queue) { accept_sk = &lsk->sk; lock_sock(accept_sk); nfc_llcp_send_disconnect(lsk); nfc_llcp_accept_unlink(accept_sk); release_sock(accept_sk); } } if (sock->type == SOCK_RAW) nfc_llcp_sock_unlink(&local->raw_sockets, sk); else nfc_llcp_sock_unlink(&local->sockets, sk); if (llcp_sock->reserved_ssap < LLCP_SAP_MAX) nfc_llcp_put_ssap(llcp_sock->local, llcp_sock->ssap); release_sock(sk); out: sock_orphan(sk); sock_put(sk); return err; } static int llcp_sock_connect(struct socket *sock, struct sockaddr *_addr, int len, int flags) { struct sock *sk = sock->sk; struct nfc_llcp_sock *llcp_sock = nfc_llcp_sock(sk); struct sockaddr_nfc_llcp *addr = (struct sockaddr_nfc_llcp *)_addr; struct nfc_dev *dev; struct nfc_llcp_local *local; int ret = 0; pr_debug("sock %p sk %p flags 0x%x\n", sock, sk, flags); if (!addr || len < sizeof(*addr) || addr->sa_family != AF_NFC) return -EINVAL; if (addr->service_name_len == 0 && addr->dsap == 0) return -EINVAL; pr_debug("addr dev_idx=%u target_idx=%u protocol=%u\n", addr->dev_idx, addr->target_idx, addr->nfc_protocol); lock_sock(sk); if (sk->sk_state == LLCP_CONNECTED) { ret = -EISCONN; goto error; } if (sk->sk_state == LLCP_CONNECTING) { ret = -EINPROGRESS; goto error; } dev = nfc_get_device(addr->dev_idx); if (dev == NULL) { ret = -ENODEV; goto error; } local = nfc_llcp_find_local(dev); if (local == NULL) { ret = -ENODEV; goto put_dev; } device_lock(&dev->dev); if (dev->dep_link_up == false) { ret = -ENOLINK; device_unlock(&dev->dev); goto sock_llcp_put_local; } device_unlock(&dev->dev); if (local->rf_mode == NFC_RF_INITIATOR && addr->target_idx != local->target_idx) { ret = -ENOLINK; goto sock_llcp_put_local; } llcp_sock->dev = dev; llcp_sock->local = local; llcp_sock->ssap = nfc_llcp_get_local_ssap(local); if (llcp_sock->ssap == LLCP_SAP_MAX) { ret = -ENOMEM; goto sock_llcp_nullify; } llcp_sock->reserved_ssap = llcp_sock->ssap; if (addr->service_name_len == 0) llcp_sock->dsap = addr->dsap; else llcp_sock->dsap = LLCP_SAP_SDP; llcp_sock->nfc_protocol = addr->nfc_protocol; llcp_sock->service_name_len = min_t(unsigned int, addr->service_name_len, NFC_LLCP_MAX_SERVICE_NAME); llcp_sock->service_name = kmemdup(addr->service_name, llcp_sock->service_name_len, GFP_KERNEL); if (!llcp_sock->service_name) { ret = -ENOMEM; goto sock_llcp_release; } nfc_llcp_sock_link(&local->connecting_sockets, sk); ret = nfc_llcp_send_connect(llcp_sock); if (ret) goto sock_unlink; sk->sk_state = LLCP_CONNECTING; ret = sock_wait_state(sk, LLCP_CONNECTED, sock_sndtimeo(sk, flags & O_NONBLOCK)); if (ret && ret != -EINPROGRESS) goto sock_unlink; release_sock(sk); return ret; sock_unlink: nfc_llcp_sock_unlink(&local->connecting_sockets, sk); kfree(llcp_sock->service_name); llcp_sock->service_name = NULL; sock_llcp_release: nfc_llcp_put_ssap(local, llcp_sock->ssap); sock_llcp_nullify: llcp_sock->local = NULL; llcp_sock->dev = NULL; sock_llcp_put_local: nfc_llcp_local_put(local); put_dev: nfc_put_device(dev); error: release_sock(sk); return ret; } static int llcp_sock_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct nfc_llcp_sock *llcp_sock = nfc_llcp_sock(sk); int ret; pr_debug("sock %p sk %p", sock, sk); ret = sock_error(sk); if (ret) return ret; if (msg->msg_flags & MSG_OOB) return -EOPNOTSUPP; lock_sock(sk); if (!llcp_sock->local) { release_sock(sk); return -ENODEV; } if (sk->sk_type == SOCK_DGRAM) { if (sk->sk_state != LLCP_BOUND) { release_sock(sk); return -ENOTCONN; } DECLARE_SOCKADDR(struct sockaddr_nfc_llcp *, addr, msg->msg_name); if (msg->msg_namelen < sizeof(*addr)) { release_sock(sk); return -EINVAL; } release_sock(sk); return nfc_llcp_send_ui_frame(llcp_sock, addr->dsap, addr->ssap, msg, len); } if (sk->sk_state != LLCP_CONNECTED) { release_sock(sk); return -ENOTCONN; } release_sock(sk); return nfc_llcp_send_i_frame(llcp_sock, msg, len); } static int llcp_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { struct sock *sk = sock->sk; unsigned int copied, rlen; struct sk_buff *skb, *cskb; int err = 0; pr_debug("%p %zu\n", sk, len); lock_sock(sk); if (sk->sk_state == LLCP_CLOSED && skb_queue_empty(&sk->sk_receive_queue)) { release_sock(sk); return 0; } release_sock(sk); if (flags & (MSG_OOB)) return -EOPNOTSUPP; skb = skb_recv_datagram(sk, flags, &err); if (!skb) { pr_err("Recv datagram failed state %d %d %d", sk->sk_state, err, sock_error(sk)); if (sk->sk_shutdown & RCV_SHUTDOWN) return 0; return err; } rlen = skb->len; /* real length of skb */ copied = min_t(unsigned int, rlen, len); cskb = skb; if (skb_copy_datagram_msg(cskb, 0, msg, copied)) { if (!(flags & MSG_PEEK)) skb_queue_head(&sk->sk_receive_queue, skb); return -EFAULT; } sock_recv_timestamp(msg, sk, skb); if (sk->sk_type == SOCK_DGRAM && msg->msg_name) { struct nfc_llcp_ui_cb *ui_cb = nfc_llcp_ui_skb_cb(skb); DECLARE_SOCKADDR(struct sockaddr_nfc_llcp *, sockaddr, msg->msg_name); msg->msg_namelen = sizeof(struct sockaddr_nfc_llcp); pr_debug("Datagram socket %d %d\n", ui_cb->dsap, ui_cb->ssap); memset(sockaddr, 0, sizeof(*sockaddr)); sockaddr->sa_family = AF_NFC; sockaddr->nfc_protocol = NFC_PROTO_NFC_DEP; sockaddr->dsap = ui_cb->dsap; sockaddr->ssap = ui_cb->ssap; } /* Mark read part of skb as used */ if (!(flags & MSG_PEEK)) { /* SOCK_STREAM: re-queue skb if it contains unreceived data */ if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) { skb_pull(skb, copied); if (skb->len) { skb_queue_head(&sk->sk_receive_queue, skb); goto done; } } kfree_skb(skb); } /* XXX Queue backlogged skbs */ done: /* SOCK_SEQPACKET: return real length if MSG_TRUNC is set */ if (sk->sk_type == SOCK_SEQPACKET && (flags & MSG_TRUNC)) copied = rlen; return copied; } static const struct proto_ops llcp_sock_ops = { .family = PF_NFC, .owner = THIS_MODULE, .bind = llcp_sock_bind, .connect = llcp_sock_connect, .release = llcp_sock_release, .socketpair = sock_no_socketpair, .accept = llcp_sock_accept, .getname = llcp_sock_getname, .poll = llcp_sock_poll, .ioctl = sock_no_ioctl, .listen = llcp_sock_listen, .shutdown = sock_no_shutdown, .setsockopt = nfc_llcp_setsockopt, .getsockopt = nfc_llcp_getsockopt, .sendmsg = llcp_sock_sendmsg, .recvmsg = llcp_sock_recvmsg, .mmap = sock_no_mmap, }; static const struct proto_ops llcp_rawsock_ops = { .family = PF_NFC, .owner = THIS_MODULE, .bind = llcp_raw_sock_bind, .connect = sock_no_connect, .release = llcp_sock_release, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = llcp_sock_getname, .poll = llcp_sock_poll, .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .sendmsg = sock_no_sendmsg, .recvmsg = llcp_sock_recvmsg, .mmap = sock_no_mmap, }; static void llcp_sock_destruct(struct sock *sk) { struct nfc_llcp_sock *llcp_sock = nfc_llcp_sock(sk); pr_debug("%p\n", sk); if (sk->sk_state == LLCP_CONNECTED) nfc_put_device(llcp_sock->dev); skb_queue_purge(&sk->sk_receive_queue); nfc_llcp_sock_free(llcp_sock); if (!sock_flag(sk, SOCK_DEAD)) { pr_err("Freeing alive NFC LLCP socket %p\n", sk); return; } } struct sock *nfc_llcp_sock_alloc(struct socket *sock, int type, gfp_t gfp, int kern) { struct sock *sk; struct nfc_llcp_sock *llcp_sock; sk = sk_alloc(&init_net, PF_NFC, gfp, &llcp_sock_proto, kern); if (!sk) return NULL; llcp_sock = nfc_llcp_sock(sk); sock_init_data(sock, sk); sk->sk_state = LLCP_CLOSED; sk->sk_protocol = NFC_SOCKPROTO_LLCP; sk->sk_type = type; sk->sk_destruct = llcp_sock_destruct; llcp_sock->ssap = 0; llcp_sock->dsap = LLCP_SAP_SDP; llcp_sock->rw = LLCP_MAX_RW + 1; llcp_sock->miux = cpu_to_be16(LLCP_MAX_MIUX + 1); llcp_sock->send_n = llcp_sock->send_ack_n = 0; llcp_sock->recv_n = llcp_sock->recv_ack_n = 0; llcp_sock->remote_ready = 1; llcp_sock->reserved_ssap = LLCP_SAP_MAX; nfc_llcp_socket_remote_param_init(llcp_sock); skb_queue_head_init(&llcp_sock->tx_queue); skb_queue_head_init(&llcp_sock->tx_pending_queue); INIT_LIST_HEAD(&llcp_sock->accept_queue); if (sock != NULL) sock->state = SS_UNCONNECTED; return sk; } void nfc_llcp_sock_free(struct nfc_llcp_sock *sock) { kfree(sock->service_name); skb_queue_purge(&sock->tx_queue); skb_queue_purge(&sock->tx_pending_queue); list_del_init(&sock->accept_queue); sock->parent = NULL; nfc_llcp_local_put(sock->local); } static int llcp_sock_create(struct net *net, struct socket *sock, const struct nfc_protocol *nfc_proto, int kern) { struct sock *sk; pr_debug("%p\n", sock); if (sock->type != SOCK_STREAM && sock->type != SOCK_DGRAM && sock->type != SOCK_RAW) return -ESOCKTNOSUPPORT; if (sock->type == SOCK_RAW) { if (!capable(CAP_NET_RAW)) return -EPERM; sock->ops = &llcp_rawsock_ops; } else { sock->ops = &llcp_sock_ops; } sk = nfc_llcp_sock_alloc(sock, sock->type, GFP_ATOMIC, kern); if (sk == NULL) return -ENOMEM; return 0; } static const struct nfc_protocol llcp_nfc_proto = { .id = NFC_SOCKPROTO_LLCP, .proto = &llcp_sock_proto, .owner = THIS_MODULE, .create = llcp_sock_create }; int __init nfc_llcp_sock_init(void) { return nfc_proto_register(&llcp_nfc_proto); } void nfc_llcp_sock_exit(void) { nfc_proto_unregister(&llcp_nfc_proto); } |
| 1 1 1 1 1 1 1 3 3 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 | // SPDX-License-Identifier: LGPL-2.1 /* * Copyright (c) 2012 Taobao. * Written by Tao Ma <boyu.mt@taobao.com> */ #include <linux/iomap.h> #include <linux/fiemap.h> #include <linux/namei.h> #include <linux/iversion.h> #include <linux/sched/mm.h> #include "ext4_jbd2.h" #include "ext4.h" #include "xattr.h" #include "truncate.h" #define EXT4_XATTR_SYSTEM_DATA "data" #define EXT4_MIN_INLINE_DATA_SIZE ((sizeof(__le32) * EXT4_N_BLOCKS)) #define EXT4_INLINE_DOTDOT_OFFSET 2 #define EXT4_INLINE_DOTDOT_SIZE 4 static int ext4_get_inline_size(struct inode *inode) { if (EXT4_I(inode)->i_inline_off) return EXT4_I(inode)->i_inline_size; return 0; } static int get_max_inline_xattr_value_size(struct inode *inode, struct ext4_iloc *iloc) { struct ext4_xattr_ibody_header *header; struct ext4_xattr_entry *entry; struct ext4_inode *raw_inode; void *end; int free, min_offs; if (!EXT4_INODE_HAS_XATTR_SPACE(inode)) return 0; min_offs = EXT4_SB(inode->i_sb)->s_inode_size - EXT4_GOOD_OLD_INODE_SIZE - EXT4_I(inode)->i_extra_isize - sizeof(struct ext4_xattr_ibody_header); /* * We need to subtract another sizeof(__u32) since an in-inode xattr * needs an empty 4 bytes to indicate the gap between the xattr entry * and the name/value pair. */ if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR)) return EXT4_XATTR_SIZE(min_offs - EXT4_XATTR_LEN(strlen(EXT4_XATTR_SYSTEM_DATA)) - EXT4_XATTR_ROUND - sizeof(__u32)); raw_inode = ext4_raw_inode(iloc); header = IHDR(inode, raw_inode); entry = IFIRST(header); end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; /* Compute min_offs. */ while (!IS_LAST_ENTRY(entry)) { void *next = EXT4_XATTR_NEXT(entry); if (next >= end) { EXT4_ERROR_INODE(inode, "corrupt xattr in inline inode"); return 0; } if (!entry->e_value_inum && entry->e_value_size) { size_t offs = le16_to_cpu(entry->e_value_offs); if (offs < min_offs) min_offs = offs; } entry = next; } free = min_offs - ((void *)entry - (void *)IFIRST(header)) - sizeof(__u32); if (EXT4_I(inode)->i_inline_off) { entry = (struct ext4_xattr_entry *) ((void *)raw_inode + EXT4_I(inode)->i_inline_off); free += EXT4_XATTR_SIZE(le32_to_cpu(entry->e_value_size)); goto out; } free -= EXT4_XATTR_LEN(strlen(EXT4_XATTR_SYSTEM_DATA)); if (free > EXT4_XATTR_ROUND) free = EXT4_XATTR_SIZE(free - EXT4_XATTR_ROUND); else free = 0; out: return free; } /* * Get the maximum size we now can store in an inode. * If we can't find the space for a xattr entry, don't use the space * of the extents since we have no space to indicate the inline data. */ int ext4_get_max_inline_size(struct inode *inode) { int error, max_inline_size; struct ext4_iloc iloc; if (EXT4_I(inode)->i_extra_isize == 0) return 0; error = ext4_get_inode_loc(inode, &iloc); if (error) { ext4_error_inode_err(inode, __func__, __LINE__, 0, -error, "can't get inode location %lu", inode->i_ino); return 0; } down_read(&EXT4_I(inode)->xattr_sem); max_inline_size = get_max_inline_xattr_value_size(inode, &iloc); up_read(&EXT4_I(inode)->xattr_sem); brelse(iloc.bh); if (!max_inline_size) return 0; return max_inline_size + EXT4_MIN_INLINE_DATA_SIZE; } /* * this function does not take xattr_sem, which is OK because it is * currently only used in a code path coming form ext4_iget, before * the new inode has been unlocked */ int ext4_find_inline_data_nolock(struct inode *inode) { struct ext4_xattr_ibody_find is = { .s = { .not_found = -ENODATA, }, }; struct ext4_xattr_info i = { .name_index = EXT4_XATTR_INDEX_SYSTEM, .name = EXT4_XATTR_SYSTEM_DATA, }; int error; if (EXT4_I(inode)->i_extra_isize == 0) return 0; error = ext4_get_inode_loc(inode, &is.iloc); if (error) return error; error = ext4_xattr_ibody_find(inode, &i, &is); if (error) goto out; if (!is.s.not_found) { if (is.s.here->e_value_inum) { EXT4_ERROR_INODE(inode, "inline data xattr refers " "to an external xattr inode"); error = -EFSCORRUPTED; goto out; } EXT4_I(inode)->i_inline_off = (u16)((void *)is.s.here - (void *)ext4_raw_inode(&is.iloc)); EXT4_I(inode)->i_inline_size = EXT4_MIN_INLINE_DATA_SIZE + le32_to_cpu(is.s.here->e_value_size); } out: brelse(is.iloc.bh); return error; } static int ext4_read_inline_data(struct inode *inode, void *buffer, unsigned int len, struct ext4_iloc *iloc) { struct ext4_xattr_entry *entry; struct ext4_xattr_ibody_header *header; int cp_len = 0; struct ext4_inode *raw_inode; if (!len) return 0; BUG_ON(len > EXT4_I(inode)->i_inline_size); cp_len = min_t(unsigned int, len, EXT4_MIN_INLINE_DATA_SIZE); raw_inode = ext4_raw_inode(iloc); memcpy(buffer, (void *)(raw_inode->i_block), cp_len); len -= cp_len; buffer += cp_len; if (!len) goto out; header = IHDR(inode, raw_inode); entry = (struct ext4_xattr_entry *)((void *)raw_inode + EXT4_I(inode)->i_inline_off); len = min_t(unsigned int, len, (unsigned int)le32_to_cpu(entry->e_value_size)); memcpy(buffer, (void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs), len); cp_len += len; out: return cp_len; } /* * write the buffer to the inline inode. * If 'create' is set, we don't need to do the extra copy in the xattr * value since it is already handled by ext4_xattr_ibody_set. * That saves us one memcpy. */ static void ext4_write_inline_data(struct inode *inode, struct ext4_iloc *iloc, void *buffer, loff_t pos, unsigned int len) { struct ext4_xattr_entry *entry; struct ext4_xattr_ibody_header *header; struct ext4_inode *raw_inode; int cp_len = 0; if (unlikely(ext4_forced_shutdown(inode->i_sb))) return; BUG_ON(!EXT4_I(inode)->i_inline_off); BUG_ON(pos + len > EXT4_I(inode)->i_inline_size); raw_inode = ext4_raw_inode(iloc); buffer += pos; if (pos < EXT4_MIN_INLINE_DATA_SIZE) { cp_len = pos + len > EXT4_MIN_INLINE_DATA_SIZE ? EXT4_MIN_INLINE_DATA_SIZE - pos : len; memcpy((void *)raw_inode->i_block + pos, buffer, cp_len); len -= cp_len; buffer += cp_len; pos += cp_len; } if (!len) return; pos -= EXT4_MIN_INLINE_DATA_SIZE; header = IHDR(inode, raw_inode); entry = (struct ext4_xattr_entry *)((void *)raw_inode + EXT4_I(inode)->i_inline_off); memcpy((void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs) + pos, buffer, len); } static int ext4_create_inline_data(handle_t *handle, struct inode *inode, unsigned len) { int error; void *value = NULL; struct ext4_xattr_ibody_find is = { .s = { .not_found = -ENODATA, }, }; struct ext4_xattr_info i = { .name_index = EXT4_XATTR_INDEX_SYSTEM, .name = EXT4_XATTR_SYSTEM_DATA, }; error = ext4_get_inode_loc(inode, &is.iloc); if (error) return error; BUFFER_TRACE(is.iloc.bh, "get_write_access"); error = ext4_journal_get_write_access(handle, inode->i_sb, is.iloc.bh, EXT4_JTR_NONE); if (error) goto out; if (len > EXT4_MIN_INLINE_DATA_SIZE) { value = EXT4_ZERO_XATTR_VALUE; len -= EXT4_MIN_INLINE_DATA_SIZE; } else { value = ""; len = 0; } /* Insert the xttr entry. */ i.value = value; i.value_len = len; error = ext4_xattr_ibody_find(inode, &i, &is); if (error) goto out; BUG_ON(!is.s.not_found); error = ext4_xattr_ibody_set(handle, inode, &i, &is); if (error) { if (error == -ENOSPC) ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); goto out; } memset((void *)ext4_raw_inode(&is.iloc)->i_block, 0, EXT4_MIN_INLINE_DATA_SIZE); EXT4_I(inode)->i_inline_off = (u16)((void *)is.s.here - (void *)ext4_raw_inode(&is.iloc)); EXT4_I(inode)->i_inline_size = len + EXT4_MIN_INLINE_DATA_SIZE; ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS); ext4_set_inode_flag(inode, EXT4_INODE_INLINE_DATA); get_bh(is.iloc.bh); error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); out: brelse(is.iloc.bh); return error; } static int ext4_update_inline_data(handle_t *handle, struct inode *inode, unsigned int len) { int error; void *value = NULL; struct ext4_xattr_ibody_find is = { .s = { .not_found = -ENODATA, }, }; struct ext4_xattr_info i = { .name_index = EXT4_XATTR_INDEX_SYSTEM, .name = EXT4_XATTR_SYSTEM_DATA, }; /* If the old space is ok, write the data directly. */ if (len <= EXT4_I(inode)->i_inline_size) return 0; error = ext4_get_inode_loc(inode, &is.iloc); if (error) return error; error = ext4_xattr_ibody_find(inode, &i, &is); if (error) goto out; BUG_ON(is.s.not_found); len -= EXT4_MIN_INLINE_DATA_SIZE; value = kzalloc(len, GFP_NOFS); if (!value) { error = -ENOMEM; goto out; } error = ext4_xattr_ibody_get(inode, i.name_index, i.name, value, len); if (error < 0) goto out; BUFFER_TRACE(is.iloc.bh, "get_write_access"); error = ext4_journal_get_write_access(handle, inode->i_sb, is.iloc.bh, EXT4_JTR_NONE); if (error) goto out; /* Update the xattr entry. */ i.value = value; i.value_len = len; error = ext4_xattr_ibody_set(handle, inode, &i, &is); if (error) goto out; EXT4_I(inode)->i_inline_off = (u16)((void *)is.s.here - (void *)ext4_raw_inode(&is.iloc)); EXT4_I(inode)->i_inline_size = EXT4_MIN_INLINE_DATA_SIZE + le32_to_cpu(is.s.here->e_value_size); ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); get_bh(is.iloc.bh); error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); out: kfree(value); brelse(is.iloc.bh); return error; } static int ext4_prepare_inline_data(handle_t *handle, struct inode *inode, unsigned int len) { int ret, size, no_expand; struct ext4_inode_info *ei = EXT4_I(inode); if (!ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) return -ENOSPC; size = ext4_get_max_inline_size(inode); if (size < len) return -ENOSPC; ext4_write_lock_xattr(inode, &no_expand); if (ei->i_inline_off) ret = ext4_update_inline_data(handle, inode, len); else ret = ext4_create_inline_data(handle, inode, len); ext4_write_unlock_xattr(inode, &no_expand); return ret; } static int ext4_destroy_inline_data_nolock(handle_t *handle, struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_xattr_ibody_find is = { .s = { .not_found = 0, }, }; struct ext4_xattr_info i = { .name_index = EXT4_XATTR_INDEX_SYSTEM, .name = EXT4_XATTR_SYSTEM_DATA, .value = NULL, .value_len = 0, }; int error; if (!ei->i_inline_off) return 0; error = ext4_get_inode_loc(inode, &is.iloc); if (error) return error; error = ext4_xattr_ibody_find(inode, &i, &is); if (error) goto out; BUFFER_TRACE(is.iloc.bh, "get_write_access"); error = ext4_journal_get_write_access(handle, inode->i_sb, is.iloc.bh, EXT4_JTR_NONE); if (error) goto out; error = ext4_xattr_ibody_set(handle, inode, &i, &is); if (error) goto out; memset((void *)ext4_raw_inode(&is.iloc)->i_block, 0, EXT4_MIN_INLINE_DATA_SIZE); memset(ei->i_data, 0, EXT4_MIN_INLINE_DATA_SIZE); if (ext4_has_feature_extents(inode->i_sb)) { if (S_ISDIR(inode->i_mode) || S_ISREG(inode->i_mode) || S_ISLNK(inode->i_mode)) { ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS); ext4_ext_tree_init(handle, inode); } } ext4_clear_inode_flag(inode, EXT4_INODE_INLINE_DATA); get_bh(is.iloc.bh); error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); EXT4_I(inode)->i_inline_off = 0; EXT4_I(inode)->i_inline_size = 0; ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); out: brelse(is.iloc.bh); if (error == -ENODATA) error = 0; return error; } static int ext4_read_inline_folio(struct inode *inode, struct folio *folio) { void *kaddr; int ret = 0; size_t len; struct ext4_iloc iloc; BUG_ON(!folio_test_locked(folio)); BUG_ON(!ext4_has_inline_data(inode)); BUG_ON(folio->index); if (!EXT4_I(inode)->i_inline_off) { ext4_warning(inode->i_sb, "inode %lu doesn't have inline data.", inode->i_ino); goto out; } ret = ext4_get_inode_loc(inode, &iloc); if (ret) goto out; len = min_t(size_t, ext4_get_inline_size(inode), i_size_read(inode)); BUG_ON(len > PAGE_SIZE); kaddr = kmap_local_folio(folio, 0); ret = ext4_read_inline_data(inode, kaddr, len, &iloc); kaddr = folio_zero_tail(folio, len, kaddr + len); kunmap_local(kaddr); folio_mark_uptodate(folio); brelse(iloc.bh); out: return ret; } int ext4_readpage_inline(struct inode *inode, struct folio *folio) { int ret = 0; down_read(&EXT4_I(inode)->xattr_sem); if (!ext4_has_inline_data(inode)) { up_read(&EXT4_I(inode)->xattr_sem); return -EAGAIN; } /* * Current inline data can only exist in the 1st page, * So for all the other pages, just set them uptodate. */ if (!folio->index) ret = ext4_read_inline_folio(inode, folio); else if (!folio_test_uptodate(folio)) { folio_zero_segment(folio, 0, folio_size(folio)); folio_mark_uptodate(folio); } up_read(&EXT4_I(inode)->xattr_sem); folio_unlock(folio); return ret >= 0 ? 0 : ret; } static int ext4_convert_inline_data_to_extent(struct address_space *mapping, struct inode *inode) { int ret, needed_blocks, no_expand; handle_t *handle = NULL; int retries = 0, sem_held = 0; struct folio *folio = NULL; unsigned from, to; struct ext4_iloc iloc; if (!ext4_has_inline_data(inode)) { /* * clear the flag so that no new write * will trap here again. */ ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); return 0; } needed_blocks = ext4_writepage_trans_blocks(inode); ret = ext4_get_inode_loc(inode, &iloc); if (ret) return ret; retry: handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks); if (IS_ERR(handle)) { ret = PTR_ERR(handle); handle = NULL; goto out; } /* We cannot recurse into the filesystem as the transaction is already * started */ folio = __filemap_get_folio(mapping, 0, FGP_WRITEBEGIN | FGP_NOFS, mapping_gfp_mask(mapping)); if (IS_ERR(folio)) { ret = PTR_ERR(folio); goto out_nofolio; } ext4_write_lock_xattr(inode, &no_expand); sem_held = 1; /* If some one has already done this for us, just exit. */ if (!ext4_has_inline_data(inode)) { ret = 0; goto out; } from = 0; to = ext4_get_inline_size(inode); if (!folio_test_uptodate(folio)) { ret = ext4_read_inline_folio(inode, folio); if (ret < 0) goto out; } ret = ext4_destroy_inline_data_nolock(handle, inode); if (ret) goto out; if (ext4_should_dioread_nolock(inode)) { ret = __block_write_begin(&folio->page, from, to, ext4_get_block_unwritten); } else ret = __block_write_begin(&folio->page, from, to, ext4_get_block); if (!ret && ext4_should_journal_data(inode)) { ret = ext4_walk_page_buffers(handle, inode, folio_buffers(folio), from, to, NULL, do_journal_get_write_access); } if (ret) { folio_unlock(folio); folio_put(folio); folio = NULL; ext4_orphan_add(handle, inode); ext4_write_unlock_xattr(inode, &no_expand); sem_held = 0; ext4_journal_stop(handle); handle = NULL; ext4_truncate_failed_write(inode); /* * If truncate failed early the inode might * still be on the orphan list; we need to * make sure the inode is removed from the * orphan list in that case. */ if (inode->i_nlink) ext4_orphan_del(NULL, inode); } if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry; if (folio) block_commit_write(&folio->page, from, to); out: if (folio) { folio_unlock(folio); folio_put(folio); } out_nofolio: if (sem_held) ext4_write_unlock_xattr(inode, &no_expand); if (handle) ext4_journal_stop(handle); brelse(iloc.bh); return ret; } /* * Try to write data in the inode. * If the inode has inline data, check whether the new write can be * in the inode also. If not, create the page the handle, move the data * to the page make it update and let the later codes create extent for it. */ int ext4_try_to_write_inline_data(struct address_space *mapping, struct inode *inode, loff_t pos, unsigned len, struct page **pagep) { int ret; handle_t *handle; struct folio *folio; struct ext4_iloc iloc; if (pos + len > ext4_get_max_inline_size(inode)) goto convert; ret = ext4_get_inode_loc(inode, &iloc); if (ret) return ret; /* * The possible write could happen in the inode, * so try to reserve the space in inode first. */ handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); if (IS_ERR(handle)) { ret = PTR_ERR(handle); handle = NULL; goto out; } ret = ext4_prepare_inline_data(handle, inode, pos + len); if (ret && ret != -ENOSPC) goto out; /* We don't have space in inline inode, so convert it to extent. */ if (ret == -ENOSPC) { ext4_journal_stop(handle); brelse(iloc.bh); goto convert; } ret = ext4_journal_get_write_access(handle, inode->i_sb, iloc.bh, EXT4_JTR_NONE); if (ret) goto out; folio = __filemap_get_folio(mapping, 0, FGP_WRITEBEGIN | FGP_NOFS, mapping_gfp_mask(mapping)); if (IS_ERR(folio)) { ret = PTR_ERR(folio); goto out; } *pagep = &folio->page; down_read(&EXT4_I(inode)->xattr_sem); if (!ext4_has_inline_data(inode)) { ret = 0; folio_unlock(folio); folio_put(folio); goto out_up_read; } if (!folio_test_uptodate(folio)) { ret = ext4_read_inline_folio(inode, folio); if (ret < 0) { folio_unlock(folio); folio_put(folio); goto out_up_read; } } ret = 1; handle = NULL; out_up_read: up_read(&EXT4_I(inode)->xattr_sem); out: if (handle && (ret != 1)) ext4_journal_stop(handle); brelse(iloc.bh); return ret; convert: return ext4_convert_inline_data_to_extent(mapping, inode); } int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, unsigned copied, struct folio *folio) { handle_t *handle = ext4_journal_current_handle(); int no_expand; void *kaddr; struct ext4_iloc iloc; int ret = 0, ret2; if (unlikely(copied < len) && !folio_test_uptodate(folio)) copied = 0; if (likely(copied)) { ret = ext4_get_inode_loc(inode, &iloc); if (ret) { folio_unlock(folio); folio_put(folio); ext4_std_error(inode->i_sb, ret); goto out; } ext4_write_lock_xattr(inode, &no_expand); BUG_ON(!ext4_has_inline_data(inode)); /* * ei->i_inline_off may have changed since * ext4_write_begin() called * ext4_try_to_write_inline_data() */ (void) ext4_find_inline_data_nolock(inode); kaddr = kmap_local_folio(folio, 0); ext4_write_inline_data(inode, &iloc, kaddr, pos, copied); kunmap_local(kaddr); folio_mark_uptodate(folio); /* clear dirty flag so that writepages wouldn't work for us. */ folio_clear_dirty(folio); ext4_write_unlock_xattr(inode, &no_expand); brelse(iloc.bh); /* * It's important to update i_size while still holding folio * lock: page writeout could otherwise come in and zero * beyond i_size. */ ext4_update_inode_size(inode, pos + copied); } folio_unlock(folio); folio_put(folio); /* * Don't mark the inode dirty under folio lock. First, it unnecessarily * makes the holding time of folio lock longer. Second, it forces lock * ordering of folio lock and transaction start for journaling * filesystems. */ if (likely(copied)) mark_inode_dirty(inode); out: /* * If we didn't copy as much data as expected, we need to trim back * size of xattr containing inline data. */ if (pos + len > inode->i_size && ext4_can_truncate(inode)) ext4_orphan_add(handle, inode); ret2 = ext4_journal_stop(handle); if (!ret) ret = ret2; if (pos + len > inode->i_size) { ext4_truncate_failed_write(inode); /* * If truncate failed early the inode might still be * on the orphan list; we need to make sure the inode * is removed from the orphan list in that case. */ if (inode->i_nlink) ext4_orphan_del(NULL, inode); } return ret ? ret : copied; } /* * Try to make the page cache and handle ready for the inline data case. * We can call this function in 2 cases: * 1. The inode is created and the first write exceeds inline size. We can * clear the inode state safely. * 2. The inode has inline data, then we need to read the data, make it * update and dirty so that ext4_da_writepages can handle it. We don't * need to start the journal since the file's metadata isn't changed now. */ static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping, struct inode *inode, void **fsdata) { int ret = 0, inline_size; struct folio *folio; folio = __filemap_get_folio(mapping, 0, FGP_WRITEBEGIN, mapping_gfp_mask(mapping)); if (IS_ERR(folio)) return PTR_ERR(folio); down_read(&EXT4_I(inode)->xattr_sem); if (!ext4_has_inline_data(inode)) { ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); goto out; } inline_size = ext4_get_inline_size(inode); if (!folio_test_uptodate(folio)) { ret = ext4_read_inline_folio(inode, folio); if (ret < 0) goto out; } ret = __block_write_begin(&folio->page, 0, inline_size, ext4_da_get_block_prep); if (ret) { up_read(&EXT4_I(inode)->xattr_sem); folio_unlock(folio); folio_put(folio); ext4_truncate_failed_write(inode); return ret; } folio_mark_dirty(folio); folio_mark_uptodate(folio); ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); *fsdata = (void *)CONVERT_INLINE_DATA; out: up_read(&EXT4_I(inode)->xattr_sem); if (folio) { folio_unlock(folio); folio_put(folio); } return ret; } /* * Prepare the write for the inline data. * If the data can be written into the inode, we just read * the page and make it uptodate, and start the journal. * Otherwise read the page, makes it dirty so that it can be * handle in writepages(the i_disksize update is left to the * normal ext4_da_write_end). */ int ext4_da_write_inline_data_begin(struct address_space *mapping, struct inode *inode, loff_t pos, unsigned len, struct page **pagep, void **fsdata) { int ret; handle_t *handle; struct folio *folio; struct ext4_iloc iloc; int retries = 0; ret = ext4_get_inode_loc(inode, &iloc); if (ret) return ret; retry_journal: handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); if (IS_ERR(handle)) { ret = PTR_ERR(handle); goto out; } ret = ext4_prepare_inline_data(handle, inode, pos + len); if (ret && ret != -ENOSPC) goto out_journal; if (ret == -ENOSPC) { ext4_journal_stop(handle); ret = ext4_da_convert_inline_data_to_extent(mapping, inode, fsdata); if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry_journal; goto out; } /* * We cannot recurse into the filesystem as the transaction * is already started. */ folio = __filemap_get_folio(mapping, 0, FGP_WRITEBEGIN | FGP_NOFS, mapping_gfp_mask(mapping)); if (IS_ERR(folio)) { ret = PTR_ERR(folio); goto out_journal; } down_read(&EXT4_I(inode)->xattr_sem); if (!ext4_has_inline_data(inode)) { ret = 0; goto out_release_page; } if (!folio_test_uptodate(folio)) { ret = ext4_read_inline_folio(inode, folio); if (ret < 0) goto out_release_page; } ret = ext4_journal_get_write_access(handle, inode->i_sb, iloc.bh, EXT4_JTR_NONE); if (ret) goto out_release_page; up_read(&EXT4_I(inode)->xattr_sem); *pagep = &folio->page; brelse(iloc.bh); return 1; out_release_page: up_read(&EXT4_I(inode)->xattr_sem); folio_unlock(folio); folio_put(folio); out_journal: ext4_journal_stop(handle); out: brelse(iloc.bh); return ret; } #ifdef INLINE_DIR_DEBUG void ext4_show_inline_dir(struct inode *dir, struct buffer_head *bh, void *inline_start, int inline_size) { int offset; unsigned short de_len; struct ext4_dir_entry_2 *de = inline_start; void *dlimit = inline_start + inline_size; trace_printk("inode %lu\n", dir->i_ino); offset = 0; while ((void *)de < dlimit) { de_len = ext4_rec_len_from_disk(de->rec_len, inline_size); trace_printk("de: off %u rlen %u name %.*s nlen %u ino %u\n", offset, de_len, de->name_len, de->name, de->name_len, le32_to_cpu(de->inode)); if (ext4_check_dir_entry(dir, NULL, de, bh, inline_start, inline_size, offset)) BUG(); offset += de_len; de = (struct ext4_dir_entry_2 *) ((char *) de + de_len); } } #else #define ext4_show_inline_dir(dir, bh, inline_start, inline_size) #endif /* * Add a new entry into a inline dir. * It will return -ENOSPC if no space is available, and -EIO * and -EEXIST if directory entry already exists. */ static int ext4_add_dirent_to_inline(handle_t *handle, struct ext4_filename *fname, struct inode *dir, struct inode *inode, struct ext4_iloc *iloc, void *inline_start, int inline_size) { int err; struct ext4_dir_entry_2 *de; err = ext4_find_dest_de(dir, inode, iloc->bh, inline_start, inline_size, fname, &de); if (err) return err; BUFFER_TRACE(iloc->bh, "get_write_access"); err = ext4_journal_get_write_access(handle, dir->i_sb, iloc->bh, EXT4_JTR_NONE); if (err) return err; ext4_insert_dentry(dir, inode, de, inline_size, fname); ext4_show_inline_dir(dir, iloc->bh, inline_start, inline_size); /* * XXX shouldn't update any times until successful * completion of syscall, but too many callers depend * on this. * * XXX similarly, too many callers depend on * ext4_new_inode() setting the times, but error * recovery deletes the inode, so the worst that can * happen is that the times are slightly out of date * and/or different from the directory change time. */ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); ext4_update_dx_flag(dir); inode_inc_iversion(dir); return 1; } static void *ext4_get_inline_xattr_pos(struct inode *inode, struct ext4_iloc *iloc) { struct ext4_xattr_entry *entry; struct ext4_xattr_ibody_header *header; BUG_ON(!EXT4_I(inode)->i_inline_off); header = IHDR(inode, ext4_raw_inode(iloc)); entry = (struct ext4_xattr_entry *)((void *)ext4_raw_inode(iloc) + EXT4_I(inode)->i_inline_off); return (void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs); } /* Set the final de to cover the whole block. */ static void ext4_update_final_de(void *de_buf, int old_size, int new_size) { struct ext4_dir_entry_2 *de, *prev_de; void *limit; int de_len; de = de_buf; if (old_size) { limit = de_buf + old_size; do { prev_de = de; de_len = ext4_rec_len_from_disk(de->rec_len, old_size); de_buf += de_len; de = de_buf; } while (de_buf < limit); prev_de->rec_len = ext4_rec_len_to_disk(de_len + new_size - old_size, new_size); } else { /* this is just created, so create an empty entry. */ de->inode = 0; de->rec_len = ext4_rec_len_to_disk(new_size, new_size); } } static int ext4_update_inline_dir(handle_t *handle, struct inode *dir, struct ext4_iloc *iloc) { int ret; int old_size = EXT4_I(dir)->i_inline_size - EXT4_MIN_INLINE_DATA_SIZE; int new_size = get_max_inline_xattr_value_size(dir, iloc); if (new_size - old_size <= ext4_dir_rec_len(1, NULL)) return -ENOSPC; ret = ext4_update_inline_data(handle, dir, new_size + EXT4_MIN_INLINE_DATA_SIZE); if (ret) return ret; ext4_update_final_de(ext4_get_inline_xattr_pos(dir, iloc), old_size, EXT4_I(dir)->i_inline_size - EXT4_MIN_INLINE_DATA_SIZE); dir->i_size = EXT4_I(dir)->i_disksize = EXT4_I(dir)->i_inline_size; return 0; } static void ext4_restore_inline_data(handle_t *handle, struct inode *inode, struct ext4_iloc *iloc, void *buf, int inline_size) { int ret; ret = ext4_create_inline_data(handle, inode, inline_size); if (ret) { ext4_msg(inode->i_sb, KERN_EMERG, "error restoring inline_data for inode -- potential data loss! (inode %lu, error %d)", inode->i_ino, ret); return; } ext4_write_inline_data(inode, iloc, buf, 0, inline_size); ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); } static int ext4_finish_convert_inline_dir(handle_t *handle, struct inode *inode, struct buffer_head *dir_block, void *buf, int inline_size) { int err, csum_size = 0, header_size = 0; struct ext4_dir_entry_2 *de; void *target = dir_block->b_data; /* * First create "." and ".." and then copy the dir information * back to the block. */ de = target; de = ext4_init_dot_dotdot(inode, de, inode->i_sb->s_blocksize, csum_size, le32_to_cpu(((struct ext4_dir_entry_2 *)buf)->inode), 1); header_size = (void *)de - target; memcpy((void *)de, buf + EXT4_INLINE_DOTDOT_SIZE, inline_size - EXT4_INLINE_DOTDOT_SIZE); if (ext4_has_metadata_csum(inode->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); inode->i_size = inode->i_sb->s_blocksize; i_size_write(inode, inode->i_sb->s_blocksize); EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize; ext4_update_final_de(dir_block->b_data, inline_size - EXT4_INLINE_DOTDOT_SIZE + header_size, inode->i_sb->s_blocksize - csum_size); if (csum_size) ext4_initialize_dirent_tail(dir_block, inode->i_sb->s_blocksize); set_buffer_uptodate(dir_block); unlock_buffer(dir_block); err = ext4_handle_dirty_dirblock(handle, inode, dir_block); if (err) return err; set_buffer_verified(dir_block); return ext4_mark_inode_dirty(handle, inode); } static int ext4_convert_inline_data_nolock(handle_t *handle, struct inode *inode, struct ext4_iloc *iloc) { int error; void *buf = NULL; struct buffer_head *data_bh = NULL; struct ext4_map_blocks map; int inline_size; inline_size = ext4_get_inline_size(inode); buf = kmalloc(inline_size, GFP_NOFS); if (!buf) { error = -ENOMEM; goto out; } error = ext4_read_inline_data(inode, buf, inline_size, iloc); if (error < 0) goto out; /* * Make sure the inline directory entries pass checks before we try to * convert them, so that we avoid touching stuff that needs fsck. */ if (S_ISDIR(inode->i_mode)) { error = ext4_check_all_de(inode, iloc->bh, buf + EXT4_INLINE_DOTDOT_SIZE, inline_size - EXT4_INLINE_DOTDOT_SIZE); if (error) goto out; } error = ext4_destroy_inline_data_nolock(handle, inode); if (error) goto out; map.m_lblk = 0; map.m_len = 1; map.m_flags = 0; error = ext4_map_blocks(handle, inode, &map, EXT4_GET_BLOCKS_CREATE); if (error < 0) goto out_restore; if (!(map.m_flags & EXT4_MAP_MAPPED)) { error = -EIO; goto out_restore; } data_bh = sb_getblk(inode->i_sb, map.m_pblk); if (!data_bh) { error = -ENOMEM; goto out_restore; } lock_buffer(data_bh); error = ext4_journal_get_create_access(handle, inode->i_sb, data_bh, EXT4_JTR_NONE); if (error) { unlock_buffer(data_bh); error = -EIO; goto out_restore; } memset(data_bh->b_data, 0, inode->i_sb->s_blocksize); if (!S_ISDIR(inode->i_mode)) { memcpy(data_bh->b_data, buf, inline_size); set_buffer_uptodate(data_bh); unlock_buffer(data_bh); error = ext4_handle_dirty_metadata(handle, inode, data_bh); } else { error = ext4_finish_convert_inline_dir(handle, inode, data_bh, buf, inline_size); } out_restore: if (error) ext4_restore_inline_data(handle, inode, iloc, buf, inline_size); out: brelse(data_bh); kfree(buf); return error; } /* * Try to add the new entry to the inline data. * If succeeds, return 0. If not, extended the inline dir and copied data to * the new created block. */ int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname, struct inode *dir, struct inode *inode) { int ret, ret2, inline_size, no_expand; void *inline_start; struct ext4_iloc iloc; ret = ext4_get_inode_loc(dir, &iloc); if (ret) return ret; ext4_write_lock_xattr(dir, &no_expand); if (!ext4_has_inline_data(dir)) goto out; inline_start = (void *)ext4_raw_inode(&iloc)->i_block + EXT4_INLINE_DOTDOT_SIZE; inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE; ret = ext4_add_dirent_to_inline(handle, fname, dir, inode, &iloc, inline_start, inline_size); if (ret != -ENOSPC) goto out; /* check whether it can be inserted to inline xattr space. */ inline_size = EXT4_I(dir)->i_inline_size - EXT4_MIN_INLINE_DATA_SIZE; if (!inline_size) { /* Try to use the xattr space.*/ ret = ext4_update_inline_dir(handle, dir, &iloc); if (ret && ret != -ENOSPC) goto out; inline_size = EXT4_I(dir)->i_inline_size - EXT4_MIN_INLINE_DATA_SIZE; } if (inline_size) { inline_start = ext4_get_inline_xattr_pos(dir, &iloc); ret = ext4_add_dirent_to_inline(handle, fname, dir, inode, &iloc, inline_start, inline_size); if (ret != -ENOSPC) goto out; } /* * The inline space is filled up, so create a new block for it. * As the extent tree will be created, we have to save the inline * dir first. */ ret = ext4_convert_inline_data_nolock(handle, dir, &iloc); out: ext4_write_unlock_xattr(dir, &no_expand); ret2 = ext4_mark_inode_dirty(handle, dir); if (unlikely(ret2 && !ret)) ret = ret2; brelse(iloc.bh); return ret; } /* * This function fills a red-black tree with information from an * inlined dir. It returns the number directory entries loaded * into the tree. If there is an error it is returned in err. */ int ext4_inlinedir_to_tree(struct file *dir_file, struct inode *dir, ext4_lblk_t block, struct dx_hash_info *hinfo, __u32 start_hash, __u32 start_minor_hash, int *has_inline_data) { int err = 0, count = 0; unsigned int parent_ino; int pos; struct ext4_dir_entry_2 *de; struct inode *inode = file_inode(dir_file); int ret, inline_size = 0; struct ext4_iloc iloc; void *dir_buf = NULL; struct ext4_dir_entry_2 fake; struct fscrypt_str tmp_str; ret = ext4_get_inode_loc(inode, &iloc); if (ret) return ret; down_read(&EXT4_I(inode)->xattr_sem); if (!ext4_has_inline_data(inode)) { up_read(&EXT4_I(inode)->xattr_sem); *has_inline_data = 0; goto out; } inline_size = ext4_get_inline_size(inode); dir_buf = kmalloc(inline_size, GFP_NOFS); if (!dir_buf) { ret = -ENOMEM; up_read(&EXT4_I(inode)->xattr_sem); goto out; } ret = ext4_read_inline_data(inode, dir_buf, inline_size, &iloc); up_read(&EXT4_I(inode)->xattr_sem); if (ret < 0) goto out; pos = 0; parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode); while (pos < inline_size) { /* * As inlined dir doesn't store any information about '.' and * only the inode number of '..' is stored, we have to handle * them differently. */ if (pos == 0) { fake.inode = cpu_to_le32(inode->i_ino); fake.name_len = 1; strcpy(fake.name, "."); fake.rec_len = ext4_rec_len_to_disk( ext4_dir_rec_len(fake.name_len, NULL), inline_size); ext4_set_de_type(inode->i_sb, &fake, S_IFDIR); de = &fake; pos = EXT4_INLINE_DOTDOT_OFFSET; } else if (pos == EXT4_INLINE_DOTDOT_OFFSET) { fake.inode = cpu_to_le32(parent_ino); fake.name_len = 2; strcpy(fake.name, ".."); fake.rec_len = ext4_rec_len_to_disk( ext4_dir_rec_len(fake.name_len, NULL), inline_size); ext4_set_de_type(inode->i_sb, &fake, S_IFDIR); de = &fake; pos = EXT4_INLINE_DOTDOT_SIZE; } else { de = (struct ext4_dir_entry_2 *)(dir_buf + pos); pos += ext4_rec_len_from_disk(de->rec_len, inline_size); if (ext4_check_dir_entry(inode, dir_file, de, iloc.bh, dir_buf, inline_size, pos)) { ret = count; goto out; } } if (ext4_hash_in_dirent(dir)) { hinfo->hash = EXT4_DIRENT_HASH(de); hinfo->minor_hash = EXT4_DIRENT_MINOR_HASH(de); } else { ext4fs_dirhash(dir, de->name, de->name_len, hinfo); } if ((hinfo->hash < start_hash) || ((hinfo->hash == start_hash) && (hinfo->minor_hash < start_minor_hash))) continue; if (de->inode == 0) continue; tmp_str.name = de->name; tmp_str.len = de->name_len; err = ext4_htree_store_dirent(dir_file, hinfo->hash, hinfo->minor_hash, de, &tmp_str); if (err) { ret = err; goto out; } count++; } ret = count; out: kfree(dir_buf); brelse(iloc.bh); return ret; } /* * So this function is called when the volume is mkfsed with * dir_index disabled. In order to keep f_pos persistent * after we convert from an inlined dir to a blocked based, * we just pretend that we are a normal dir and return the * offset as if '.' and '..' really take place. * */ int ext4_read_inline_dir(struct file *file, struct dir_context *ctx, int *has_inline_data) { unsigned int offset, parent_ino; int i; struct ext4_dir_entry_2 *de; struct super_block *sb; struct inode *inode = file_inode(file); int ret, inline_size = 0; struct ext4_iloc iloc; void *dir_buf = NULL; int dotdot_offset, dotdot_size, extra_offset, extra_size; ret = ext4_get_inode_loc(inode, &iloc); if (ret) return ret; down_read(&EXT4_I(inode)->xattr_sem); if (!ext4_has_inline_data(inode)) { up_read(&EXT4_I(inode)->xattr_sem); *has_inline_data = 0; goto out; } inline_size = ext4_get_inline_size(inode); dir_buf = kmalloc(inline_size, GFP_NOFS); if (!dir_buf) { ret = -ENOMEM; up_read(&EXT4_I(inode)->xattr_sem); goto out; } ret = ext4_read_inline_data(inode, dir_buf, inline_size, &iloc); up_read(&EXT4_I(inode)->xattr_sem); if (ret < 0) goto out; ret = 0; sb = inode->i_sb; parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode); offset = ctx->pos; /* * dotdot_offset and dotdot_size is the real offset and * size for ".." and "." if the dir is block based while * the real size for them are only EXT4_INLINE_DOTDOT_SIZE. * So we will use extra_offset and extra_size to indicate them * during the inline dir iteration. */ dotdot_offset = ext4_dir_rec_len(1, NULL); dotdot_size = dotdot_offset + ext4_dir_rec_len(2, NULL); extra_offset = dotdot_size - EXT4_INLINE_DOTDOT_SIZE; extra_size = extra_offset + inline_size; /* * If the version has changed since the last call to * readdir(2), then we might be pointing to an invalid * dirent right now. Scan from the start of the inline * dir to make sure. */ if (!inode_eq_iversion(inode, file->f_version)) { for (i = 0; i < extra_size && i < offset;) { /* * "." is with offset 0 and * ".." is dotdot_offset. */ if (!i) { i = dotdot_offset; continue; } else if (i == dotdot_offset) { i = dotdot_size; continue; } /* for other entry, the real offset in * the buf has to be tuned accordingly. */ de = (struct ext4_dir_entry_2 *) (dir_buf + i - extra_offset); /* It's too expensive to do a full * dirent test each time round this * loop, but we do have to test at * least that it is non-zero. A * failure will be detected in the * dirent test below. */ if (ext4_rec_len_from_disk(de->rec_len, extra_size) < ext4_dir_rec_len(1, NULL)) break; i += ext4_rec_len_from_disk(de->rec_len, extra_size); } offset = i; ctx->pos = offset; file->f_version = inode_query_iversion(inode); } while (ctx->pos < extra_size) { if (ctx->pos == 0) { if (!dir_emit(ctx, ".", 1, inode->i_ino, DT_DIR)) goto out; ctx->pos = dotdot_offset; continue; } if (ctx->pos == dotdot_offset) { if (!dir_emit(ctx, "..", 2, parent_ino, DT_DIR)) goto out; ctx->pos = dotdot_size; continue; } de = (struct ext4_dir_entry_2 *) (dir_buf + ctx->pos - extra_offset); if (ext4_check_dir_entry(inode, file, de, iloc.bh, dir_buf, extra_size, ctx->pos)) goto out; if (le32_to_cpu(de->inode)) { if (!dir_emit(ctx, de->name, de->name_len, le32_to_cpu(de->inode), get_dtype(sb, de->file_type))) goto out; } ctx->pos += ext4_rec_len_from_disk(de->rec_len, extra_size); } out: kfree(dir_buf); brelse(iloc.bh); return ret; } void *ext4_read_inline_link(struct inode *inode) { struct ext4_iloc iloc; int ret, inline_size; void *link; ret = ext4_get_inode_loc(inode, &iloc); if (ret) return ERR_PTR(ret); ret = -ENOMEM; inline_size = ext4_get_inline_size(inode); link = kmalloc(inline_size + 1, GFP_NOFS); if (!link) goto out; ret = ext4_read_inline_data(inode, link, inline_size, &iloc); if (ret < 0) { kfree(link); goto out; } nd_terminate_link(link, inode->i_size, ret); out: if (ret < 0) link = ERR_PTR(ret); brelse(iloc.bh); return link; } struct buffer_head *ext4_get_first_inline_block(struct inode *inode, struct ext4_dir_entry_2 **parent_de, int *retval) { struct ext4_iloc iloc; *retval = ext4_get_inode_loc(inode, &iloc); if (*retval) return NULL; *parent_de = (struct ext4_dir_entry_2 *)ext4_raw_inode(&iloc)->i_block; return iloc.bh; } /* * Try to create the inline data for the new dir. * If it succeeds, return 0, otherwise return the error. * In case of ENOSPC, the caller should create the normal disk layout dir. */ int ext4_try_create_inline_dir(handle_t *handle, struct inode *parent, struct inode *inode) { int ret, inline_size = EXT4_MIN_INLINE_DATA_SIZE; struct ext4_iloc iloc; struct ext4_dir_entry_2 *de; ret = ext4_get_inode_loc(inode, &iloc); if (ret) return ret; ret = ext4_prepare_inline_data(handle, inode, inline_size); if (ret) goto out; /* * For inline dir, we only save the inode information for the ".." * and create a fake dentry to cover the left space. */ de = (struct ext4_dir_entry_2 *)ext4_raw_inode(&iloc)->i_block; de->inode = cpu_to_le32(parent->i_ino); de = (struct ext4_dir_entry_2 *)((void *)de + EXT4_INLINE_DOTDOT_SIZE); de->inode = 0; de->rec_len = ext4_rec_len_to_disk( inline_size - EXT4_INLINE_DOTDOT_SIZE, inline_size); set_nlink(inode, 2); inode->i_size = EXT4_I(inode)->i_disksize = inline_size; out: brelse(iloc.bh); return ret; } struct buffer_head *ext4_find_inline_entry(struct inode *dir, struct ext4_filename *fname, struct ext4_dir_entry_2 **res_dir, int *has_inline_data) { int ret; struct ext4_iloc iloc; void *inline_start; int inline_size; if (ext4_get_inode_loc(dir, &iloc)) return NULL; down_read(&EXT4_I(dir)->xattr_sem); if (!ext4_has_inline_data(dir)) { *has_inline_data = 0; goto out; } inline_start = (void *)ext4_raw_inode(&iloc)->i_block + EXT4_INLINE_DOTDOT_SIZE; inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE; ret = ext4_search_dir(iloc.bh, inline_start, inline_size, dir, fname, 0, res_dir); if (ret == 1) goto out_find; if (ret < 0) goto out; if (ext4_get_inline_size(dir) == EXT4_MIN_INLINE_DATA_SIZE) goto out; inline_start = ext4_get_inline_xattr_pos(dir, &iloc); inline_size = ext4_get_inline_size(dir) - EXT4_MIN_INLINE_DATA_SIZE; ret = ext4_search_dir(iloc.bh, inline_start, inline_size, dir, fname, 0, res_dir); if (ret == 1) goto out_find; out: brelse(iloc.bh); iloc.bh = NULL; out_find: up_read(&EXT4_I(dir)->xattr_sem); return iloc.bh; } int ext4_delete_inline_entry(handle_t *handle, struct inode *dir, struct ext4_dir_entry_2 *de_del, struct buffer_head *bh, int *has_inline_data) { int err, inline_size, no_expand; struct ext4_iloc iloc; void *inline_start; err = ext4_get_inode_loc(dir, &iloc); if (err) return err; ext4_write_lock_xattr(dir, &no_expand); if (!ext4_has_inline_data(dir)) { *has_inline_data = 0; goto out; } if ((void *)de_del - ((void *)ext4_raw_inode(&iloc)->i_block) < EXT4_MIN_INLINE_DATA_SIZE) { inline_start = (void *)ext4_raw_inode(&iloc)->i_block + EXT4_INLINE_DOTDOT_SIZE; inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE; } else { inline_start = ext4_get_inline_xattr_pos(dir, &iloc); inline_size = ext4_get_inline_size(dir) - EXT4_MIN_INLINE_DATA_SIZE; } BUFFER_TRACE(bh, "get_write_access"); err = ext4_journal_get_write_access(handle, dir->i_sb, bh, EXT4_JTR_NONE); if (err) goto out; err = ext4_generic_delete_entry(dir, de_del, bh, inline_start, inline_size, 0); if (err) goto out; ext4_show_inline_dir(dir, iloc.bh, inline_start, inline_size); out: ext4_write_unlock_xattr(dir, &no_expand); if (likely(err == 0)) err = ext4_mark_inode_dirty(handle, dir); brelse(iloc.bh); if (err != -ENOENT) ext4_std_error(dir->i_sb, err); return err; } /* * Get the inline dentry at offset. */ static inline struct ext4_dir_entry_2 * ext4_get_inline_entry(struct inode *inode, struct ext4_iloc *iloc, unsigned int offset, void **inline_start, int *inline_size) { void *inline_pos; BUG_ON(offset > ext4_get_inline_size(inode)); if (offset < EXT4_MIN_INLINE_DATA_SIZE) { inline_pos = (void *)ext4_raw_inode(iloc)->i_block; *inline_size = EXT4_MIN_INLINE_DATA_SIZE; } else { inline_pos = ext4_get_inline_xattr_pos(inode, iloc); offset -= EXT4_MIN_INLINE_DATA_SIZE; *inline_size = ext4_get_inline_size(inode) - EXT4_MIN_INLINE_DATA_SIZE; } if (inline_start) *inline_start = inline_pos; return (struct ext4_dir_entry_2 *)(inline_pos + offset); } bool empty_inline_dir(struct inode *dir, int *has_inline_data) { int err, inline_size; struct ext4_iloc iloc; size_t inline_len; void *inline_pos; unsigned int offset; struct ext4_dir_entry_2 *de; bool ret = false; err = ext4_get_inode_loc(dir, &iloc); if (err) { EXT4_ERROR_INODE_ERR(dir, -err, "error %d getting inode %lu block", err, dir->i_ino); return false; } down_read(&EXT4_I(dir)->xattr_sem); if (!ext4_has_inline_data(dir)) { *has_inline_data = 0; ret = true; goto out; } de = (struct ext4_dir_entry_2 *)ext4_raw_inode(&iloc)->i_block; if (!le32_to_cpu(de->inode)) { ext4_warning(dir->i_sb, "bad inline directory (dir #%lu) - no `..'", dir->i_ino); goto out; } inline_len = ext4_get_inline_size(dir); offset = EXT4_INLINE_DOTDOT_SIZE; while (offset < inline_len) { de = ext4_get_inline_entry(dir, &iloc, offset, &inline_pos, &inline_size); if (ext4_check_dir_entry(dir, NULL, de, iloc.bh, inline_pos, inline_size, offset)) { ext4_warning(dir->i_sb, "bad inline directory (dir #%lu) - " "inode %u, rec_len %u, name_len %d" "inline size %d", dir->i_ino, le32_to_cpu(de->inode), le16_to_cpu(de->rec_len), de->name_len, inline_size); goto out; } if (le32_to_cpu(de->inode)) { goto out; } offset += ext4_rec_len_from_disk(de->rec_len, inline_size); } ret = true; out: up_read(&EXT4_I(dir)->xattr_sem); brelse(iloc.bh); return ret; } int ext4_destroy_inline_data(handle_t *handle, struct inode *inode) { int ret, no_expand; ext4_write_lock_xattr(inode, &no_expand); ret = ext4_destroy_inline_data_nolock(handle, inode); ext4_write_unlock_xattr(inode, &no_expand); return ret; } int ext4_inline_data_iomap(struct inode *inode, struct iomap *iomap) { __u64 addr; int error = -EAGAIN; struct ext4_iloc iloc; down_read(&EXT4_I(inode)->xattr_sem); if (!ext4_has_inline_data(inode)) goto out; error = ext4_get_inode_loc(inode, &iloc); if (error) goto out; addr = (__u64)iloc.bh->b_blocknr << inode->i_sb->s_blocksize_bits; addr += (char *)ext4_raw_inode(&iloc) - iloc.bh->b_data; addr += offsetof(struct ext4_inode, i_block); brelse(iloc.bh); iomap->addr = addr; iomap->offset = 0; iomap->length = min_t(loff_t, ext4_get_inline_size(inode), i_size_read(inode)); iomap->type = IOMAP_INLINE; iomap->flags = 0; out: up_read(&EXT4_I(inode)->xattr_sem); return error; } int ext4_inline_data_truncate(struct inode *inode, int *has_inline) { handle_t *handle; int inline_size, value_len, needed_blocks, no_expand, err = 0; size_t i_size; void *value = NULL; struct ext4_xattr_ibody_find is = { .s = { .not_found = -ENODATA, }, }; struct ext4_xattr_info i = { .name_index = EXT4_XATTR_INDEX_SYSTEM, .name = EXT4_XATTR_SYSTEM_DATA, }; needed_blocks = ext4_writepage_trans_blocks(inode); handle = ext4_journal_start(inode, EXT4_HT_INODE, needed_blocks); if (IS_ERR(handle)) return PTR_ERR(handle); ext4_write_lock_xattr(inode, &no_expand); if (!ext4_has_inline_data(inode)) { ext4_write_unlock_xattr(inode, &no_expand); *has_inline = 0; ext4_journal_stop(handle); return 0; } if ((err = ext4_orphan_add(handle, inode)) != 0) goto out; if ((err = ext4_get_inode_loc(inode, &is.iloc)) != 0) goto out; down_write(&EXT4_I(inode)->i_data_sem); i_size = inode->i_size; inline_size = ext4_get_inline_size(inode); EXT4_I(inode)->i_disksize = i_size; if (i_size < inline_size) { /* * if there's inline data to truncate and this file was * converted to extents after that inline data was written, * the extent status cache must be cleared to avoid leaving * behind stale delayed allocated extent entries */ if (!ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS); /* Clear the content in the xattr space. */ if (inline_size > EXT4_MIN_INLINE_DATA_SIZE) { if ((err = ext4_xattr_ibody_find(inode, &i, &is)) != 0) goto out_error; BUG_ON(is.s.not_found); value_len = le32_to_cpu(is.s.here->e_value_size); value = kmalloc(value_len, GFP_NOFS); if (!value) { err = -ENOMEM; goto out_error; } err = ext4_xattr_ibody_get(inode, i.name_index, i.name, value, value_len); if (err <= 0) goto out_error; i.value = value; i.value_len = i_size > EXT4_MIN_INLINE_DATA_SIZE ? i_size - EXT4_MIN_INLINE_DATA_SIZE : 0; err = ext4_xattr_ibody_set(handle, inode, &i, &is); if (err) goto out_error; } /* Clear the content within i_blocks. */ if (i_size < EXT4_MIN_INLINE_DATA_SIZE) { void *p = (void *) ext4_raw_inode(&is.iloc)->i_block; memset(p + i_size, 0, EXT4_MIN_INLINE_DATA_SIZE - i_size); } EXT4_I(inode)->i_inline_size = i_size < EXT4_MIN_INLINE_DATA_SIZE ? EXT4_MIN_INLINE_DATA_SIZE : i_size; } out_error: up_write(&EXT4_I(inode)->i_data_sem); out: brelse(is.iloc.bh); ext4_write_unlock_xattr(inode, &no_expand); kfree(value); if (inode->i_nlink) ext4_orphan_del(handle, inode); if (err == 0) { inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); err = ext4_mark_inode_dirty(handle, inode); if (IS_SYNC(inode)) ext4_handle_sync(handle); } ext4_journal_stop(handle); return err; } int ext4_convert_inline_data(struct inode *inode) { int error, needed_blocks, no_expand; handle_t *handle; struct ext4_iloc iloc; if (!ext4_has_inline_data(inode)) { ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); return 0; } else if (!ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { /* * Inode has inline data but EXT4_STATE_MAY_INLINE_DATA is * cleared. This means we are in the middle of moving of * inline data to delay allocated block. Just force writeout * here to finish conversion. */ error = filemap_flush(inode->i_mapping); if (error) return error; if (!ext4_has_inline_data(inode)) return 0; } needed_blocks = ext4_writepage_trans_blocks(inode); iloc.bh = NULL; error = ext4_get_inode_loc(inode, &iloc); if (error) return error; handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks); if (IS_ERR(handle)) { error = PTR_ERR(handle); goto out_free; } ext4_write_lock_xattr(inode, &no_expand); if (ext4_has_inline_data(inode)) error = ext4_convert_inline_data_nolock(handle, inode, &iloc); ext4_write_unlock_xattr(inode, &no_expand); ext4_journal_stop(handle); out_free: brelse(iloc.bh); return error; } |
| 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 | /* SPDX-License-Identifier: GPL-2.0 */ /* * Shared Memory Communications over RDMA (SMC-R) and RoCE * * CLC (connection layer control) handshake over initial TCP socket to * prepare for RDMA traffic * * Copyright IBM Corp. 2016 * * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> */ #ifndef _SMC_CLC_H #define _SMC_CLC_H #include <rdma/ib_verbs.h> #include <linux/smc.h> #include "smc.h" #include "smc_netlink.h" #define SMC_CLC_PROPOSAL 0x01 #define SMC_CLC_ACCEPT 0x02 #define SMC_CLC_CONFIRM 0x03 #define SMC_CLC_DECLINE 0x04 #define SMC_TYPE_R 0 /* SMC-R only */ #define SMC_TYPE_D 1 /* SMC-D only */ #define SMC_TYPE_N 2 /* neither SMC-R nor SMC-D */ #define SMC_TYPE_B 3 /* SMC-R and SMC-D */ #define CLC_WAIT_TIME (6 * HZ) /* max. wait time on clcsock */ #define CLC_WAIT_TIME_SHORT HZ /* short wait time on clcsock */ #define SMC_CLC_DECL_MEM 0x01010000 /* insufficient memory resources */ #define SMC_CLC_DECL_TIMEOUT_CL 0x02010000 /* timeout w4 QP confirm link */ #define SMC_CLC_DECL_TIMEOUT_AL 0x02020000 /* timeout w4 QP add link */ #define SMC_CLC_DECL_CNFERR 0x03000000 /* configuration error */ #define SMC_CLC_DECL_PEERNOSMC 0x03010000 /* peer did not indicate SMC */ #define SMC_CLC_DECL_IPSEC 0x03020000 /* IPsec usage */ #define SMC_CLC_DECL_NOSMCDEV 0x03030000 /* no SMC device found (R or D) */ #define SMC_CLC_DECL_NOSMCDDEV 0x03030001 /* no SMC-D device found */ #define SMC_CLC_DECL_NOSMCRDEV 0x03030002 /* no SMC-R device found */ #define SMC_CLC_DECL_NOISM2SUPP 0x03030003 /* hardware has no ISMv2 support */ #define SMC_CLC_DECL_NOV2EXT 0x03030004 /* peer sent no clc v2 extension */ #define SMC_CLC_DECL_NOV2DEXT 0x03030005 /* peer sent no clc SMC-Dv2 ext. */ #define SMC_CLC_DECL_NOSEID 0x03030006 /* peer sent no SEID */ #define SMC_CLC_DECL_NOSMCD2DEV 0x03030007 /* no SMC-Dv2 device found */ #define SMC_CLC_DECL_NOUEID 0x03030008 /* peer sent no UEID */ #define SMC_CLC_DECL_RELEASEERR 0x03030009 /* release version negotiate failed */ #define SMC_CLC_DECL_MAXCONNERR 0x0303000a /* max connections negotiate failed */ #define SMC_CLC_DECL_MAXLINKERR 0x0303000b /* max links negotiate failed */ #define SMC_CLC_DECL_MODEUNSUPP 0x03040000 /* smc modes do not match (R or D)*/ #define SMC_CLC_DECL_RMBE_EC 0x03050000 /* peer has eyecatcher in RMBE */ #define SMC_CLC_DECL_OPTUNSUPP 0x03060000 /* fastopen sockopt not supported */ #define SMC_CLC_DECL_DIFFPREFIX 0x03070000 /* IP prefix / subnet mismatch */ #define SMC_CLC_DECL_GETVLANERR 0x03080000 /* err to get vlan id of ip device*/ #define SMC_CLC_DECL_ISMVLANERR 0x03090000 /* err to reg vlan id on ism dev */ #define SMC_CLC_DECL_NOACTLINK 0x030a0000 /* no active smc-r link in lgr */ #define SMC_CLC_DECL_NOSRVLINK 0x030b0000 /* SMC-R link from srv not found */ #define SMC_CLC_DECL_VERSMISMAT 0x030c0000 /* SMC version mismatch */ #define SMC_CLC_DECL_MAX_DMB 0x030d0000 /* SMC-D DMB limit exceeded */ #define SMC_CLC_DECL_NOROUTE 0x030e0000 /* SMC-Rv2 conn. no route to peer */ #define SMC_CLC_DECL_NOINDIRECT 0x030f0000 /* SMC-Rv2 conn. indirect mismatch*/ #define SMC_CLC_DECL_SYNCERR 0x04000000 /* synchronization error */ #define SMC_CLC_DECL_PEERDECL 0x05000000 /* peer declined during handshake */ #define SMC_CLC_DECL_INTERR 0x09990000 /* internal error */ #define SMC_CLC_DECL_ERR_RTOK 0x09990001 /* rtoken handling failed */ #define SMC_CLC_DECL_ERR_RDYLNK 0x09990002 /* ib ready link failed */ #define SMC_CLC_DECL_ERR_REGBUF 0x09990003 /* reg rdma bufs failed */ #define SMC_FIRST_CONTACT_MASK 0b10 /* first contact bit within typev2 */ struct smc_clc_msg_hdr { /* header1 of clc messages */ u8 eyecatcher[4]; /* eye catcher */ u8 type; /* proposal / accept / confirm / decline */ __be16 length; #if defined(__BIG_ENDIAN_BITFIELD) u8 version : 4, typev2 : 2, typev1 : 2; #elif defined(__LITTLE_ENDIAN_BITFIELD) u8 typev1 : 2, typev2 : 2, version : 4; #endif } __packed; /* format defined in RFC7609 */ struct smc_clc_msg_trail { /* trailer of clc messages */ u8 eyecatcher[4]; }; struct smc_clc_msg_local { /* header2 of clc messages */ u8 id_for_peer[SMC_SYSTEMID_LEN]; /* unique system id */ u8 gid[16]; /* gid of ib_device port */ u8 mac[6]; /* mac of ib_device port */ }; /* Struct would be 4 byte aligned, but it is used in an array that is sent * to peers and must conform to RFC7609, hence we need to use packed here. */ struct smc_clc_ipv6_prefix { struct in6_addr prefix; u8 prefix_len; } __packed; /* format defined in RFC7609 */ #if defined(__BIG_ENDIAN_BITFIELD) struct smc_clc_v2_flag { u8 release : 4, rsvd : 3, seid : 1; }; #elif defined(__LITTLE_ENDIAN_BITFIELD) struct smc_clc_v2_flag { u8 seid : 1, rsvd : 3, release : 4; }; #endif struct smc_clnt_opts_area_hdr { u8 eid_cnt; /* number of user defined EIDs */ u8 ism_gid_cnt; /* number of ISMv2 GIDs */ u8 reserved1; struct smc_clc_v2_flag flag; u8 reserved2[2]; __be16 smcd_v2_ext_offset; /* SMC-Dv2 Extension Offset */ }; struct smc_clc_smcd_gid_chid { __be64 gid; /* ISM GID */ __be16 chid; /* ISMv2 CHID */ } __packed; /* format defined in * IBM Shared Memory Communications Version 2 * (https://www.ibm.com/support/pages/node/6326337) */ struct smc_clc_v2_extension { /* New members must be added within the struct_group() macro below. */ struct_group_tagged(smc_clc_v2_extension_fixed, fixed, struct smc_clnt_opts_area_hdr hdr; u8 roce[16]; /* RoCEv2 GID */ u8 max_conns; u8 max_links; __be16 feature_mask; u8 reserved[12]; ); u8 user_eids[][SMC_MAX_EID_LEN]; }; struct smc_clc_msg_proposal_prefix { /* prefix part of clc proposal message*/ __be32 outgoing_subnet; /* subnet mask */ u8 prefix_len; /* number of significant bits in mask */ u8 reserved[2]; u8 ipv6_prefixes_cnt; /* number of IPv6 prefixes in prefix array */ } __aligned(4); struct smc_clc_msg_smcd { /* SMC-D GID information */ struct smc_clc_smcd_gid_chid ism; /* ISM native GID+CHID of requestor */ __be16 v2_ext_offset; /* SMC Version 2 Extension Offset */ u8 vendor_oui[3]; /* vendor organizationally unique identifier */ u8 vendor_exp_options[5]; u8 reserved[20]; }; struct smc_clc_smcd_v2_extension { /* New members must be added within the struct_group() macro below. */ struct_group_tagged(smc_clc_smcd_v2_extension_fixed, fixed, u8 system_eid[SMC_MAX_EID_LEN]; u8 reserved[16]; ); struct smc_clc_smcd_gid_chid gidchid[]; }; struct smc_clc_msg_proposal { /* clc proposal message sent by Linux */ struct smc_clc_msg_hdr hdr; struct smc_clc_msg_local lcl; __be16 iparea_offset; /* offset to IP address information area */ } __aligned(4); #define SMC_CLC_MAX_V6_PREFIX 8 #define SMC_CLC_MAX_UEID 8 #define SMCD_CLC_MAX_V2_GID_ENTRIES 8 /* max # of CHID-GID entries in CLC * proposal SMC-Dv2 extension. * each ISM device takes one entry and * each Emulated-ISM takes two entries */ struct smc_clc_msg_proposal_area { struct smc_clc_msg_proposal pclc_base; struct smc_clc_msg_smcd pclc_smcd; struct smc_clc_msg_proposal_prefix pclc_prfx; struct smc_clc_ipv6_prefix pclc_prfx_ipv6[SMC_CLC_MAX_V6_PREFIX]; struct smc_clc_v2_extension_fixed pclc_v2_ext; u8 user_eids[SMC_CLC_MAX_UEID][SMC_MAX_EID_LEN]; struct smc_clc_smcd_v2_extension_fixed pclc_smcd_v2_ext; struct smc_clc_smcd_gid_chid pclc_gidchids[SMCD_CLC_MAX_V2_GID_ENTRIES]; struct smc_clc_msg_trail pclc_trl; }; struct smcr_clc_msg_accept_confirm { /* SMCR accept/confirm */ struct smc_clc_msg_local lcl; u8 qpn[3]; /* QP number */ __be32 rmb_rkey; /* RMB rkey */ u8 rmbe_idx; /* Index of RMBE in RMB */ __be32 rmbe_alert_token; /* unique connection id */ #if defined(__BIG_ENDIAN_BITFIELD) u8 rmbe_size : 4, /* buf size (compressed) */ qp_mtu : 4; /* QP mtu */ #elif defined(__LITTLE_ENDIAN_BITFIELD) u8 qp_mtu : 4, rmbe_size : 4; #endif u8 reserved; __be64 rmb_dma_addr; /* RMB virtual address */ u8 reserved2; u8 psn[3]; /* packet sequence number */ } __packed; struct smcd_clc_msg_accept_confirm_common { /* SMCD accept/confirm */ __be64 gid; /* Sender GID */ __be64 token; /* DMB token */ u8 dmbe_idx; /* DMBE index */ #if defined(__BIG_ENDIAN_BITFIELD) u8 dmbe_size : 4, /* buf size (compressed) */ reserved3 : 4; #elif defined(__LITTLE_ENDIAN_BITFIELD) u8 reserved3 : 4, dmbe_size : 4; #endif u16 reserved4; __be32 linkid; /* Link identifier */ } __packed; #define SMC_CLC_OS_ZOS 1 #define SMC_CLC_OS_LINUX 2 #define SMC_CLC_OS_AIX 3 struct smc_clc_first_contact_ext { #if defined(__BIG_ENDIAN_BITFIELD) u8 v2_direct : 1, reserved : 7; u8 os_type : 4, release : 4; #elif defined(__LITTLE_ENDIAN_BITFIELD) u8 reserved : 7, v2_direct : 1; u8 release : 4, os_type : 4; #endif u8 reserved2[2]; u8 hostname[SMC_MAX_HOSTNAME_LEN]; }; struct smc_clc_first_contact_ext_v2x { struct smc_clc_first_contact_ext fce_v2_base; union { struct { u8 max_conns; /* for SMC-R only */ u8 max_links; /* for SMC-R only */ }; u8 reserved3[2]; /* for SMC-D only */ }; __be16 feature_mask; __be32 vendor_exp_options; u8 reserved4[8]; } __packed; /* format defined in * IBM Shared Memory Communications Version 2 (Third Edition) * (https://www.ibm.com/support/pages/node/7009315) */ struct smc_clc_fce_gid_ext { u8 gid_cnt; u8 reserved2[3]; u8 gid[][SMC_GID_SIZE]; }; struct smc_clc_msg_accept_confirm { /* clc accept / confirm message */ struct smc_clc_msg_hdr hdr; union { struct { /* SMC-R */ struct smcr_clc_msg_accept_confirm r0; struct { /* v2 only */ u8 eid[SMC_MAX_EID_LEN]; u8 reserved6[8]; } __packed r1; }; struct { /* SMC-D */ struct smcd_clc_msg_accept_confirm_common d0; struct { /* v2 only, but 12 bytes reserved in v1 */ __be16 chid; u8 eid[SMC_MAX_EID_LEN]; __be64 gid_ext; } __packed d1; }; }; }; struct smc_clc_msg_decline { /* clc decline message */ struct smc_clc_msg_hdr hdr; u8 id_for_peer[SMC_SYSTEMID_LEN]; /* sender peer_id */ __be32 peer_diagnosis; /* diagnosis information */ #if defined(__BIG_ENDIAN_BITFIELD) u8 os_type : 4, reserved : 4; #elif defined(__LITTLE_ENDIAN_BITFIELD) u8 reserved : 4, os_type : 4; #endif u8 reserved2[3]; struct smc_clc_msg_trail trl; /* eye catcher "SMCD" or "SMCR" EBCDIC */ } __aligned(4); #define SMC_DECL_DIAG_COUNT_V2 4 /* no. of additional peer diagnosis codes */ struct smc_clc_msg_decline_v2 { /* clc decline message */ struct smc_clc_msg_hdr hdr; u8 id_for_peer[SMC_SYSTEMID_LEN]; /* sender peer_id */ __be32 peer_diagnosis; /* diagnosis information */ #if defined(__BIG_ENDIAN_BITFIELD) u8 os_type : 4, reserved : 4; #elif defined(__LITTLE_ENDIAN_BITFIELD) u8 reserved : 4, os_type : 4; #endif u8 reserved2[3]; __be32 peer_diagnosis_v2[SMC_DECL_DIAG_COUNT_V2]; struct smc_clc_msg_trail trl; /* eye catcher "SMCD" or "SMCR" EBCDIC */ } __aligned(4); /* determine start of the prefix area within the proposal message */ static inline struct smc_clc_msg_proposal_prefix * smc_clc_proposal_get_prefix(struct smc_clc_msg_proposal *pclc) { return (struct smc_clc_msg_proposal_prefix *) ((u8 *)pclc + sizeof(*pclc) + ntohs(pclc->iparea_offset)); } static inline bool smcr_indicated(int smc_type) { return smc_type == SMC_TYPE_R || smc_type == SMC_TYPE_B; } static inline bool smcd_indicated(int smc_type) { return smc_type == SMC_TYPE_D || smc_type == SMC_TYPE_B; } static inline u8 smc_indicated_type(int is_smcd, int is_smcr) { if (is_smcd && is_smcr) return SMC_TYPE_B; if (is_smcd) return SMC_TYPE_D; if (is_smcr) return SMC_TYPE_R; return SMC_TYPE_N; } /* get SMC-D info from proposal message */ static inline struct smc_clc_msg_smcd * smc_get_clc_msg_smcd(struct smc_clc_msg_proposal *prop) { if (smcd_indicated(prop->hdr.typev1) && ntohs(prop->iparea_offset) != sizeof(struct smc_clc_msg_smcd)) return NULL; return (struct smc_clc_msg_smcd *)(prop + 1); } static inline struct smc_clc_v2_extension * smc_get_clc_v2_ext(struct smc_clc_msg_proposal *prop) { struct smc_clc_msg_smcd *prop_smcd = smc_get_clc_msg_smcd(prop); if (!prop_smcd || !ntohs(prop_smcd->v2_ext_offset)) return NULL; return (struct smc_clc_v2_extension *) ((u8 *)prop_smcd + offsetof(struct smc_clc_msg_smcd, v2_ext_offset) + sizeof(prop_smcd->v2_ext_offset) + ntohs(prop_smcd->v2_ext_offset)); } static inline struct smc_clc_smcd_v2_extension * smc_get_clc_smcd_v2_ext(struct smc_clc_v2_extension *prop_v2ext) { if (!prop_v2ext) return NULL; if (!ntohs(prop_v2ext->hdr.smcd_v2_ext_offset)) return NULL; return (struct smc_clc_smcd_v2_extension *) ((u8 *)prop_v2ext + offsetof(struct smc_clc_v2_extension, hdr) + offsetof(struct smc_clnt_opts_area_hdr, smcd_v2_ext_offset) + sizeof(prop_v2ext->hdr.smcd_v2_ext_offset) + ntohs(prop_v2ext->hdr.smcd_v2_ext_offset)); } static inline struct smc_clc_first_contact_ext * smc_get_clc_first_contact_ext(struct smc_clc_msg_accept_confirm *clc, bool is_smcd) { int clc_v2_len; if (clc->hdr.version == SMC_V1 || !(clc->hdr.typev2 & SMC_FIRST_CONTACT_MASK)) return NULL; if (is_smcd) clc_v2_len = offsetofend(struct smc_clc_msg_accept_confirm, d1); else clc_v2_len = offsetofend(struct smc_clc_msg_accept_confirm, r1); return (struct smc_clc_first_contact_ext *)(((u8 *)clc) + clc_v2_len); } struct smcd_dev; struct smc_init_info; int smc_clc_prfx_match(struct socket *clcsock, struct smc_clc_msg_proposal_prefix *prop); int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, u8 expected_type, unsigned long timeout); int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info, u8 version); int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini); int smc_clc_send_confirm(struct smc_sock *smc, bool clnt_first_contact, u8 version, u8 *eid, struct smc_init_info *ini); int smc_clc_send_accept(struct smc_sock *smc, bool srv_first_contact, u8 version, u8 *negotiated_eid, struct smc_init_info *ini); int smc_clc_srv_v2x_features_validate(struct smc_sock *smc, struct smc_clc_msg_proposal *pclc, struct smc_init_info *ini); int smc_clc_clnt_v2x_features_validate(struct smc_clc_first_contact_ext *fce, struct smc_init_info *ini); int smc_clc_v2x_features_confirm_check(struct smc_clc_msg_accept_confirm *cclc, struct smc_init_info *ini); void smc_clc_init(void) __init; void smc_clc_exit(void); void smc_clc_get_hostname(u8 **host); bool smc_clc_match_eid(u8 *negotiated_eid, struct smc_clc_v2_extension *smc_v2_ext, u8 *peer_eid, u8 *local_eid); int smc_clc_ueid_count(void); int smc_nl_dump_ueid(struct sk_buff *skb, struct netlink_callback *cb); int smc_nl_add_ueid(struct sk_buff *skb, struct genl_info *info); int smc_nl_remove_ueid(struct sk_buff *skb, struct genl_info *info); int smc_nl_flush_ueid(struct sk_buff *skb, struct genl_info *info); int smc_nl_dump_seid(struct sk_buff *skb, struct netlink_callback *cb); int smc_nl_enable_seid(struct sk_buff *skb, struct genl_info *info); int smc_nl_disable_seid(struct sk_buff *skb, struct genl_info *info); #endif |
| 2 2 2 2 2 2 1 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 | // SPDX-License-Identifier: (GPL-2.0-only OR Apache-2.0) /* * Generic implementation of the BLAKE2b digest algorithm. Based on the BLAKE2b * reference implementation, but it has been heavily modified for use in the * kernel. The reference implementation was: * * Copyright 2012, Samuel Neves <sneves@dei.uc.pt>. You may use this under * the terms of the CC0, the OpenSSL Licence, or the Apache Public License * 2.0, at your option. The terms of these licenses can be found at: * * - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0 * - OpenSSL license : https://www.openssl.org/source/license.html * - Apache 2.0 : https://www.apache.org/licenses/LICENSE-2.0 * * More information about BLAKE2 can be found at https://blake2.net. */ #include <asm/unaligned.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/bitops.h> #include <crypto/internal/blake2b.h> #include <crypto/internal/hash.h> static const u8 blake2b_sigma[12][16] = { { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }, { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 }, { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 }, { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 }, { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } }; static void blake2b_increment_counter(struct blake2b_state *S, const u64 inc) { S->t[0] += inc; S->t[1] += (S->t[0] < inc); } #define G(r,i,a,b,c,d) \ do { \ a = a + b + m[blake2b_sigma[r][2*i+0]]; \ d = ror64(d ^ a, 32); \ c = c + d; \ b = ror64(b ^ c, 24); \ a = a + b + m[blake2b_sigma[r][2*i+1]]; \ d = ror64(d ^ a, 16); \ c = c + d; \ b = ror64(b ^ c, 63); \ } while (0) #define ROUND(r) \ do { \ G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \ G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \ G(r,2,v[ 2],v[ 6],v[10],v[14]); \ G(r,3,v[ 3],v[ 7],v[11],v[15]); \ G(r,4,v[ 0],v[ 5],v[10],v[15]); \ G(r,5,v[ 1],v[ 6],v[11],v[12]); \ G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \ G(r,7,v[ 3],v[ 4],v[ 9],v[14]); \ } while (0) static void blake2b_compress_one_generic(struct blake2b_state *S, const u8 block[BLAKE2B_BLOCK_SIZE]) { u64 m[16]; u64 v[16]; size_t i; for (i = 0; i < 16; ++i) m[i] = get_unaligned_le64(block + i * sizeof(m[i])); for (i = 0; i < 8; ++i) v[i] = S->h[i]; v[ 8] = BLAKE2B_IV0; v[ 9] = BLAKE2B_IV1; v[10] = BLAKE2B_IV2; v[11] = BLAKE2B_IV3; v[12] = BLAKE2B_IV4 ^ S->t[0]; v[13] = BLAKE2B_IV5 ^ S->t[1]; v[14] = BLAKE2B_IV6 ^ S->f[0]; v[15] = BLAKE2B_IV7 ^ S->f[1]; ROUND(0); ROUND(1); ROUND(2); ROUND(3); ROUND(4); ROUND(5); ROUND(6); ROUND(7); ROUND(8); ROUND(9); ROUND(10); ROUND(11); #ifdef CONFIG_CC_IS_CLANG #pragma nounroll /* https://llvm.org/pr45803 */ #endif for (i = 0; i < 8; ++i) S->h[i] = S->h[i] ^ v[i] ^ v[i + 8]; } #undef G #undef ROUND void blake2b_compress_generic(struct blake2b_state *state, const u8 *block, size_t nblocks, u32 inc) { do { blake2b_increment_counter(state, inc); blake2b_compress_one_generic(state, block); block += BLAKE2B_BLOCK_SIZE; } while (--nblocks); } EXPORT_SYMBOL(blake2b_compress_generic); static int crypto_blake2b_update_generic(struct shash_desc *desc, const u8 *in, unsigned int inlen) { return crypto_blake2b_update(desc, in, inlen, blake2b_compress_generic); } static int crypto_blake2b_final_generic(struct shash_desc *desc, u8 *out) { return crypto_blake2b_final(desc, out, blake2b_compress_generic); } #define BLAKE2B_ALG(name, driver_name, digest_size) \ { \ .base.cra_name = name, \ .base.cra_driver_name = driver_name, \ .base.cra_priority = 100, \ .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, \ .base.cra_blocksize = BLAKE2B_BLOCK_SIZE, \ .base.cra_ctxsize = sizeof(struct blake2b_tfm_ctx), \ .base.cra_module = THIS_MODULE, \ .digestsize = digest_size, \ .setkey = crypto_blake2b_setkey, \ .init = crypto_blake2b_init, \ .update = crypto_blake2b_update_generic, \ .final = crypto_blake2b_final_generic, \ .descsize = sizeof(struct blake2b_state), \ } static struct shash_alg blake2b_algs[] = { BLAKE2B_ALG("blake2b-160", "blake2b-160-generic", BLAKE2B_160_HASH_SIZE), BLAKE2B_ALG("blake2b-256", "blake2b-256-generic", BLAKE2B_256_HASH_SIZE), BLAKE2B_ALG("blake2b-384", "blake2b-384-generic", BLAKE2B_384_HASH_SIZE), BLAKE2B_ALG("blake2b-512", "blake2b-512-generic", BLAKE2B_512_HASH_SIZE), }; static int __init blake2b_mod_init(void) { return crypto_register_shashes(blake2b_algs, ARRAY_SIZE(blake2b_algs)); } static void __exit blake2b_mod_fini(void) { crypto_unregister_shashes(blake2b_algs, ARRAY_SIZE(blake2b_algs)); } subsys_initcall(blake2b_mod_init); module_exit(blake2b_mod_fini); MODULE_AUTHOR("David Sterba <kdave@kernel.org>"); MODULE_DESCRIPTION("BLAKE2b generic implementation"); MODULE_LICENSE("GPL"); MODULE_ALIAS_CRYPTO("blake2b-160"); MODULE_ALIAS_CRYPTO("blake2b-160-generic"); MODULE_ALIAS_CRYPTO("blake2b-256"); MODULE_ALIAS_CRYPTO("blake2b-256-generic"); MODULE_ALIAS_CRYPTO("blake2b-384"); MODULE_ALIAS_CRYPTO("blake2b-384-generic"); MODULE_ALIAS_CRYPTO("blake2b-512"); MODULE_ALIAS_CRYPTO("blake2b-512-generic"); |
| 1 1 1 2 2 2 2 1 1 2 1 2 2 2 1 1 1 1 1 5 5 4 5 4 4 4 3 3 3 2 2 2 1 1 1 1 1 1 3 3 1 4 5 3 3 5 2 2 2 2 6 3 5 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 | // SPDX-License-Identifier: GPL-2.0 #include <linux/cred.h> #include <linux/device.h> #include <linux/dma-buf.h> #include <linux/dma-resv.h> #include <linux/highmem.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/memfd.h> #include <linux/miscdevice.h> #include <linux/module.h> #include <linux/shmem_fs.h> #include <linux/slab.h> #include <linux/udmabuf.h> #include <linux/vmalloc.h> #include <linux/iosys-map.h> static int list_limit = 1024; module_param(list_limit, int, 0644); MODULE_PARM_DESC(list_limit, "udmabuf_create_list->count limit. Default is 1024."); static int size_limit_mb = 64; module_param(size_limit_mb, int, 0644); MODULE_PARM_DESC(size_limit_mb, "Max size of a dmabuf, in megabytes. Default is 64."); struct udmabuf { pgoff_t pagecount; struct page **pages; struct sg_table *sg; struct miscdevice *device; }; static vm_fault_t udmabuf_vm_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct udmabuf *ubuf = vma->vm_private_data; pgoff_t pgoff = vmf->pgoff; if (pgoff >= ubuf->pagecount) return VM_FAULT_SIGBUS; vmf->page = ubuf->pages[pgoff]; get_page(vmf->page); return 0; } static const struct vm_operations_struct udmabuf_vm_ops = { .fault = udmabuf_vm_fault, }; static int mmap_udmabuf(struct dma_buf *buf, struct vm_area_struct *vma) { struct udmabuf *ubuf = buf->priv; if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) == 0) return -EINVAL; vma->vm_ops = &udmabuf_vm_ops; vma->vm_private_data = ubuf; return 0; } static int vmap_udmabuf(struct dma_buf *buf, struct iosys_map *map) { struct udmabuf *ubuf = buf->priv; void *vaddr; dma_resv_assert_held(buf->resv); vaddr = vm_map_ram(ubuf->pages, ubuf->pagecount, -1); if (!vaddr) return -EINVAL; iosys_map_set_vaddr(map, vaddr); return 0; } static void vunmap_udmabuf(struct dma_buf *buf, struct iosys_map *map) { struct udmabuf *ubuf = buf->priv; dma_resv_assert_held(buf->resv); vm_unmap_ram(map->vaddr, ubuf->pagecount); } static struct sg_table *get_sg_table(struct device *dev, struct dma_buf *buf, enum dma_data_direction direction) { struct udmabuf *ubuf = buf->priv; struct sg_table *sg; int ret; sg = kzalloc(sizeof(*sg), GFP_KERNEL); if (!sg) return ERR_PTR(-ENOMEM); ret = sg_alloc_table_from_pages(sg, ubuf->pages, ubuf->pagecount, 0, ubuf->pagecount << PAGE_SHIFT, GFP_KERNEL); if (ret < 0) goto err; ret = dma_map_sgtable(dev, sg, direction, 0); if (ret < 0) goto err; return sg; err: sg_free_table(sg); kfree(sg); return ERR_PTR(ret); } static void put_sg_table(struct device *dev, struct sg_table *sg, enum dma_data_direction direction) { dma_unmap_sgtable(dev, sg, direction, 0); sg_free_table(sg); kfree(sg); } static struct sg_table *map_udmabuf(struct dma_buf_attachment *at, enum dma_data_direction direction) { return get_sg_table(at->dev, at->dmabuf, direction); } static void unmap_udmabuf(struct dma_buf_attachment *at, struct sg_table *sg, enum dma_data_direction direction) { return put_sg_table(at->dev, sg, direction); } static void release_udmabuf(struct dma_buf *buf) { struct udmabuf *ubuf = buf->priv; struct device *dev = ubuf->device->this_device; pgoff_t pg; if (ubuf->sg) put_sg_table(dev, ubuf->sg, DMA_BIDIRECTIONAL); for (pg = 0; pg < ubuf->pagecount; pg++) put_page(ubuf->pages[pg]); kfree(ubuf->pages); kfree(ubuf); } static int begin_cpu_udmabuf(struct dma_buf *buf, enum dma_data_direction direction) { struct udmabuf *ubuf = buf->priv; struct device *dev = ubuf->device->this_device; int ret = 0; if (!ubuf->sg) { ubuf->sg = get_sg_table(dev, buf, direction); if (IS_ERR(ubuf->sg)) { ret = PTR_ERR(ubuf->sg); ubuf->sg = NULL; } } else { dma_sync_sg_for_cpu(dev, ubuf->sg->sgl, ubuf->sg->nents, direction); } return ret; } static int end_cpu_udmabuf(struct dma_buf *buf, enum dma_data_direction direction) { struct udmabuf *ubuf = buf->priv; struct device *dev = ubuf->device->this_device; if (!ubuf->sg) return -EINVAL; dma_sync_sg_for_device(dev, ubuf->sg->sgl, ubuf->sg->nents, direction); return 0; } static const struct dma_buf_ops udmabuf_ops = { .cache_sgt_mapping = true, .map_dma_buf = map_udmabuf, .unmap_dma_buf = unmap_udmabuf, .release = release_udmabuf, .mmap = mmap_udmabuf, .vmap = vmap_udmabuf, .vunmap = vunmap_udmabuf, .begin_cpu_access = begin_cpu_udmabuf, .end_cpu_access = end_cpu_udmabuf, }; #define SEALS_WANTED (F_SEAL_SHRINK) #define SEALS_DENIED (F_SEAL_WRITE) static long udmabuf_create(struct miscdevice *device, struct udmabuf_create_list *head, struct udmabuf_create_item *list) { DEFINE_DMA_BUF_EXPORT_INFO(exp_info); struct file *memfd = NULL; struct address_space *mapping = NULL; struct udmabuf *ubuf; struct dma_buf *buf; pgoff_t pgoff, pgcnt, pgidx, pgbuf = 0, pglimit; struct page *page; int seals, ret = -EINVAL; u32 i, flags; ubuf = kzalloc(sizeof(*ubuf), GFP_KERNEL); if (!ubuf) return -ENOMEM; pglimit = (size_limit_mb * 1024 * 1024) >> PAGE_SHIFT; for (i = 0; i < head->count; i++) { if (!IS_ALIGNED(list[i].offset, PAGE_SIZE)) goto err; if (!IS_ALIGNED(list[i].size, PAGE_SIZE)) goto err; ubuf->pagecount += list[i].size >> PAGE_SHIFT; if (ubuf->pagecount > pglimit) goto err; } if (!ubuf->pagecount) goto err; ubuf->pages = kmalloc_array(ubuf->pagecount, sizeof(*ubuf->pages), GFP_KERNEL); if (!ubuf->pages) { ret = -ENOMEM; goto err; } pgbuf = 0; for (i = 0; i < head->count; i++) { ret = -EBADFD; memfd = fget(list[i].memfd); if (!memfd) goto err; mapping = memfd->f_mapping; if (!shmem_mapping(mapping)) goto err; seals = memfd_fcntl(memfd, F_GET_SEALS, 0); if (seals == -EINVAL) goto err; ret = -EINVAL; if ((seals & SEALS_WANTED) != SEALS_WANTED || (seals & SEALS_DENIED) != 0) goto err; pgoff = list[i].offset >> PAGE_SHIFT; pgcnt = list[i].size >> PAGE_SHIFT; for (pgidx = 0; pgidx < pgcnt; pgidx++) { page = shmem_read_mapping_page(mapping, pgoff + pgidx); if (IS_ERR(page)) { ret = PTR_ERR(page); goto err; } ubuf->pages[pgbuf++] = page; } fput(memfd); memfd = NULL; } exp_info.ops = &udmabuf_ops; exp_info.size = ubuf->pagecount << PAGE_SHIFT; exp_info.priv = ubuf; exp_info.flags = O_RDWR; ubuf->device = device; buf = dma_buf_export(&exp_info); if (IS_ERR(buf)) { ret = PTR_ERR(buf); goto err; } flags = 0; if (head->flags & UDMABUF_FLAGS_CLOEXEC) flags |= O_CLOEXEC; return dma_buf_fd(buf, flags); err: while (pgbuf > 0) put_page(ubuf->pages[--pgbuf]); if (memfd) fput(memfd); kfree(ubuf->pages); kfree(ubuf); return ret; } static long udmabuf_ioctl_create(struct file *filp, unsigned long arg) { struct udmabuf_create create; struct udmabuf_create_list head; struct udmabuf_create_item list; if (copy_from_user(&create, (void __user *)arg, sizeof(create))) return -EFAULT; head.flags = create.flags; head.count = 1; list.memfd = create.memfd; list.offset = create.offset; list.size = create.size; return udmabuf_create(filp->private_data, &head, &list); } static long udmabuf_ioctl_create_list(struct file *filp, unsigned long arg) { struct udmabuf_create_list head; struct udmabuf_create_item *list; int ret = -EINVAL; u32 lsize; if (copy_from_user(&head, (void __user *)arg, sizeof(head))) return -EFAULT; if (head.count > list_limit) return -EINVAL; lsize = sizeof(struct udmabuf_create_item) * head.count; list = memdup_user((void __user *)(arg + sizeof(head)), lsize); if (IS_ERR(list)) return PTR_ERR(list); ret = udmabuf_create(filp->private_data, &head, list); kfree(list); return ret; } static long udmabuf_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { long ret; switch (ioctl) { case UDMABUF_CREATE: ret = udmabuf_ioctl_create(filp, arg); break; case UDMABUF_CREATE_LIST: ret = udmabuf_ioctl_create_list(filp, arg); break; default: ret = -ENOTTY; break; } return ret; } static const struct file_operations udmabuf_fops = { .owner = THIS_MODULE, .unlocked_ioctl = udmabuf_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = udmabuf_ioctl, #endif }; static struct miscdevice udmabuf_misc = { .minor = MISC_DYNAMIC_MINOR, .name = "udmabuf", .fops = &udmabuf_fops, }; static int __init udmabuf_dev_init(void) { int ret; ret = misc_register(&udmabuf_misc); if (ret < 0) { pr_err("Could not initialize udmabuf device\n"); return ret; } ret = dma_coerce_mask_and_coherent(udmabuf_misc.this_device, DMA_BIT_MASK(64)); if (ret < 0) { pr_err("Could not setup DMA mask for udmabuf device\n"); misc_deregister(&udmabuf_misc); return ret; } return 0; } static void __exit udmabuf_dev_exit(void) { misc_deregister(&udmabuf_misc); } module_init(udmabuf_dev_init) module_exit(udmabuf_dev_exit) MODULE_AUTHOR("Gerd Hoffmann <kraxel@redhat.com>"); |
| 7 2 2 2 1 1 2 1 1 1 1 1 5 5 1 5 4 5 1 1 5 5 4 2 4 2 5 1 1 1 1 1 1 1 1 25 25 13 13 13 13 13 7 13 21 20 21 1 1 12 12 12 9 8 1 2 1 2 1 2 2 2 12 12 12 12 2 2 2 3 9 4 5 5 5 5 2 5 2 2 2 12 2 2 2 2 2 2 2 2 2 2 1 2 11 11 11 12 1 1 1 1 1 1 1 1 1 1 1 5 5 21 21 12 12 1 1 1 1 21 12 12 12 5 12 12 12 11 12 1 11 8 8 2 2 12 9 9 9 3 3 3 4 3 3 2 2 3 4 2 4 4 2 4 4 4 3 4 4 7 7 7 7 25 25 14 14 14 14 25 13 13 9 4 1 1 1 1 1 1 1 1 1 4 4 4 4 25 25 21 14 2 2 2 1 2 2 25 25 4 4 4 3 2 2 3 4 4 4 4 3 3 3 4 24 21 2 2 1 20 21 21 2 1 1 1 1 1 4 1 1 1 3 2 2 1 2 2 2 2 2 1 1 1 1 1 1 1 1 6 1 1 25 25 25 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 | // SPDX-License-Identifier: GPL-2.0 /* Multipath TCP * * Copyright (c) 2017 - 2019, Intel Corporation. */ #define pr_fmt(fmt) "MPTCP: " fmt #include <linux/kernel.h> #include <crypto/sha2.h> #include <net/tcp.h> #include <net/mptcp.h> #include "protocol.h" #include "mib.h" #include <trace/events/mptcp.h> static bool mptcp_cap_flag_sha256(u8 flags) { return (flags & MPTCP_CAP_FLAG_MASK) == MPTCP_CAP_HMAC_SHA256; } static void mptcp_parse_option(const struct sk_buff *skb, const unsigned char *ptr, int opsize, struct mptcp_options_received *mp_opt) { u8 subtype = *ptr >> 4; int expected_opsize; u16 subopt; u8 version; u8 flags; u8 i; switch (subtype) { case MPTCPOPT_MP_CAPABLE: /* strict size checking */ if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) { if (skb->len > tcp_hdr(skb)->doff << 2) expected_opsize = TCPOLEN_MPTCP_MPC_ACK_DATA; else expected_opsize = TCPOLEN_MPTCP_MPC_ACK; subopt = OPTION_MPTCP_MPC_ACK; } else { if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK) { expected_opsize = TCPOLEN_MPTCP_MPC_SYNACK; subopt = OPTION_MPTCP_MPC_SYNACK; } else { expected_opsize = TCPOLEN_MPTCP_MPC_SYN; subopt = OPTION_MPTCP_MPC_SYN; } } /* Cfr RFC 8684 Section 3.3.0: * If a checksum is present but its use had * not been negotiated in the MP_CAPABLE handshake, the receiver MUST * close the subflow with a RST, as it is not behaving as negotiated. * If a checksum is not present when its use has been negotiated, the * receiver MUST close the subflow with a RST, as it is considered * broken * We parse even option with mismatching csum presence, so that * later in subflow_data_ready we can trigger the reset. */ if (opsize != expected_opsize && (expected_opsize != TCPOLEN_MPTCP_MPC_ACK_DATA || opsize != TCPOLEN_MPTCP_MPC_ACK_DATA_CSUM)) break; /* try to be gentle vs future versions on the initial syn */ version = *ptr++ & MPTCP_VERSION_MASK; if (opsize != TCPOLEN_MPTCP_MPC_SYN) { if (version != MPTCP_SUPPORTED_VERSION) break; } else if (version < MPTCP_SUPPORTED_VERSION) { break; } flags = *ptr++; if (!mptcp_cap_flag_sha256(flags) || (flags & MPTCP_CAP_EXTENSIBILITY)) break; /* RFC 6824, Section 3.1: * "For the Checksum Required bit (labeled "A"), if either * host requires the use of checksums, checksums MUST be used. * In other words, the only way for checksums not to be used * is if both hosts in their SYNs set A=0." */ if (flags & MPTCP_CAP_CHECKSUM_REQD) mp_opt->suboptions |= OPTION_MPTCP_CSUMREQD; mp_opt->deny_join_id0 = !!(flags & MPTCP_CAP_DENY_JOIN_ID0); mp_opt->suboptions |= subopt; if (opsize >= TCPOLEN_MPTCP_MPC_SYNACK) { mp_opt->sndr_key = get_unaligned_be64(ptr); ptr += 8; } if (opsize >= TCPOLEN_MPTCP_MPC_ACK) { mp_opt->rcvr_key = get_unaligned_be64(ptr); ptr += 8; } if (opsize >= TCPOLEN_MPTCP_MPC_ACK_DATA) { /* Section 3.1.: * "the data parameters in a MP_CAPABLE are semantically * equivalent to those in a DSS option and can be used * interchangeably." */ mp_opt->suboptions |= OPTION_MPTCP_DSS; mp_opt->use_map = 1; mp_opt->mpc_map = 1; mp_opt->use_ack = 0; mp_opt->data_len = get_unaligned_be16(ptr); ptr += 2; } if (opsize == TCPOLEN_MPTCP_MPC_ACK_DATA_CSUM) { mp_opt->csum = get_unaligned((__force __sum16 *)ptr); mp_opt->suboptions |= OPTION_MPTCP_CSUMREQD; ptr += 2; } pr_debug("MP_CAPABLE version=%x, flags=%x, optlen=%d sndr=%llu, rcvr=%llu len=%d csum=%u", version, flags, opsize, mp_opt->sndr_key, mp_opt->rcvr_key, mp_opt->data_len, mp_opt->csum); break; case MPTCPOPT_MP_JOIN: if (opsize == TCPOLEN_MPTCP_MPJ_SYN) { mp_opt->suboptions |= OPTION_MPTCP_MPJ_SYN; mp_opt->backup = *ptr++ & MPTCPOPT_BACKUP; mp_opt->join_id = *ptr++; mp_opt->token = get_unaligned_be32(ptr); ptr += 4; mp_opt->nonce = get_unaligned_be32(ptr); ptr += 4; pr_debug("MP_JOIN bkup=%u, id=%u, token=%u, nonce=%u", mp_opt->backup, mp_opt->join_id, mp_opt->token, mp_opt->nonce); } else if (opsize == TCPOLEN_MPTCP_MPJ_SYNACK) { mp_opt->suboptions |= OPTION_MPTCP_MPJ_SYNACK; mp_opt->backup = *ptr++ & MPTCPOPT_BACKUP; mp_opt->join_id = *ptr++; mp_opt->thmac = get_unaligned_be64(ptr); ptr += 8; mp_opt->nonce = get_unaligned_be32(ptr); ptr += 4; pr_debug("MP_JOIN bkup=%u, id=%u, thmac=%llu, nonce=%u", mp_opt->backup, mp_opt->join_id, mp_opt->thmac, mp_opt->nonce); } else if (opsize == TCPOLEN_MPTCP_MPJ_ACK) { mp_opt->suboptions |= OPTION_MPTCP_MPJ_ACK; ptr += 2; memcpy(mp_opt->hmac, ptr, MPTCPOPT_HMAC_LEN); pr_debug("MP_JOIN hmac"); } break; case MPTCPOPT_DSS: pr_debug("DSS"); ptr++; /* we must clear 'mpc_map' be able to detect MP_CAPABLE * map vs DSS map in mptcp_incoming_options(), and reconstruct * map info accordingly */ mp_opt->mpc_map = 0; flags = (*ptr++) & MPTCP_DSS_FLAG_MASK; mp_opt->data_fin = (flags & MPTCP_DSS_DATA_FIN) != 0; mp_opt->dsn64 = (flags & MPTCP_DSS_DSN64) != 0; mp_opt->use_map = (flags & MPTCP_DSS_HAS_MAP) != 0; mp_opt->ack64 = (flags & MPTCP_DSS_ACK64) != 0; mp_opt->use_ack = (flags & MPTCP_DSS_HAS_ACK); pr_debug("data_fin=%d dsn64=%d use_map=%d ack64=%d use_ack=%d", mp_opt->data_fin, mp_opt->dsn64, mp_opt->use_map, mp_opt->ack64, mp_opt->use_ack); expected_opsize = TCPOLEN_MPTCP_DSS_BASE; if (mp_opt->use_ack) { if (mp_opt->ack64) expected_opsize += TCPOLEN_MPTCP_DSS_ACK64; else expected_opsize += TCPOLEN_MPTCP_DSS_ACK32; } if (mp_opt->use_map) { if (mp_opt->dsn64) expected_opsize += TCPOLEN_MPTCP_DSS_MAP64; else expected_opsize += TCPOLEN_MPTCP_DSS_MAP32; } /* Always parse any csum presence combination, we will enforce * RFC 8684 Section 3.3.0 checks later in subflow_data_ready */ if (opsize != expected_opsize && opsize != expected_opsize + TCPOLEN_MPTCP_DSS_CHECKSUM) break; mp_opt->suboptions |= OPTION_MPTCP_DSS; if (mp_opt->use_ack) { if (mp_opt->ack64) { mp_opt->data_ack = get_unaligned_be64(ptr); ptr += 8; } else { mp_opt->data_ack = get_unaligned_be32(ptr); ptr += 4; } pr_debug("data_ack=%llu", mp_opt->data_ack); } if (mp_opt->use_map) { if (mp_opt->dsn64) { mp_opt->data_seq = get_unaligned_be64(ptr); ptr += 8; } else { mp_opt->data_seq = get_unaligned_be32(ptr); ptr += 4; } mp_opt->subflow_seq = get_unaligned_be32(ptr); ptr += 4; mp_opt->data_len = get_unaligned_be16(ptr); ptr += 2; if (opsize == expected_opsize + TCPOLEN_MPTCP_DSS_CHECKSUM) { mp_opt->suboptions |= OPTION_MPTCP_CSUMREQD; mp_opt->csum = get_unaligned((__force __sum16 *)ptr); ptr += 2; } pr_debug("data_seq=%llu subflow_seq=%u data_len=%u csum=%d:%u", mp_opt->data_seq, mp_opt->subflow_seq, mp_opt->data_len, !!(mp_opt->suboptions & OPTION_MPTCP_CSUMREQD), mp_opt->csum); } break; case MPTCPOPT_ADD_ADDR: mp_opt->echo = (*ptr++) & MPTCP_ADDR_ECHO; if (!mp_opt->echo) { if (opsize == TCPOLEN_MPTCP_ADD_ADDR || opsize == TCPOLEN_MPTCP_ADD_ADDR_PORT) mp_opt->addr.family = AF_INET; #if IS_ENABLED(CONFIG_MPTCP_IPV6) else if (opsize == TCPOLEN_MPTCP_ADD_ADDR6 || opsize == TCPOLEN_MPTCP_ADD_ADDR6_PORT) mp_opt->addr.family = AF_INET6; #endif else break; } else { if (opsize == TCPOLEN_MPTCP_ADD_ADDR_BASE || opsize == TCPOLEN_MPTCP_ADD_ADDR_BASE_PORT) mp_opt->addr.family = AF_INET; #if IS_ENABLED(CONFIG_MPTCP_IPV6) else if (opsize == TCPOLEN_MPTCP_ADD_ADDR6_BASE || opsize == TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT) mp_opt->addr.family = AF_INET6; #endif else break; } mp_opt->suboptions |= OPTION_MPTCP_ADD_ADDR; mp_opt->addr.id = *ptr++; mp_opt->addr.port = 0; mp_opt->ahmac = 0; if (mp_opt->addr.family == AF_INET) { memcpy((u8 *)&mp_opt->addr.addr.s_addr, (u8 *)ptr, 4); ptr += 4; if (opsize == TCPOLEN_MPTCP_ADD_ADDR_PORT || opsize == TCPOLEN_MPTCP_ADD_ADDR_BASE_PORT) { mp_opt->addr.port = htons(get_unaligned_be16(ptr)); ptr += 2; } } #if IS_ENABLED(CONFIG_MPTCP_IPV6) else { memcpy(mp_opt->addr.addr6.s6_addr, (u8 *)ptr, 16); ptr += 16; if (opsize == TCPOLEN_MPTCP_ADD_ADDR6_PORT || opsize == TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT) { mp_opt->addr.port = htons(get_unaligned_be16(ptr)); ptr += 2; } } #endif if (!mp_opt->echo) { mp_opt->ahmac = get_unaligned_be64(ptr); ptr += 8; } pr_debug("ADD_ADDR%s: id=%d, ahmac=%llu, echo=%d, port=%d", (mp_opt->addr.family == AF_INET6) ? "6" : "", mp_opt->addr.id, mp_opt->ahmac, mp_opt->echo, ntohs(mp_opt->addr.port)); break; case MPTCPOPT_RM_ADDR: if (opsize < TCPOLEN_MPTCP_RM_ADDR_BASE + 1 || opsize > TCPOLEN_MPTCP_RM_ADDR_BASE + MPTCP_RM_IDS_MAX) break; ptr++; mp_opt->suboptions |= OPTION_MPTCP_RM_ADDR; mp_opt->rm_list.nr = opsize - TCPOLEN_MPTCP_RM_ADDR_BASE; for (i = 0; i < mp_opt->rm_list.nr; i++) mp_opt->rm_list.ids[i] = *ptr++; pr_debug("RM_ADDR: rm_list_nr=%d", mp_opt->rm_list.nr); break; case MPTCPOPT_MP_PRIO: if (opsize != TCPOLEN_MPTCP_PRIO) break; mp_opt->suboptions |= OPTION_MPTCP_PRIO; mp_opt->backup = *ptr++ & MPTCP_PRIO_BKUP; pr_debug("MP_PRIO: prio=%d", mp_opt->backup); break; case MPTCPOPT_MP_FASTCLOSE: if (opsize != TCPOLEN_MPTCP_FASTCLOSE) break; ptr += 2; mp_opt->rcvr_key = get_unaligned_be64(ptr); ptr += 8; mp_opt->suboptions |= OPTION_MPTCP_FASTCLOSE; pr_debug("MP_FASTCLOSE: recv_key=%llu", mp_opt->rcvr_key); break; case MPTCPOPT_RST: if (opsize != TCPOLEN_MPTCP_RST) break; if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_RST)) break; mp_opt->suboptions |= OPTION_MPTCP_RST; flags = *ptr++; mp_opt->reset_transient = flags & MPTCP_RST_TRANSIENT; mp_opt->reset_reason = *ptr; pr_debug("MP_RST: transient=%u reason=%u", mp_opt->reset_transient, mp_opt->reset_reason); break; case MPTCPOPT_MP_FAIL: if (opsize != TCPOLEN_MPTCP_FAIL) break; ptr += 2; mp_opt->suboptions |= OPTION_MPTCP_FAIL; mp_opt->fail_seq = get_unaligned_be64(ptr); pr_debug("MP_FAIL: data_seq=%llu", mp_opt->fail_seq); break; default: break; } } void mptcp_get_options(const struct sk_buff *skb, struct mptcp_options_received *mp_opt) { const struct tcphdr *th = tcp_hdr(skb); const unsigned char *ptr; int length; /* initialize option status */ mp_opt->suboptions = 0; length = (th->doff * 4) - sizeof(struct tcphdr); ptr = (const unsigned char *)(th + 1); while (length > 0) { int opcode = *ptr++; int opsize; switch (opcode) { case TCPOPT_EOL: return; case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ length--; continue; default: if (length < 2) return; opsize = *ptr++; if (opsize < 2) /* "silly options" */ return; if (opsize > length) return; /* don't parse partial options */ if (opcode == TCPOPT_MPTCP) mptcp_parse_option(skb, ptr, opsize, mp_opt); ptr += opsize - 2; length -= opsize; } } } bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb, unsigned int *size, struct mptcp_out_options *opts) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); /* we will use snd_isn to detect first pkt [re]transmission * in mptcp_established_options_mp() */ subflow->snd_isn = TCP_SKB_CB(skb)->end_seq; if (subflow->request_mptcp) { opts->suboptions = OPTION_MPTCP_MPC_SYN; opts->csum_reqd = mptcp_is_checksum_enabled(sock_net(sk)); opts->allow_join_id0 = mptcp_allow_join_id0(sock_net(sk)); *size = TCPOLEN_MPTCP_MPC_SYN; return true; } else if (subflow->request_join) { pr_debug("remote_token=%u, nonce=%u", subflow->remote_token, subflow->local_nonce); opts->suboptions = OPTION_MPTCP_MPJ_SYN; opts->join_id = subflow->local_id; opts->token = subflow->remote_token; opts->nonce = subflow->local_nonce; opts->backup = subflow->request_bkup; *size = TCPOLEN_MPTCP_MPJ_SYN; return true; } return false; } static void clear_3rdack_retransmission(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); sk_stop_timer(sk, &icsk->icsk_delack_timer); icsk->icsk_ack.timeout = 0; icsk->icsk_ack.ato = 0; icsk->icsk_ack.pending &= ~(ICSK_ACK_SCHED | ICSK_ACK_TIMER); } static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb, bool snd_data_fin_enable, unsigned int *size, struct mptcp_out_options *opts) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); struct mptcp_ext *mpext; unsigned int data_len; u8 len; /* When skb is not available, we better over-estimate the emitted * options len. A full DSS option (28 bytes) is longer than * TCPOLEN_MPTCP_MPC_ACK_DATA(22) or TCPOLEN_MPTCP_MPJ_ACK(24), so * tell the caller to defer the estimate to * mptcp_established_options_dss(), which will reserve enough space. */ if (!skb) return false; /* MPC/MPJ needed only on 3rd ack packet, DATA_FIN and TCP shutdown take precedence */ if (subflow->fully_established || snd_data_fin_enable || subflow->snd_isn != TCP_SKB_CB(skb)->seq || sk->sk_state != TCP_ESTABLISHED) return false; if (subflow->mp_capable) { mpext = mptcp_get_ext(skb); data_len = mpext ? mpext->data_len : 0; /* we will check ops->data_len in mptcp_write_options() to * discriminate between TCPOLEN_MPTCP_MPC_ACK_DATA and * TCPOLEN_MPTCP_MPC_ACK */ opts->data_len = data_len; opts->suboptions = OPTION_MPTCP_MPC_ACK; opts->sndr_key = subflow->local_key; opts->rcvr_key = subflow->remote_key; opts->csum_reqd = READ_ONCE(msk->csum_enabled); opts->allow_join_id0 = mptcp_allow_join_id0(sock_net(sk)); /* Section 3.1. * The MP_CAPABLE option is carried on the SYN, SYN/ACK, and ACK * packets that start the first subflow of an MPTCP connection, * as well as the first packet that carries data */ if (data_len > 0) { len = TCPOLEN_MPTCP_MPC_ACK_DATA; if (opts->csum_reqd) { /* we need to propagate more info to csum the pseudo hdr */ opts->data_seq = mpext->data_seq; opts->subflow_seq = mpext->subflow_seq; opts->csum = mpext->csum; len += TCPOLEN_MPTCP_DSS_CHECKSUM; } *size = ALIGN(len, 4); } else { *size = TCPOLEN_MPTCP_MPC_ACK; } pr_debug("subflow=%p, local_key=%llu, remote_key=%llu map_len=%d", subflow, subflow->local_key, subflow->remote_key, data_len); return true; } else if (subflow->mp_join) { opts->suboptions = OPTION_MPTCP_MPJ_ACK; memcpy(opts->hmac, subflow->hmac, MPTCPOPT_HMAC_LEN); *size = TCPOLEN_MPTCP_MPJ_ACK; pr_debug("subflow=%p", subflow); /* we can use the full delegate action helper only from BH context * If we are in process context - sk is flushing the backlog at * socket lock release time - just set the appropriate flag, will * be handled by the release callback */ if (sock_owned_by_user(sk)) set_bit(MPTCP_DELEGATE_ACK, &subflow->delegated_status); else mptcp_subflow_delegate(subflow, MPTCP_DELEGATE_ACK); return true; } return false; } static void mptcp_write_data_fin(struct mptcp_subflow_context *subflow, struct sk_buff *skb, struct mptcp_ext *ext) { /* The write_seq value has already been incremented, so the actual * sequence number for the DATA_FIN is one less. */ u64 data_fin_tx_seq = READ_ONCE(mptcp_sk(subflow->conn)->write_seq) - 1; if (!ext->use_map || !skb->len) { /* RFC6824 requires a DSS mapping with specific values * if DATA_FIN is set but no data payload is mapped */ ext->data_fin = 1; ext->use_map = 1; ext->dsn64 = 1; ext->data_seq = data_fin_tx_seq; ext->subflow_seq = 0; ext->data_len = 1; } else if (ext->data_seq + ext->data_len == data_fin_tx_seq) { /* If there's an existing DSS mapping and it is the * final mapping, DATA_FIN consumes 1 additional byte of * mapping space. */ ext->data_fin = 1; ext->data_len++; } } static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb, bool snd_data_fin_enable, unsigned int *size, struct mptcp_out_options *opts) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); unsigned int dss_size = 0; struct mptcp_ext *mpext; unsigned int ack_size; bool ret = false; u64 ack_seq; opts->csum_reqd = READ_ONCE(msk->csum_enabled); mpext = skb ? mptcp_get_ext(skb) : NULL; if (!skb || (mpext && mpext->use_map) || snd_data_fin_enable) { unsigned int map_size = TCPOLEN_MPTCP_DSS_BASE + TCPOLEN_MPTCP_DSS_MAP64; if (mpext) { if (opts->csum_reqd) map_size += TCPOLEN_MPTCP_DSS_CHECKSUM; opts->ext_copy = *mpext; } dss_size = map_size; if (skb && snd_data_fin_enable) mptcp_write_data_fin(subflow, skb, &opts->ext_copy); opts->suboptions = OPTION_MPTCP_DSS; ret = true; } /* passive sockets msk will set the 'can_ack' after accept(), even * if the first subflow may have the already the remote key handy */ opts->ext_copy.use_ack = 0; if (!READ_ONCE(msk->can_ack)) { *size = ALIGN(dss_size, 4); return ret; } ack_seq = READ_ONCE(msk->ack_seq); if (READ_ONCE(msk->use_64bit_ack)) { ack_size = TCPOLEN_MPTCP_DSS_ACK64; opts->ext_copy.data_ack = ack_seq; opts->ext_copy.ack64 = 1; } else { ack_size = TCPOLEN_MPTCP_DSS_ACK32; opts->ext_copy.data_ack32 = (uint32_t)ack_seq; opts->ext_copy.ack64 = 0; } opts->ext_copy.use_ack = 1; opts->suboptions = OPTION_MPTCP_DSS; WRITE_ONCE(msk->old_wspace, __mptcp_space((struct sock *)msk)); /* Add kind/length/subtype/flag overhead if mapping is not populated */ if (dss_size == 0) ack_size += TCPOLEN_MPTCP_DSS_BASE; dss_size += ack_size; *size = ALIGN(dss_size, 4); return true; } static u64 add_addr_generate_hmac(u64 key1, u64 key2, struct mptcp_addr_info *addr) { u16 port = ntohs(addr->port); u8 hmac[SHA256_DIGEST_SIZE]; u8 msg[19]; int i = 0; msg[i++] = addr->id; if (addr->family == AF_INET) { memcpy(&msg[i], &addr->addr.s_addr, 4); i += 4; } #if IS_ENABLED(CONFIG_MPTCP_IPV6) else if (addr->family == AF_INET6) { memcpy(&msg[i], &addr->addr6.s6_addr, 16); i += 16; } #endif msg[i++] = port >> 8; msg[i++] = port & 0xFF; mptcp_crypto_hmac_sha(key1, key2, msg, i, hmac); return get_unaligned_be64(&hmac[SHA256_DIGEST_SIZE - sizeof(u64)]); } static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff *skb, unsigned int *size, unsigned int remaining, struct mptcp_out_options *opts) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); bool drop_other_suboptions = false; unsigned int opt_size = *size; bool echo; int len; /* add addr will strip the existing options, be sure to avoid breaking * MPC/MPJ handshakes */ if (!mptcp_pm_should_add_signal(msk) || (opts->suboptions & (OPTION_MPTCP_MPJ_ACK | OPTION_MPTCP_MPC_ACK)) || !mptcp_pm_add_addr_signal(msk, skb, opt_size, remaining, &opts->addr, &echo, &drop_other_suboptions)) return false; if (drop_other_suboptions) remaining += opt_size; len = mptcp_add_addr_len(opts->addr.family, echo, !!opts->addr.port); if (remaining < len) return false; *size = len; if (drop_other_suboptions) { pr_debug("drop other suboptions"); opts->suboptions = 0; /* note that e.g. DSS could have written into the memory * aliased by ahmac, we must reset the field here * to avoid appending the hmac even for ADD_ADDR echo * options */ opts->ahmac = 0; *size -= opt_size; } opts->suboptions |= OPTION_MPTCP_ADD_ADDR; if (!echo) { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ADDADDRTX); opts->ahmac = add_addr_generate_hmac(READ_ONCE(msk->local_key), READ_ONCE(msk->remote_key), &opts->addr); } else { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ECHOADDTX); } pr_debug("addr_id=%d, ahmac=%llu, echo=%d, port=%d", opts->addr.id, opts->ahmac, echo, ntohs(opts->addr.port)); return true; } static bool mptcp_established_options_rm_addr(struct sock *sk, unsigned int *size, unsigned int remaining, struct mptcp_out_options *opts) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); struct mptcp_rm_list rm_list; int i, len; if (!mptcp_pm_should_rm_signal(msk) || !(mptcp_pm_rm_addr_signal(msk, remaining, &rm_list))) return false; len = mptcp_rm_addr_len(&rm_list); if (len < 0) return false; if (remaining < len) return false; *size = len; opts->suboptions |= OPTION_MPTCP_RM_ADDR; opts->rm_list = rm_list; for (i = 0; i < opts->rm_list.nr; i++) pr_debug("rm_list_ids[%d]=%d", i, opts->rm_list.ids[i]); MPTCP_ADD_STATS(sock_net(sk), MPTCP_MIB_RMADDRTX, opts->rm_list.nr); return true; } static bool mptcp_established_options_mp_prio(struct sock *sk, unsigned int *size, unsigned int remaining, struct mptcp_out_options *opts) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); /* can't send MP_PRIO with MPC, as they share the same option space: * 'backup'. Also it makes no sense at all */ if (!subflow->send_mp_prio || (opts->suboptions & OPTIONS_MPTCP_MPC)) return false; /* account for the trailing 'nop' option */ if (remaining < TCPOLEN_MPTCP_PRIO_ALIGN) return false; *size = TCPOLEN_MPTCP_PRIO_ALIGN; opts->suboptions |= OPTION_MPTCP_PRIO; opts->backup = subflow->request_bkup; pr_debug("prio=%d", opts->backup); return true; } static noinline bool mptcp_established_options_rst(struct sock *sk, struct sk_buff *skb, unsigned int *size, unsigned int remaining, struct mptcp_out_options *opts) { const struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); if (remaining < TCPOLEN_MPTCP_RST) return false; *size = TCPOLEN_MPTCP_RST; opts->suboptions |= OPTION_MPTCP_RST; opts->reset_transient = subflow->reset_transient; opts->reset_reason = subflow->reset_reason; MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPRSTTX); return true; } static bool mptcp_established_options_fastclose(struct sock *sk, unsigned int *size, unsigned int remaining, struct mptcp_out_options *opts) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); if (likely(!subflow->send_fastclose)) return false; if (remaining < TCPOLEN_MPTCP_FASTCLOSE) return false; *size = TCPOLEN_MPTCP_FASTCLOSE; opts->suboptions |= OPTION_MPTCP_FASTCLOSE; opts->rcvr_key = READ_ONCE(msk->remote_key); pr_debug("FASTCLOSE key=%llu", opts->rcvr_key); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPFASTCLOSETX); return true; } static bool mptcp_established_options_mp_fail(struct sock *sk, unsigned int *size, unsigned int remaining, struct mptcp_out_options *opts) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); if (likely(!subflow->send_mp_fail)) return false; if (remaining < TCPOLEN_MPTCP_FAIL) return false; *size = TCPOLEN_MPTCP_FAIL; opts->suboptions |= OPTION_MPTCP_FAIL; opts->fail_seq = subflow->map_seq; pr_debug("MP_FAIL fail_seq=%llu", opts->fail_seq); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPFAILTX); return true; } bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, unsigned int *size, unsigned int remaining, struct mptcp_out_options *opts) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); unsigned int opt_size = 0; bool snd_data_fin; bool ret = false; opts->suboptions = 0; if (unlikely(__mptcp_check_fallback(msk) && !mptcp_check_infinite_map(skb))) return false; if (unlikely(skb && TCP_SKB_CB(skb)->tcp_flags & TCPHDR_RST)) { if (mptcp_established_options_fastclose(sk, &opt_size, remaining, opts) || mptcp_established_options_mp_fail(sk, &opt_size, remaining, opts)) { *size += opt_size; remaining -= opt_size; } /* MP_RST can be used with MP_FASTCLOSE and MP_FAIL if there is room */ if (mptcp_established_options_rst(sk, skb, &opt_size, remaining, opts)) { *size += opt_size; remaining -= opt_size; } return true; } snd_data_fin = mptcp_data_fin_enabled(msk); if (mptcp_established_options_mp(sk, skb, snd_data_fin, &opt_size, opts)) ret = true; else if (mptcp_established_options_dss(sk, skb, snd_data_fin, &opt_size, opts)) { unsigned int mp_fail_size; ret = true; if (mptcp_established_options_mp_fail(sk, &mp_fail_size, remaining - opt_size, opts)) { *size += opt_size + mp_fail_size; remaining -= opt_size - mp_fail_size; return true; } } /* we reserved enough space for the above options, and exceeding the * TCP option space would be fatal */ if (WARN_ON_ONCE(opt_size > remaining)) return false; *size += opt_size; remaining -= opt_size; if (mptcp_established_options_add_addr(sk, skb, &opt_size, remaining, opts)) { *size += opt_size; remaining -= opt_size; ret = true; } else if (mptcp_established_options_rm_addr(sk, &opt_size, remaining, opts)) { *size += opt_size; remaining -= opt_size; ret = true; } if (mptcp_established_options_mp_prio(sk, &opt_size, remaining, opts)) { *size += opt_size; remaining -= opt_size; ret = true; } return ret; } bool mptcp_synack_options(const struct request_sock *req, unsigned int *size, struct mptcp_out_options *opts) { struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); if (subflow_req->mp_capable) { opts->suboptions = OPTION_MPTCP_MPC_SYNACK; opts->sndr_key = subflow_req->local_key; opts->csum_reqd = subflow_req->csum_reqd; opts->allow_join_id0 = subflow_req->allow_join_id0; *size = TCPOLEN_MPTCP_MPC_SYNACK; pr_debug("subflow_req=%p, local_key=%llu", subflow_req, subflow_req->local_key); return true; } else if (subflow_req->mp_join) { opts->suboptions = OPTION_MPTCP_MPJ_SYNACK; opts->backup = subflow_req->backup; opts->join_id = subflow_req->local_id; opts->thmac = subflow_req->thmac; opts->nonce = subflow_req->local_nonce; pr_debug("req=%p, bkup=%u, id=%u, thmac=%llu, nonce=%u", subflow_req, opts->backup, opts->join_id, opts->thmac, opts->nonce); *size = TCPOLEN_MPTCP_MPJ_SYNACK; return true; } return false; } static bool check_fully_established(struct mptcp_sock *msk, struct sock *ssk, struct mptcp_subflow_context *subflow, struct sk_buff *skb, struct mptcp_options_received *mp_opt) { /* here we can process OoO, in-window pkts, only in-sequence 4th ack * will make the subflow fully established */ if (likely(subflow->fully_established)) { /* on passive sockets, check for 3rd ack retransmission * note that msk is always set by subflow_syn_recv_sock() * for mp_join subflows */ if (TCP_SKB_CB(skb)->seq == subflow->ssn_offset + 1 && TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq && subflow->mp_join && (mp_opt->suboptions & OPTIONS_MPTCP_MPJ) && !subflow->request_join) tcp_send_ack(ssk); goto check_notify; } /* we must process OoO packets before the first subflow is fully * established. OoO packets are instead a protocol violation * for MP_JOIN subflows as the peer must not send any data * before receiving the forth ack - cfr. RFC 8684 section 3.2. */ if (TCP_SKB_CB(skb)->seq != subflow->ssn_offset + 1) { if (subflow->mp_join) goto reset; if (subflow->is_mptfo && mp_opt->suboptions & OPTION_MPTCP_MPC_ACK) goto set_fully_established; return subflow->mp_capable; } if (subflow->remote_key_valid && (((mp_opt->suboptions & OPTION_MPTCP_DSS) && mp_opt->use_ack) || ((mp_opt->suboptions & OPTION_MPTCP_ADD_ADDR) && !mp_opt->echo))) { /* subflows are fully established as soon as we get any * additional ack, including ADD_ADDR. */ goto set_fully_established; } /* If the first established packet does not contain MP_CAPABLE + data * then fallback to TCP. Fallback scenarios requires a reset for * MP_JOIN subflows. */ if (!(mp_opt->suboptions & OPTIONS_MPTCP_MPC)) { if (subflow->mp_join) goto reset; subflow->mp_capable = 0; pr_fallback(msk); mptcp_do_fallback(ssk); return false; } if (mp_opt->deny_join_id0) WRITE_ONCE(msk->pm.remote_deny_join_id0, true); if (unlikely(!READ_ONCE(msk->pm.server_side))) pr_warn_once("bogus mpc option on established client sk"); set_fully_established: mptcp_data_lock((struct sock *)msk); __mptcp_subflow_fully_established(msk, subflow, mp_opt); mptcp_data_unlock((struct sock *)msk); check_notify: /* if the subflow is not already linked into the conn_list, we can't * notify the PM: this subflow is still on the listener queue * and the PM possibly acquiring the subflow lock could race with * the listener close */ if (likely(subflow->pm_notified) || list_empty(&subflow->node)) return true; subflow->pm_notified = 1; if (subflow->mp_join) { clear_3rdack_retransmission(ssk); mptcp_pm_subflow_established(msk); } else { mptcp_pm_fully_established(msk, ssk); } return true; reset: mptcp_subflow_reset(ssk); return false; } u64 __mptcp_expand_seq(u64 old_seq, u64 cur_seq) { u32 old_seq32, cur_seq32; old_seq32 = (u32)old_seq; cur_seq32 = (u32)cur_seq; cur_seq = (old_seq & GENMASK_ULL(63, 32)) + cur_seq32; if (unlikely(cur_seq32 < old_seq32 && before(old_seq32, cur_seq32))) return cur_seq + (1LL << 32); /* reverse wrap could happen, too */ if (unlikely(cur_seq32 > old_seq32 && after(old_seq32, cur_seq32))) return cur_seq - (1LL << 32); return cur_seq; } static void __mptcp_snd_una_update(struct mptcp_sock *msk, u64 new_snd_una) { msk->bytes_acked += new_snd_una - msk->snd_una; WRITE_ONCE(msk->snd_una, new_snd_una); } static void ack_update_msk(struct mptcp_sock *msk, struct sock *ssk, struct mptcp_options_received *mp_opt) { u64 new_wnd_end, new_snd_una, snd_nxt = READ_ONCE(msk->snd_nxt); struct sock *sk = (struct sock *)msk; u64 old_snd_una; mptcp_data_lock(sk); /* avoid ack expansion on update conflict, to reduce the risk of * wrongly expanding to a future ack sequence number, which is way * more dangerous than missing an ack */ old_snd_una = msk->snd_una; new_snd_una = mptcp_expand_seq(old_snd_una, mp_opt->data_ack, mp_opt->ack64); /* ACK for data not even sent yet? Ignore.*/ if (unlikely(after64(new_snd_una, snd_nxt))) new_snd_una = old_snd_una; new_wnd_end = new_snd_una + tcp_sk(ssk)->snd_wnd; if (after64(new_wnd_end, msk->wnd_end)) WRITE_ONCE(msk->wnd_end, new_wnd_end); /* this assumes mptcp_incoming_options() is invoked after tcp_ack() */ if (after64(msk->wnd_end, snd_nxt)) __mptcp_check_push(sk, ssk); if (after64(new_snd_una, old_snd_una)) { __mptcp_snd_una_update(msk, new_snd_una); __mptcp_data_acked(sk); } msk->last_ack_recv = tcp_jiffies32; mptcp_data_unlock(sk); trace_ack_update_msk(mp_opt->data_ack, old_snd_una, new_snd_una, new_wnd_end, READ_ONCE(msk->wnd_end)); } bool mptcp_update_rcv_data_fin(struct mptcp_sock *msk, u64 data_fin_seq, bool use_64bit) { /* Skip if DATA_FIN was already received. * If updating simultaneously with the recvmsg loop, values * should match. If they mismatch, the peer is misbehaving and * we will prefer the most recent information. */ if (READ_ONCE(msk->rcv_data_fin)) return false; WRITE_ONCE(msk->rcv_data_fin_seq, mptcp_expand_seq(READ_ONCE(msk->ack_seq), data_fin_seq, use_64bit)); WRITE_ONCE(msk->rcv_data_fin, 1); return true; } static bool add_addr_hmac_valid(struct mptcp_sock *msk, struct mptcp_options_received *mp_opt) { u64 hmac = 0; if (mp_opt->echo) return true; hmac = add_addr_generate_hmac(READ_ONCE(msk->remote_key), READ_ONCE(msk->local_key), &mp_opt->addr); pr_debug("msk=%p, ahmac=%llu, mp_opt->ahmac=%llu\n", msk, hmac, mp_opt->ahmac); return hmac == mp_opt->ahmac; } /* Return false if a subflow has been reset, else return true */ bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); struct mptcp_options_received mp_opt; struct mptcp_ext *mpext; if (__mptcp_check_fallback(msk)) { /* Keep it simple and unconditionally trigger send data cleanup and * pending queue spooling. We will need to acquire the data lock * for more accurate checks, and once the lock is acquired, such * helpers are cheap. */ mptcp_data_lock(subflow->conn); if (sk_stream_memory_free(sk)) __mptcp_check_push(subflow->conn, sk); /* on fallback we just need to ignore the msk-level snd_una, as * this is really plain TCP */ __mptcp_snd_una_update(msk, READ_ONCE(msk->snd_nxt)); __mptcp_data_acked(subflow->conn); mptcp_data_unlock(subflow->conn); return true; } mptcp_get_options(skb, &mp_opt); /* The subflow can be in close state only if check_fully_established() * just sent a reset. If so, tell the caller to ignore the current packet. */ if (!check_fully_established(msk, sk, subflow, skb, &mp_opt)) return sk->sk_state != TCP_CLOSE; if (unlikely(mp_opt.suboptions != OPTION_MPTCP_DSS)) { if ((mp_opt.suboptions & OPTION_MPTCP_FASTCLOSE) && READ_ONCE(msk->local_key) == mp_opt.rcvr_key) { WRITE_ONCE(msk->rcv_fastclose, true); mptcp_schedule_work((struct sock *)msk); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPFASTCLOSERX); } if ((mp_opt.suboptions & OPTION_MPTCP_ADD_ADDR) && add_addr_hmac_valid(msk, &mp_opt)) { if (!mp_opt.echo) { mptcp_pm_add_addr_received(sk, &mp_opt.addr); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ADDADDR); } else { mptcp_pm_add_addr_echoed(msk, &mp_opt.addr); mptcp_pm_del_add_timer(msk, &mp_opt.addr, true); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ECHOADD); } if (mp_opt.addr.port) MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_PORTADD); } if (mp_opt.suboptions & OPTION_MPTCP_RM_ADDR) mptcp_pm_rm_addr_received(msk, &mp_opt.rm_list); if (mp_opt.suboptions & OPTION_MPTCP_PRIO) { mptcp_pm_mp_prio_received(sk, mp_opt.backup); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPPRIORX); } if (mp_opt.suboptions & OPTION_MPTCP_FAIL) { mptcp_pm_mp_fail_received(sk, mp_opt.fail_seq); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPFAILRX); } if (mp_opt.suboptions & OPTION_MPTCP_RST) { subflow->reset_seen = 1; subflow->reset_reason = mp_opt.reset_reason; subflow->reset_transient = mp_opt.reset_transient; MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPRSTRX); } if (!(mp_opt.suboptions & OPTION_MPTCP_DSS)) return true; } /* we can't wait for recvmsg() to update the ack_seq, otherwise * monodirectional flows will stuck */ if (mp_opt.use_ack) ack_update_msk(msk, sk, &mp_opt); /* Zero-data-length packets are dropped by the caller and not * propagated to the MPTCP layer, so the skb extension does not * need to be allocated or populated. DATA_FIN information, if * present, needs to be updated here before the skb is freed. */ if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) { if (mp_opt.data_fin && mp_opt.data_len == 1 && mptcp_update_rcv_data_fin(msk, mp_opt.data_seq, mp_opt.dsn64)) mptcp_schedule_work((struct sock *)msk); return true; } mpext = skb_ext_add(skb, SKB_EXT_MPTCP); if (!mpext) return true; memset(mpext, 0, sizeof(*mpext)); if (likely(mp_opt.use_map)) { if (mp_opt.mpc_map) { /* this is an MP_CAPABLE carrying MPTCP data * we know this map the first chunk of data */ mptcp_crypto_key_sha(subflow->remote_key, NULL, &mpext->data_seq); mpext->data_seq++; mpext->subflow_seq = 1; mpext->dsn64 = 1; mpext->mpc_map = 1; mpext->data_fin = 0; } else { mpext->data_seq = mp_opt.data_seq; mpext->subflow_seq = mp_opt.subflow_seq; mpext->dsn64 = mp_opt.dsn64; mpext->data_fin = mp_opt.data_fin; } mpext->data_len = mp_opt.data_len; mpext->use_map = 1; mpext->csum_reqd = !!(mp_opt.suboptions & OPTION_MPTCP_CSUMREQD); if (mpext->csum_reqd) mpext->csum = mp_opt.csum; } return true; } static void mptcp_set_rwin(struct tcp_sock *tp, struct tcphdr *th) { const struct sock *ssk = (const struct sock *)tp; struct mptcp_subflow_context *subflow; u64 ack_seq, rcv_wnd_old, rcv_wnd_new; struct mptcp_sock *msk; u32 new_win; u64 win; subflow = mptcp_subflow_ctx(ssk); msk = mptcp_sk(subflow->conn); ack_seq = READ_ONCE(msk->ack_seq); rcv_wnd_new = ack_seq + tp->rcv_wnd; rcv_wnd_old = atomic64_read(&msk->rcv_wnd_sent); if (after64(rcv_wnd_new, rcv_wnd_old)) { u64 rcv_wnd; for (;;) { rcv_wnd = atomic64_cmpxchg(&msk->rcv_wnd_sent, rcv_wnd_old, rcv_wnd_new); if (rcv_wnd == rcv_wnd_old) break; rcv_wnd_old = rcv_wnd; if (before64(rcv_wnd_new, rcv_wnd_old)) { MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_RCVWNDCONFLICTUPDATE); goto raise_win; } MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_RCVWNDCONFLICT); } return; } if (rcv_wnd_new != rcv_wnd_old) { raise_win: win = rcv_wnd_old - ack_seq; tp->rcv_wnd = min_t(u64, win, U32_MAX); new_win = tp->rcv_wnd; /* Make sure we do not exceed the maximum possible * scaled window. */ if (unlikely(th->syn)) new_win = min(new_win, 65535U) << tp->rx_opt.rcv_wscale; if (!tp->rx_opt.rcv_wscale && READ_ONCE(sock_net(ssk)->ipv4.sysctl_tcp_workaround_signed_windows)) new_win = min(new_win, MAX_TCP_WINDOW); else new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale)); /* RFC1323 scaling applied */ new_win >>= tp->rx_opt.rcv_wscale; th->window = htons(new_win); MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_RCVWNDSHARED); } } __sum16 __mptcp_make_csum(u64 data_seq, u32 subflow_seq, u16 data_len, __wsum sum) { struct csum_pseudo_header header; __wsum csum; /* cfr RFC 8684 3.3.1.: * the data sequence number used in the pseudo-header is * always the 64-bit value, irrespective of what length is used in the * DSS option itself. */ header.data_seq = cpu_to_be64(data_seq); header.subflow_seq = htonl(subflow_seq); header.data_len = htons(data_len); header.csum = 0; csum = csum_partial(&header, sizeof(header), sum); return csum_fold(csum); } static __sum16 mptcp_make_csum(const struct mptcp_ext *mpext) { return __mptcp_make_csum(mpext->data_seq, mpext->subflow_seq, mpext->data_len, ~csum_unfold(mpext->csum)); } static void put_len_csum(u16 len, __sum16 csum, void *data) { __sum16 *sumptr = data + 2; __be16 *ptr = data; put_unaligned_be16(len, ptr); put_unaligned(csum, sumptr); } void mptcp_write_options(struct tcphdr *th, __be32 *ptr, struct tcp_sock *tp, struct mptcp_out_options *opts) { const struct sock *ssk = (const struct sock *)tp; struct mptcp_subflow_context *subflow; /* Which options can be used together? * * X: mutually exclusive * O: often used together * C: can be used together in some cases * P: could be used together but we prefer not to (optimisations) * * Opt: | MPC | MPJ | DSS | ADD | RM | PRIO | FAIL | FC | * ------|------|------|------|------|------|------|------|------| * MPC |------|------|------|------|------|------|------|------| * MPJ | X |------|------|------|------|------|------|------| * DSS | X | X |------|------|------|------|------|------| * ADD | X | X | P |------|------|------|------|------| * RM | C | C | C | P |------|------|------|------| * PRIO | X | C | C | C | C |------|------|------| * FAIL | X | X | C | X | X | X |------|------| * FC | X | X | X | X | X | X | X |------| * RST | X | X | X | X | X | X | O | O | * ------|------|------|------|------|------|------|------|------| * * The same applies in mptcp_established_options() function. */ if (likely(OPTION_MPTCP_DSS & opts->suboptions)) { struct mptcp_ext *mpext = &opts->ext_copy; u8 len = TCPOLEN_MPTCP_DSS_BASE; u8 flags = 0; if (mpext->use_ack) { flags = MPTCP_DSS_HAS_ACK; if (mpext->ack64) { len += TCPOLEN_MPTCP_DSS_ACK64; flags |= MPTCP_DSS_ACK64; } else { len += TCPOLEN_MPTCP_DSS_ACK32; } } if (mpext->use_map) { len += TCPOLEN_MPTCP_DSS_MAP64; /* Use only 64-bit mapping flags for now, add * support for optional 32-bit mappings later. */ flags |= MPTCP_DSS_HAS_MAP | MPTCP_DSS_DSN64; if (mpext->data_fin) flags |= MPTCP_DSS_DATA_FIN; if (opts->csum_reqd) len += TCPOLEN_MPTCP_DSS_CHECKSUM; } *ptr++ = mptcp_option(MPTCPOPT_DSS, len, 0, flags); if (mpext->use_ack) { if (mpext->ack64) { put_unaligned_be64(mpext->data_ack, ptr); ptr += 2; } else { put_unaligned_be32(mpext->data_ack32, ptr); ptr += 1; } } if (mpext->use_map) { put_unaligned_be64(mpext->data_seq, ptr); ptr += 2; put_unaligned_be32(mpext->subflow_seq, ptr); ptr += 1; if (opts->csum_reqd) { /* data_len == 0 is reserved for the infinite mapping, * the checksum will also be set to 0. */ put_len_csum(mpext->data_len, (mpext->data_len ? mptcp_make_csum(mpext) : 0), ptr); } else { put_unaligned_be32(mpext->data_len << 16 | TCPOPT_NOP << 8 | TCPOPT_NOP, ptr); } ptr += 1; } /* We might need to add MP_FAIL options in rare cases */ if (unlikely(OPTION_MPTCP_FAIL & opts->suboptions)) goto mp_fail; } else if (OPTIONS_MPTCP_MPC & opts->suboptions) { u8 len, flag = MPTCP_CAP_HMAC_SHA256; if (OPTION_MPTCP_MPC_SYN & opts->suboptions) { len = TCPOLEN_MPTCP_MPC_SYN; } else if (OPTION_MPTCP_MPC_SYNACK & opts->suboptions) { len = TCPOLEN_MPTCP_MPC_SYNACK; } else if (opts->data_len) { len = TCPOLEN_MPTCP_MPC_ACK_DATA; if (opts->csum_reqd) len += TCPOLEN_MPTCP_DSS_CHECKSUM; } else { len = TCPOLEN_MPTCP_MPC_ACK; } if (opts->csum_reqd) flag |= MPTCP_CAP_CHECKSUM_REQD; if (!opts->allow_join_id0) flag |= MPTCP_CAP_DENY_JOIN_ID0; *ptr++ = mptcp_option(MPTCPOPT_MP_CAPABLE, len, MPTCP_SUPPORTED_VERSION, flag); if (!((OPTION_MPTCP_MPC_SYNACK | OPTION_MPTCP_MPC_ACK) & opts->suboptions)) goto mp_capable_done; put_unaligned_be64(opts->sndr_key, ptr); ptr += 2; if (!((OPTION_MPTCP_MPC_ACK) & opts->suboptions)) goto mp_capable_done; put_unaligned_be64(opts->rcvr_key, ptr); ptr += 2; if (!opts->data_len) goto mp_capable_done; if (opts->csum_reqd) { put_len_csum(opts->data_len, __mptcp_make_csum(opts->data_seq, opts->subflow_seq, opts->data_len, ~csum_unfold(opts->csum)), ptr); } else { put_unaligned_be32(opts->data_len << 16 | TCPOPT_NOP << 8 | TCPOPT_NOP, ptr); } ptr += 1; /* MPC is additionally mutually exclusive with MP_PRIO */ goto mp_capable_done; } else if (OPTIONS_MPTCP_MPJ & opts->suboptions) { if (OPTION_MPTCP_MPJ_SYN & opts->suboptions) { *ptr++ = mptcp_option(MPTCPOPT_MP_JOIN, TCPOLEN_MPTCP_MPJ_SYN, opts->backup, opts->join_id); put_unaligned_be32(opts->token, ptr); ptr += 1; put_unaligned_be32(opts->nonce, ptr); ptr += 1; } else if (OPTION_MPTCP_MPJ_SYNACK & opts->suboptions) { *ptr++ = mptcp_option(MPTCPOPT_MP_JOIN, TCPOLEN_MPTCP_MPJ_SYNACK, opts->backup, opts->join_id); put_unaligned_be64(opts->thmac, ptr); ptr += 2; put_unaligned_be32(opts->nonce, ptr); ptr += 1; } else { *ptr++ = mptcp_option(MPTCPOPT_MP_JOIN, TCPOLEN_MPTCP_MPJ_ACK, 0, 0); memcpy(ptr, opts->hmac, MPTCPOPT_HMAC_LEN); ptr += 5; } } else if (OPTION_MPTCP_ADD_ADDR & opts->suboptions) { u8 len = TCPOLEN_MPTCP_ADD_ADDR_BASE; u8 echo = MPTCP_ADDR_ECHO; #if IS_ENABLED(CONFIG_MPTCP_IPV6) if (opts->addr.family == AF_INET6) len = TCPOLEN_MPTCP_ADD_ADDR6_BASE; #endif if (opts->addr.port) len += TCPOLEN_MPTCP_PORT_LEN; if (opts->ahmac) { len += sizeof(opts->ahmac); echo = 0; } *ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR, len, echo, opts->addr.id); if (opts->addr.family == AF_INET) { memcpy((u8 *)ptr, (u8 *)&opts->addr.addr.s_addr, 4); ptr += 1; } #if IS_ENABLED(CONFIG_MPTCP_IPV6) else if (opts->addr.family == AF_INET6) { memcpy((u8 *)ptr, opts->addr.addr6.s6_addr, 16); ptr += 4; } #endif if (!opts->addr.port) { if (opts->ahmac) { put_unaligned_be64(opts->ahmac, ptr); ptr += 2; } } else { u16 port = ntohs(opts->addr.port); if (opts->ahmac) { u8 *bptr = (u8 *)ptr; put_unaligned_be16(port, bptr); bptr += 2; put_unaligned_be64(opts->ahmac, bptr); bptr += 8; put_unaligned_be16(TCPOPT_NOP << 8 | TCPOPT_NOP, bptr); ptr += 3; } else { put_unaligned_be32(port << 16 | TCPOPT_NOP << 8 | TCPOPT_NOP, ptr); ptr += 1; } } } else if (unlikely(OPTION_MPTCP_FASTCLOSE & opts->suboptions)) { /* FASTCLOSE is mutually exclusive with others except RST */ *ptr++ = mptcp_option(MPTCPOPT_MP_FASTCLOSE, TCPOLEN_MPTCP_FASTCLOSE, 0, 0); put_unaligned_be64(opts->rcvr_key, ptr); ptr += 2; if (OPTION_MPTCP_RST & opts->suboptions) goto mp_rst; return; } else if (unlikely(OPTION_MPTCP_FAIL & opts->suboptions)) { mp_fail: /* MP_FAIL is mutually exclusive with others except RST */ subflow = mptcp_subflow_ctx(ssk); subflow->send_mp_fail = 0; *ptr++ = mptcp_option(MPTCPOPT_MP_FAIL, TCPOLEN_MPTCP_FAIL, 0, 0); put_unaligned_be64(opts->fail_seq, ptr); ptr += 2; if (OPTION_MPTCP_RST & opts->suboptions) goto mp_rst; return; } else if (unlikely(OPTION_MPTCP_RST & opts->suboptions)) { mp_rst: *ptr++ = mptcp_option(MPTCPOPT_RST, TCPOLEN_MPTCP_RST, opts->reset_transient, opts->reset_reason); return; } if (OPTION_MPTCP_PRIO & opts->suboptions) { subflow = mptcp_subflow_ctx(ssk); subflow->send_mp_prio = 0; *ptr++ = mptcp_option(MPTCPOPT_MP_PRIO, TCPOLEN_MPTCP_PRIO, opts->backup, TCPOPT_NOP); MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_MPPRIOTX); } mp_capable_done: if (OPTION_MPTCP_RM_ADDR & opts->suboptions) { u8 i = 1; *ptr++ = mptcp_option(MPTCPOPT_RM_ADDR, TCPOLEN_MPTCP_RM_ADDR_BASE + opts->rm_list.nr, 0, opts->rm_list.ids[0]); while (i < opts->rm_list.nr) { u8 id1, id2, id3, id4; id1 = opts->rm_list.ids[i]; id2 = i + 1 < opts->rm_list.nr ? opts->rm_list.ids[i + 1] : TCPOPT_NOP; id3 = i + 2 < opts->rm_list.nr ? opts->rm_list.ids[i + 2] : TCPOPT_NOP; id4 = i + 3 < opts->rm_list.nr ? opts->rm_list.ids[i + 3] : TCPOPT_NOP; put_unaligned_be32(id1 << 24 | id2 << 16 | id3 << 8 | id4, ptr); ptr += 1; i += 4; } } if (tp) mptcp_set_rwin(tp, th); } __be32 mptcp_get_reset_option(const struct sk_buff *skb) { const struct mptcp_ext *ext = mptcp_get_ext(skb); u8 flags, reason; if (ext) { flags = ext->reset_transient; reason = ext->reset_reason; return mptcp_option(MPTCPOPT_RST, TCPOLEN_MPTCP_RST, flags, reason); } return htonl(0u); } EXPORT_SYMBOL_GPL(mptcp_get_reset_option); |
| 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 51 51 51 51 2 51 2 2 2 2 51 50 51 50 51 51 51 51 51 51 51 51 49 48 49 49 49 9 11 10 6 5 11 11 10 11 10 11 10 2 11 9 2 11 11 11 5 3 2 2 11 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 | // SPDX-License-Identifier: GPL-2.0-only /* * AppArmor security module * * This file contains AppArmor label definitions * * Copyright 2017 Canonical Ltd. */ #include <linux/audit.h> #include <linux/seq_file.h> #include <linux/sort.h> #include "include/apparmor.h" #include "include/cred.h" #include "include/label.h" #include "include/policy.h" #include "include/secid.h" /* * the aa_label represents the set of profiles confining an object * * Labels maintain a reference count to the set of pointers they reference * Labels are ref counted by * tasks and object via the security field/security context off the field * code - will take a ref count on a label if it needs the label * beyond what is possible with an rcu_read_lock. * profiles - each profile is a label * secids - a pinned secid will keep a refcount of the label it is * referencing * objects - inode, files, sockets, ... * * Labels are not ref counted by the label set, so they maybe removed and * freed when no longer in use. * */ #define PROXY_POISON 97 #define LABEL_POISON 100 static void free_proxy(struct aa_proxy *proxy) { if (proxy) { /* p->label will not updated any more as p is dead */ aa_put_label(rcu_dereference_protected(proxy->label, true)); memset(proxy, 0, sizeof(*proxy)); RCU_INIT_POINTER(proxy->label, (struct aa_label *)PROXY_POISON); kfree(proxy); } } void aa_proxy_kref(struct kref *kref) { struct aa_proxy *proxy = container_of(kref, struct aa_proxy, count); free_proxy(proxy); } struct aa_proxy *aa_alloc_proxy(struct aa_label *label, gfp_t gfp) { struct aa_proxy *new; new = kzalloc(sizeof(struct aa_proxy), gfp); if (new) { kref_init(&new->count); rcu_assign_pointer(new->label, aa_get_label(label)); } return new; } /* requires profile list write lock held */ void __aa_proxy_redirect(struct aa_label *orig, struct aa_label *new) { struct aa_label *tmp; AA_BUG(!orig); AA_BUG(!new); lockdep_assert_held_write(&labels_set(orig)->lock); tmp = rcu_dereference_protected(orig->proxy->label, &labels_ns(orig)->lock); rcu_assign_pointer(orig->proxy->label, aa_get_label(new)); orig->flags |= FLAG_STALE; aa_put_label(tmp); } static void __proxy_share(struct aa_label *old, struct aa_label *new) { struct aa_proxy *proxy = new->proxy; new->proxy = aa_get_proxy(old->proxy); __aa_proxy_redirect(old, new); aa_put_proxy(proxy); } /** * ns_cmp - compare ns for label set ordering * @a: ns to compare (NOT NULL) * @b: ns to compare (NOT NULL) * * Returns: <0 if a < b * ==0 if a == b * >0 if a > b */ static int ns_cmp(struct aa_ns *a, struct aa_ns *b) { int res; AA_BUG(!a); AA_BUG(!b); AA_BUG(!a->base.hname); AA_BUG(!b->base.hname); if (a == b) return 0; res = a->level - b->level; if (res) return res; return strcmp(a->base.hname, b->base.hname); } /** * profile_cmp - profile comparison for set ordering * @a: profile to compare (NOT NULL) * @b: profile to compare (NOT NULL) * * Returns: <0 if a < b * ==0 if a == b * >0 if a > b */ static int profile_cmp(struct aa_profile *a, struct aa_profile *b) { int res; AA_BUG(!a); AA_BUG(!b); AA_BUG(!a->ns); AA_BUG(!b->ns); AA_BUG(!a->base.hname); AA_BUG(!b->base.hname); if (a == b || a->base.hname == b->base.hname) return 0; res = ns_cmp(a->ns, b->ns); if (res) return res; return strcmp(a->base.hname, b->base.hname); } /** * vec_cmp - label comparison for set ordering * @a: aa_profile to compare (NOT NULL) * @an: length of @a * @b: aa_profile to compare (NOT NULL) * @bn: length of @b * * Returns: <0 if @a < @b * ==0 if @a == @b * >0 if @a > @b */ static int vec_cmp(struct aa_profile **a, int an, struct aa_profile **b, int bn) { int i; AA_BUG(!a); AA_BUG(!*a); AA_BUG(!b); AA_BUG(!*b); AA_BUG(an <= 0); AA_BUG(bn <= 0); for (i = 0; i < an && i < bn; i++) { int res = profile_cmp(a[i], b[i]); if (res != 0) return res; } return an - bn; } static bool vec_is_stale(struct aa_profile **vec, int n) { int i; AA_BUG(!vec); for (i = 0; i < n; i++) { if (profile_is_stale(vec[i])) return true; } return false; } static long accum_vec_flags(struct aa_profile **vec, int n) { long u = FLAG_UNCONFINED; int i; AA_BUG(!vec); for (i = 0; i < n; i++) { u |= vec[i]->label.flags & (FLAG_DEBUG1 | FLAG_DEBUG2 | FLAG_STALE); if (!(u & vec[i]->label.flags & FLAG_UNCONFINED)) u &= ~FLAG_UNCONFINED; } return u; } static int sort_cmp(const void *a, const void *b) { return profile_cmp(*(struct aa_profile **)a, *(struct aa_profile **)b); } /* * assumes vec is sorted * Assumes @vec has null terminator at vec[n], and will null terminate * vec[n - dups] */ static inline int unique(struct aa_profile **vec, int n) { int i, pos, dups = 0; AA_BUG(n < 1); AA_BUG(!vec); pos = 0; for (i = 1; i < n; i++) { int res = profile_cmp(vec[pos], vec[i]); AA_BUG(res > 0, "vec not sorted"); if (res == 0) { /* drop duplicate */ aa_put_profile(vec[i]); dups++; continue; } pos++; if (dups) vec[pos] = vec[i]; } AA_BUG(dups < 0); return dups; } /** * aa_vec_unique - canonical sort and unique a list of profiles * @n: number of refcounted profiles in the list (@n > 0) * @vec: list of profiles to sort and merge * @flags: null terminator flags of @vec * * Returns: the number of duplicates eliminated == references put * * If @flags & VEC_FLAG_TERMINATE @vec has null terminator at vec[n], and will * null terminate vec[n - dups] */ int aa_vec_unique(struct aa_profile **vec, int n, int flags) { int i, dups = 0; AA_BUG(n < 1); AA_BUG(!vec); /* vecs are usually small and inorder, have a fallback for larger */ if (n > 8) { sort(vec, n, sizeof(struct aa_profile *), sort_cmp, NULL); dups = unique(vec, n); goto out; } /* insertion sort + unique in one */ for (i = 1; i < n; i++) { struct aa_profile *tmp = vec[i]; int pos, j; for (pos = i - 1 - dups; pos >= 0; pos--) { int res = profile_cmp(vec[pos], tmp); if (res == 0) { /* drop duplicate entry */ aa_put_profile(tmp); dups++; goto continue_outer; } else if (res < 0) break; } /* pos is at entry < tmp, or index -1. Set to insert pos */ pos++; for (j = i - dups; j > pos; j--) vec[j] = vec[j - 1]; vec[pos] = tmp; continue_outer: ; } AA_BUG(dups < 0); out: if (flags & VEC_FLAG_TERMINATE) vec[n - dups] = NULL; return dups; } void aa_label_destroy(struct aa_label *label) { AA_BUG(!label); if (!label_isprofile(label)) { struct aa_profile *profile; struct label_it i; aa_put_str(label->hname); label_for_each(i, label, profile) { aa_put_profile(profile); label->vec[i.i] = (struct aa_profile *) (LABEL_POISON + (long) i.i); } } if (label->proxy) { if (rcu_dereference_protected(label->proxy->label, true) == label) rcu_assign_pointer(label->proxy->label, NULL); aa_put_proxy(label->proxy); } aa_free_secid(label->secid); label->proxy = (struct aa_proxy *) PROXY_POISON + 1; } void aa_label_free(struct aa_label *label) { if (!label) return; aa_label_destroy(label); kfree(label); } static void label_free_switch(struct aa_label *label) { if (label->flags & FLAG_NS_COUNT) aa_free_ns(labels_ns(label)); else if (label_isprofile(label)) aa_free_profile(labels_profile(label)); else aa_label_free(label); } static void label_free_rcu(struct rcu_head *head) { struct aa_label *label = container_of(head, struct aa_label, rcu); if (label->flags & FLAG_IN_TREE) (void) aa_label_remove(label); label_free_switch(label); } void aa_label_kref(struct kref *kref) { struct aa_label *label = container_of(kref, struct aa_label, count); struct aa_ns *ns = labels_ns(label); if (!ns) { /* never live, no rcu callback needed, just using the fn */ label_free_switch(label); return; } /* TODO: update labels_profile macro so it works here */ AA_BUG(label_isprofile(label) && on_list_rcu(&label->vec[0]->base.profiles)); AA_BUG(label_isprofile(label) && on_list_rcu(&label->vec[0]->base.list)); /* TODO: if compound label and not stale add to reclaim cache */ call_rcu(&label->rcu, label_free_rcu); } static void label_free_or_put_new(struct aa_label *label, struct aa_label *new) { if (label != new) /* need to free directly to break circular ref with proxy */ aa_label_free(new); else aa_put_label(new); } bool aa_label_init(struct aa_label *label, int size, gfp_t gfp) { AA_BUG(!label); AA_BUG(size < 1); if (aa_alloc_secid(label, gfp) < 0) return false; label->size = size; /* doesn't include null */ label->vec[size] = NULL; /* null terminate */ kref_init(&label->count); RB_CLEAR_NODE(&label->node); return true; } /** * aa_label_alloc - allocate a label with a profile vector of @size length * @size: size of profile vector in the label * @proxy: proxy to use OR null if to allocate a new one * @gfp: memory allocation type * * Returns: new label * else NULL if failed */ struct aa_label *aa_label_alloc(int size, struct aa_proxy *proxy, gfp_t gfp) { struct aa_label *new; AA_BUG(size < 1); /* + 1 for null terminator entry on vec */ new = kzalloc(struct_size(new, vec, size + 1), gfp); AA_DEBUG("%s (%p)\n", __func__, new); if (!new) goto fail; if (!aa_label_init(new, size, gfp)) goto fail; if (!proxy) { proxy = aa_alloc_proxy(new, gfp); if (!proxy) goto fail; } else aa_get_proxy(proxy); /* just set new's proxy, don't redirect proxy here if it was passed in*/ new->proxy = proxy; return new; fail: kfree(new); return NULL; } /** * label_cmp - label comparison for set ordering * @a: label to compare (NOT NULL) * @b: label to compare (NOT NULL) * * Returns: <0 if a < b * ==0 if a == b * >0 if a > b */ static int label_cmp(struct aa_label *a, struct aa_label *b) { AA_BUG(!b); if (a == b) return 0; return vec_cmp(a->vec, a->size, b->vec, b->size); } /* helper fn for label_for_each_confined */ int aa_label_next_confined(struct aa_label *label, int i) { AA_BUG(!label); AA_BUG(i < 0); for (; i < label->size; i++) { if (!profile_unconfined(label->vec[i])) return i; } return i; } /** * __aa_label_next_not_in_set - return the next profile of @sub not in @set * @I: label iterator * @set: label to test against * @sub: label to if is subset of @set * * Returns: profile in @sub that is not in @set, with iterator set pos after * else NULL if @sub is a subset of @set */ struct aa_profile *__aa_label_next_not_in_set(struct label_it *I, struct aa_label *set, struct aa_label *sub) { AA_BUG(!set); AA_BUG(!I); AA_BUG(I->i < 0); AA_BUG(I->i > set->size); AA_BUG(!sub); AA_BUG(I->j < 0); AA_BUG(I->j > sub->size); while (I->j < sub->size && I->i < set->size) { int res = profile_cmp(sub->vec[I->j], set->vec[I->i]); if (res == 0) { (I->j)++; (I->i)++; } else if (res > 0) (I->i)++; else return sub->vec[(I->j)++]; } if (I->j < sub->size) return sub->vec[(I->j)++]; return NULL; } /** * aa_label_is_subset - test if @sub is a subset of @set * @set: label to test against * @sub: label to test if is subset of @set * * Returns: true if @sub is subset of @set * else false */ bool aa_label_is_subset(struct aa_label *set, struct aa_label *sub) { struct label_it i = { }; AA_BUG(!set); AA_BUG(!sub); if (sub == set) return true; return __aa_label_next_not_in_set(&i, set, sub) == NULL; } /** * aa_label_is_unconfined_subset - test if @sub is a subset of @set * @set: label to test against * @sub: label to test if is subset of @set * * This checks for subset but taking into account unconfined. IF * @sub contains an unconfined profile that does not have a matching * unconfined in @set then this will not cause the test to fail. * Conversely we don't care about an unconfined in @set that is not in * @sub * * Returns: true if @sub is special_subset of @set * else false */ bool aa_label_is_unconfined_subset(struct aa_label *set, struct aa_label *sub) { struct label_it i = { }; struct aa_profile *p; AA_BUG(!set); AA_BUG(!sub); if (sub == set) return true; do { p = __aa_label_next_not_in_set(&i, set, sub); if (p && !profile_unconfined(p)) break; } while (p); return p == NULL; } /** * __label_remove - remove @label from the label set * @label: label to remove * @new: label to redirect to * * Requires: labels_set(@label)->lock write_lock * Returns: true if the label was in the tree and removed */ static bool __label_remove(struct aa_label *label, struct aa_label *new) { struct aa_labelset *ls = labels_set(label); AA_BUG(!ls); AA_BUG(!label); lockdep_assert_held_write(&ls->lock); if (new) __aa_proxy_redirect(label, new); if (!label_is_stale(label)) __label_make_stale(label); if (label->flags & FLAG_IN_TREE) { rb_erase(&label->node, &ls->root); label->flags &= ~FLAG_IN_TREE; return true; } return false; } /** * __label_replace - replace @old with @new in label set * @old: label to remove from label set * @new: label to replace @old with * * Requires: labels_set(@old)->lock write_lock * valid ref count be held on @new * Returns: true if @old was in set and replaced by @new * * Note: current implementation requires label set be order in such a way * that @new directly replaces @old position in the set (ie. * using pointer comparison of the label address would not work) */ static bool __label_replace(struct aa_label *old, struct aa_label *new) { struct aa_labelset *ls = labels_set(old); AA_BUG(!ls); AA_BUG(!old); AA_BUG(!new); lockdep_assert_held_write(&ls->lock); AA_BUG(new->flags & FLAG_IN_TREE); if (!label_is_stale(old)) __label_make_stale(old); if (old->flags & FLAG_IN_TREE) { rb_replace_node(&old->node, &new->node, &ls->root); old->flags &= ~FLAG_IN_TREE; new->flags |= FLAG_IN_TREE; return true; } return false; } /** * __label_insert - attempt to insert @l into a label set * @ls: set of labels to insert @l into (NOT NULL) * @label: new label to insert (NOT NULL) * @replace: whether insertion should replace existing entry that is not stale * * Requires: @ls->lock * caller to hold a valid ref on l * if @replace is true l has a preallocated proxy associated * Returns: @l if successful in inserting @l - with additional refcount * else ref counted equivalent label that is already in the set, * the else condition only happens if @replace is false */ static struct aa_label *__label_insert(struct aa_labelset *ls, struct aa_label *label, bool replace) { struct rb_node **new, *parent = NULL; AA_BUG(!ls); AA_BUG(!label); AA_BUG(labels_set(label) != ls); lockdep_assert_held_write(&ls->lock); AA_BUG(label->flags & FLAG_IN_TREE); /* Figure out where to put new node */ new = &ls->root.rb_node; while (*new) { struct aa_label *this = rb_entry(*new, struct aa_label, node); int result = label_cmp(label, this); parent = *new; if (result == 0) { /* !__aa_get_label means queued for destruction, * so replace in place, however the label has * died before the replacement so do not share * the proxy */ if (!replace && !label_is_stale(this)) { if (__aa_get_label(this)) return this; } else __proxy_share(this, label); AA_BUG(!__label_replace(this, label)); return aa_get_label(label); } else if (result < 0) new = &((*new)->rb_left); else /* (result > 0) */ new = &((*new)->rb_right); } /* Add new node and rebalance tree. */ rb_link_node(&label->node, parent, new); rb_insert_color(&label->node, &ls->root); label->flags |= FLAG_IN_TREE; return aa_get_label(label); } /** * __vec_find - find label that matches @vec in label set * @vec: vec of profiles to find matching label for (NOT NULL) * @n: length of @vec * * Requires: @vec_labelset(vec) lock held * caller to hold a valid ref on l * * Returns: ref counted @label if matching label is in tree * ref counted label that is equiv to @l in tree * else NULL if @vec equiv is not in tree */ static struct aa_label *__vec_find(struct aa_profile **vec, int n) { struct rb_node *node; AA_BUG(!vec); AA_BUG(!*vec); AA_BUG(n <= 0); node = vec_labelset(vec, n)->root.rb_node; while (node) { struct aa_label *this = rb_entry(node, struct aa_label, node); int result = vec_cmp(this->vec, this->size, vec, n); if (result > 0) node = node->rb_left; else if (result < 0) node = node->rb_right; else return __aa_get_label(this); } return NULL; } /** * __label_find - find label @label in label set * @label: label to find (NOT NULL) * * Requires: labels_set(@label)->lock held * caller to hold a valid ref on l * * Returns: ref counted @label if @label is in tree OR * ref counted label that is equiv to @label in tree * else NULL if @label or equiv is not in tree */ static struct aa_label *__label_find(struct aa_label *label) { AA_BUG(!label); return __vec_find(label->vec, label->size); } /** * aa_label_remove - remove a label from the labelset * @label: label to remove * * Returns: true if @label was removed from the tree * else @label was not in tree so it could not be removed */ bool aa_label_remove(struct aa_label *label) { struct aa_labelset *ls = labels_set(label); unsigned long flags; bool res; AA_BUG(!ls); write_lock_irqsave(&ls->lock, flags); res = __label_remove(label, ns_unconfined(labels_ns(label))); write_unlock_irqrestore(&ls->lock, flags); return res; } /** * aa_label_replace - replace a label @old with a new version @new * @old: label to replace * @new: label replacing @old * * Returns: true if @old was in tree and replaced * else @old was not in tree, and @new was not inserted */ bool aa_label_replace(struct aa_label *old, struct aa_label *new) { unsigned long flags; bool res; if (name_is_shared(old, new) && labels_ns(old) == labels_ns(new)) { write_lock_irqsave(&labels_set(old)->lock, flags); if (old->proxy != new->proxy) __proxy_share(old, new); else __aa_proxy_redirect(old, new); res = __label_replace(old, new); write_unlock_irqrestore(&labels_set(old)->lock, flags); } else { struct aa_label *l; struct aa_labelset *ls = labels_set(old); write_lock_irqsave(&ls->lock, flags); res = __label_remove(old, new); if (labels_ns(old) != labels_ns(new)) { write_unlock_irqrestore(&ls->lock, flags); ls = labels_set(new); write_lock_irqsave(&ls->lock, flags); } l = __label_insert(ls, new, true); res = (l == new); write_unlock_irqrestore(&ls->lock, flags); aa_put_label(l); } return res; } /** * vec_find - find label @l in label set * @vec: array of profiles to find equiv label for (NOT NULL) * @n: length of @vec * * Returns: refcounted label if @vec equiv is in tree * else NULL if @vec equiv is not in tree */ static struct aa_label *vec_find(struct aa_profile **vec, int n) { struct aa_labelset *ls; struct aa_label *label; unsigned long flags; AA_BUG(!vec); AA_BUG(!*vec); AA_BUG(n <= 0); ls = vec_labelset(vec, n); read_lock_irqsave(&ls->lock, flags); label = __vec_find(vec, n); read_unlock_irqrestore(&ls->lock, flags); return label; } /* requires sort and merge done first */ static struct aa_label *vec_create_and_insert_label(struct aa_profile **vec, int len, gfp_t gfp) { struct aa_label *label = NULL; struct aa_labelset *ls; unsigned long flags; struct aa_label *new; int i; AA_BUG(!vec); if (len == 1) return aa_get_label(&vec[0]->label); ls = labels_set(&vec[len - 1]->label); /* TODO: enable when read side is lockless * check if label exists before taking locks */ new = aa_label_alloc(len, NULL, gfp); if (!new) return NULL; for (i = 0; i < len; i++) new->vec[i] = aa_get_profile(vec[i]); write_lock_irqsave(&ls->lock, flags); label = __label_insert(ls, new, false); write_unlock_irqrestore(&ls->lock, flags); label_free_or_put_new(label, new); return label; } struct aa_label *aa_vec_find_or_create_label(struct aa_profile **vec, int len, gfp_t gfp) { struct aa_label *label = vec_find(vec, len); if (label) return label; return vec_create_and_insert_label(vec, len, gfp); } /** * aa_label_find - find label @label in label set * @label: label to find (NOT NULL) * * Requires: caller to hold a valid ref on l * * Returns: refcounted @label if @label is in tree * refcounted label that is equiv to @label in tree * else NULL if @label or equiv is not in tree */ struct aa_label *aa_label_find(struct aa_label *label) { AA_BUG(!label); return vec_find(label->vec, label->size); } /** * aa_label_insert - insert label @label into @ls or return existing label * @ls: labelset to insert @label into * @label: label to insert * * Requires: caller to hold a valid ref on @label * * Returns: ref counted @label if successful in inserting @label * else ref counted equivalent label that is already in the set */ struct aa_label *aa_label_insert(struct aa_labelset *ls, struct aa_label *label) { struct aa_label *l; unsigned long flags; AA_BUG(!ls); AA_BUG(!label); /* check if label exists before taking lock */ if (!label_is_stale(label)) { read_lock_irqsave(&ls->lock, flags); l = __label_find(label); read_unlock_irqrestore(&ls->lock, flags); if (l) return l; } write_lock_irqsave(&ls->lock, flags); l = __label_insert(ls, label, false); write_unlock_irqrestore(&ls->lock, flags); return l; } /** * aa_label_next_in_merge - find the next profile when merging @a and @b * @I: label iterator * @a: label to merge * @b: label to merge * * Returns: next profile * else null if no more profiles */ struct aa_profile *aa_label_next_in_merge(struct label_it *I, struct aa_label *a, struct aa_label *b) { AA_BUG(!a); AA_BUG(!b); AA_BUG(!I); AA_BUG(I->i < 0); AA_BUG(I->i > a->size); AA_BUG(I->j < 0); AA_BUG(I->j > b->size); if (I->i < a->size) { if (I->j < b->size) { int res = profile_cmp(a->vec[I->i], b->vec[I->j]); if (res > 0) return b->vec[(I->j)++]; if (res == 0) (I->j)++; } return a->vec[(I->i)++]; } if (I->j < b->size) return b->vec[(I->j)++]; return NULL; } /** * label_merge_cmp - cmp of @a merging with @b against @z for set ordering * @a: label to merge then compare (NOT NULL) * @b: label to merge then compare (NOT NULL) * @z: label to compare merge against (NOT NULL) * * Assumes: using the most recent versions of @a, @b, and @z * * Returns: <0 if a < b * ==0 if a == b * >0 if a > b */ static int label_merge_cmp(struct aa_label *a, struct aa_label *b, struct aa_label *z) { struct aa_profile *p = NULL; struct label_it i = { }; int k; AA_BUG(!a); AA_BUG(!b); AA_BUG(!z); for (k = 0; k < z->size && (p = aa_label_next_in_merge(&i, a, b)); k++) { int res = profile_cmp(p, z->vec[k]); if (res != 0) return res; } if (p) return 1; else if (k < z->size) return -1; return 0; } /** * label_merge_insert - create a new label by merging @a and @b * @new: preallocated label to merge into (NOT NULL) * @a: label to merge with @b (NOT NULL) * @b: label to merge with @a (NOT NULL) * * Requires: preallocated proxy * * Returns: ref counted label either @new if merge is unique * @a if @b is a subset of @a * @b if @a is a subset of @b * * NOTE: will not use @new if the merge results in @new == @a or @b * * Must be used within labelset write lock to avoid racing with * setting labels stale. */ static struct aa_label *label_merge_insert(struct aa_label *new, struct aa_label *a, struct aa_label *b) { struct aa_label *label; struct aa_labelset *ls; struct aa_profile *next; struct label_it i; unsigned long flags; int k = 0, invcount = 0; bool stale = false; AA_BUG(!a); AA_BUG(a->size < 0); AA_BUG(!b); AA_BUG(b->size < 0); AA_BUG(!new); AA_BUG(new->size < a->size + b->size); label_for_each_in_merge(i, a, b, next) { AA_BUG(!next); if (profile_is_stale(next)) { new->vec[k] = aa_get_newest_profile(next); AA_BUG(!new->vec[k]->label.proxy); AA_BUG(!new->vec[k]->label.proxy->label); if (next->label.proxy != new->vec[k]->label.proxy) invcount++; k++; stale = true; } else new->vec[k++] = aa_get_profile(next); } /* set to actual size which is <= allocated len */ new->size = k; new->vec[k] = NULL; if (invcount) { new->size -= aa_vec_unique(&new->vec[0], new->size, VEC_FLAG_TERMINATE); /* TODO: deal with reference labels */ if (new->size == 1) { label = aa_get_label(&new->vec[0]->label); return label; } } else if (!stale) { /* * merge could be same as a || b, note: it is not possible * for new->size == a->size == b->size unless a == b */ if (k == a->size) return aa_get_label(a); else if (k == b->size) return aa_get_label(b); } new->flags |= accum_vec_flags(new->vec, new->size); ls = labels_set(new); write_lock_irqsave(&ls->lock, flags); label = __label_insert(labels_set(new), new, false); write_unlock_irqrestore(&ls->lock, flags); return label; } /** * labelset_of_merge - find which labelset a merged label should be inserted * @a: label to merge and insert * @b: label to merge and insert * * Returns: labelset that the merged label should be inserted into */ static struct aa_labelset *labelset_of_merge(struct aa_label *a, struct aa_label *b) { struct aa_ns *nsa = labels_ns(a); struct aa_ns *nsb = labels_ns(b); if (ns_cmp(nsa, nsb) <= 0) return &nsa->labels; return &nsb->labels; } /** * __label_find_merge - find label that is equiv to merge of @a and @b * @ls: set of labels to search (NOT NULL) * @a: label to merge with @b (NOT NULL) * @b: label to merge with @a (NOT NULL) * * Requires: ls->lock read_lock held * * Returns: ref counted label that is equiv to merge of @a and @b * else NULL if merge of @a and @b is not in set */ static struct aa_label *__label_find_merge(struct aa_labelset *ls, struct aa_label *a, struct aa_label *b) { struct rb_node *node; AA_BUG(!ls); AA_BUG(!a); AA_BUG(!b); if (a == b) return __label_find(a); node = ls->root.rb_node; while (node) { struct aa_label *this = container_of(node, struct aa_label, node); int result = label_merge_cmp(a, b, this); if (result < 0) node = node->rb_left; else if (result > 0) node = node->rb_right; else return __aa_get_label(this); } return NULL; } /** * aa_label_find_merge - find label that is equiv to merge of @a and @b * @a: label to merge with @b (NOT NULL) * @b: label to merge with @a (NOT NULL) * * Requires: labels be fully constructed with a valid ns * * Returns: ref counted label that is equiv to merge of @a and @b * else NULL if merge of @a and @b is not in set */ struct aa_label *aa_label_find_merge(struct aa_label *a, struct aa_label *b) { struct aa_labelset *ls; struct aa_label *label, *ar = NULL, *br = NULL; unsigned long flags; AA_BUG(!a); AA_BUG(!b); if (label_is_stale(a)) a = ar = aa_get_newest_label(a); if (label_is_stale(b)) b = br = aa_get_newest_label(b); ls = labelset_of_merge(a, b); read_lock_irqsave(&ls->lock, flags); label = __label_find_merge(ls, a, b); read_unlock_irqrestore(&ls->lock, flags); aa_put_label(ar); aa_put_label(br); return label; } /** * aa_label_merge - attempt to insert new merged label of @a and @b * @a: label to merge with @b (NOT NULL) * @b: label to merge with @a (NOT NULL) * @gfp: memory allocation type * * Requires: caller to hold valid refs on @a and @b * labels be fully constructed with a valid ns * * Returns: ref counted new label if successful in inserting merge of a & b * else ref counted equivalent label that is already in the set. * else NULL if could not create label (-ENOMEM) */ struct aa_label *aa_label_merge(struct aa_label *a, struct aa_label *b, gfp_t gfp) { struct aa_label *label = NULL; AA_BUG(!a); AA_BUG(!b); if (a == b) return aa_get_newest_label(a); /* TODO: enable when read side is lockless * check if label exists before taking locks if (!label_is_stale(a) && !label_is_stale(b)) label = aa_label_find_merge(a, b); */ if (!label) { struct aa_label *new; a = aa_get_newest_label(a); b = aa_get_newest_label(b); /* could use label_merge_len(a, b), but requires double * comparison for small savings */ new = aa_label_alloc(a->size + b->size, NULL, gfp); if (!new) goto out; label = label_merge_insert(new, a, b); label_free_or_put_new(label, new); out: aa_put_label(a); aa_put_label(b); } return label; } /* match a profile and its associated ns component if needed * Assumes visibility test has already been done. * If a subns profile is not to be matched should be prescreened with * visibility test. */ static inline aa_state_t match_component(struct aa_profile *profile, struct aa_ruleset *rules, struct aa_profile *tp, aa_state_t state) { const char *ns_name; if (profile->ns == tp->ns) return aa_dfa_match(rules->policy->dfa, state, tp->base.hname); /* try matching with namespace name and then profile */ ns_name = aa_ns_name(profile->ns, tp->ns, true); state = aa_dfa_match_len(rules->policy->dfa, state, ":", 1); state = aa_dfa_match(rules->policy->dfa, state, ns_name); state = aa_dfa_match_len(rules->policy->dfa, state, ":", 1); return aa_dfa_match(rules->policy->dfa, state, tp->base.hname); } /** * label_compound_match - find perms for full compound label * @profile: profile to find perms for * @rules: ruleset to search * @label: label to check access permissions for * @state: state to start match in * @subns: whether to do permission checks on components in a subns * @request: permissions to request * @perms: perms struct to set * * Returns: 0 on success else ERROR * * For the label A//&B//&C this does the perm match for A//&B//&C * @perms should be preinitialized with allperms OR a previous permission * check to be stacked. */ static int label_compound_match(struct aa_profile *profile, struct aa_ruleset *rules, struct aa_label *label, aa_state_t state, bool subns, u32 request, struct aa_perms *perms) { struct aa_profile *tp; struct label_it i; /* find first subcomponent that is visible */ label_for_each(i, label, tp) { if (!aa_ns_visible(profile->ns, tp->ns, subns)) continue; state = match_component(profile, rules, tp, state); if (!state) goto fail; goto next; } /* no component visible */ *perms = allperms; return 0; next: label_for_each_cont(i, label, tp) { if (!aa_ns_visible(profile->ns, tp->ns, subns)) continue; state = aa_dfa_match(rules->policy->dfa, state, "//&"); state = match_component(profile, rules, tp, state); if (!state) goto fail; } *perms = *aa_lookup_perms(rules->policy, state); aa_apply_modes_to_perms(profile, perms); if ((perms->allow & request) != request) return -EACCES; return 0; fail: *perms = nullperms; return state; } /** * label_components_match - find perms for all subcomponents of a label * @profile: profile to find perms for * @rules: ruleset to search * @label: label to check access permissions for * @start: state to start match in * @subns: whether to do permission checks on components in a subns * @request: permissions to request * @perms: an initialized perms struct to add accumulation to * * Returns: 0 on success else ERROR * * For the label A//&B//&C this does the perm match for each of A and B and C * @perms should be preinitialized with allperms OR a previous permission * check to be stacked. */ static int label_components_match(struct aa_profile *profile, struct aa_ruleset *rules, struct aa_label *label, aa_state_t start, bool subns, u32 request, struct aa_perms *perms) { struct aa_profile *tp; struct label_it i; struct aa_perms tmp; aa_state_t state = 0; /* find first subcomponent to test */ label_for_each(i, label, tp) { if (!aa_ns_visible(profile->ns, tp->ns, subns)) continue; state = match_component(profile, rules, tp, start); if (!state) goto fail; goto next; } /* no subcomponents visible - no change in perms */ return 0; next: tmp = *aa_lookup_perms(rules->policy, state); aa_apply_modes_to_perms(profile, &tmp); aa_perms_accum(perms, &tmp); label_for_each_cont(i, label, tp) { if (!aa_ns_visible(profile->ns, tp->ns, subns)) continue; state = match_component(profile, rules, tp, start); if (!state) goto fail; tmp = *aa_lookup_perms(rules->policy, state); aa_apply_modes_to_perms(profile, &tmp); aa_perms_accum(perms, &tmp); } if ((perms->allow & request) != request) return -EACCES; return 0; fail: *perms = nullperms; return -EACCES; } /** * aa_label_match - do a multi-component label match * @profile: profile to match against (NOT NULL) * @rules: ruleset to search * @label: label to match (NOT NULL) * @state: state to start in * @subns: whether to match subns components * @request: permission request * @perms: Returns computed perms (NOT NULL) * * Returns: the state the match finished in, may be the none matching state */ int aa_label_match(struct aa_profile *profile, struct aa_ruleset *rules, struct aa_label *label, aa_state_t state, bool subns, u32 request, struct aa_perms *perms) { int error = label_compound_match(profile, rules, label, state, subns, request, perms); if (!error) return error; *perms = allperms; return label_components_match(profile, rules, label, state, subns, request, perms); } /** * aa_update_label_name - update a label to have a stored name * @ns: ns being viewed from (NOT NULL) * @label: label to update (NOT NULL) * @gfp: type of memory allocation * * Requires: labels_set(label) not locked in caller * * note: only updates the label name if it does not have a name already * and if it is in the labelset */ bool aa_update_label_name(struct aa_ns *ns, struct aa_label *label, gfp_t gfp) { struct aa_labelset *ls; unsigned long flags; char __counted *name; bool res = false; AA_BUG(!ns); AA_BUG(!label); if (label->hname || labels_ns(label) != ns) return res; if (aa_label_acntsxprint(&name, ns, label, FLAGS_NONE, gfp) < 0) return res; ls = labels_set(label); write_lock_irqsave(&ls->lock, flags); if (!label->hname && label->flags & FLAG_IN_TREE) { label->hname = name; res = true; } else aa_put_str(name); write_unlock_irqrestore(&ls->lock, flags); return res; } /* * cached label name is present and visible * @label->hname only exists if label is namespace hierachical */ static inline bool use_label_hname(struct aa_ns *ns, struct aa_label *label, int flags) { if (label->hname && (!ns || labels_ns(label) == ns) && !(flags & ~FLAG_SHOW_MODE)) return true; return false; } /* helper macro for snprint routines */ #define update_for_len(total, len, size, str) \ do { \ size_t ulen = len; \ \ AA_BUG(len < 0); \ total += ulen; \ ulen = min(ulen, size); \ size -= ulen; \ str += ulen; \ } while (0) /** * aa_profile_snxprint - print a profile name to a buffer * @str: buffer to write to. (MAY BE NULL if @size == 0) * @size: size of buffer * @view: namespace profile is being viewed from * @profile: profile to view (NOT NULL) * @flags: whether to include the mode string * @prev_ns: last ns printed when used in compound print * * Returns: size of name written or would be written if larger than * available buffer * * Note: will not print anything if the profile is not visible */ static int aa_profile_snxprint(char *str, size_t size, struct aa_ns *view, struct aa_profile *profile, int flags, struct aa_ns **prev_ns) { const char *ns_name = NULL; AA_BUG(!str && size != 0); AA_BUG(!profile); if (!view) view = profiles_ns(profile); if (view != profile->ns && (!prev_ns || (*prev_ns != profile->ns))) { if (prev_ns) *prev_ns = profile->ns; ns_name = aa_ns_name(view, profile->ns, flags & FLAG_VIEW_SUBNS); if (ns_name == aa_hidden_ns_name) { if (flags & FLAG_HIDDEN_UNCONFINED) return snprintf(str, size, "%s", "unconfined"); return snprintf(str, size, "%s", ns_name); } } if ((flags & FLAG_SHOW_MODE) && profile != profile->ns->unconfined) { const char *modestr = aa_profile_mode_names[profile->mode]; if (ns_name) return snprintf(str, size, ":%s:%s (%s)", ns_name, profile->base.hname, modestr); return snprintf(str, size, "%s (%s)", profile->base.hname, modestr); } if (ns_name) return snprintf(str, size, ":%s:%s", ns_name, profile->base.hname); return snprintf(str, size, "%s", profile->base.hname); } static const char *label_modename(struct aa_ns *ns, struct aa_label *label, int flags) { struct aa_profile *profile; struct label_it i; int mode = -1, count = 0; label_for_each(i, label, profile) { if (aa_ns_visible(ns, profile->ns, flags & FLAG_VIEW_SUBNS)) { count++; if (profile == profile->ns->unconfined) /* special case unconfined so stacks with * unconfined don't report as mixed. ie. * profile_foo//&:ns1:unconfined (mixed) */ continue; if (mode == -1) mode = profile->mode; else if (mode != profile->mode) return "mixed"; } } if (count == 0) return "-"; if (mode == -1) /* everything was unconfined */ mode = APPARMOR_UNCONFINED; return aa_profile_mode_names[mode]; } /* if any visible label is not unconfined the display_mode returns true */ static inline bool display_mode(struct aa_ns *ns, struct aa_label *label, int flags) { if ((flags & FLAG_SHOW_MODE)) { struct aa_profile *profile; struct label_it i; label_for_each(i, label, profile) { if (aa_ns_visible(ns, profile->ns, flags & FLAG_VIEW_SUBNS) && profile != profile->ns->unconfined) return true; } /* only ns->unconfined in set of profiles in ns */ return false; } return false; } /** * aa_label_snxprint - print a label name to a string buffer * @str: buffer to write to. (MAY BE NULL if @size == 0) * @size: size of buffer * @ns: namespace profile is being viewed from * @label: label to view (NOT NULL) * @flags: whether to include the mode string * * Returns: size of name written or would be written if larger than * available buffer * * Note: labels do not have to be strictly hierarchical to the ns as * objects may be shared across different namespaces and thus * pickup labeling from each ns. If a particular part of the * label is not visible it will just be excluded. And if none * of the label is visible "---" will be used. */ int aa_label_snxprint(char *str, size_t size, struct aa_ns *ns, struct aa_label *label, int flags) { struct aa_profile *profile; struct aa_ns *prev_ns = NULL; struct label_it i; int count = 0, total = 0; ssize_t len; AA_BUG(!str && size != 0); AA_BUG(!label); if (AA_DEBUG_LABEL && (flags & FLAG_ABS_ROOT)) { ns = root_ns; len = snprintf(str, size, "_"); update_for_len(total, len, size, str); } else if (!ns) { ns = labels_ns(label); } label_for_each(i, label, profile) { if (aa_ns_visible(ns, profile->ns, flags & FLAG_VIEW_SUBNS)) { if (count > 0) { len = snprintf(str, size, "//&"); update_for_len(total, len, size, str); } len = aa_profile_snxprint(str, size, ns, profile, flags & FLAG_VIEW_SUBNS, &prev_ns); update_for_len(total, len, size, str); count++; } } if (count == 0) { if (flags & FLAG_HIDDEN_UNCONFINED) return snprintf(str, size, "%s", "unconfined"); return snprintf(str, size, "%s", aa_hidden_ns_name); } /* count == 1 && ... is for backwards compat where the mode * is not displayed for 'unconfined' in the current ns */ if (display_mode(ns, label, flags)) { len = snprintf(str, size, " (%s)", label_modename(ns, label, flags)); update_for_len(total, len, size, str); } return total; } #undef update_for_len /** * aa_label_asxprint - allocate a string buffer and print label into it * @strp: Returns - the allocated buffer with the label name. (NOT NULL) * @ns: namespace profile is being viewed from * @label: label to view (NOT NULL) * @flags: flags controlling what label info is printed * @gfp: kernel memory allocation type * * Returns: size of name written or would be written if larger than * available buffer */ int aa_label_asxprint(char **strp, struct aa_ns *ns, struct aa_label *label, int flags, gfp_t gfp) { int size; AA_BUG(!strp); AA_BUG(!label); size = aa_label_snxprint(NULL, 0, ns, label, flags); if (size < 0) return size; *strp = kmalloc(size + 1, gfp); if (!*strp) return -ENOMEM; return aa_label_snxprint(*strp, size + 1, ns, label, flags); } /** * aa_label_acntsxprint - allocate a __counted string buffer and print label * @strp: buffer to write to. * @ns: namespace profile is being viewed from * @label: label to view (NOT NULL) * @flags: flags controlling what label info is printed * @gfp: kernel memory allocation type * * Returns: size of name written or would be written if larger than * available buffer */ int aa_label_acntsxprint(char __counted **strp, struct aa_ns *ns, struct aa_label *label, int flags, gfp_t gfp) { int size; AA_BUG(!strp); AA_BUG(!label); size = aa_label_snxprint(NULL, 0, ns, label, flags); if (size < 0) return size; *strp = aa_str_alloc(size + 1, gfp); if (!*strp) return -ENOMEM; return aa_label_snxprint(*strp, size + 1, ns, label, flags); } void aa_label_xaudit(struct audit_buffer *ab, struct aa_ns *ns, struct aa_label *label, int flags, gfp_t gfp) { const char *str; char *name = NULL; int len; AA_BUG(!ab); AA_BUG(!label); if (!use_label_hname(ns, label, flags) || display_mode(ns, label, flags)) { len = aa_label_asxprint(&name, ns, label, flags, gfp); if (len < 0) { AA_DEBUG("label print error"); return; } str = name; } else { str = (char *) label->hname; len = strlen(str); } if (audit_string_contains_control(str, len)) audit_log_n_hex(ab, str, len); else audit_log_n_string(ab, str, len); kfree(name); } void aa_label_seq_xprint(struct seq_file *f, struct aa_ns *ns, struct aa_label *label, int flags, gfp_t gfp) { AA_BUG(!f); AA_BUG(!label); if (!use_label_hname(ns, label, flags)) { char *str; int len; len = aa_label_asxprint(&str, ns, label, flags, gfp); if (len < 0) { AA_DEBUG("label print error"); return; } seq_puts(f, str); kfree(str); } else if (display_mode(ns, label, flags)) seq_printf(f, "%s (%s)", label->hname, label_modename(ns, label, flags)); else seq_puts(f, label->hname); } void aa_label_xprintk(struct aa_ns *ns, struct aa_label *label, int flags, gfp_t gfp) { AA_BUG(!label); if (!use_label_hname(ns, label, flags)) { char *str; int len; len = aa_label_asxprint(&str, ns, label, flags, gfp); if (len < 0) { AA_DEBUG("label print error"); return; } pr_info("%s", str); kfree(str); } else if (display_mode(ns, label, flags)) pr_info("%s (%s)", label->hname, label_modename(ns, label, flags)); else pr_info("%s", label->hname); } void aa_label_audit(struct audit_buffer *ab, struct aa_label *label, gfp_t gfp) { struct aa_ns *ns = aa_get_current_ns(); aa_label_xaudit(ab, ns, label, FLAG_VIEW_SUBNS, gfp); aa_put_ns(ns); } void aa_label_seq_print(struct seq_file *f, struct aa_label *label, gfp_t gfp) { struct aa_ns *ns = aa_get_current_ns(); aa_label_seq_xprint(f, ns, label, FLAG_VIEW_SUBNS, gfp); aa_put_ns(ns); } void aa_label_printk(struct aa_label *label, gfp_t gfp) { struct aa_ns *ns = aa_get_current_ns(); aa_label_xprintk(ns, label, FLAG_VIEW_SUBNS, gfp); aa_put_ns(ns); } static int label_count_strn_entries(const char *str, size_t n) { const char *end = str + n; const char *split; int count = 1; AA_BUG(!str); for (split = aa_label_strn_split(str, end - str); split; split = aa_label_strn_split(str, end - str)) { count++; str = split + 3; } return count; } /* * ensure stacks with components like * :ns:A//&B * have :ns: applied to both 'A' and 'B' by making the lookup relative * to the base if the lookup specifies an ns, else making the stacked lookup * relative to the last embedded ns in the string. */ static struct aa_profile *fqlookupn_profile(struct aa_label *base, struct aa_label *currentbase, const char *str, size_t n) { const char *first = skipn_spaces(str, n); if (first && *first == ':') return aa_fqlookupn_profile(base, str, n); return aa_fqlookupn_profile(currentbase, str, n); } /** * aa_label_strn_parse - parse, validate and convert a text string to a label * @base: base label to use for lookups (NOT NULL) * @str: null terminated text string (NOT NULL) * @n: length of str to parse, will stop at \0 if encountered before n * @gfp: allocation type * @create: true if should create compound labels if they don't exist * @force_stack: true if should stack even if no leading & * * Returns: the matching refcounted label if present * else ERRPTR */ struct aa_label *aa_label_strn_parse(struct aa_label *base, const char *str, size_t n, gfp_t gfp, bool create, bool force_stack) { DEFINE_VEC(profile, vec); struct aa_label *label, *currbase = base; int i, len, stack = 0, error; const char *end = str + n; const char *split; AA_BUG(!base); AA_BUG(!str); str = skipn_spaces(str, n); if (str == NULL || (AA_DEBUG_LABEL && *str == '_' && base != &root_ns->unconfined->label)) return ERR_PTR(-EINVAL); len = label_count_strn_entries(str, end - str); if (*str == '&' || force_stack) { /* stack on top of base */ stack = base->size; len += stack; if (*str == '&') str++; } error = vec_setup(profile, vec, len, gfp); if (error) return ERR_PTR(error); for (i = 0; i < stack; i++) vec[i] = aa_get_profile(base->vec[i]); for (split = aa_label_strn_split(str, end - str), i = stack; split && i < len; i++) { vec[i] = fqlookupn_profile(base, currbase, str, split - str); if (!vec[i]) goto fail; /* * if component specified a new ns it becomes the new base * so that subsequent lookups are relative to it */ if (vec[i]->ns != labels_ns(currbase)) currbase = &vec[i]->label; str = split + 3; split = aa_label_strn_split(str, end - str); } /* last element doesn't have a split */ if (i < len) { vec[i] = fqlookupn_profile(base, currbase, str, end - str); if (!vec[i]) goto fail; } if (len == 1) /* no need to free vec as len < LOCAL_VEC_ENTRIES */ return &vec[0]->label; len -= aa_vec_unique(vec, len, VEC_FLAG_TERMINATE); /* TODO: deal with reference labels */ if (len == 1) { label = aa_get_label(&vec[0]->label); goto out; } if (create) label = aa_vec_find_or_create_label(vec, len, gfp); else label = vec_find(vec, len); if (!label) goto fail; out: /* use adjusted len from after vec_unique, not original */ vec_cleanup(profile, vec, len); return label; fail: label = ERR_PTR(-ENOENT); goto out; } struct aa_label *aa_label_parse(struct aa_label *base, const char *str, gfp_t gfp, bool create, bool force_stack) { return aa_label_strn_parse(base, str, strlen(str), gfp, create, force_stack); } /** * aa_labelset_destroy - remove all labels from the label set * @ls: label set to cleanup (NOT NULL) * * Labels that are removed from the set may still exist beyond the set * being destroyed depending on their reference counting */ void aa_labelset_destroy(struct aa_labelset *ls) { struct rb_node *node; unsigned long flags; AA_BUG(!ls); write_lock_irqsave(&ls->lock, flags); for (node = rb_first(&ls->root); node; node = rb_first(&ls->root)) { struct aa_label *this = rb_entry(node, struct aa_label, node); if (labels_ns(this) != root_ns) __label_remove(this, ns_unconfined(labels_ns(this)->parent)); else __label_remove(this, NULL); } write_unlock_irqrestore(&ls->lock, flags); } /* * @ls: labelset to init (NOT NULL) */ void aa_labelset_init(struct aa_labelset *ls) { AA_BUG(!ls); rwlock_init(&ls->lock); ls->root = RB_ROOT; } static struct aa_label *labelset_next_stale(struct aa_labelset *ls) { struct aa_label *label; struct rb_node *node; unsigned long flags; AA_BUG(!ls); read_lock_irqsave(&ls->lock, flags); __labelset_for_each(ls, node) { label = rb_entry(node, struct aa_label, node); if ((label_is_stale(label) || vec_is_stale(label->vec, label->size)) && __aa_get_label(label)) goto out; } label = NULL; out: read_unlock_irqrestore(&ls->lock, flags); return label; } /** * __label_update - insert updated version of @label into labelset * @label: the label to update/replace * * Returns: new label that is up to date * else NULL on failure * * Requires: @ns lock be held * * Note: worst case is the stale @label does not get updated and has * to be updated at a later time. */ static struct aa_label *__label_update(struct aa_label *label) { struct aa_label *new, *tmp; struct aa_labelset *ls; unsigned long flags; int i, invcount = 0; AA_BUG(!label); AA_BUG(!mutex_is_locked(&labels_ns(label)->lock)); new = aa_label_alloc(label->size, label->proxy, GFP_KERNEL); if (!new) return NULL; /* * while holding the ns_lock will stop profile replacement, removal, * and label updates, label merging and removal can be occurring */ ls = labels_set(label); write_lock_irqsave(&ls->lock, flags); for (i = 0; i < label->size; i++) { AA_BUG(!label->vec[i]); new->vec[i] = aa_get_newest_profile(label->vec[i]); AA_BUG(!new->vec[i]); AA_BUG(!new->vec[i]->label.proxy); AA_BUG(!new->vec[i]->label.proxy->label); if (new->vec[i]->label.proxy != label->vec[i]->label.proxy) invcount++; } /* updated stale label by being removed/renamed from labelset */ if (invcount) { new->size -= aa_vec_unique(&new->vec[0], new->size, VEC_FLAG_TERMINATE); /* TODO: deal with reference labels */ if (new->size == 1) { tmp = aa_get_label(&new->vec[0]->label); AA_BUG(tmp == label); goto remove; } if (labels_set(label) != labels_set(new)) { write_unlock_irqrestore(&ls->lock, flags); tmp = aa_label_insert(labels_set(new), new); write_lock_irqsave(&ls->lock, flags); goto remove; } } else AA_BUG(labels_ns(label) != labels_ns(new)); tmp = __label_insert(labels_set(label), new, true); remove: /* ensure label is removed, and redirected correctly */ __label_remove(label, tmp); write_unlock_irqrestore(&ls->lock, flags); label_free_or_put_new(tmp, new); return tmp; } /** * __labelset_update - update labels in @ns * @ns: namespace to update labels in (NOT NULL) * * Requires: @ns lock be held * * Walk the labelset ensuring that all labels are up to date and valid * Any label that has a stale component is marked stale and replaced and * by an updated version. * * If failures happen due to memory pressures then stale labels will * be left in place until the next pass. */ static void __labelset_update(struct aa_ns *ns) { struct aa_label *label; AA_BUG(!ns); AA_BUG(!mutex_is_locked(&ns->lock)); do { label = labelset_next_stale(&ns->labels); if (label) { struct aa_label *l = __label_update(label); aa_put_label(l); aa_put_label(label); } } while (label); } /** * __aa_labelset_update_subtree - update all labels with a stale component * @ns: ns to start update at (NOT NULL) * * Requires: @ns lock be held * * Invalidates labels based on @p in @ns and any children namespaces. */ void __aa_labelset_update_subtree(struct aa_ns *ns) { struct aa_ns *child; AA_BUG(!ns); AA_BUG(!mutex_is_locked(&ns->lock)); __labelset_update(ns); list_for_each_entry(child, &ns->sub_ns, base.list) { mutex_lock_nested(&child->lock, child->level); __aa_labelset_update_subtree(child); mutex_unlock(&child->lock); } } |
| 14 15 16 14 14 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright 2019 Google LLC */ /* * Refer to Documentation/block/inline-encryption.rst for detailed explanation. */ #define pr_fmt(fmt) "blk-crypto: " fmt #include <linux/bio.h> #include <linux/blkdev.h> #include <linux/blk-crypto-profile.h> #include <linux/module.h> #include <linux/ratelimit.h> #include <linux/slab.h> #include "blk-crypto-internal.h" const struct blk_crypto_mode blk_crypto_modes[] = { [BLK_ENCRYPTION_MODE_AES_256_XTS] = { .name = "AES-256-XTS", .cipher_str = "xts(aes)", .keysize = 64, .ivsize = 16, }, [BLK_ENCRYPTION_MODE_AES_128_CBC_ESSIV] = { .name = "AES-128-CBC-ESSIV", .cipher_str = "essiv(cbc(aes),sha256)", .keysize = 16, .ivsize = 16, }, [BLK_ENCRYPTION_MODE_ADIANTUM] = { .name = "Adiantum", .cipher_str = "adiantum(xchacha12,aes)", .keysize = 32, .ivsize = 32, }, [BLK_ENCRYPTION_MODE_SM4_XTS] = { .name = "SM4-XTS", .cipher_str = "xts(sm4)", .keysize = 32, .ivsize = 16, }, }; /* * This number needs to be at least (the number of threads doing IO * concurrently) * (maximum recursive depth of a bio), so that we don't * deadlock on crypt_ctx allocations. The default is chosen to be the same * as the default number of post read contexts in both EXT4 and F2FS. */ static int num_prealloc_crypt_ctxs = 128; module_param(num_prealloc_crypt_ctxs, int, 0444); MODULE_PARM_DESC(num_prealloc_crypt_ctxs, "Number of bio crypto contexts to preallocate"); static struct kmem_cache *bio_crypt_ctx_cache; static mempool_t *bio_crypt_ctx_pool; static int __init bio_crypt_ctx_init(void) { size_t i; bio_crypt_ctx_cache = KMEM_CACHE(bio_crypt_ctx, 0); if (!bio_crypt_ctx_cache) goto out_no_mem; bio_crypt_ctx_pool = mempool_create_slab_pool(num_prealloc_crypt_ctxs, bio_crypt_ctx_cache); if (!bio_crypt_ctx_pool) goto out_no_mem; /* This is assumed in various places. */ BUILD_BUG_ON(BLK_ENCRYPTION_MODE_INVALID != 0); /* Sanity check that no algorithm exceeds the defined limits. */ for (i = 0; i < BLK_ENCRYPTION_MODE_MAX; i++) { BUG_ON(blk_crypto_modes[i].keysize > BLK_CRYPTO_MAX_KEY_SIZE); BUG_ON(blk_crypto_modes[i].ivsize > BLK_CRYPTO_MAX_IV_SIZE); } return 0; out_no_mem: panic("Failed to allocate mem for bio crypt ctxs\n"); } subsys_initcall(bio_crypt_ctx_init); void bio_crypt_set_ctx(struct bio *bio, const struct blk_crypto_key *key, const u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE], gfp_t gfp_mask) { struct bio_crypt_ctx *bc; /* * The caller must use a gfp_mask that contains __GFP_DIRECT_RECLAIM so * that the mempool_alloc() can't fail. */ WARN_ON_ONCE(!(gfp_mask & __GFP_DIRECT_RECLAIM)); bc = mempool_alloc(bio_crypt_ctx_pool, gfp_mask); bc->bc_key = key; memcpy(bc->bc_dun, dun, sizeof(bc->bc_dun)); bio->bi_crypt_context = bc; } void __bio_crypt_free_ctx(struct bio *bio) { mempool_free(bio->bi_crypt_context, bio_crypt_ctx_pool); bio->bi_crypt_context = NULL; } int __bio_crypt_clone(struct bio *dst, struct bio *src, gfp_t gfp_mask) { dst->bi_crypt_context = mempool_alloc(bio_crypt_ctx_pool, gfp_mask); if (!dst->bi_crypt_context) return -ENOMEM; *dst->bi_crypt_context = *src->bi_crypt_context; return 0; } /* Increments @dun by @inc, treating @dun as a multi-limb integer. */ void bio_crypt_dun_increment(u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE], unsigned int inc) { int i; for (i = 0; inc && i < BLK_CRYPTO_DUN_ARRAY_SIZE; i++) { dun[i] += inc; /* * If the addition in this limb overflowed, then we need to * carry 1 into the next limb. Else the carry is 0. */ if (dun[i] < inc) inc = 1; else inc = 0; } } void __bio_crypt_advance(struct bio *bio, unsigned int bytes) { struct bio_crypt_ctx *bc = bio->bi_crypt_context; bio_crypt_dun_increment(bc->bc_dun, bytes >> bc->bc_key->data_unit_size_bits); } /* * Returns true if @bc->bc_dun plus @bytes converted to data units is equal to * @next_dun, treating the DUNs as multi-limb integers. */ bool bio_crypt_dun_is_contiguous(const struct bio_crypt_ctx *bc, unsigned int bytes, const u64 next_dun[BLK_CRYPTO_DUN_ARRAY_SIZE]) { int i; unsigned int carry = bytes >> bc->bc_key->data_unit_size_bits; for (i = 0; i < BLK_CRYPTO_DUN_ARRAY_SIZE; i++) { if (bc->bc_dun[i] + carry != next_dun[i]) return false; /* * If the addition in this limb overflowed, then we need to * carry 1 into the next limb. Else the carry is 0. */ if ((bc->bc_dun[i] + carry) < carry) carry = 1; else carry = 0; } /* If the DUN wrapped through 0, don't treat it as contiguous. */ return carry == 0; } /* * Checks that two bio crypt contexts are compatible - i.e. that * they are mergeable except for data_unit_num continuity. */ static bool bio_crypt_ctx_compatible(struct bio_crypt_ctx *bc1, struct bio_crypt_ctx *bc2) { if (!bc1) return !bc2; return bc2 && bc1->bc_key == bc2->bc_key; } bool bio_crypt_rq_ctx_compatible(struct request *rq, struct bio *bio) { return bio_crypt_ctx_compatible(rq->crypt_ctx, bio->bi_crypt_context); } /* * Checks that two bio crypt contexts are compatible, and also * that their data_unit_nums are continuous (and can hence be merged) * in the order @bc1 followed by @bc2. */ bool bio_crypt_ctx_mergeable(struct bio_crypt_ctx *bc1, unsigned int bc1_bytes, struct bio_crypt_ctx *bc2) { if (!bio_crypt_ctx_compatible(bc1, bc2)) return false; return !bc1 || bio_crypt_dun_is_contiguous(bc1, bc1_bytes, bc2->bc_dun); } /* Check that all I/O segments are data unit aligned. */ static bool bio_crypt_check_alignment(struct bio *bio) { const unsigned int data_unit_size = bio->bi_crypt_context->bc_key->crypto_cfg.data_unit_size; struct bvec_iter iter; struct bio_vec bv; bio_for_each_segment(bv, bio, iter) { if (!IS_ALIGNED(bv.bv_len | bv.bv_offset, data_unit_size)) return false; } return true; } blk_status_t __blk_crypto_rq_get_keyslot(struct request *rq) { return blk_crypto_get_keyslot(rq->q->crypto_profile, rq->crypt_ctx->bc_key, &rq->crypt_keyslot); } void __blk_crypto_rq_put_keyslot(struct request *rq) { blk_crypto_put_keyslot(rq->crypt_keyslot); rq->crypt_keyslot = NULL; } void __blk_crypto_free_request(struct request *rq) { /* The keyslot, if one was needed, should have been released earlier. */ if (WARN_ON_ONCE(rq->crypt_keyslot)) __blk_crypto_rq_put_keyslot(rq); mempool_free(rq->crypt_ctx, bio_crypt_ctx_pool); rq->crypt_ctx = NULL; } /** * __blk_crypto_bio_prep - Prepare bio for inline encryption * * @bio_ptr: pointer to original bio pointer * * If the bio crypt context provided for the bio is supported by the underlying * device's inline encryption hardware, do nothing. * * Otherwise, try to perform en/decryption for this bio by falling back to the * kernel crypto API. When the crypto API fallback is used for encryption, * blk-crypto may choose to split the bio into 2 - the first one that will * continue to be processed and the second one that will be resubmitted via * submit_bio_noacct. A bounce bio will be allocated to encrypt the contents * of the aforementioned "first one", and *bio_ptr will be updated to this * bounce bio. * * Caller must ensure bio has bio_crypt_ctx. * * Return: true on success; false on error (and bio->bi_status will be set * appropriately, and bio_endio() will have been called so bio * submission should abort). */ bool __blk_crypto_bio_prep(struct bio **bio_ptr) { struct bio *bio = *bio_ptr; const struct blk_crypto_key *bc_key = bio->bi_crypt_context->bc_key; /* Error if bio has no data. */ if (WARN_ON_ONCE(!bio_has_data(bio))) { bio->bi_status = BLK_STS_IOERR; goto fail; } if (!bio_crypt_check_alignment(bio)) { bio->bi_status = BLK_STS_IOERR; goto fail; } /* * Success if device supports the encryption context, or if we succeeded * in falling back to the crypto API. */ if (blk_crypto_config_supported_natively(bio->bi_bdev, &bc_key->crypto_cfg)) return true; if (blk_crypto_fallback_bio_prep(bio_ptr)) return true; fail: bio_endio(*bio_ptr); return false; } int __blk_crypto_rq_bio_prep(struct request *rq, struct bio *bio, gfp_t gfp_mask) { if (!rq->crypt_ctx) { rq->crypt_ctx = mempool_alloc(bio_crypt_ctx_pool, gfp_mask); if (!rq->crypt_ctx) return -ENOMEM; } *rq->crypt_ctx = *bio->bi_crypt_context; return 0; } /** * blk_crypto_init_key() - Prepare a key for use with blk-crypto * @blk_key: Pointer to the blk_crypto_key to initialize. * @raw_key: Pointer to the raw key. Must be the correct length for the chosen * @crypto_mode; see blk_crypto_modes[]. * @crypto_mode: identifier for the encryption algorithm to use * @dun_bytes: number of bytes that will be used to specify the DUN when this * key is used * @data_unit_size: the data unit size to use for en/decryption * * Return: 0 on success, -errno on failure. The caller is responsible for * zeroizing both blk_key and raw_key when done with them. */ int blk_crypto_init_key(struct blk_crypto_key *blk_key, const u8 *raw_key, enum blk_crypto_mode_num crypto_mode, unsigned int dun_bytes, unsigned int data_unit_size) { const struct blk_crypto_mode *mode; memset(blk_key, 0, sizeof(*blk_key)); if (crypto_mode >= ARRAY_SIZE(blk_crypto_modes)) return -EINVAL; mode = &blk_crypto_modes[crypto_mode]; if (mode->keysize == 0) return -EINVAL; if (dun_bytes == 0 || dun_bytes > mode->ivsize) return -EINVAL; if (!is_power_of_2(data_unit_size)) return -EINVAL; blk_key->crypto_cfg.crypto_mode = crypto_mode; blk_key->crypto_cfg.dun_bytes = dun_bytes; blk_key->crypto_cfg.data_unit_size = data_unit_size; blk_key->data_unit_size_bits = ilog2(data_unit_size); blk_key->size = mode->keysize; memcpy(blk_key->raw, raw_key, mode->keysize); return 0; } bool blk_crypto_config_supported_natively(struct block_device *bdev, const struct blk_crypto_config *cfg) { return __blk_crypto_cfg_supported(bdev_get_queue(bdev)->crypto_profile, cfg); } /* * Check if bios with @cfg can be en/decrypted by blk-crypto (i.e. either the * block_device it's submitted to supports inline crypto, or the * blk-crypto-fallback is enabled and supports the cfg). */ bool blk_crypto_config_supported(struct block_device *bdev, const struct blk_crypto_config *cfg) { return IS_ENABLED(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) || blk_crypto_config_supported_natively(bdev, cfg); } /** * blk_crypto_start_using_key() - Start using a blk_crypto_key on a device * @bdev: block device to operate on * @key: A key to use on the device * * Upper layers must call this function to ensure that either the hardware * supports the key's crypto settings, or the crypto API fallback has transforms * for the needed mode allocated and ready to go. This function may allocate * an skcipher, and *should not* be called from the data path, since that might * cause a deadlock * * Return: 0 on success; -ENOPKG if the hardware doesn't support the key and * blk-crypto-fallback is either disabled or the needed algorithm * is disabled in the crypto API; or another -errno code. */ int blk_crypto_start_using_key(struct block_device *bdev, const struct blk_crypto_key *key) { if (blk_crypto_config_supported_natively(bdev, &key->crypto_cfg)) return 0; return blk_crypto_fallback_start_using_mode(key->crypto_cfg.crypto_mode); } /** * blk_crypto_evict_key() - Evict a blk_crypto_key from a block_device * @bdev: a block_device on which I/O using the key may have been done * @key: the key to evict * * For a given block_device, this function removes the given blk_crypto_key from * the keyslot management structures and evicts it from any underlying hardware * keyslot(s) or blk-crypto-fallback keyslot it may have been programmed into. * * Upper layers must call this before freeing the blk_crypto_key. It must be * called for every block_device the key may have been used on. The key must no * longer be in use by any I/O when this function is called. * * Context: May sleep. */ void blk_crypto_evict_key(struct block_device *bdev, const struct blk_crypto_key *key) { struct request_queue *q = bdev_get_queue(bdev); int err; if (blk_crypto_config_supported_natively(bdev, &key->crypto_cfg)) err = __blk_crypto_evict_key(q->crypto_profile, key); else err = blk_crypto_fallback_evict_key(key); /* * An error can only occur here if the key failed to be evicted from a * keyslot (due to a hardware or driver issue) or is allegedly still in * use by I/O (due to a kernel bug). Even in these cases, the key is * still unlinked from the keyslot management structures, and the caller * is allowed and expected to free it right away. There's nothing * callers can do to handle errors, so just log them and return void. */ if (err) pr_warn_ratelimited("%pg: error %d evicting key\n", bdev, err); } EXPORT_SYMBOL_GPL(blk_crypto_evict_key); |
| 3 2 2 2 2 2 3 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 | // SPDX-License-Identifier: GPL-2.0-only /* * net/sched/sch_choke.c CHOKE scheduler * * Copyright (c) 2011 Stephen Hemminger <shemminger@vyatta.com> * Copyright (c) 2011 Eric Dumazet <eric.dumazet@gmail.com> */ #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/vmalloc.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> #include <net/inet_ecn.h> #include <net/red.h> #include <net/flow_dissector.h> /* CHOKe stateless AQM for fair bandwidth allocation ================================================= CHOKe (CHOose and Keep for responsive flows, CHOose and Kill for unresponsive flows) is a variant of RED that penalizes misbehaving flows but maintains no flow state. The difference from RED is an additional step during the enqueuing process. If average queue size is over the low threshold (qmin), a packet is chosen at random from the queue. If both the new and chosen packet are from the same flow, both are dropped. Unlike RED, CHOKe is not really a "classful" qdisc because it needs to access packets in queue randomly. It has a minimal class interface to allow overriding the builtin flow classifier with filters. Source: R. Pan, B. Prabhakar, and K. Psounis, "CHOKe, A Stateless Active Queue Management Scheme for Approximating Fair Bandwidth Allocation", IEEE INFOCOM, 2000. A. Tang, J. Wang, S. Low, "Understanding CHOKe: Throughput and Spatial Characteristics", IEEE/ACM Transactions on Networking, 2004 */ /* Upper bound on size of sk_buff table (packets) */ #define CHOKE_MAX_QUEUE (128*1024 - 1) struct choke_sched_data { /* Parameters */ u32 limit; unsigned char flags; struct red_parms parms; /* Variables */ struct red_vars vars; struct { u32 prob_drop; /* Early probability drops */ u32 prob_mark; /* Early probability marks */ u32 forced_drop; /* Forced drops, qavg > max_thresh */ u32 forced_mark; /* Forced marks, qavg > max_thresh */ u32 pdrop; /* Drops due to queue limits */ u32 matched; /* Drops to flow match */ } stats; unsigned int head; unsigned int tail; unsigned int tab_mask; /* size - 1 */ struct sk_buff **tab; }; /* number of elements in queue including holes */ static unsigned int choke_len(const struct choke_sched_data *q) { return (q->tail - q->head) & q->tab_mask; } /* Is ECN parameter configured */ static int use_ecn(const struct choke_sched_data *q) { return q->flags & TC_RED_ECN; } /* Should packets over max just be dropped (versus marked) */ static int use_harddrop(const struct choke_sched_data *q) { return q->flags & TC_RED_HARDDROP; } /* Move head pointer forward to skip over holes */ static void choke_zap_head_holes(struct choke_sched_data *q) { do { q->head = (q->head + 1) & q->tab_mask; if (q->head == q->tail) break; } while (q->tab[q->head] == NULL); } /* Move tail pointer backwards to reuse holes */ static void choke_zap_tail_holes(struct choke_sched_data *q) { do { q->tail = (q->tail - 1) & q->tab_mask; if (q->head == q->tail) break; } while (q->tab[q->tail] == NULL); } /* Drop packet from queue array by creating a "hole" */ static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx, struct sk_buff **to_free) { struct choke_sched_data *q = qdisc_priv(sch); struct sk_buff *skb = q->tab[idx]; q->tab[idx] = NULL; if (idx == q->head) choke_zap_head_holes(q); if (idx == q->tail) choke_zap_tail_holes(q); qdisc_qstats_backlog_dec(sch, skb); qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb)); qdisc_drop(skb, sch, to_free); --sch->q.qlen; } struct choke_skb_cb { u8 keys_valid; struct flow_keys_digest keys; }; static inline struct choke_skb_cb *choke_skb_cb(const struct sk_buff *skb) { qdisc_cb_private_validate(skb, sizeof(struct choke_skb_cb)); return (struct choke_skb_cb *)qdisc_skb_cb(skb)->data; } /* * Compare flow of two packets * Returns true only if source and destination address and port match. * false for special cases */ static bool choke_match_flow(struct sk_buff *skb1, struct sk_buff *skb2) { struct flow_keys temp; if (skb1->protocol != skb2->protocol) return false; if (!choke_skb_cb(skb1)->keys_valid) { choke_skb_cb(skb1)->keys_valid = 1; skb_flow_dissect_flow_keys(skb1, &temp, 0); make_flow_keys_digest(&choke_skb_cb(skb1)->keys, &temp); } if (!choke_skb_cb(skb2)->keys_valid) { choke_skb_cb(skb2)->keys_valid = 1; skb_flow_dissect_flow_keys(skb2, &temp, 0); make_flow_keys_digest(&choke_skb_cb(skb2)->keys, &temp); } return !memcmp(&choke_skb_cb(skb1)->keys, &choke_skb_cb(skb2)->keys, sizeof(choke_skb_cb(skb1)->keys)); } /* * Select a packet at random from queue * HACK: since queue can have holes from previous deletion; retry several * times to find a random skb but then just give up and return the head * Will return NULL if queue is empty (q->head == q->tail) */ static struct sk_buff *choke_peek_random(const struct choke_sched_data *q, unsigned int *pidx) { struct sk_buff *skb; int retrys = 3; do { *pidx = (q->head + get_random_u32_below(choke_len(q))) & q->tab_mask; skb = q->tab[*pidx]; if (skb) return skb; } while (--retrys > 0); return q->tab[*pidx = q->head]; } /* * Compare new packet with random packet in queue * returns true if matched and sets *pidx */ static bool choke_match_random(const struct choke_sched_data *q, struct sk_buff *nskb, unsigned int *pidx) { struct sk_buff *oskb; if (q->head == q->tail) return false; oskb = choke_peek_random(q, pidx); return choke_match_flow(oskb, nskb); } static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { struct choke_sched_data *q = qdisc_priv(sch); const struct red_parms *p = &q->parms; choke_skb_cb(skb)->keys_valid = 0; /* Compute average queue usage (see RED) */ q->vars.qavg = red_calc_qavg(p, &q->vars, sch->q.qlen); if (red_is_idling(&q->vars)) red_end_of_idle_period(&q->vars); /* Is queue small? */ if (q->vars.qavg <= p->qth_min) q->vars.qcount = -1; else { unsigned int idx; /* Draw a packet at random from queue and compare flow */ if (choke_match_random(q, skb, &idx)) { q->stats.matched++; choke_drop_by_idx(sch, idx, to_free); goto congestion_drop; } /* Queue is large, always mark/drop */ if (q->vars.qavg > p->qth_max) { q->vars.qcount = -1; qdisc_qstats_overlimit(sch); if (use_harddrop(q) || !use_ecn(q) || !INET_ECN_set_ce(skb)) { q->stats.forced_drop++; goto congestion_drop; } q->stats.forced_mark++; } else if (++q->vars.qcount) { if (red_mark_probability(p, &q->vars, q->vars.qavg)) { q->vars.qcount = 0; q->vars.qR = red_random(p); qdisc_qstats_overlimit(sch); if (!use_ecn(q) || !INET_ECN_set_ce(skb)) { q->stats.prob_drop++; goto congestion_drop; } q->stats.prob_mark++; } } else q->vars.qR = red_random(p); } /* Admit new packet */ if (sch->q.qlen < q->limit) { q->tab[q->tail] = skb; q->tail = (q->tail + 1) & q->tab_mask; ++sch->q.qlen; qdisc_qstats_backlog_inc(sch, skb); return NET_XMIT_SUCCESS; } q->stats.pdrop++; return qdisc_drop(skb, sch, to_free); congestion_drop: qdisc_drop(skb, sch, to_free); return NET_XMIT_CN; } static struct sk_buff *choke_dequeue(struct Qdisc *sch) { struct choke_sched_data *q = qdisc_priv(sch); struct sk_buff *skb; if (q->head == q->tail) { if (!red_is_idling(&q->vars)) red_start_of_idle_period(&q->vars); return NULL; } skb = q->tab[q->head]; q->tab[q->head] = NULL; choke_zap_head_holes(q); --sch->q.qlen; qdisc_qstats_backlog_dec(sch, skb); qdisc_bstats_update(sch, skb); return skb; } static void choke_reset(struct Qdisc *sch) { struct choke_sched_data *q = qdisc_priv(sch); while (q->head != q->tail) { struct sk_buff *skb = q->tab[q->head]; q->head = (q->head + 1) & q->tab_mask; if (!skb) continue; rtnl_qdisc_drop(skb, sch); } if (q->tab) memset(q->tab, 0, (q->tab_mask + 1) * sizeof(struct sk_buff *)); q->head = q->tail = 0; red_restart(&q->vars); } static const struct nla_policy choke_policy[TCA_CHOKE_MAX + 1] = { [TCA_CHOKE_PARMS] = { .len = sizeof(struct tc_red_qopt) }, [TCA_CHOKE_STAB] = { .len = RED_STAB_SIZE }, [TCA_CHOKE_MAX_P] = { .type = NLA_U32 }, }; static void choke_free(void *addr) { kvfree(addr); } static int choke_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct choke_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_CHOKE_MAX + 1]; const struct tc_red_qopt *ctl; int err; struct sk_buff **old = NULL; unsigned int mask; u32 max_P; u8 *stab; if (opt == NULL) return -EINVAL; err = nla_parse_nested_deprecated(tb, TCA_CHOKE_MAX, opt, choke_policy, NULL); if (err < 0) return err; if (tb[TCA_CHOKE_PARMS] == NULL || tb[TCA_CHOKE_STAB] == NULL) return -EINVAL; max_P = tb[TCA_CHOKE_MAX_P] ? nla_get_u32(tb[TCA_CHOKE_MAX_P]) : 0; ctl = nla_data(tb[TCA_CHOKE_PARMS]); stab = nla_data(tb[TCA_CHOKE_STAB]); if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog, ctl->Scell_log, stab)) return -EINVAL; if (ctl->limit > CHOKE_MAX_QUEUE) return -EINVAL; mask = roundup_pow_of_two(ctl->limit + 1) - 1; if (mask != q->tab_mask) { struct sk_buff **ntab; ntab = kvcalloc(mask + 1, sizeof(struct sk_buff *), GFP_KERNEL); if (!ntab) return -ENOMEM; sch_tree_lock(sch); old = q->tab; if (old) { unsigned int oqlen = sch->q.qlen, tail = 0; unsigned dropped = 0; while (q->head != q->tail) { struct sk_buff *skb = q->tab[q->head]; q->head = (q->head + 1) & q->tab_mask; if (!skb) continue; if (tail < mask) { ntab[tail++] = skb; continue; } dropped += qdisc_pkt_len(skb); qdisc_qstats_backlog_dec(sch, skb); --sch->q.qlen; rtnl_qdisc_drop(skb, sch); } qdisc_tree_reduce_backlog(sch, oqlen - sch->q.qlen, dropped); q->head = 0; q->tail = tail; } q->tab_mask = mask; q->tab = ntab; } else sch_tree_lock(sch); WRITE_ONCE(q->flags, ctl->flags); WRITE_ONCE(q->limit, ctl->limit); red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog, ctl->Plog, ctl->Scell_log, stab, max_P); red_set_vars(&q->vars); if (q->head == q->tail) red_end_of_idle_period(&q->vars); sch_tree_unlock(sch); choke_free(old); return 0; } static int choke_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { return choke_change(sch, opt, extack); } static int choke_dump(struct Qdisc *sch, struct sk_buff *skb) { struct choke_sched_data *q = qdisc_priv(sch); u8 Wlog = READ_ONCE(q->parms.Wlog); struct nlattr *opts = NULL; struct tc_red_qopt opt = { .limit = READ_ONCE(q->limit), .flags = READ_ONCE(q->flags), .qth_min = READ_ONCE(q->parms.qth_min) >> Wlog, .qth_max = READ_ONCE(q->parms.qth_max) >> Wlog, .Wlog = Wlog, .Plog = READ_ONCE(q->parms.Plog), .Scell_log = READ_ONCE(q->parms.Scell_log), }; opts = nla_nest_start_noflag(skb, TCA_OPTIONS); if (opts == NULL) goto nla_put_failure; if (nla_put(skb, TCA_CHOKE_PARMS, sizeof(opt), &opt) || nla_put_u32(skb, TCA_CHOKE_MAX_P, READ_ONCE(q->parms.max_P))) goto nla_put_failure; return nla_nest_end(skb, opts); nla_put_failure: nla_nest_cancel(skb, opts); return -EMSGSIZE; } static int choke_dump_stats(struct Qdisc *sch, struct gnet_dump *d) { struct choke_sched_data *q = qdisc_priv(sch); struct tc_choke_xstats st = { .early = q->stats.prob_drop + q->stats.forced_drop, .marked = q->stats.prob_mark + q->stats.forced_mark, .pdrop = q->stats.pdrop, .matched = q->stats.matched, }; return gnet_stats_copy_app(d, &st, sizeof(st)); } static void choke_destroy(struct Qdisc *sch) { struct choke_sched_data *q = qdisc_priv(sch); choke_free(q->tab); } static struct sk_buff *choke_peek_head(struct Qdisc *sch) { struct choke_sched_data *q = qdisc_priv(sch); return (q->head != q->tail) ? q->tab[q->head] : NULL; } static struct Qdisc_ops choke_qdisc_ops __read_mostly = { .id = "choke", .priv_size = sizeof(struct choke_sched_data), .enqueue = choke_enqueue, .dequeue = choke_dequeue, .peek = choke_peek_head, .init = choke_init, .destroy = choke_destroy, .reset = choke_reset, .change = choke_change, .dump = choke_dump, .dump_stats = choke_dump_stats, .owner = THIS_MODULE, }; MODULE_ALIAS_NET_SCH("choke"); static int __init choke_module_init(void) { return register_qdisc(&choke_qdisc_ops); } static void __exit choke_module_exit(void) { unregister_qdisc(&choke_qdisc_ops); } module_init(choke_module_init) module_exit(choke_module_exit) MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Choose and keep responsive flows scheduler"); |
| 6 6 14 2 14 14 2 2 14 14 14 14 14 1 3 1 3 22 22 22 22 22 22 22 8 8 2 8 2 2 2 2 2 8 6 6 6 6 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 | // SPDX-License-Identifier: GPL-2.0-only /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Implementation of the Transmission Control Protocol(TCP). * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Mark Evans, <evansmp@uhura.aston.ac.uk> * Corey Minyard <wf-rch!minyard@relay.EU.net> * Florian La Roche, <flla@stud.uni-sb.de> * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> * Linus Torvalds, <torvalds@cs.helsinki.fi> * Alan Cox, <gw4pts@gw4pts.ampr.org> * Matthew Dillon, <dillon@apollo.west.oic.com> * Arnt Gulbrandsen, <agulbra@nvg.unit.no> * Jorge Cwik, <jorge@laser.satlink.net> */ #include <net/tcp.h> #include <net/xfrm.h> #include <net/busy_poll.h> #include <net/rstreason.h> static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) { if (seq == s_win) return true; if (after(end_seq, s_win) && before(seq, e_win)) return true; return seq == e_win && seq == end_seq; } static enum tcp_tw_status tcp_timewait_check_oow_rate_limit(struct inet_timewait_sock *tw, const struct sk_buff *skb, int mib_idx) { struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); if (!tcp_oow_rate_limited(twsk_net(tw), skb, mib_idx, &tcptw->tw_last_oow_ack_time)) { /* Send ACK. Note, we do not put the bucket, * it will be released by caller. */ return TCP_TW_ACK; } /* We are rate-limiting, so just release the tw sock and drop skb. */ inet_twsk_put(tw); return TCP_TW_SUCCESS; } static void twsk_rcv_nxt_update(struct tcp_timewait_sock *tcptw, u32 seq) { #ifdef CONFIG_TCP_AO struct tcp_ao_info *ao; ao = rcu_dereference(tcptw->ao_info); if (unlikely(ao && seq < tcptw->tw_rcv_nxt)) WRITE_ONCE(ao->rcv_sne, ao->rcv_sne + 1); #endif tcptw->tw_rcv_nxt = seq; } /* * * Main purpose of TIME-WAIT state is to close connection gracefully, * when one of ends sits in LAST-ACK or CLOSING retransmitting FIN * (and, probably, tail of data) and one or more our ACKs are lost. * * What is TIME-WAIT timeout? It is associated with maximal packet * lifetime in the internet, which results in wrong conclusion, that * it is set to catch "old duplicate segments" wandering out of their path. * It is not quite correct. This timeout is calculated so that it exceeds * maximal retransmission timeout enough to allow to lose one (or more) * segments sent by peer and our ACKs. This time may be calculated from RTO. * * When TIME-WAIT socket receives RST, it means that another end * finally closed and we are allowed to kill TIME-WAIT too. * * Second purpose of TIME-WAIT is catching old duplicate segments. * Well, certainly it is pure paranoia, but if we load TIME-WAIT * with this semantics, we MUST NOT kill TIME-WAIT state with RSTs. * * If we invented some more clever way to catch duplicates * (f.e. based on PAWS), we could truncate TIME-WAIT to several RTOs. * * The algorithm below is based on FORMAL INTERPRETATION of RFCs. * When you compare it to RFCs, please, read section SEGMENT ARRIVES * from the very beginning. * * NOTE. With recycling (and later with fin-wait-2) TW bucket * is _not_ stateless. It means, that strictly speaking we must * spinlock it. I do not want! Well, probability of misbehaviour * is ridiculously low and, seems, we could use some mb() tricks * to avoid misread sequence numbers, states etc. --ANK * * We don't need to initialize tmp_out.sack_ok as we don't use the results */ enum tcp_tw_status tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, const struct tcphdr *th, u32 *tw_isn) { struct tcp_options_received tmp_opt; struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); bool paws_reject = false; tmp_opt.saw_tstamp = 0; if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) { tcp_parse_options(twsk_net(tw), skb, &tmp_opt, 0, NULL); if (tmp_opt.saw_tstamp) { if (tmp_opt.rcv_tsecr) tmp_opt.rcv_tsecr -= tcptw->tw_ts_offset; tmp_opt.ts_recent = tcptw->tw_ts_recent; tmp_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; paws_reject = tcp_paws_reject(&tmp_opt, th->rst); } } if (tw->tw_substate == TCP_FIN_WAIT2) { /* Just repeat all the checks of tcp_rcv_state_process() */ /* Out of window, send ACK */ if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, tcptw->tw_rcv_nxt, tcptw->tw_rcv_nxt + tcptw->tw_rcv_wnd)) return tcp_timewait_check_oow_rate_limit( tw, skb, LINUX_MIB_TCPACKSKIPPEDFINWAIT2); if (th->rst) goto kill; if (th->syn && !before(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt)) return TCP_TW_RST; /* Dup ACK? */ if (!th->ack || !after(TCP_SKB_CB(skb)->end_seq, tcptw->tw_rcv_nxt) || TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) { inet_twsk_put(tw); return TCP_TW_SUCCESS; } /* New data or FIN. If new data arrive after half-duplex close, * reset. */ if (!th->fin || TCP_SKB_CB(skb)->end_seq != tcptw->tw_rcv_nxt + 1) return TCP_TW_RST; /* FIN arrived, enter true time-wait state. */ tw->tw_substate = TCP_TIME_WAIT; twsk_rcv_nxt_update(tcptw, TCP_SKB_CB(skb)->end_seq); if (tmp_opt.saw_tstamp) { tcptw->tw_ts_recent_stamp = ktime_get_seconds(); tcptw->tw_ts_recent = tmp_opt.rcv_tsval; } inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN); return TCP_TW_ACK; } /* * Now real TIME-WAIT state. * * RFC 1122: * "When a connection is [...] on TIME-WAIT state [...] * [a TCP] MAY accept a new SYN from the remote TCP to * reopen the connection directly, if it: * * (1) assigns its initial sequence number for the new * connection to be larger than the largest sequence * number it used on the previous connection incarnation, * and * * (2) returns to TIME-WAIT state if the SYN turns out * to be an old duplicate". */ if (!paws_reject && (TCP_SKB_CB(skb)->seq == tcptw->tw_rcv_nxt && (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq || th->rst))) { /* In window segment, it may be only reset or bare ack. */ if (th->rst) { /* This is TIME_WAIT assassination, in two flavors. * Oh well... nobody has a sufficient solution to this * protocol bug yet. */ if (!READ_ONCE(twsk_net(tw)->ipv4.sysctl_tcp_rfc1337)) { kill: inet_twsk_deschedule_put(tw); return TCP_TW_SUCCESS; } } else { inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN); } if (tmp_opt.saw_tstamp) { tcptw->tw_ts_recent = tmp_opt.rcv_tsval; tcptw->tw_ts_recent_stamp = ktime_get_seconds(); } inet_twsk_put(tw); return TCP_TW_SUCCESS; } /* Out of window segment. All the segments are ACKed immediately. The only exception is new SYN. We accept it, if it is not old duplicate and we are not in danger to be killed by delayed old duplicates. RFC check is that it has newer sequence number works at rates <40Mbit/sec. However, if paws works, it is reliable AND even more, we even may relax silly seq space cutoff. RED-PEN: we violate main RFC requirement, if this SYN will appear old duplicate (i.e. we receive RST in reply to SYN-ACK), we must return socket to time-wait state. It is not good, but not fatal yet. */ if (th->syn && !th->rst && !th->ack && !paws_reject && (after(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt) || (tmp_opt.saw_tstamp && (s32)(tcptw->tw_ts_recent - tmp_opt.rcv_tsval) < 0))) { u32 isn = tcptw->tw_snd_nxt + 65535 + 2; if (isn == 0) isn++; *tw_isn = isn; return TCP_TW_SYN; } if (paws_reject) __NET_INC_STATS(twsk_net(tw), LINUX_MIB_PAWSESTABREJECTED); if (!th->rst) { /* In this case we must reset the TIMEWAIT timer. * * If it is ACKless SYN it may be both old duplicate * and new good SYN with random sequence number <rcv_nxt. * Do not reschedule in the last case. */ if (paws_reject || th->ack) inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN); return tcp_timewait_check_oow_rate_limit( tw, skb, LINUX_MIB_TCPACKSKIPPEDTIMEWAIT); } inet_twsk_put(tw); return TCP_TW_SUCCESS; } EXPORT_SYMBOL(tcp_timewait_state_process); static void tcp_time_wait_init(struct sock *sk, struct tcp_timewait_sock *tcptw) { #ifdef CONFIG_TCP_MD5SIG const struct tcp_sock *tp = tcp_sk(sk); struct tcp_md5sig_key *key; /* * The timewait bucket does not have the key DB from the * sock structure. We just make a quick copy of the * md5 key being used (if indeed we are using one) * so the timewait ack generating code has the key. */ tcptw->tw_md5_key = NULL; if (!static_branch_unlikely(&tcp_md5_needed.key)) return; key = tp->af_specific->md5_lookup(sk, sk); if (key) { tcptw->tw_md5_key = kmemdup(key, sizeof(*key), GFP_ATOMIC); if (!tcptw->tw_md5_key) return; if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) goto out_free; tcp_md5_add_sigpool(); } return; out_free: WARN_ON_ONCE(1); kfree(tcptw->tw_md5_key); tcptw->tw_md5_key = NULL; #endif } /* * Move a socket to time-wait or dead fin-wait-2 state. */ void tcp_time_wait(struct sock *sk, int state, int timeo) { const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); struct inet_timewait_sock *tw; tw = inet_twsk_alloc(sk, &net->ipv4.tcp_death_row, state); if (tw) { struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1); tw->tw_transparent = inet_test_bit(TRANSPARENT, sk); tw->tw_mark = sk->sk_mark; tw->tw_priority = READ_ONCE(sk->sk_priority); tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale; tcptw->tw_rcv_nxt = tp->rcv_nxt; tcptw->tw_snd_nxt = tp->snd_nxt; tcptw->tw_rcv_wnd = tcp_receive_window(tp); tcptw->tw_ts_recent = tp->rx_opt.ts_recent; tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp; tcptw->tw_ts_offset = tp->tsoffset; tw->tw_usec_ts = tp->tcp_usec_ts; tcptw->tw_last_oow_ack_time = 0; tcptw->tw_tx_delay = tp->tcp_tx_delay; tw->tw_txhash = sk->sk_txhash; #if IS_ENABLED(CONFIG_IPV6) if (tw->tw_family == PF_INET6) { struct ipv6_pinfo *np = inet6_sk(sk); tw->tw_v6_daddr = sk->sk_v6_daddr; tw->tw_v6_rcv_saddr = sk->sk_v6_rcv_saddr; tw->tw_tclass = np->tclass; tw->tw_flowlabel = be32_to_cpu(np->flow_label & IPV6_FLOWLABEL_MASK); tw->tw_ipv6only = sk->sk_ipv6only; } #endif tcp_time_wait_init(sk, tcptw); tcp_ao_time_wait(tcptw, tp); /* Get the TIME_WAIT timeout firing. */ if (timeo < rto) timeo = rto; if (state == TCP_TIME_WAIT) timeo = TCP_TIMEWAIT_LEN; /* tw_timer is pinned, so we need to make sure BH are disabled * in following section, otherwise timer handler could run before * we complete the initialization. */ local_bh_disable(); inet_twsk_schedule(tw, timeo); /* Linkage updates. * Note that access to tw after this point is illegal. */ inet_twsk_hashdance(tw, sk, net->ipv4.tcp_death_row.hashinfo); local_bh_enable(); } else { /* Sorry, if we're out of memory, just CLOSE this * socket up. We've got bigger problems than * non-graceful socket closings. */ NET_INC_STATS(net, LINUX_MIB_TCPTIMEWAITOVERFLOW); } tcp_update_metrics(sk); tcp_done(sk); } EXPORT_SYMBOL(tcp_time_wait); #ifdef CONFIG_TCP_MD5SIG static void tcp_md5_twsk_free_rcu(struct rcu_head *head) { struct tcp_md5sig_key *key; key = container_of(head, struct tcp_md5sig_key, rcu); kfree(key); static_branch_slow_dec_deferred(&tcp_md5_needed); tcp_md5_release_sigpool(); } #endif void tcp_twsk_destructor(struct sock *sk) { #ifdef CONFIG_TCP_MD5SIG if (static_branch_unlikely(&tcp_md5_needed.key)) { struct tcp_timewait_sock *twsk = tcp_twsk(sk); if (twsk->tw_md5_key) call_rcu(&twsk->tw_md5_key->rcu, tcp_md5_twsk_free_rcu); } #endif tcp_ao_destroy_sock(sk, true); } EXPORT_SYMBOL_GPL(tcp_twsk_destructor); void tcp_twsk_purge(struct list_head *net_exit_list) { bool purged_once = false; struct net *net; list_for_each_entry(net, net_exit_list, exit_list) { if (net->ipv4.tcp_death_row.hashinfo->pernet) { /* Even if tw_refcount == 1, we must clean up kernel reqsk */ inet_twsk_purge(net->ipv4.tcp_death_row.hashinfo); } else if (!purged_once) { inet_twsk_purge(&tcp_hashinfo); purged_once = true; } } } /* Warning : This function is called without sk_listener being locked. * Be sure to read socket fields once, as their value could change under us. */ void tcp_openreq_init_rwin(struct request_sock *req, const struct sock *sk_listener, const struct dst_entry *dst) { struct inet_request_sock *ireq = inet_rsk(req); const struct tcp_sock *tp = tcp_sk(sk_listener); int full_space = tcp_full_space(sk_listener); u32 window_clamp; __u8 rcv_wscale; u32 rcv_wnd; int mss; mss = tcp_mss_clamp(tp, dst_metric_advmss(dst)); window_clamp = READ_ONCE(tp->window_clamp); /* Set this up on the first call only */ req->rsk_window_clamp = window_clamp ? : dst_metric(dst, RTAX_WINDOW); /* limit the window selection if the user enforce a smaller rx buffer */ if (sk_listener->sk_userlocks & SOCK_RCVBUF_LOCK && (req->rsk_window_clamp > full_space || req->rsk_window_clamp == 0)) req->rsk_window_clamp = full_space; rcv_wnd = tcp_rwnd_init_bpf((struct sock *)req); if (rcv_wnd == 0) rcv_wnd = dst_metric(dst, RTAX_INITRWND); else if (full_space < rcv_wnd * mss) full_space = rcv_wnd * mss; /* tcp_full_space because it is guaranteed to be the first packet */ tcp_select_initial_window(sk_listener, full_space, mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), &req->rsk_rcv_wnd, &req->rsk_window_clamp, ireq->wscale_ok, &rcv_wscale, rcv_wnd); ireq->rcv_wscale = rcv_wscale; } EXPORT_SYMBOL(tcp_openreq_init_rwin); static void tcp_ecn_openreq_child(struct tcp_sock *tp, const struct request_sock *req) { tp->ecn_flags = inet_rsk(req)->ecn_ok ? TCP_ECN_OK : 0; } void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst) { struct inet_connection_sock *icsk = inet_csk(sk); u32 ca_key = dst_metric(dst, RTAX_CC_ALGO); bool ca_got_dst = false; if (ca_key != TCP_CA_UNSPEC) { const struct tcp_congestion_ops *ca; rcu_read_lock(); ca = tcp_ca_find_key(ca_key); if (likely(ca && bpf_try_module_get(ca, ca->owner))) { icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst); icsk->icsk_ca_ops = ca; ca_got_dst = true; } rcu_read_unlock(); } /* If no valid choice made yet, assign current system default ca. */ if (!ca_got_dst && (!icsk->icsk_ca_setsockopt || !bpf_try_module_get(icsk->icsk_ca_ops, icsk->icsk_ca_ops->owner))) tcp_assign_congestion_control(sk); tcp_set_ca_state(sk, TCP_CA_Open); } EXPORT_SYMBOL_GPL(tcp_ca_openreq_child); static void smc_check_reset_syn_req(const struct tcp_sock *oldtp, struct request_sock *req, struct tcp_sock *newtp) { #if IS_ENABLED(CONFIG_SMC) struct inet_request_sock *ireq; if (static_branch_unlikely(&tcp_have_smc)) { ireq = inet_rsk(req); if (oldtp->syn_smc && !ireq->smc_ok) newtp->syn_smc = 0; } #endif } /* This is not only more efficient than what we used to do, it eliminates * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM * * Actually, we could lots of memory writes here. tp of listening * socket contains all necessary default parameters. */ struct sock *tcp_create_openreq_child(const struct sock *sk, struct request_sock *req, struct sk_buff *skb) { struct sock *newsk = inet_csk_clone_lock(sk, req, GFP_ATOMIC); const struct inet_request_sock *ireq = inet_rsk(req); struct tcp_request_sock *treq = tcp_rsk(req); struct inet_connection_sock *newicsk; const struct tcp_sock *oldtp; struct tcp_sock *newtp; u32 seq; #ifdef CONFIG_TCP_AO struct tcp_ao_key *ao_key; #endif if (!newsk) return NULL; newicsk = inet_csk(newsk); newtp = tcp_sk(newsk); oldtp = tcp_sk(sk); smc_check_reset_syn_req(oldtp, req, newtp); /* Now setup tcp_sock */ newtp->pred_flags = 0; seq = treq->rcv_isn + 1; newtp->rcv_wup = seq; WRITE_ONCE(newtp->copied_seq, seq); WRITE_ONCE(newtp->rcv_nxt, seq); newtp->segs_in = 1; seq = treq->snt_isn + 1; newtp->snd_sml = newtp->snd_una = seq; WRITE_ONCE(newtp->snd_nxt, seq); newtp->snd_up = seq; INIT_LIST_HEAD(&newtp->tsq_node); INIT_LIST_HEAD(&newtp->tsorted_sent_queue); tcp_init_wl(newtp, treq->rcv_isn); minmax_reset(&newtp->rtt_min, tcp_jiffies32, ~0U); newicsk->icsk_ack.lrcvtime = tcp_jiffies32; newtp->lsndtime = tcp_jiffies32; newsk->sk_txhash = READ_ONCE(treq->txhash); newtp->total_retrans = req->num_retrans; tcp_init_xmit_timers(newsk); WRITE_ONCE(newtp->write_seq, newtp->pushed_seq = treq->snt_isn + 1); if (sock_flag(newsk, SOCK_KEEPOPEN)) inet_csk_reset_keepalive_timer(newsk, keepalive_time_when(newtp)); newtp->rx_opt.tstamp_ok = ireq->tstamp_ok; newtp->rx_opt.sack_ok = ireq->sack_ok; newtp->window_clamp = req->rsk_window_clamp; newtp->rcv_ssthresh = req->rsk_rcv_wnd; newtp->rcv_wnd = req->rsk_rcv_wnd; newtp->rx_opt.wscale_ok = ireq->wscale_ok; if (newtp->rx_opt.wscale_ok) { newtp->rx_opt.snd_wscale = ireq->snd_wscale; newtp->rx_opt.rcv_wscale = ireq->rcv_wscale; } else { newtp->rx_opt.snd_wscale = newtp->rx_opt.rcv_wscale = 0; newtp->window_clamp = min(newtp->window_clamp, 65535U); } newtp->snd_wnd = ntohs(tcp_hdr(skb)->window) << newtp->rx_opt.snd_wscale; newtp->max_window = newtp->snd_wnd; if (newtp->rx_opt.tstamp_ok) { newtp->tcp_usec_ts = treq->req_usec_ts; newtp->rx_opt.ts_recent = READ_ONCE(req->ts_recent); newtp->rx_opt.ts_recent_stamp = ktime_get_seconds(); newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; } else { newtp->tcp_usec_ts = 0; newtp->rx_opt.ts_recent_stamp = 0; newtp->tcp_header_len = sizeof(struct tcphdr); } if (req->num_timeout) { newtp->total_rto = req->num_timeout; newtp->undo_marker = treq->snt_isn; if (newtp->tcp_usec_ts) { newtp->retrans_stamp = treq->snt_synack; newtp->total_rto_time = (u32)(tcp_clock_us() - newtp->retrans_stamp) / USEC_PER_MSEC; } else { newtp->retrans_stamp = div_u64(treq->snt_synack, USEC_PER_SEC / TCP_TS_HZ); newtp->total_rto_time = tcp_clock_ms() - newtp->retrans_stamp; } newtp->total_rto_recoveries = 1; } newtp->tsoffset = treq->ts_off; #ifdef CONFIG_TCP_MD5SIG newtp->md5sig_info = NULL; /*XXX*/ #endif #ifdef CONFIG_TCP_AO newtp->ao_info = NULL; ao_key = treq->af_specific->ao_lookup(sk, req, tcp_rsk(req)->ao_keyid, -1); if (ao_key) newtp->tcp_header_len += tcp_ao_len_aligned(ao_key); #endif if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len) newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len; newtp->rx_opt.mss_clamp = req->mss; tcp_ecn_openreq_child(newtp, req); newtp->fastopen_req = NULL; RCU_INIT_POINTER(newtp->fastopen_rsk, NULL); newtp->bpf_chg_cc_inprogress = 0; tcp_bpf_clone(sk, newsk); __TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS); return newsk; } EXPORT_SYMBOL(tcp_create_openreq_child); /* * Process an incoming packet for SYN_RECV sockets represented as a * request_sock. Normally sk is the listener socket but for TFO it * points to the child socket. * * XXX (TFO) - The current impl contains a special check for ack * validation and inside tcp_v4_reqsk_send_ack(). Can we do better? * * We don't need to initialize tmp_opt.sack_ok as we don't use the results * * Note: If @fastopen is true, this can be called from process context. * Otherwise, this is from BH context. */ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, struct request_sock *req, bool fastopen, bool *req_stolen) { struct tcp_options_received tmp_opt; struct sock *child; const struct tcphdr *th = tcp_hdr(skb); __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); bool paws_reject = false; bool own_req; tmp_opt.saw_tstamp = 0; if (th->doff > (sizeof(struct tcphdr)>>2)) { tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0, NULL); if (tmp_opt.saw_tstamp) { tmp_opt.ts_recent = READ_ONCE(req->ts_recent); if (tmp_opt.rcv_tsecr) tmp_opt.rcv_tsecr -= tcp_rsk(req)->ts_off; /* We do not store true stamp, but it is not required, * it can be estimated (approximately) * from another data. */ tmp_opt.ts_recent_stamp = ktime_get_seconds() - reqsk_timeout(req, TCP_RTO_MAX) / HZ; paws_reject = tcp_paws_reject(&tmp_opt, th->rst); } } /* Check for pure retransmitted SYN. */ if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn && flg == TCP_FLAG_SYN && !paws_reject) { /* * RFC793 draws (Incorrectly! It was fixed in RFC1122) * this case on figure 6 and figure 8, but formal * protocol description says NOTHING. * To be more exact, it says that we should send ACK, * because this segment (at least, if it has no data) * is out of window. * * CONCLUSION: RFC793 (even with RFC1122) DOES NOT * describe SYN-RECV state. All the description * is wrong, we cannot believe to it and should * rely only on common sense and implementation * experience. * * Enforce "SYN-ACK" according to figure 8, figure 6 * of RFC793, fixed by RFC1122. * * Note that even if there is new data in the SYN packet * they will be thrown away too. * * Reset timer after retransmitting SYNACK, similar to * the idea of fast retransmit in recovery. */ if (!tcp_oow_rate_limited(sock_net(sk), skb, LINUX_MIB_TCPACKSKIPPEDSYNRECV, &tcp_rsk(req)->last_oow_ack_time) && !inet_rtx_syn_ack(sk, req)) { unsigned long expires = jiffies; expires += reqsk_timeout(req, TCP_RTO_MAX); if (!fastopen) mod_timer_pending(&req->rsk_timer, expires); else req->rsk_timer.expires = expires; } return NULL; } /* Further reproduces section "SEGMENT ARRIVES" for state SYN-RECEIVED of RFC793. It is broken, however, it does not work only when SYNs are crossed. You would think that SYN crossing is impossible here, since we should have a SYN_SENT socket (from connect()) on our end, but this is not true if the crossed SYNs were sent to both ends by a malicious third party. We must defend against this, and to do that we first verify the ACK (as per RFC793, page 36) and reset if it is invalid. Is this a true full defense? To convince ourselves, let us consider a way in which the ACK test can still pass in this 'malicious crossed SYNs' case. Malicious sender sends identical SYNs (and thus identical sequence numbers) to both A and B: A: gets SYN, seq=7 B: gets SYN, seq=7 By our good fortune, both A and B select the same initial send sequence number of seven :-) A: sends SYN|ACK, seq=7, ack_seq=8 B: sends SYN|ACK, seq=7, ack_seq=8 So we are now A eating this SYN|ACK, ACK test passes. So does sequence test, SYN is truncated, and thus we consider it a bare ACK. If icsk->icsk_accept_queue.rskq_defer_accept, we silently drop this bare ACK. Otherwise, we create an established connection. Both ends (listening sockets) accept the new incoming connection and try to talk to each other. 8-) Note: This case is both harmless, and rare. Possibility is about the same as us discovering intelligent life on another plant tomorrow. But generally, we should (RFC lies!) to accept ACK from SYNACK both here and in tcp_rcv_state_process(). tcp_rcv_state_process() does not, hence, we do not too. Note that the case is absolutely generic: we cannot optimize anything here without violating protocol. All the checks must be made before attempt to create socket. */ /* RFC793 page 36: "If the connection is in any non-synchronized state ... * and the incoming segment acknowledges something not yet * sent (the segment carries an unacceptable ACK) ... * a reset is sent." * * Invalid ACK: reset will be sent by listening socket. * Note that the ACK validity check for a Fast Open socket is done * elsewhere and is checked directly against the child socket rather * than req because user data may have been sent out. */ if ((flg & TCP_FLAG_ACK) && !fastopen && (TCP_SKB_CB(skb)->ack_seq != tcp_rsk(req)->snt_isn + 1)) return sk; /* Also, it would be not so bad idea to check rcv_tsecr, which * is essentially ACK extension and too early or too late values * should cause reset in unsynchronized states. */ /* RFC793: "first check sequence number". */ if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, tcp_rsk(req)->rcv_nxt, tcp_rsk(req)->rcv_nxt + tcp_synack_window(req))) { /* Out of window: send ACK and drop. */ if (!(flg & TCP_FLAG_RST) && !tcp_oow_rate_limited(sock_net(sk), skb, LINUX_MIB_TCPACKSKIPPEDSYNRECV, &tcp_rsk(req)->last_oow_ack_time)) req->rsk_ops->send_ack(sk, skb, req); if (paws_reject) NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); return NULL; } /* In sequence, PAWS is OK. */ /* TODO: We probably should defer ts_recent change once * we take ownership of @req. */ if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_nxt)) WRITE_ONCE(req->ts_recent, tmp_opt.rcv_tsval); if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) { /* Truncate SYN, it is out of window starting at tcp_rsk(req)->rcv_isn + 1. */ flg &= ~TCP_FLAG_SYN; } /* RFC793: "second check the RST bit" and * "fourth, check the SYN bit" */ if (flg & (TCP_FLAG_RST|TCP_FLAG_SYN)) { TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS); goto embryonic_reset; } /* ACK sequence verified above, just make sure ACK is * set. If ACK not set, just silently drop the packet. * * XXX (TFO) - if we ever allow "data after SYN", the * following check needs to be removed. */ if (!(flg & TCP_FLAG_ACK)) return NULL; /* For Fast Open no more processing is needed (sk is the * child socket). */ if (fastopen) return sk; /* While TCP_DEFER_ACCEPT is active, drop bare ACK. */ if (req->num_timeout < READ_ONCE(inet_csk(sk)->icsk_accept_queue.rskq_defer_accept) && TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) { inet_rsk(req)->acked = 1; __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDEFERACCEPTDROP); return NULL; } /* OK, ACK is valid, create big socket and * feed this segment to it. It will repeat all * the tests. THIS SEGMENT MUST MOVE SOCKET TO * ESTABLISHED STATE. If it will be dropped after * socket is created, wait for troubles. */ child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL, req, &own_req); if (!child) goto listen_overflow; if (own_req && rsk_drop_req(req)) { reqsk_queue_removed(&inet_csk(req->rsk_listener)->icsk_accept_queue, req); inet_csk_reqsk_queue_drop_and_put(req->rsk_listener, req); return child; } sock_rps_save_rxhash(child, skb); tcp_synack_rtt_meas(child, req); *req_stolen = !own_req; return inet_csk_complete_hashdance(sk, child, req, own_req); listen_overflow: if (sk != req->rsk_listener) __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQFAILURE); if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_abort_on_overflow)) { inet_rsk(req)->acked = 1; return NULL; } embryonic_reset: if (!(flg & TCP_FLAG_RST)) { /* Received a bad SYN pkt - for TFO We try not to reset * the local connection unless it's really necessary to * avoid becoming vulnerable to outside attack aiming at * resetting legit local connections. */ req->rsk_ops->send_reset(sk, skb, SK_RST_REASON_INVALID_SYN); } else if (fastopen) { /* received a valid RST pkt */ reqsk_fastopen_remove(sk, req, true); tcp_reset(sk, skb); } if (!fastopen) { bool unlinked = inet_csk_reqsk_queue_drop(sk, req); if (unlinked) __NET_INC_STATS(sock_net(sk), LINUX_MIB_EMBRYONICRSTS); *req_stolen = !unlinked; } return NULL; } EXPORT_SYMBOL(tcp_check_req); /* * Queue segment on the new socket if the new socket is active, * otherwise we just shortcircuit this and continue with * the new socket. * * For the vast majority of cases child->sk_state will be TCP_SYN_RECV * when entering. But other states are possible due to a race condition * where after __inet_lookup_established() fails but before the listener * locked is obtained, other packets cause the same connection to * be created. */ enum skb_drop_reason tcp_child_process(struct sock *parent, struct sock *child, struct sk_buff *skb) __releases(&((child)->sk_lock.slock)) { enum skb_drop_reason reason = SKB_NOT_DROPPED_YET; int state = child->sk_state; /* record sk_napi_id and sk_rx_queue_mapping of child. */ sk_mark_napi_id_set(child, skb); tcp_segs_in(tcp_sk(child), skb); if (!sock_owned_by_user(child)) { reason = tcp_rcv_state_process(child, skb); /* Wakeup parent, send SIGIO */ if (state == TCP_SYN_RECV && child->sk_state != state) parent->sk_data_ready(parent); } else { /* Alas, it is possible again, because we do lookup * in main socket hash table and lock on listening * socket does not protect us more. */ __sk_add_backlog(child, skb); } bh_unlock_sock(child); sock_put(child); return reason; } EXPORT_SYMBOL(tcp_child_process); |
| 23 28 1 23 24 6 36 106 106 36 1 3 1 2 11 6 4 4 2 2 1 1 1 10 10 6 2 5 2 9 3 2 5 9 3 1 9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _INET_ECN_H_ #define _INET_ECN_H_ #include <linux/ip.h> #include <linux/skbuff.h> #include <linux/if_vlan.h> #include <net/inet_sock.h> #include <net/dsfield.h> #include <net/checksum.h> enum { INET_ECN_NOT_ECT = 0, INET_ECN_ECT_1 = 1, INET_ECN_ECT_0 = 2, INET_ECN_CE = 3, INET_ECN_MASK = 3, }; extern int sysctl_tunnel_ecn_log; static inline int INET_ECN_is_ce(__u8 dsfield) { return (dsfield & INET_ECN_MASK) == INET_ECN_CE; } static inline int INET_ECN_is_not_ect(__u8 dsfield) { return (dsfield & INET_ECN_MASK) == INET_ECN_NOT_ECT; } static inline int INET_ECN_is_capable(__u8 dsfield) { return dsfield & INET_ECN_ECT_0; } /* * RFC 3168 9.1.1 * The full-functionality option for ECN encapsulation is to copy the * ECN codepoint of the inside header to the outside header on * encapsulation if the inside header is not-ECT or ECT, and to set the * ECN codepoint of the outside header to ECT(0) if the ECN codepoint of * the inside header is CE. */ static inline __u8 INET_ECN_encapsulate(__u8 outer, __u8 inner) { outer &= ~INET_ECN_MASK; outer |= !INET_ECN_is_ce(inner) ? (inner & INET_ECN_MASK) : INET_ECN_ECT_0; return outer; } static inline void INET_ECN_xmit(struct sock *sk) { inet_sk(sk)->tos |= INET_ECN_ECT_0; if (inet6_sk(sk) != NULL) inet6_sk(sk)->tclass |= INET_ECN_ECT_0; } static inline void INET_ECN_dontxmit(struct sock *sk) { inet_sk(sk)->tos &= ~INET_ECN_MASK; if (inet6_sk(sk) != NULL) inet6_sk(sk)->tclass &= ~INET_ECN_MASK; } #define IP6_ECN_flow_init(label) do { \ (label) &= ~htonl(INET_ECN_MASK << 20); \ } while (0) #define IP6_ECN_flow_xmit(sk, label) do { \ if (INET_ECN_is_capable(inet6_sk(sk)->tclass)) \ (label) |= htonl(INET_ECN_ECT_0 << 20); \ } while (0) static inline int IP_ECN_set_ce(struct iphdr *iph) { u32 ecn = (iph->tos + 1) & INET_ECN_MASK; __be16 check_add; /* * After the last operation we have (in binary): * INET_ECN_NOT_ECT => 01 * INET_ECN_ECT_1 => 10 * INET_ECN_ECT_0 => 11 * INET_ECN_CE => 00 */ if (!(ecn & 2)) return !ecn; /* * The following gives us: * INET_ECN_ECT_1 => check += htons(0xFFFD) * INET_ECN_ECT_0 => check += htons(0xFFFE) */ check_add = (__force __be16)((__force u16)htons(0xFFFB) + (__force u16)htons(ecn)); iph->check = csum16_add(iph->check, check_add); iph->tos |= INET_ECN_CE; return 1; } static inline int IP_ECN_set_ect1(struct iphdr *iph) { if ((iph->tos & INET_ECN_MASK) != INET_ECN_ECT_0) return 0; iph->check = csum16_add(iph->check, htons(0x1)); iph->tos ^= INET_ECN_MASK; return 1; } static inline void IP_ECN_clear(struct iphdr *iph) { iph->tos &= ~INET_ECN_MASK; } static inline void ipv4_copy_dscp(unsigned int dscp, struct iphdr *inner) { dscp &= ~INET_ECN_MASK; ipv4_change_dsfield(inner, INET_ECN_MASK, dscp); } struct ipv6hdr; /* Note: * IP_ECN_set_ce() has to tweak IPV4 checksum when setting CE, * meaning both changes have no effect on skb->csum if/when CHECKSUM_COMPLETE * In IPv6 case, no checksum compensates the change in IPv6 header, * so we have to update skb->csum. */ static inline int IP6_ECN_set_ce(struct sk_buff *skb, struct ipv6hdr *iph) { __be32 from, to; if (INET_ECN_is_not_ect(ipv6_get_dsfield(iph))) return 0; from = *(__be32 *)iph; to = from | htonl(INET_ECN_CE << 20); *(__be32 *)iph = to; if (skb->ip_summed == CHECKSUM_COMPLETE) skb->csum = csum_add(csum_sub(skb->csum, (__force __wsum)from), (__force __wsum)to); return 1; } static inline int IP6_ECN_set_ect1(struct sk_buff *skb, struct ipv6hdr *iph) { __be32 from, to; if ((ipv6_get_dsfield(iph) & INET_ECN_MASK) != INET_ECN_ECT_0) return 0; from = *(__be32 *)iph; to = from ^ htonl(INET_ECN_MASK << 20); *(__be32 *)iph = to; if (skb->ip_summed == CHECKSUM_COMPLETE) skb->csum = csum_add(csum_sub(skb->csum, (__force __wsum)from), (__force __wsum)to); return 1; } static inline void ipv6_copy_dscp(unsigned int dscp, struct ipv6hdr *inner) { dscp &= ~INET_ECN_MASK; ipv6_change_dsfield(inner, INET_ECN_MASK, dscp); } static inline int INET_ECN_set_ce(struct sk_buff *skb) { switch (skb_protocol(skb, true)) { case cpu_to_be16(ETH_P_IP): if (skb_network_header(skb) + sizeof(struct iphdr) <= skb_tail_pointer(skb)) return IP_ECN_set_ce(ip_hdr(skb)); break; case cpu_to_be16(ETH_P_IPV6): if (skb_network_header(skb) + sizeof(struct ipv6hdr) <= skb_tail_pointer(skb)) return IP6_ECN_set_ce(skb, ipv6_hdr(skb)); break; } return 0; } static inline int skb_get_dsfield(struct sk_buff *skb) { switch (skb_protocol(skb, true)) { case cpu_to_be16(ETH_P_IP): if (!pskb_network_may_pull(skb, sizeof(struct iphdr))) break; return ipv4_get_dsfield(ip_hdr(skb)); case cpu_to_be16(ETH_P_IPV6): if (!pskb_network_may_pull(skb, sizeof(struct ipv6hdr))) break; return ipv6_get_dsfield(ipv6_hdr(skb)); } return -1; } static inline int INET_ECN_set_ect1(struct sk_buff *skb) { switch (skb_protocol(skb, true)) { case cpu_to_be16(ETH_P_IP): if (skb_network_header(skb) + sizeof(struct iphdr) <= skb_tail_pointer(skb)) return IP_ECN_set_ect1(ip_hdr(skb)); break; case cpu_to_be16(ETH_P_IPV6): if (skb_network_header(skb) + sizeof(struct ipv6hdr) <= skb_tail_pointer(skb)) return IP6_ECN_set_ect1(skb, ipv6_hdr(skb)); break; } return 0; } /* * RFC 6040 4.2 * To decapsulate the inner header at the tunnel egress, a compliant * tunnel egress MUST set the outgoing ECN field to the codepoint at the * intersection of the appropriate arriving inner header (row) and outer * header (column) in Figure 4 * * +---------+------------------------------------------------+ * |Arriving | Arriving Outer Header | * | Inner +---------+------------+------------+------------+ * | Header | Not-ECT | ECT(0) | ECT(1) | CE | * +---------+---------+------------+------------+------------+ * | Not-ECT | Not-ECT |Not-ECT(!!!)|Not-ECT(!!!)| <drop>(!!!)| * | ECT(0) | ECT(0) | ECT(0) | ECT(1) | CE | * | ECT(1) | ECT(1) | ECT(1) (!) | ECT(1) | CE | * | CE | CE | CE | CE(!!!)| CE | * +---------+---------+------------+------------+------------+ * * Figure 4: New IP in IP Decapsulation Behaviour * * returns 0 on success * 1 if something is broken and should be logged (!!! above) * 2 if packet should be dropped */ static inline int __INET_ECN_decapsulate(__u8 outer, __u8 inner, bool *set_ce) { if (INET_ECN_is_not_ect(inner)) { switch (outer & INET_ECN_MASK) { case INET_ECN_NOT_ECT: return 0; case INET_ECN_ECT_0: case INET_ECN_ECT_1: return 1; case INET_ECN_CE: return 2; } } *set_ce = INET_ECN_is_ce(outer); return 0; } static inline int INET_ECN_decapsulate(struct sk_buff *skb, __u8 outer, __u8 inner) { bool set_ce = false; int rc; rc = __INET_ECN_decapsulate(outer, inner, &set_ce); if (!rc) { if (set_ce) INET_ECN_set_ce(skb); else if ((outer & INET_ECN_MASK) == INET_ECN_ECT_1) INET_ECN_set_ect1(skb); } return rc; } static inline int IP_ECN_decapsulate(const struct iphdr *oiph, struct sk_buff *skb) { __u8 inner; switch (skb_protocol(skb, true)) { case htons(ETH_P_IP): inner = ip_hdr(skb)->tos; break; case htons(ETH_P_IPV6): inner = ipv6_get_dsfield(ipv6_hdr(skb)); break; default: return 0; } return INET_ECN_decapsulate(skb, oiph->tos, inner); } static inline int IP6_ECN_decapsulate(const struct ipv6hdr *oipv6h, struct sk_buff *skb) { __u8 inner; switch (skb_protocol(skb, true)) { case htons(ETH_P_IP): inner = ip_hdr(skb)->tos; break; case htons(ETH_P_IPV6): inner = ipv6_get_dsfield(ipv6_hdr(skb)); break; default: return 0; } return INET_ECN_decapsulate(skb, ipv6_get_dsfield(oipv6h), inner); } #endif |
| 4 4 4 4 4 3 4 4 1 1 1 3 1 1 1 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 | /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (c) 2008, Intel Corporation. * * Author: Alexander Duyck <alexander.h.duyck@intel.com> */ #ifndef __NET_TC_SKBEDIT_H #define __NET_TC_SKBEDIT_H #include <net/act_api.h> #include <linux/tc_act/tc_skbedit.h> struct tcf_skbedit_params { u32 flags; u32 priority; u32 mark; u32 mask; u16 queue_mapping; u16 mapping_mod; u16 ptype; struct rcu_head rcu; }; struct tcf_skbedit { struct tc_action common; struct tcf_skbedit_params __rcu *params; }; #define to_skbedit(a) ((struct tcf_skbedit *)a) /* Return true iff action is the one identified by FLAG. */ static inline bool is_tcf_skbedit_with_flag(const struct tc_action *a, u32 flag) { #ifdef CONFIG_NET_CLS_ACT u32 flags; if (a->ops && a->ops->id == TCA_ID_SKBEDIT) { rcu_read_lock(); flags = rcu_dereference(to_skbedit(a)->params)->flags; rcu_read_unlock(); return flags == flag; } #endif return false; } /* Return true iff action is mark */ static inline bool is_tcf_skbedit_mark(const struct tc_action *a) { return is_tcf_skbedit_with_flag(a, SKBEDIT_F_MARK); } static inline u32 tcf_skbedit_mark(const struct tc_action *a) { u32 mark; rcu_read_lock(); mark = rcu_dereference(to_skbedit(a)->params)->mark; rcu_read_unlock(); return mark; } /* Return true iff action is ptype */ static inline bool is_tcf_skbedit_ptype(const struct tc_action *a) { return is_tcf_skbedit_with_flag(a, SKBEDIT_F_PTYPE); } static inline u32 tcf_skbedit_ptype(const struct tc_action *a) { u16 ptype; rcu_read_lock(); ptype = rcu_dereference(to_skbedit(a)->params)->ptype; rcu_read_unlock(); return ptype; } /* Return true iff action is priority */ static inline bool is_tcf_skbedit_priority(const struct tc_action *a) { return is_tcf_skbedit_with_flag(a, SKBEDIT_F_PRIORITY); } static inline u32 tcf_skbedit_priority(const struct tc_action *a) { u32 priority; rcu_read_lock(); priority = rcu_dereference(to_skbedit(a)->params)->priority; rcu_read_unlock(); return priority; } static inline u16 tcf_skbedit_rx_queue_mapping(const struct tc_action *a) { u16 rx_queue; rcu_read_lock(); rx_queue = rcu_dereference(to_skbedit(a)->params)->queue_mapping; rcu_read_unlock(); return rx_queue; } /* Return true iff action is queue_mapping */ static inline bool is_tcf_skbedit_queue_mapping(const struct tc_action *a) { return is_tcf_skbedit_with_flag(a, SKBEDIT_F_QUEUE_MAPPING); } /* Return true if action is on ingress traffic */ static inline bool is_tcf_skbedit_ingress(u32 flags) { return flags & TCA_ACT_FLAGS_AT_INGRESS; } static inline bool is_tcf_skbedit_tx_queue_mapping(const struct tc_action *a) { return is_tcf_skbedit_queue_mapping(a) && !is_tcf_skbedit_ingress(a->tcfa_flags); } static inline bool is_tcf_skbedit_rx_queue_mapping(const struct tc_action *a) { return is_tcf_skbedit_queue_mapping(a) && is_tcf_skbedit_ingress(a->tcfa_flags); } /* Return true iff action is inheritdsfield */ static inline bool is_tcf_skbedit_inheritdsfield(const struct tc_action *a) { return is_tcf_skbedit_with_flag(a, SKBEDIT_F_INHERITDSFIELD); } #endif /* __NET_TC_SKBEDIT_H */ |
| 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 2 1 2 1 2 2 2 2 9 9 7 1 1 1 1 1 1 1 1 1 1 2 3 4 4 2 9 5 4 4 4 4 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 | // SPDX-License-Identifier: GPL-2.0-or-later /* * OSS emulation layer for the mixer interface * Copyright (c) by Jaroslav Kysela <perex@perex.cz> */ #include <linux/init.h> #include <linux/slab.h> #include <linux/time.h> #include <linux/string.h> #include <linux/module.h> #include <linux/compat.h> #include <sound/core.h> #include <sound/minors.h> #include <sound/control.h> #include <sound/info.h> #include <sound/mixer_oss.h> #include <linux/soundcard.h> #define OSS_ALSAEMULVER _SIOR ('M', 249, int) MODULE_AUTHOR("Jaroslav Kysela <perex@perex.cz>"); MODULE_DESCRIPTION("Mixer OSS emulation for ALSA."); MODULE_LICENSE("GPL"); MODULE_ALIAS_SNDRV_MINOR(SNDRV_MINOR_OSS_MIXER); static int snd_mixer_oss_open(struct inode *inode, struct file *file) { struct snd_card *card; struct snd_mixer_oss_file *fmixer; int err; err = nonseekable_open(inode, file); if (err < 0) return err; card = snd_lookup_oss_minor_data(iminor(inode), SNDRV_OSS_DEVICE_TYPE_MIXER); if (card == NULL) return -ENODEV; if (card->mixer_oss == NULL) { snd_card_unref(card); return -ENODEV; } err = snd_card_file_add(card, file); if (err < 0) { snd_card_unref(card); return err; } fmixer = kzalloc(sizeof(*fmixer), GFP_KERNEL); if (fmixer == NULL) { snd_card_file_remove(card, file); snd_card_unref(card); return -ENOMEM; } fmixer->card = card; fmixer->mixer = card->mixer_oss; file->private_data = fmixer; if (!try_module_get(card->module)) { kfree(fmixer); snd_card_file_remove(card, file); snd_card_unref(card); return -EFAULT; } snd_card_unref(card); return 0; } static int snd_mixer_oss_release(struct inode *inode, struct file *file) { struct snd_mixer_oss_file *fmixer; if (file->private_data) { fmixer = file->private_data; module_put(fmixer->card->module); snd_card_file_remove(fmixer->card, file); kfree(fmixer); } return 0; } static int snd_mixer_oss_info(struct snd_mixer_oss_file *fmixer, mixer_info __user *_info) { struct snd_card *card = fmixer->card; struct snd_mixer_oss *mixer = fmixer->mixer; struct mixer_info info; memset(&info, 0, sizeof(info)); strscpy(info.id, mixer && mixer->id[0] ? mixer->id : card->driver, sizeof(info.id)); strscpy(info.name, mixer && mixer->name[0] ? mixer->name : card->mixername, sizeof(info.name)); info.modify_counter = card->mixer_oss_change_count; if (copy_to_user(_info, &info, sizeof(info))) return -EFAULT; return 0; } static int snd_mixer_oss_info_obsolete(struct snd_mixer_oss_file *fmixer, _old_mixer_info __user *_info) { struct snd_card *card = fmixer->card; struct snd_mixer_oss *mixer = fmixer->mixer; _old_mixer_info info; memset(&info, 0, sizeof(info)); strscpy(info.id, mixer && mixer->id[0] ? mixer->id : card->driver, sizeof(info.id)); strscpy(info.name, mixer && mixer->name[0] ? mixer->name : card->mixername, sizeof(info.name)); if (copy_to_user(_info, &info, sizeof(info))) return -EFAULT; return 0; } static int snd_mixer_oss_caps(struct snd_mixer_oss_file *fmixer) { struct snd_mixer_oss *mixer = fmixer->mixer; int result = 0; if (mixer == NULL) return -EIO; if (mixer->get_recsrc && mixer->put_recsrc) result |= SOUND_CAP_EXCL_INPUT; return result; } static int snd_mixer_oss_devmask(struct snd_mixer_oss_file *fmixer) { struct snd_mixer_oss *mixer = fmixer->mixer; struct snd_mixer_oss_slot *pslot; int result = 0, chn; if (mixer == NULL) return -EIO; guard(mutex)(&mixer->reg_mutex); for (chn = 0; chn < 31; chn++) { pslot = &mixer->slots[chn]; if (pslot->put_volume || pslot->put_recsrc) result |= 1 << chn; } return result; } static int snd_mixer_oss_stereodevs(struct snd_mixer_oss_file *fmixer) { struct snd_mixer_oss *mixer = fmixer->mixer; struct snd_mixer_oss_slot *pslot; int result = 0, chn; if (mixer == NULL) return -EIO; guard(mutex)(&mixer->reg_mutex); for (chn = 0; chn < 31; chn++) { pslot = &mixer->slots[chn]; if (pslot->put_volume && pslot->stereo) result |= 1 << chn; } return result; } static int snd_mixer_oss_recmask(struct snd_mixer_oss_file *fmixer) { struct snd_mixer_oss *mixer = fmixer->mixer; int result = 0; if (mixer == NULL) return -EIO; guard(mutex)(&mixer->reg_mutex); if (mixer->put_recsrc && mixer->get_recsrc) { /* exclusive */ result = mixer->mask_recsrc; } else { struct snd_mixer_oss_slot *pslot; int chn; for (chn = 0; chn < 31; chn++) { pslot = &mixer->slots[chn]; if (pslot->put_recsrc) result |= 1 << chn; } } return result; } static int snd_mixer_oss_get_recsrc(struct snd_mixer_oss_file *fmixer) { struct snd_mixer_oss *mixer = fmixer->mixer; int result = 0; if (mixer == NULL) return -EIO; guard(mutex)(&mixer->reg_mutex); if (mixer->put_recsrc && mixer->get_recsrc) { /* exclusive */ unsigned int index; result = mixer->get_recsrc(fmixer, &index); if (result < 0) return result; result = 1 << index; } else { struct snd_mixer_oss_slot *pslot; int chn; for (chn = 0; chn < 31; chn++) { pslot = &mixer->slots[chn]; if (pslot->get_recsrc) { int active = 0; pslot->get_recsrc(fmixer, pslot, &active); if (active) result |= 1 << chn; } } } mixer->oss_recsrc = result; return result; } static int snd_mixer_oss_set_recsrc(struct snd_mixer_oss_file *fmixer, int recsrc) { struct snd_mixer_oss *mixer = fmixer->mixer; struct snd_mixer_oss_slot *pslot; int chn, active; unsigned int index; int result = 0; if (mixer == NULL) return -EIO; guard(mutex)(&mixer->reg_mutex); if (mixer->get_recsrc && mixer->put_recsrc) { /* exclusive input */ if (recsrc & ~mixer->oss_recsrc) recsrc &= ~mixer->oss_recsrc; mixer->put_recsrc(fmixer, ffz(~recsrc)); mixer->get_recsrc(fmixer, &index); result = 1 << index; } for (chn = 0; chn < 31; chn++) { pslot = &mixer->slots[chn]; if (pslot->put_recsrc) { active = (recsrc & (1 << chn)) ? 1 : 0; pslot->put_recsrc(fmixer, pslot, active); } } if (! result) { for (chn = 0; chn < 31; chn++) { pslot = &mixer->slots[chn]; if (pslot->get_recsrc) { active = 0; pslot->get_recsrc(fmixer, pslot, &active); if (active) result |= 1 << chn; } } } return result; } static int snd_mixer_oss_get_volume(struct snd_mixer_oss_file *fmixer, int slot) { struct snd_mixer_oss *mixer = fmixer->mixer; struct snd_mixer_oss_slot *pslot; int result = 0, left, right; if (mixer == NULL || slot > 30) return -EIO; guard(mutex)(&mixer->reg_mutex); pslot = &mixer->slots[slot]; left = pslot->volume[0]; right = pslot->volume[1]; if (pslot->get_volume) result = pslot->get_volume(fmixer, pslot, &left, &right); if (!pslot->stereo) right = left; if (snd_BUG_ON(left < 0 || left > 100)) return -EIO; if (snd_BUG_ON(right < 0 || right > 100)) return -EIO; if (result >= 0) { pslot->volume[0] = left; pslot->volume[1] = right; result = (left & 0xff) | ((right & 0xff) << 8); } return result; } static int snd_mixer_oss_set_volume(struct snd_mixer_oss_file *fmixer, int slot, int volume) { struct snd_mixer_oss *mixer = fmixer->mixer; struct snd_mixer_oss_slot *pslot; int result = 0, left = volume & 0xff, right = (volume >> 8) & 0xff; if (mixer == NULL || slot > 30) return -EIO; guard(mutex)(&mixer->reg_mutex); pslot = &mixer->slots[slot]; if (left > 100) left = 100; if (right > 100) right = 100; if (!pslot->stereo) right = left; if (pslot->put_volume) result = pslot->put_volume(fmixer, pslot, left, right); if (result < 0) return result; pslot->volume[0] = left; pslot->volume[1] = right; result = (left & 0xff) | ((right & 0xff) << 8); return result; } static int snd_mixer_oss_ioctl1(struct snd_mixer_oss_file *fmixer, unsigned int cmd, unsigned long arg) { void __user *argp = (void __user *)arg; int __user *p = argp; int tmp; if (snd_BUG_ON(!fmixer)) return -ENXIO; if (((cmd >> 8) & 0xff) == 'M') { switch (cmd) { case SOUND_MIXER_INFO: return snd_mixer_oss_info(fmixer, argp); case SOUND_OLD_MIXER_INFO: return snd_mixer_oss_info_obsolete(fmixer, argp); case SOUND_MIXER_WRITE_RECSRC: if (get_user(tmp, p)) return -EFAULT; tmp = snd_mixer_oss_set_recsrc(fmixer, tmp); if (tmp < 0) return tmp; return put_user(tmp, p); case OSS_GETVERSION: return put_user(SNDRV_OSS_VERSION, p); case OSS_ALSAEMULVER: return put_user(1, p); case SOUND_MIXER_READ_DEVMASK: tmp = snd_mixer_oss_devmask(fmixer); if (tmp < 0) return tmp; return put_user(tmp, p); case SOUND_MIXER_READ_STEREODEVS: tmp = snd_mixer_oss_stereodevs(fmixer); if (tmp < 0) return tmp; return put_user(tmp, p); case SOUND_MIXER_READ_RECMASK: tmp = snd_mixer_oss_recmask(fmixer); if (tmp < 0) return tmp; return put_user(tmp, p); case SOUND_MIXER_READ_CAPS: tmp = snd_mixer_oss_caps(fmixer); if (tmp < 0) return tmp; return put_user(tmp, p); case SOUND_MIXER_READ_RECSRC: tmp = snd_mixer_oss_get_recsrc(fmixer); if (tmp < 0) return tmp; return put_user(tmp, p); } } if (cmd & SIOC_IN) { if (get_user(tmp, p)) return -EFAULT; tmp = snd_mixer_oss_set_volume(fmixer, cmd & 0xff, tmp); if (tmp < 0) return tmp; return put_user(tmp, p); } else if (cmd & SIOC_OUT) { tmp = snd_mixer_oss_get_volume(fmixer, cmd & 0xff); if (tmp < 0) return tmp; return put_user(tmp, p); } return -ENXIO; } static long snd_mixer_oss_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { return snd_mixer_oss_ioctl1(file->private_data, cmd, arg); } int snd_mixer_oss_ioctl_card(struct snd_card *card, unsigned int cmd, unsigned long arg) { struct snd_mixer_oss_file fmixer; if (snd_BUG_ON(!card)) return -ENXIO; if (card->mixer_oss == NULL) return -ENXIO; memset(&fmixer, 0, sizeof(fmixer)); fmixer.card = card; fmixer.mixer = card->mixer_oss; return snd_mixer_oss_ioctl1(&fmixer, cmd, arg); } EXPORT_SYMBOL(snd_mixer_oss_ioctl_card); #ifdef CONFIG_COMPAT /* all compatible */ static long snd_mixer_oss_ioctl_compat(struct file *file, unsigned int cmd, unsigned long arg) { return snd_mixer_oss_ioctl1(file->private_data, cmd, (unsigned long)compat_ptr(arg)); } #else #define snd_mixer_oss_ioctl_compat NULL #endif /* * REGISTRATION PART */ static const struct file_operations snd_mixer_oss_f_ops = { .owner = THIS_MODULE, .open = snd_mixer_oss_open, .release = snd_mixer_oss_release, .llseek = no_llseek, .unlocked_ioctl = snd_mixer_oss_ioctl, .compat_ioctl = snd_mixer_oss_ioctl_compat, }; /* * utilities */ static long snd_mixer_oss_conv(long val, long omin, long omax, long nmin, long nmax) { long orange = omax - omin, nrange = nmax - nmin; if (orange == 0) return 0; return DIV_ROUND_CLOSEST(nrange * (val - omin), orange) + nmin; } /* convert from alsa native to oss values (0-100) */ static long snd_mixer_oss_conv1(long val, long min, long max, int *old) { if (val == snd_mixer_oss_conv(*old, 0, 100, min, max)) return *old; return snd_mixer_oss_conv(val, min, max, 0, 100); } /* convert from oss to alsa native values */ static long snd_mixer_oss_conv2(long val, long min, long max) { return snd_mixer_oss_conv(val, 0, 100, min, max); } #if 0 static void snd_mixer_oss_recsrce_set(struct snd_card *card, int slot) { struct snd_mixer_oss *mixer = card->mixer_oss; if (mixer) mixer->mask_recsrc |= 1 << slot; } static int snd_mixer_oss_recsrce_get(struct snd_card *card, int slot) { struct snd_mixer_oss *mixer = card->mixer_oss; if (mixer && (mixer->mask_recsrc & (1 << slot))) return 1; return 0; } #endif #define SNDRV_MIXER_OSS_SIGNATURE 0x65999250 #define SNDRV_MIXER_OSS_ITEM_GLOBAL 0 #define SNDRV_MIXER_OSS_ITEM_GSWITCH 1 #define SNDRV_MIXER_OSS_ITEM_GROUTE 2 #define SNDRV_MIXER_OSS_ITEM_GVOLUME 3 #define SNDRV_MIXER_OSS_ITEM_PSWITCH 4 #define SNDRV_MIXER_OSS_ITEM_PROUTE 5 #define SNDRV_MIXER_OSS_ITEM_PVOLUME 6 #define SNDRV_MIXER_OSS_ITEM_CSWITCH 7 #define SNDRV_MIXER_OSS_ITEM_CROUTE 8 #define SNDRV_MIXER_OSS_ITEM_CVOLUME 9 #define SNDRV_MIXER_OSS_ITEM_CAPTURE 10 #define SNDRV_MIXER_OSS_ITEM_COUNT 11 #define SNDRV_MIXER_OSS_PRESENT_GLOBAL (1<<0) #define SNDRV_MIXER_OSS_PRESENT_GSWITCH (1<<1) #define SNDRV_MIXER_OSS_PRESENT_GROUTE (1<<2) #define SNDRV_MIXER_OSS_PRESENT_GVOLUME (1<<3) #define SNDRV_MIXER_OSS_PRESENT_PSWITCH (1<<4) #define SNDRV_MIXER_OSS_PRESENT_PROUTE (1<<5) #define SNDRV_MIXER_OSS_PRESENT_PVOLUME (1<<6) #define SNDRV_MIXER_OSS_PRESENT_CSWITCH (1<<7) #define SNDRV_MIXER_OSS_PRESENT_CROUTE (1<<8) #define SNDRV_MIXER_OSS_PRESENT_CVOLUME (1<<9) #define SNDRV_MIXER_OSS_PRESENT_CAPTURE (1<<10) struct slot { unsigned int signature; unsigned int present; unsigned int channels; unsigned int numid[SNDRV_MIXER_OSS_ITEM_COUNT]; unsigned int capture_item; const struct snd_mixer_oss_assign_table *assigned; unsigned int allocated: 1; }; #define ID_UNKNOWN ((unsigned int)-1) static struct snd_kcontrol *snd_mixer_oss_test_id(struct snd_mixer_oss *mixer, const char *name, int index) { struct snd_card *card = mixer->card; struct snd_ctl_elem_id id; memset(&id, 0, sizeof(id)); id.iface = SNDRV_CTL_ELEM_IFACE_MIXER; strscpy(id.name, name, sizeof(id.name)); id.index = index; return snd_ctl_find_id_locked(card, &id); } static void snd_mixer_oss_get_volume1_vol(struct snd_mixer_oss_file *fmixer, struct snd_mixer_oss_slot *pslot, unsigned int numid, int *left, int *right) { struct snd_ctl_elem_info *uinfo __free(kfree) = NULL; struct snd_ctl_elem_value *uctl __free(kfree) = NULL; struct snd_kcontrol *kctl; struct snd_card *card = fmixer->card; if (numid == ID_UNKNOWN) return; guard(rwsem_read)(&card->controls_rwsem); kctl = snd_ctl_find_numid_locked(card, numid); if (!kctl) return; uinfo = kzalloc(sizeof(*uinfo), GFP_KERNEL); uctl = kzalloc(sizeof(*uctl), GFP_KERNEL); if (uinfo == NULL || uctl == NULL) return; if (kctl->info(kctl, uinfo)) return; if (kctl->get(kctl, uctl)) return; if (uinfo->type == SNDRV_CTL_ELEM_TYPE_BOOLEAN && uinfo->value.integer.min == 0 && uinfo->value.integer.max == 1) return; *left = snd_mixer_oss_conv1(uctl->value.integer.value[0], uinfo->value.integer.min, uinfo->value.integer.max, &pslot->volume[0]); if (uinfo->count > 1) *right = snd_mixer_oss_conv1(uctl->value.integer.value[1], uinfo->value.integer.min, uinfo->value.integer.max, &pslot->volume[1]); } static void snd_mixer_oss_get_volume1_sw(struct snd_mixer_oss_file *fmixer, struct snd_mixer_oss_slot *pslot, unsigned int numid, int *left, int *right, int route) { struct snd_ctl_elem_info *uinfo __free(kfree) = NULL; struct snd_ctl_elem_value *uctl __free(kfree) = NULL; struct snd_kcontrol *kctl; struct snd_card *card = fmixer->card; if (numid == ID_UNKNOWN) return; guard(rwsem_read)(&card->controls_rwsem); kctl = snd_ctl_find_numid_locked(card, numid); if (!kctl) return; uinfo = kzalloc(sizeof(*uinfo), GFP_KERNEL); uctl = kzalloc(sizeof(*uctl), GFP_KERNEL); if (uinfo == NULL || uctl == NULL) return; if (kctl->info(kctl, uinfo)) return; if (kctl->get(kctl, uctl)) return; if (!uctl->value.integer.value[0]) { *left = 0; if (uinfo->count == 1) *right = 0; } if (uinfo->count > 1 && !uctl->value.integer.value[route ? 3 : 1]) *right = 0; } static int snd_mixer_oss_get_volume1(struct snd_mixer_oss_file *fmixer, struct snd_mixer_oss_slot *pslot, int *left, int *right) { struct slot *slot = pslot->private_data; *left = *right = 100; if (slot->present & SNDRV_MIXER_OSS_PRESENT_PVOLUME) { snd_mixer_oss_get_volume1_vol(fmixer, pslot, slot->numid[SNDRV_MIXER_OSS_ITEM_PVOLUME], left, right); } else if (slot->present & SNDRV_MIXER_OSS_PRESENT_GVOLUME) { snd_mixer_oss_get_volume1_vol(fmixer, pslot, slot->numid[SNDRV_MIXER_OSS_ITEM_GVOLUME], left, right); } else if (slot->present & SNDRV_MIXER_OSS_PRESENT_GLOBAL) { snd_mixer_oss_get_volume1_vol(fmixer, pslot, slot->numid[SNDRV_MIXER_OSS_ITEM_GLOBAL], left, right); } if (slot->present & SNDRV_MIXER_OSS_PRESENT_PSWITCH) { snd_mixer_oss_get_volume1_sw(fmixer, pslot, slot->numid[SNDRV_MIXER_OSS_ITEM_PSWITCH], left, right, 0); } else if (slot->present & SNDRV_MIXER_OSS_PRESENT_GSWITCH) { snd_mixer_oss_get_volume1_sw(fmixer, pslot, slot->numid[SNDRV_MIXER_OSS_ITEM_GSWITCH], left, right, 0); } else if (slot->present & SNDRV_MIXER_OSS_PRESENT_PROUTE) { snd_mixer_oss_get_volume1_sw(fmixer, pslot, slot->numid[SNDRV_MIXER_OSS_ITEM_PROUTE], left, right, 1); } else if (slot->present & SNDRV_MIXER_OSS_PRESENT_GROUTE) { snd_mixer_oss_get_volume1_sw(fmixer, pslot, slot->numid[SNDRV_MIXER_OSS_ITEM_GROUTE], left, right, 1); } return 0; } static void snd_mixer_oss_put_volume1_vol(struct snd_mixer_oss_file *fmixer, struct snd_mixer_oss_slot *pslot, unsigned int numid, int left, int right) { struct snd_ctl_elem_info *uinfo __free(kfree) = NULL; struct snd_ctl_elem_value *uctl __free(kfree) = NULL; struct snd_kcontrol *kctl; struct snd_card *card = fmixer->card; int res; if (numid == ID_UNKNOWN) return; guard(rwsem_read)(&card->controls_rwsem); kctl = snd_ctl_find_numid_locked(card, numid); if (!kctl) return; uinfo = kzalloc(sizeof(*uinfo), GFP_KERNEL); uctl = kzalloc(sizeof(*uctl), GFP_KERNEL); if (uinfo == NULL || uctl == NULL) return; if (kctl->info(kctl, uinfo)) return; if (uinfo->type == SNDRV_CTL_ELEM_TYPE_BOOLEAN && uinfo->value.integer.min == 0 && uinfo->value.integer.max == 1) return; uctl->value.integer.value[0] = snd_mixer_oss_conv2(left, uinfo->value.integer.min, uinfo->value.integer.max); if (uinfo->count > 1) uctl->value.integer.value[1] = snd_mixer_oss_conv2(right, uinfo->value.integer.min, uinfo->value.integer.max); res = kctl->put(kctl, uctl); if (res < 0) return; if (res > 0) snd_ctl_notify(card, SNDRV_CTL_EVENT_MASK_VALUE, &kctl->id); } static void snd_mixer_oss_put_volume1_sw(struct snd_mixer_oss_file *fmixer, struct snd_mixer_oss_slot *pslot, unsigned int numid, int left, int right, int route) { struct snd_ctl_elem_info *uinfo __free(kfree) = NULL; struct snd_ctl_elem_value *uctl __free(kfree) = NULL; struct snd_kcontrol *kctl; struct snd_card *card = fmixer->card; int res; if (numid == ID_UNKNOWN) return; guard(rwsem_read)(&card->controls_rwsem); kctl = snd_ctl_find_numid_locked(card, numid); if (!kctl) return; uinfo = kzalloc(sizeof(*uinfo), GFP_KERNEL); uctl = kzalloc(sizeof(*uctl), GFP_KERNEL); if (uinfo == NULL || uctl == NULL) return; if (kctl->info(kctl, uinfo)) return; if (uinfo->count > 1) { uctl->value.integer.value[0] = left > 0 ? 1 : 0; uctl->value.integer.value[route ? 3 : 1] = right > 0 ? 1 : 0; if (route) { uctl->value.integer.value[1] = uctl->value.integer.value[2] = 0; } } else { uctl->value.integer.value[0] = (left > 0 || right > 0) ? 1 : 0; } res = kctl->put(kctl, uctl); if (res < 0) return; if (res > 0) snd_ctl_notify(card, SNDRV_CTL_EVENT_MASK_VALUE, &kctl->id); } static int snd_mixer_oss_put_volume1(struct snd_mixer_oss_file *fmixer, struct snd_mixer_oss_slot *pslot, int left, int right) { struct slot *slot = pslot->private_data; if (slot->present & SNDRV_MIXER_OSS_PRESENT_PVOLUME) { snd_mixer_oss_put_volume1_vol(fmixer, pslot, slot->numid[SNDRV_MIXER_OSS_ITEM_PVOLUME], left, right); if (slot->present & SNDRV_MIXER_OSS_PRESENT_CVOLUME) snd_mixer_oss_put_volume1_vol(fmixer, pslot, slot->numid[SNDRV_MIXER_OSS_ITEM_CVOLUME], left, right); } else if (slot->present & SNDRV_MIXER_OSS_PRESENT_CVOLUME) { snd_mixer_oss_put_volume1_vol(fmixer, pslot, slot->numid[SNDRV_MIXER_OSS_ITEM_CVOLUME], left, right); } else if (slot->present & SNDRV_MIXER_OSS_PRESENT_GVOLUME) { snd_mixer_oss_put_volume1_vol(fmixer, pslot, slot->numid[SNDRV_MIXER_OSS_ITEM_GVOLUME], left, right); } else if (slot->present & SNDRV_MIXER_OSS_PRESENT_GLOBAL) { snd_mixer_oss_put_volume1_vol(fmixer, pslot, slot->numid[SNDRV_MIXER_OSS_ITEM_GLOBAL], left, right); } if (left || right) { if (slot->present & SNDRV_MIXER_OSS_PRESENT_PSWITCH) snd_mixer_oss_put_volume1_sw(fmixer, pslot, slot->numid[SNDRV_MIXER_OSS_ITEM_PSWITCH], left, right, 0); if (slot->present & SNDRV_MIXER_OSS_PRESENT_CSWITCH) snd_mixer_oss_put_volume1_sw(fmixer, pslot, slot->numid[SNDRV_MIXER_OSS_ITEM_CSWITCH], left, right, 0); if (slot->present & SNDRV_MIXER_OSS_PRESENT_GSWITCH) snd_mixer_oss_put_volume1_sw(fmixer, pslot, slot->numid[SNDRV_MIXER_OSS_ITEM_GSWITCH], left, right, 0); if (slot->present & SNDRV_MIXER_OSS_PRESENT_PROUTE) snd_mixer_oss_put_volume1_sw(fmixer, pslot, slot->numid[SNDRV_MIXER_OSS_ITEM_PROUTE], left, right, 1); if (slot->present & SNDRV_MIXER_OSS_PRESENT_CROUTE) snd_mixer_oss_put_volume1_sw(fmixer, pslot, slot->numid[SNDRV_MIXER_OSS_ITEM_CROUTE], left, right, 1); if (slot->present & SNDRV_MIXER_OSS_PRESENT_GROUTE) snd_mixer_oss_put_volume1_sw(fmixer, pslot, slot->numid[SNDRV_MIXER_OSS_ITEM_GROUTE], left, right, 1); } else { if (slot->present & SNDRV_MIXER_OSS_PRESENT_PSWITCH) { snd_mixer_oss_put_volume1_sw(fmixer, pslot, slot->numid[SNDRV_MIXER_OSS_ITEM_PSWITCH], left, right, 0); } else if (slot->present & SNDRV_MIXER_OSS_PRESENT_CSWITCH) { snd_mixer_oss_put_volume1_sw(fmixer, pslot, slot->numid[SNDRV_MIXER_OSS_ITEM_CSWITCH], left, right, 0); } else if (slot->present & SNDRV_MIXER_OSS_PRESENT_GSWITCH) { snd_mixer_oss_put_volume1_sw(fmixer, pslot, slot->numid[SNDRV_MIXER_OSS_ITEM_GSWITCH], left, right, 0); } else if (slot->present & SNDRV_MIXER_OSS_PRESENT_PROUTE) { snd_mixer_oss_put_volume1_sw(fmixer, pslot, slot->numid[SNDRV_MIXER_OSS_ITEM_PROUTE], left, right, 1); } else if (slot->present & SNDRV_MIXER_OSS_PRESENT_CROUTE) { snd_mixer_oss_put_volume1_sw(fmixer, pslot, slot->numid[SNDRV_MIXER_OSS_ITEM_CROUTE], left, right, 1); } else if (slot->present & SNDRV_MIXER_OSS_PRESENT_GROUTE) { snd_mixer_oss_put_volume1_sw(fmixer, pslot, slot->numid[SNDRV_MIXER_OSS_ITEM_GROUTE], left, right, 1); } } return 0; } static int snd_mixer_oss_get_recsrc1_sw(struct snd_mixer_oss_file *fmixer, struct snd_mixer_oss_slot *pslot, int *active) { struct slot *slot = pslot->private_data; int left, right; left = right = 1; snd_mixer_oss_get_volume1_sw(fmixer, pslot, slot->numid[SNDRV_MIXER_OSS_ITEM_CSWITCH], &left, &right, 0); *active = (left || right) ? 1 : 0; return 0; } static int snd_mixer_oss_get_recsrc1_route(struct snd_mixer_oss_file *fmixer, struct snd_mixer_oss_slot *pslot, int *active) { struct slot *slot = pslot->private_data; int left, right; left = right = 1; snd_mixer_oss_get_volume1_sw(fmixer, pslot, slot->numid[SNDRV_MIXER_OSS_ITEM_CROUTE], &left, &right, 1); *active = (left || right) ? 1 : 0; return 0; } static int snd_mixer_oss_put_recsrc1_sw(struct snd_mixer_oss_file *fmixer, struct snd_mixer_oss_slot *pslot, int active) { struct slot *slot = pslot->private_data; snd_mixer_oss_put_volume1_sw(fmixer, pslot, slot->numid[SNDRV_MIXER_OSS_ITEM_CSWITCH], active, active, 0); return 0; } static int snd_mixer_oss_put_recsrc1_route(struct snd_mixer_oss_file *fmixer, struct snd_mixer_oss_slot *pslot, int active) { struct slot *slot = pslot->private_data; snd_mixer_oss_put_volume1_sw(fmixer, pslot, slot->numid[SNDRV_MIXER_OSS_ITEM_CROUTE], active, active, 1); return 0; } static int snd_mixer_oss_get_recsrc2(struct snd_mixer_oss_file *fmixer, unsigned int *active_index) { struct snd_card *card = fmixer->card; struct snd_mixer_oss *mixer = fmixer->mixer; struct snd_kcontrol *kctl; struct snd_mixer_oss_slot *pslot; struct slot *slot; struct snd_ctl_elem_info *uinfo __free(kfree) = NULL; struct snd_ctl_elem_value *uctl __free(kfree) = NULL; int err, idx; uinfo = kzalloc(sizeof(*uinfo), GFP_KERNEL); uctl = kzalloc(sizeof(*uctl), GFP_KERNEL); if (uinfo == NULL || uctl == NULL) return -ENOMEM; guard(rwsem_read)(&card->controls_rwsem); kctl = snd_mixer_oss_test_id(mixer, "Capture Source", 0); if (!kctl) return -ENOENT; err = kctl->info(kctl, uinfo); if (err < 0) return err; err = kctl->get(kctl, uctl); if (err < 0) return err; for (idx = 0; idx < 32; idx++) { if (!(mixer->mask_recsrc & (1 << idx))) continue; pslot = &mixer->slots[idx]; slot = pslot->private_data; if (slot->signature != SNDRV_MIXER_OSS_SIGNATURE) continue; if (!(slot->present & SNDRV_MIXER_OSS_PRESENT_CAPTURE)) continue; if (slot->capture_item == uctl->value.enumerated.item[0]) { *active_index = idx; break; } } return 0; } static int snd_mixer_oss_put_recsrc2(struct snd_mixer_oss_file *fmixer, unsigned int active_index) { struct snd_card *card = fmixer->card; struct snd_mixer_oss *mixer = fmixer->mixer; struct snd_kcontrol *kctl; struct snd_mixer_oss_slot *pslot; struct slot *slot = NULL; struct snd_ctl_elem_info *uinfo __free(kfree) = NULL; struct snd_ctl_elem_value *uctl __free(kfree) = NULL; int err; unsigned int idx; uinfo = kzalloc(sizeof(*uinfo), GFP_KERNEL); uctl = kzalloc(sizeof(*uctl), GFP_KERNEL); if (uinfo == NULL || uctl == NULL) return -ENOMEM; guard(rwsem_read)(&card->controls_rwsem); kctl = snd_mixer_oss_test_id(mixer, "Capture Source", 0); if (!kctl) return -ENOENT; err = kctl->info(kctl, uinfo); if (err < 0) return err; for (idx = 0; idx < 32; idx++) { if (!(mixer->mask_recsrc & (1 << idx))) continue; pslot = &mixer->slots[idx]; slot = pslot->private_data; if (slot->signature != SNDRV_MIXER_OSS_SIGNATURE) continue; if (!(slot->present & SNDRV_MIXER_OSS_PRESENT_CAPTURE)) continue; if (idx == active_index) break; slot = NULL; } if (!slot) return 0; for (idx = 0; idx < uinfo->count; idx++) uctl->value.enumerated.item[idx] = slot->capture_item; err = kctl->put(kctl, uctl); if (err > 0) snd_ctl_notify(fmixer->card, SNDRV_CTL_EVENT_MASK_VALUE, &kctl->id); return 0; } struct snd_mixer_oss_assign_table { int oss_id; const char *name; int index; }; static int snd_mixer_oss_build_test(struct snd_mixer_oss *mixer, struct slot *slot, const char *name, int index, int item) { struct snd_ctl_elem_info *info __free(kfree) = NULL; struct snd_kcontrol *kcontrol; struct snd_card *card = mixer->card; int err; scoped_guard(rwsem_read, &card->controls_rwsem) { kcontrol = snd_mixer_oss_test_id(mixer, name, index); if (kcontrol == NULL) return 0; info = kmalloc(sizeof(*info), GFP_KERNEL); if (!info) return -ENOMEM; err = kcontrol->info(kcontrol, info); if (err < 0) return err; slot->numid[item] = kcontrol->id.numid; } if (info->count > slot->channels) slot->channels = info->count; slot->present |= 1 << item; return 0; } static void snd_mixer_oss_slot_free(struct snd_mixer_oss_slot *chn) { struct slot *p = chn->private_data; if (p) { if (p->allocated && p->assigned) { kfree_const(p->assigned->name); kfree_const(p->assigned); } kfree(p); } } static void mixer_slot_clear(struct snd_mixer_oss_slot *rslot) { int idx = rslot->number; /* remember this */ if (rslot->private_free) rslot->private_free(rslot); memset(rslot, 0, sizeof(*rslot)); rslot->number = idx; } /* In a separate function to keep gcc 3.2 happy - do NOT merge this in snd_mixer_oss_build_input! */ static int snd_mixer_oss_build_test_all(struct snd_mixer_oss *mixer, const struct snd_mixer_oss_assign_table *ptr, struct slot *slot) { char str[64]; int err; err = snd_mixer_oss_build_test(mixer, slot, ptr->name, ptr->index, SNDRV_MIXER_OSS_ITEM_GLOBAL); if (err) return err; sprintf(str, "%s Switch", ptr->name); err = snd_mixer_oss_build_test(mixer, slot, str, ptr->index, SNDRV_MIXER_OSS_ITEM_GSWITCH); if (err) return err; sprintf(str, "%s Route", ptr->name); err = snd_mixer_oss_build_test(mixer, slot, str, ptr->index, SNDRV_MIXER_OSS_ITEM_GROUTE); if (err) return err; sprintf(str, "%s Volume", ptr->name); err = snd_mixer_oss_build_test(mixer, slot, str, ptr->index, SNDRV_MIXER_OSS_ITEM_GVOLUME); if (err) return err; sprintf(str, "%s Playback Switch", ptr->name); err = snd_mixer_oss_build_test(mixer, slot, str, ptr->index, SNDRV_MIXER_OSS_ITEM_PSWITCH); if (err) return err; sprintf(str, "%s Playback Route", ptr->name); err = snd_mixer_oss_build_test(mixer, slot, str, ptr->index, SNDRV_MIXER_OSS_ITEM_PROUTE); if (err) return err; sprintf(str, "%s Playback Volume", ptr->name); err = snd_mixer_oss_build_test(mixer, slot, str, ptr->index, SNDRV_MIXER_OSS_ITEM_PVOLUME); if (err) return err; sprintf(str, "%s Capture Switch", ptr->name); err = snd_mixer_oss_build_test(mixer, slot, str, ptr->index, SNDRV_MIXER_OSS_ITEM_CSWITCH); if (err) return err; sprintf(str, "%s Capture Route", ptr->name); err = snd_mixer_oss_build_test(mixer, slot, str, ptr->index, SNDRV_MIXER_OSS_ITEM_CROUTE); if (err) return err; sprintf(str, "%s Capture Volume", ptr->name); err = snd_mixer_oss_build_test(mixer, slot, str, ptr->index, SNDRV_MIXER_OSS_ITEM_CVOLUME); if (err) return err; return 0; } /* * build an OSS mixer element. * ptr_allocated means the entry is dynamically allocated (change via proc file). * when replace_old = 1, the old entry is replaced with the new one. */ static int snd_mixer_oss_build_input(struct snd_mixer_oss *mixer, const struct snd_mixer_oss_assign_table *ptr, int ptr_allocated, int replace_old) { struct slot slot; struct slot *pslot; struct snd_kcontrol *kctl; struct snd_mixer_oss_slot *rslot; char str[64]; /* check if already assigned */ if (mixer->slots[ptr->oss_id].get_volume && ! replace_old) return 0; memset(&slot, 0, sizeof(slot)); memset(slot.numid, 0xff, sizeof(slot.numid)); /* ID_UNKNOWN */ if (snd_mixer_oss_build_test_all(mixer, ptr, &slot)) return 0; guard(rwsem_read)(&mixer->card->controls_rwsem); kctl = NULL; if (!ptr->index) kctl = snd_mixer_oss_test_id(mixer, "Capture Source", 0); if (kctl) { struct snd_ctl_elem_info *uinfo __free(kfree) = NULL; uinfo = kzalloc(sizeof(*uinfo), GFP_KERNEL); if (!uinfo) return -ENOMEM; if (kctl->info(kctl, uinfo)) return 0; strcpy(str, ptr->name); if (!strcmp(str, "Master")) strcpy(str, "Mix"); if (!strcmp(str, "Master Mono")) strcpy(str, "Mix Mono"); slot.capture_item = 0; if (!strcmp(uinfo->value.enumerated.name, str)) { slot.present |= SNDRV_MIXER_OSS_PRESENT_CAPTURE; } else { for (slot.capture_item = 1; slot.capture_item < uinfo->value.enumerated.items; slot.capture_item++) { uinfo->value.enumerated.item = slot.capture_item; if (kctl->info(kctl, uinfo)) return 0; if (!strcmp(uinfo->value.enumerated.name, str)) { slot.present |= SNDRV_MIXER_OSS_PRESENT_CAPTURE; break; } } } } if (slot.present != 0) { pslot = kmalloc(sizeof(slot), GFP_KERNEL); if (! pslot) return -ENOMEM; *pslot = slot; pslot->signature = SNDRV_MIXER_OSS_SIGNATURE; pslot->assigned = ptr; pslot->allocated = ptr_allocated; rslot = &mixer->slots[ptr->oss_id]; mixer_slot_clear(rslot); rslot->stereo = slot.channels > 1 ? 1 : 0; rslot->get_volume = snd_mixer_oss_get_volume1; rslot->put_volume = snd_mixer_oss_put_volume1; /* note: ES18xx have both Capture Source and XX Capture Volume !!! */ if (slot.present & SNDRV_MIXER_OSS_PRESENT_CSWITCH) { rslot->get_recsrc = snd_mixer_oss_get_recsrc1_sw; rslot->put_recsrc = snd_mixer_oss_put_recsrc1_sw; } else if (slot.present & SNDRV_MIXER_OSS_PRESENT_CROUTE) { rslot->get_recsrc = snd_mixer_oss_get_recsrc1_route; rslot->put_recsrc = snd_mixer_oss_put_recsrc1_route; } else if (slot.present & SNDRV_MIXER_OSS_PRESENT_CAPTURE) { mixer->mask_recsrc |= 1 << ptr->oss_id; } rslot->private_data = pslot; rslot->private_free = snd_mixer_oss_slot_free; return 1; } return 0; } #ifdef CONFIG_SND_PROC_FS /* */ #define MIXER_VOL(name) [SOUND_MIXER_##name] = #name static const char * const oss_mixer_names[SNDRV_OSS_MAX_MIXERS] = { MIXER_VOL(VOLUME), MIXER_VOL(BASS), MIXER_VOL(TREBLE), MIXER_VOL(SYNTH), MIXER_VOL(PCM), MIXER_VOL(SPEAKER), MIXER_VOL(LINE), MIXER_VOL(MIC), MIXER_VOL(CD), MIXER_VOL(IMIX), MIXER_VOL(ALTPCM), MIXER_VOL(RECLEV), MIXER_VOL(IGAIN), MIXER_VOL(OGAIN), MIXER_VOL(LINE1), MIXER_VOL(LINE2), MIXER_VOL(LINE3), MIXER_VOL(DIGITAL1), MIXER_VOL(DIGITAL2), MIXER_VOL(DIGITAL3), MIXER_VOL(PHONEIN), MIXER_VOL(PHONEOUT), MIXER_VOL(VIDEO), MIXER_VOL(RADIO), MIXER_VOL(MONITOR), }; /* * /proc interface */ static void snd_mixer_oss_proc_read(struct snd_info_entry *entry, struct snd_info_buffer *buffer) { struct snd_mixer_oss *mixer = entry->private_data; int i; guard(mutex)(&mixer->reg_mutex); for (i = 0; i < SNDRV_OSS_MAX_MIXERS; i++) { struct slot *p; if (! oss_mixer_names[i]) continue; p = (struct slot *)mixer->slots[i].private_data; snd_iprintf(buffer, "%s ", oss_mixer_names[i]); if (p && p->assigned) snd_iprintf(buffer, "\"%s\" %d\n", p->assigned->name, p->assigned->index); else snd_iprintf(buffer, "\"\" 0\n"); } } static void snd_mixer_oss_proc_write(struct snd_info_entry *entry, struct snd_info_buffer *buffer) { struct snd_mixer_oss *mixer = entry->private_data; char line[128], str[32], idxstr[16]; const char *cptr; unsigned int idx; int ch; struct snd_mixer_oss_assign_table *tbl; struct slot *slot; while (!snd_info_get_line(buffer, line, sizeof(line))) { cptr = snd_info_get_str(str, line, sizeof(str)); for (ch = 0; ch < SNDRV_OSS_MAX_MIXERS; ch++) if (oss_mixer_names[ch] && strcmp(oss_mixer_names[ch], str) == 0) break; if (ch >= SNDRV_OSS_MAX_MIXERS) { pr_err("ALSA: mixer_oss: invalid OSS volume '%s'\n", str); continue; } cptr = snd_info_get_str(str, cptr, sizeof(str)); if (! *str) { /* remove the entry */ scoped_guard(mutex, &mixer->reg_mutex) mixer_slot_clear(&mixer->slots[ch]); continue; } snd_info_get_str(idxstr, cptr, sizeof(idxstr)); idx = simple_strtoul(idxstr, NULL, 10); if (idx >= 0x4000) { /* too big */ pr_err("ALSA: mixer_oss: invalid index %d\n", idx); continue; } scoped_guard(mutex, &mixer->reg_mutex) { slot = (struct slot *)mixer->slots[ch].private_data; if (slot && slot->assigned && slot->assigned->index == idx && !strcmp(slot->assigned->name, str)) /* not changed */ break; tbl = kmalloc(sizeof(*tbl), GFP_KERNEL); if (!tbl) break; tbl->oss_id = ch; tbl->name = kstrdup(str, GFP_KERNEL); if (!tbl->name) { kfree(tbl); break; } tbl->index = idx; if (snd_mixer_oss_build_input(mixer, tbl, 1, 1) <= 0) { kfree(tbl->name); kfree(tbl); } } } } static void snd_mixer_oss_proc_init(struct snd_mixer_oss *mixer) { struct snd_info_entry *entry; entry = snd_info_create_card_entry(mixer->card, "oss_mixer", mixer->card->proc_root); if (! entry) return; entry->content = SNDRV_INFO_CONTENT_TEXT; entry->mode = S_IFREG | 0644; entry->c.text.read = snd_mixer_oss_proc_read; entry->c.text.write = snd_mixer_oss_proc_write; entry->private_data = mixer; if (snd_info_register(entry) < 0) { snd_info_free_entry(entry); entry = NULL; } mixer->proc_entry = entry; } static void snd_mixer_oss_proc_done(struct snd_mixer_oss *mixer) { snd_info_free_entry(mixer->proc_entry); mixer->proc_entry = NULL; } #else /* !CONFIG_SND_PROC_FS */ #define snd_mixer_oss_proc_init(mix) #define snd_mixer_oss_proc_done(mix) #endif /* CONFIG_SND_PROC_FS */ static void snd_mixer_oss_build(struct snd_mixer_oss *mixer) { static const struct snd_mixer_oss_assign_table table[] = { { SOUND_MIXER_VOLUME, "Master", 0 }, { SOUND_MIXER_VOLUME, "Front", 0 }, /* fallback */ { SOUND_MIXER_BASS, "Tone Control - Bass", 0 }, { SOUND_MIXER_TREBLE, "Tone Control - Treble", 0 }, { SOUND_MIXER_SYNTH, "Synth", 0 }, { SOUND_MIXER_SYNTH, "FM", 0 }, /* fallback */ { SOUND_MIXER_SYNTH, "Music", 0 }, /* fallback */ { SOUND_MIXER_PCM, "PCM", 0 }, { SOUND_MIXER_SPEAKER, "Beep", 0 }, { SOUND_MIXER_SPEAKER, "PC Speaker", 0 }, /* fallback */ { SOUND_MIXER_SPEAKER, "Speaker", 0 }, /* fallback */ { SOUND_MIXER_LINE, "Line", 0 }, { SOUND_MIXER_MIC, "Mic", 0 }, { SOUND_MIXER_CD, "CD", 0 }, { SOUND_MIXER_IMIX, "Monitor Mix", 0 }, { SOUND_MIXER_ALTPCM, "PCM", 1 }, { SOUND_MIXER_ALTPCM, "Headphone", 0 }, /* fallback */ { SOUND_MIXER_ALTPCM, "Wave", 0 }, /* fallback */ { SOUND_MIXER_RECLEV, "-- nothing --", 0 }, { SOUND_MIXER_IGAIN, "Capture", 0 }, { SOUND_MIXER_OGAIN, "Playback", 0 }, { SOUND_MIXER_LINE1, "Aux", 0 }, { SOUND_MIXER_LINE2, "Aux", 1 }, { SOUND_MIXER_LINE3, "Aux", 2 }, { SOUND_MIXER_DIGITAL1, "Digital", 0 }, { SOUND_MIXER_DIGITAL1, "IEC958", 0 }, /* fallback */ { SOUND_MIXER_DIGITAL1, "IEC958 Optical", 0 }, /* fallback */ { SOUND_MIXER_DIGITAL1, "IEC958 Coaxial", 0 }, /* fallback */ { SOUND_MIXER_DIGITAL2, "Digital", 1 }, { SOUND_MIXER_DIGITAL3, "Digital", 2 }, { SOUND_MIXER_PHONEIN, "Phone", 0 }, { SOUND_MIXER_PHONEOUT, "Master Mono", 0 }, { SOUND_MIXER_PHONEOUT, "Speaker", 0 }, /*fallback*/ { SOUND_MIXER_PHONEOUT, "Mono", 0 }, /*fallback*/ { SOUND_MIXER_PHONEOUT, "Phone", 0 }, /* fallback */ { SOUND_MIXER_VIDEO, "Video", 0 }, { SOUND_MIXER_RADIO, "Radio", 0 }, { SOUND_MIXER_MONITOR, "Monitor", 0 } }; unsigned int idx; for (idx = 0; idx < ARRAY_SIZE(table); idx++) snd_mixer_oss_build_input(mixer, &table[idx], 0, 0); if (mixer->mask_recsrc) { mixer->get_recsrc = snd_mixer_oss_get_recsrc2; mixer->put_recsrc = snd_mixer_oss_put_recsrc2; } } /* * */ static int snd_mixer_oss_free1(void *private) { struct snd_mixer_oss *mixer = private; struct snd_card *card; int idx; if (!mixer) return 0; card = mixer->card; if (snd_BUG_ON(mixer != card->mixer_oss)) return -ENXIO; card->mixer_oss = NULL; for (idx = 0; idx < SNDRV_OSS_MAX_MIXERS; idx++) { struct snd_mixer_oss_slot *chn = &mixer->slots[idx]; if (chn->private_free) chn->private_free(chn); } kfree(mixer); return 0; } static int snd_mixer_oss_notify_handler(struct snd_card *card, int cmd) { struct snd_mixer_oss *mixer; if (cmd == SND_MIXER_OSS_NOTIFY_REGISTER) { int idx, err; mixer = kcalloc(2, sizeof(*mixer), GFP_KERNEL); if (mixer == NULL) return -ENOMEM; mutex_init(&mixer->reg_mutex); err = snd_register_oss_device(SNDRV_OSS_DEVICE_TYPE_MIXER, card, 0, &snd_mixer_oss_f_ops, card); if (err < 0) { dev_err(card->dev, "unable to register OSS mixer device %i:%i\n", card->number, 0); kfree(mixer); return err; } mixer->oss_dev_alloc = 1; mixer->card = card; if (*card->mixername) strscpy(mixer->name, card->mixername, sizeof(mixer->name)); else snprintf(mixer->name, sizeof(mixer->name), "mixer%i", card->number); #ifdef SNDRV_OSS_INFO_DEV_MIXERS snd_oss_info_register(SNDRV_OSS_INFO_DEV_MIXERS, card->number, mixer->name); #endif for (idx = 0; idx < SNDRV_OSS_MAX_MIXERS; idx++) mixer->slots[idx].number = idx; card->mixer_oss = mixer; snd_mixer_oss_build(mixer); snd_mixer_oss_proc_init(mixer); } else { mixer = card->mixer_oss; if (mixer == NULL) return 0; if (mixer->oss_dev_alloc) { #ifdef SNDRV_OSS_INFO_DEV_MIXERS snd_oss_info_unregister(SNDRV_OSS_INFO_DEV_MIXERS, mixer->card->number); #endif snd_unregister_oss_device(SNDRV_OSS_DEVICE_TYPE_MIXER, mixer->card, 0); mixer->oss_dev_alloc = 0; } if (cmd == SND_MIXER_OSS_NOTIFY_DISCONNECT) return 0; snd_mixer_oss_proc_done(mixer); return snd_mixer_oss_free1(mixer); } return 0; } static int __init alsa_mixer_oss_init(void) { struct snd_card *card; int idx; snd_mixer_oss_notify_callback = snd_mixer_oss_notify_handler; for (idx = 0; idx < SNDRV_CARDS; idx++) { card = snd_card_ref(idx); if (card) { snd_mixer_oss_notify_handler(card, SND_MIXER_OSS_NOTIFY_REGISTER); snd_card_unref(card); } } return 0; } static void __exit alsa_mixer_oss_exit(void) { struct snd_card *card; int idx; snd_mixer_oss_notify_callback = NULL; for (idx = 0; idx < SNDRV_CARDS; idx++) { card = snd_card_ref(idx); if (card) { snd_mixer_oss_notify_handler(card, SND_MIXER_OSS_NOTIFY_FREE); snd_card_unref(card); } } } module_init(alsa_mixer_oss_init) module_exit(alsa_mixer_oss_exit) |
| 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_RATELIMIT_H #define _LINUX_RATELIMIT_H #include <linux/ratelimit_types.h> #include <linux/sched.h> #include <linux/spinlock.h> static inline void ratelimit_state_init(struct ratelimit_state *rs, int interval, int burst) { memset(rs, 0, sizeof(*rs)); raw_spin_lock_init(&rs->lock); rs->interval = interval; rs->burst = burst; } static inline void ratelimit_default_init(struct ratelimit_state *rs) { return ratelimit_state_init(rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); } static inline void ratelimit_state_exit(struct ratelimit_state *rs) { if (!(rs->flags & RATELIMIT_MSG_ON_RELEASE)) return; if (rs->missed) { pr_warn("%s: %d output lines suppressed due to ratelimiting\n", current->comm, rs->missed); rs->missed = 0; } } static inline void ratelimit_set_flags(struct ratelimit_state *rs, unsigned long flags) { rs->flags = flags; } extern struct ratelimit_state printk_ratelimit_state; #ifdef CONFIG_PRINTK #define WARN_ON_RATELIMIT(condition, state) ({ \ bool __rtn_cond = !!(condition); \ WARN_ON(__rtn_cond && __ratelimit(state)); \ __rtn_cond; \ }) #define WARN_RATELIMIT(condition, format, ...) \ ({ \ static DEFINE_RATELIMIT_STATE(_rs, \ DEFAULT_RATELIMIT_INTERVAL, \ DEFAULT_RATELIMIT_BURST); \ int rtn = !!(condition); \ \ if (unlikely(rtn && __ratelimit(&_rs))) \ WARN(rtn, format, ##__VA_ARGS__); \ \ rtn; \ }) #else #define WARN_ON_RATELIMIT(condition, state) \ WARN_ON(condition) #define WARN_RATELIMIT(condition, format, ...) \ ({ \ int rtn = WARN(condition, format, ##__VA_ARGS__); \ rtn; \ }) #endif #endif /* _LINUX_RATELIMIT_H */ |
| 58 67 16 25 2 24 24 28 28 26 25 9 28 24 25 15 1 15 2 13 14 10 10 10 6 10 3 3 3 3 3 3 3 3 3 3 3 3 3 3 2 3 3 3 41 42 31 1 17 22 21 22 17 22 1 1 1 1 1 1 1 1 1 1 24 24 24 7 7 7 4 4 1 1 1 39 2 2 2 2 1 1 1 1 1 2 1 29 39 39 15 15 30 38 13 1 1 1 12 12 10 11 6 6 5 11 61 60 60 59 59 7 60 60 61 1 1 1 58 60 1 1 1 59 61 50 7 59 61 61 57 53 54 51 53 53 53 1 53 51 50 51 51 22 10 9 10 6 10 10 11 10 11 6 12 4 16 11 16 11 16 6 6 16 16 16 16 4 16 14 13 15 6 6 11 14 11 7 2 438 439 10 441 13 13 13 13 13 26 26 15 26 26 26 26 4 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 195 198 196 196 196 94 95 197 86 91 42 10 10 10 10 43 8 35 4 92 3 81 32 12 33 2 33 32 31 31 33 15 33 32 32 3 31 41 23 54 54 41 41 20 20 20 2 20 2 2 2 29 29 29 22 22 22 21 21 22 22 22 3 29 22 22 22 7 32 32 32 21 19 2 31 32 30 32 13 14 14 11 11 11 11 11 11 11 8 14 8 8 8 7 7 2 1 1 6 1 6 6 5 5 4 1 1 1 1 1 1 7 1 1 1 5 14 14 14 12 10 12 1 7 1 11 5 11 6 1 6 1 13 1 2 2 5 10 10 10 10 10 2 10 10 5 5 5 5 5 5 5 5 1 5 5 5 5 5 3 8 8 8 12 5 12 7 9 1 8 9 8 7 2 2 2 1 6 5 28 28 28 28 28 28 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 2 1 1 1 2 2 4 3 3 3 1 2 2 2 1 1 1 1 1 1 3 3 3 4 12 2 2 2 2 10 9 8 1 1 7 7 7 7 10 4 4 4 4 2 4 2 2 10 1 12 12 6 4 6 10 10 10 10 1 10 10 12 12 1 1 12 12 3 9 4 4 4 12 3 1 11 11 11 10 11 11 11 30 232 30 251 250 250 238 238 238 235 236 238 234 234 235 5 11 12 234 227 233 229 251 251 250 233 234 235 233 234 235 233 234 233 234 235 251 248 2 250 6 6 6 6 6 6 6 6 4 4 4 4 8 8 8 3 3 8 2 2 2 3 2 2 1 6 8 6 2 2 3 1 3 6 5 5 1 1 4 5 46 47 47 5 45 45 43 44 43 44 44 44 2 44 43 39 42 1 8 6 8 8 1 1 1 6 9 8 1 5 7 14 14 8 8 14 8 1 1 1 1 4 4 2 2 1 1 1 1 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 | // SPDX-License-Identifier: GPL-2.0-only /* * linux/mm/filemap.c * * Copyright (C) 1994-1999 Linus Torvalds */ /* * This file handles the generic file mmap semantics used by * most "normal" filesystems (but you don't /have/ to use this: * the NFS filesystem used to do this differently, for example) */ #include <linux/export.h> #include <linux/compiler.h> #include <linux/dax.h> #include <linux/fs.h> #include <linux/sched/signal.h> #include <linux/uaccess.h> #include <linux/capability.h> #include <linux/kernel_stat.h> #include <linux/gfp.h> #include <linux/mm.h> #include <linux/swap.h> #include <linux/swapops.h> #include <linux/syscalls.h> #include <linux/mman.h> #include <linux/pagemap.h> #include <linux/file.h> #include <linux/uio.h> #include <linux/error-injection.h> #include <linux/hash.h> #include <linux/writeback.h> #include <linux/backing-dev.h> #include <linux/pagevec.h> #include <linux/security.h> #include <linux/cpuset.h> #include <linux/hugetlb.h> #include <linux/memcontrol.h> #include <linux/shmem_fs.h> #include <linux/rmap.h> #include <linux/delayacct.h> #include <linux/psi.h> #include <linux/ramfs.h> #include <linux/page_idle.h> #include <linux/migrate.h> #include <linux/pipe_fs_i.h> #include <linux/splice.h> #include <linux/rcupdate_wait.h> #include <asm/pgalloc.h> #include <asm/tlbflush.h> #include "internal.h" #define CREATE_TRACE_POINTS #include <trace/events/filemap.h> /* * FIXME: remove all knowledge of the buffer layer from the core VM */ #include <linux/buffer_head.h> /* for try_to_free_buffers */ #include <asm/mman.h> #include "swap.h" /* * Shared mappings implemented 30.11.1994. It's not fully working yet, * though. * * Shared mappings now work. 15.8.1995 Bruno. * * finished 'unifying' the page and buffer cache and SMP-threaded the * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com> * * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de> */ /* * Lock ordering: * * ->i_mmap_rwsem (truncate_pagecache) * ->private_lock (__free_pte->block_dirty_folio) * ->swap_lock (exclusive_swap_page, others) * ->i_pages lock * * ->i_rwsem * ->invalidate_lock (acquired by fs in truncate path) * ->i_mmap_rwsem (truncate->unmap_mapping_range) * * ->mmap_lock * ->i_mmap_rwsem * ->page_table_lock or pte_lock (various, mainly in memory.c) * ->i_pages lock (arch-dependent flush_dcache_mmap_lock) * * ->mmap_lock * ->invalidate_lock (filemap_fault) * ->lock_page (filemap_fault, access_process_vm) * * ->i_rwsem (generic_perform_write) * ->mmap_lock (fault_in_readable->do_page_fault) * * bdi->wb.list_lock * sb_lock (fs/fs-writeback.c) * ->i_pages lock (__sync_single_inode) * * ->i_mmap_rwsem * ->anon_vma.lock (vma_merge) * * ->anon_vma.lock * ->page_table_lock or pte_lock (anon_vma_prepare and various) * * ->page_table_lock or pte_lock * ->swap_lock (try_to_unmap_one) * ->private_lock (try_to_unmap_one) * ->i_pages lock (try_to_unmap_one) * ->lruvec->lru_lock (follow_page->mark_page_accessed) * ->lruvec->lru_lock (check_pte_range->isolate_lru_page) * ->private_lock (folio_remove_rmap_pte->set_page_dirty) * ->i_pages lock (folio_remove_rmap_pte->set_page_dirty) * bdi.wb->list_lock (folio_remove_rmap_pte->set_page_dirty) * ->inode->i_lock (folio_remove_rmap_pte->set_page_dirty) * ->memcg->move_lock (folio_remove_rmap_pte->folio_memcg_lock) * bdi.wb->list_lock (zap_pte_range->set_page_dirty) * ->inode->i_lock (zap_pte_range->set_page_dirty) * ->private_lock (zap_pte_range->block_dirty_folio) */ static void mapping_set_update(struct xa_state *xas, struct address_space *mapping) { if (dax_mapping(mapping) || shmem_mapping(mapping)) return; xas_set_update(xas, workingset_update_node); xas_set_lru(xas, &shadow_nodes); } static void page_cache_delete(struct address_space *mapping, struct folio *folio, void *shadow) { XA_STATE(xas, &mapping->i_pages, folio->index); long nr = 1; mapping_set_update(&xas, mapping); xas_set_order(&xas, folio->index, folio_order(folio)); nr = folio_nr_pages(folio); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); xas_store(&xas, shadow); xas_init_marks(&xas); folio->mapping = NULL; /* Leave page->index set: truncation lookup relies upon it */ mapping->nrpages -= nr; } static void filemap_unaccount_folio(struct address_space *mapping, struct folio *folio) { long nr; VM_BUG_ON_FOLIO(folio_mapped(folio), folio); if (!IS_ENABLED(CONFIG_DEBUG_VM) && unlikely(folio_mapped(folio))) { pr_alert("BUG: Bad page cache in process %s pfn:%05lx\n", current->comm, folio_pfn(folio)); dump_page(&folio->page, "still mapped when deleted"); dump_stack(); add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); if (mapping_exiting(mapping) && !folio_test_large(folio)) { int mapcount = folio_mapcount(folio); if (folio_ref_count(folio) >= mapcount + 2) { /* * All vmas have already been torn down, so it's * a good bet that actually the page is unmapped * and we'd rather not leak it: if we're wrong, * another bad page check should catch it later. */ page_mapcount_reset(&folio->page); folio_ref_sub(folio, mapcount); } } } /* hugetlb folios do not participate in page cache accounting. */ if (folio_test_hugetlb(folio)) return; nr = folio_nr_pages(folio); __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, -nr); if (folio_test_swapbacked(folio)) { __lruvec_stat_mod_folio(folio, NR_SHMEM, -nr); if (folio_test_pmd_mappable(folio)) __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, -nr); } else if (folio_test_pmd_mappable(folio)) { __lruvec_stat_mod_folio(folio, NR_FILE_THPS, -nr); filemap_nr_thps_dec(mapping); } /* * At this point folio must be either written or cleaned by * truncate. Dirty folio here signals a bug and loss of * unwritten data - on ordinary filesystems. * * But it's harmless on in-memory filesystems like tmpfs; and can * occur when a driver which did get_user_pages() sets page dirty * before putting it, while the inode is being finally evicted. * * Below fixes dirty accounting after removing the folio entirely * but leaves the dirty flag set: it has no effect for truncated * folio and anyway will be cleared before returning folio to * buddy allocator. */ if (WARN_ON_ONCE(folio_test_dirty(folio) && mapping_can_writeback(mapping))) folio_account_cleaned(folio, inode_to_wb(mapping->host)); } /* * Delete a page from the page cache and free it. Caller has to make * sure the page is locked and that nobody else uses it - or that usage * is safe. The caller must hold the i_pages lock. */ void __filemap_remove_folio(struct folio *folio, void *shadow) { struct address_space *mapping = folio->mapping; trace_mm_filemap_delete_from_page_cache(folio); filemap_unaccount_folio(mapping, folio); page_cache_delete(mapping, folio, shadow); } void filemap_free_folio(struct address_space *mapping, struct folio *folio) { void (*free_folio)(struct folio *); int refs = 1; free_folio = mapping->a_ops->free_folio; if (free_folio) free_folio(folio); if (folio_test_large(folio)) refs = folio_nr_pages(folio); folio_put_refs(folio, refs); } /** * filemap_remove_folio - Remove folio from page cache. * @folio: The folio. * * This must be called only on folios that are locked and have been * verified to be in the page cache. It will never put the folio into * the free list because the caller has a reference on the page. */ void filemap_remove_folio(struct folio *folio) { struct address_space *mapping = folio->mapping; BUG_ON(!folio_test_locked(folio)); spin_lock(&mapping->host->i_lock); xa_lock_irq(&mapping->i_pages); __filemap_remove_folio(folio, NULL); xa_unlock_irq(&mapping->i_pages); if (mapping_shrinkable(mapping)) inode_add_lru(mapping->host); spin_unlock(&mapping->host->i_lock); filemap_free_folio(mapping, folio); } /* * page_cache_delete_batch - delete several folios from page cache * @mapping: the mapping to which folios belong * @fbatch: batch of folios to delete * * The function walks over mapping->i_pages and removes folios passed in * @fbatch from the mapping. The function expects @fbatch to be sorted * by page index and is optimised for it to be dense. * It tolerates holes in @fbatch (mapping entries at those indices are not * modified). * * The function expects the i_pages lock to be held. */ static void page_cache_delete_batch(struct address_space *mapping, struct folio_batch *fbatch) { XA_STATE(xas, &mapping->i_pages, fbatch->folios[0]->index); long total_pages = 0; int i = 0; struct folio *folio; mapping_set_update(&xas, mapping); xas_for_each(&xas, folio, ULONG_MAX) { if (i >= folio_batch_count(fbatch)) break; /* A swap/dax/shadow entry got inserted? Skip it. */ if (xa_is_value(folio)) continue; /* * A page got inserted in our range? Skip it. We have our * pages locked so they are protected from being removed. * If we see a page whose index is higher than ours, it * means our page has been removed, which shouldn't be * possible because we're holding the PageLock. */ if (folio != fbatch->folios[i]) { VM_BUG_ON_FOLIO(folio->index > fbatch->folios[i]->index, folio); continue; } WARN_ON_ONCE(!folio_test_locked(folio)); folio->mapping = NULL; /* Leave folio->index set: truncation lookup relies on it */ i++; xas_store(&xas, NULL); total_pages += folio_nr_pages(folio); } mapping->nrpages -= total_pages; } void delete_from_page_cache_batch(struct address_space *mapping, struct folio_batch *fbatch) { int i; if (!folio_batch_count(fbatch)) return; spin_lock(&mapping->host->i_lock); xa_lock_irq(&mapping->i_pages); for (i = 0; i < folio_batch_count(fbatch); i++) { struct folio *folio = fbatch->folios[i]; trace_mm_filemap_delete_from_page_cache(folio); filemap_unaccount_folio(mapping, folio); } page_cache_delete_batch(mapping, fbatch); xa_unlock_irq(&mapping->i_pages); if (mapping_shrinkable(mapping)) inode_add_lru(mapping->host); spin_unlock(&mapping->host->i_lock); for (i = 0; i < folio_batch_count(fbatch); i++) filemap_free_folio(mapping, fbatch->folios[i]); } int filemap_check_errors(struct address_space *mapping) { int ret = 0; /* Check for outstanding write errors */ if (test_bit(AS_ENOSPC, &mapping->flags) && test_and_clear_bit(AS_ENOSPC, &mapping->flags)) ret = -ENOSPC; if (test_bit(AS_EIO, &mapping->flags) && test_and_clear_bit(AS_EIO, &mapping->flags)) ret = -EIO; return ret; } EXPORT_SYMBOL(filemap_check_errors); static int filemap_check_and_keep_errors(struct address_space *mapping) { /* Check for outstanding write errors */ if (test_bit(AS_EIO, &mapping->flags)) return -EIO; if (test_bit(AS_ENOSPC, &mapping->flags)) return -ENOSPC; return 0; } /** * filemap_fdatawrite_wbc - start writeback on mapping dirty pages in range * @mapping: address space structure to write * @wbc: the writeback_control controlling the writeout * * Call writepages on the mapping using the provided wbc to control the * writeout. * * Return: %0 on success, negative error code otherwise. */ int filemap_fdatawrite_wbc(struct address_space *mapping, struct writeback_control *wbc) { int ret; if (!mapping_can_writeback(mapping) || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) return 0; wbc_attach_fdatawrite_inode(wbc, mapping->host); ret = do_writepages(mapping, wbc); wbc_detach_inode(wbc); return ret; } EXPORT_SYMBOL(filemap_fdatawrite_wbc); /** * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range * @mapping: address space structure to write * @start: offset in bytes where the range starts * @end: offset in bytes where the range ends (inclusive) * @sync_mode: enable synchronous operation * * Start writeback against all of a mapping's dirty pages that lie * within the byte offsets <start, end> inclusive. * * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as * opposed to a regular memory cleansing writeback. The difference between * these two operations is that if a dirty page/buffer is encountered, it must * be waited upon, and not just skipped over. * * Return: %0 on success, negative error code otherwise. */ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, loff_t end, int sync_mode) { struct writeback_control wbc = { .sync_mode = sync_mode, .nr_to_write = LONG_MAX, .range_start = start, .range_end = end, }; return filemap_fdatawrite_wbc(mapping, &wbc); } static inline int __filemap_fdatawrite(struct address_space *mapping, int sync_mode) { return __filemap_fdatawrite_range(mapping, 0, LLONG_MAX, sync_mode); } int filemap_fdatawrite(struct address_space *mapping) { return __filemap_fdatawrite(mapping, WB_SYNC_ALL); } EXPORT_SYMBOL(filemap_fdatawrite); int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, loff_t end) { return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL); } EXPORT_SYMBOL(filemap_fdatawrite_range); /** * filemap_flush - mostly a non-blocking flush * @mapping: target address_space * * This is a mostly non-blocking flush. Not suitable for data-integrity * purposes - I/O may not be started against all dirty pages. * * Return: %0 on success, negative error code otherwise. */ int filemap_flush(struct address_space *mapping) { return __filemap_fdatawrite(mapping, WB_SYNC_NONE); } EXPORT_SYMBOL(filemap_flush); /** * filemap_range_has_page - check if a page exists in range. * @mapping: address space within which to check * @start_byte: offset in bytes where the range starts * @end_byte: offset in bytes where the range ends (inclusive) * * Find at least one page in the range supplied, usually used to check if * direct writing in this range will trigger a writeback. * * Return: %true if at least one page exists in the specified range, * %false otherwise. */ bool filemap_range_has_page(struct address_space *mapping, loff_t start_byte, loff_t end_byte) { struct folio *folio; XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT); pgoff_t max = end_byte >> PAGE_SHIFT; if (end_byte < start_byte) return false; rcu_read_lock(); for (;;) { folio = xas_find(&xas, max); if (xas_retry(&xas, folio)) continue; /* Shadow entries don't count */ if (xa_is_value(folio)) continue; /* * We don't need to try to pin this page; we're about to * release the RCU lock anyway. It is enough to know that * there was a page here recently. */ break; } rcu_read_unlock(); return folio != NULL; } EXPORT_SYMBOL(filemap_range_has_page); static void __filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, loff_t end_byte) { pgoff_t index = start_byte >> PAGE_SHIFT; pgoff_t end = end_byte >> PAGE_SHIFT; struct folio_batch fbatch; unsigned nr_folios; folio_batch_init(&fbatch); while (index <= end) { unsigned i; nr_folios = filemap_get_folios_tag(mapping, &index, end, PAGECACHE_TAG_WRITEBACK, &fbatch); if (!nr_folios) break; for (i = 0; i < nr_folios; i++) { struct folio *folio = fbatch.folios[i]; folio_wait_writeback(folio); folio_clear_error(folio); } folio_batch_release(&fbatch); cond_resched(); } } /** * filemap_fdatawait_range - wait for writeback to complete * @mapping: address space structure to wait for * @start_byte: offset in bytes where the range starts * @end_byte: offset in bytes where the range ends (inclusive) * * Walk the list of under-writeback pages of the given address space * in the given range and wait for all of them. Check error status of * the address space and return it. * * Since the error status of the address space is cleared by this function, * callers are responsible for checking the return value and handling and/or * reporting the error. * * Return: error status of the address space. */ int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, loff_t end_byte) { __filemap_fdatawait_range(mapping, start_byte, end_byte); return filemap_check_errors(mapping); } EXPORT_SYMBOL(filemap_fdatawait_range); /** * filemap_fdatawait_range_keep_errors - wait for writeback to complete * @mapping: address space structure to wait for * @start_byte: offset in bytes where the range starts * @end_byte: offset in bytes where the range ends (inclusive) * * Walk the list of under-writeback pages of the given address space in the * given range and wait for all of them. Unlike filemap_fdatawait_range(), * this function does not clear error status of the address space. * * Use this function if callers don't handle errors themselves. Expected * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2), * fsfreeze(8) */ int filemap_fdatawait_range_keep_errors(struct address_space *mapping, loff_t start_byte, loff_t end_byte) { __filemap_fdatawait_range(mapping, start_byte, end_byte); return filemap_check_and_keep_errors(mapping); } EXPORT_SYMBOL(filemap_fdatawait_range_keep_errors); /** * file_fdatawait_range - wait for writeback to complete * @file: file pointing to address space structure to wait for * @start_byte: offset in bytes where the range starts * @end_byte: offset in bytes where the range ends (inclusive) * * Walk the list of under-writeback pages of the address space that file * refers to, in the given range and wait for all of them. Check error * status of the address space vs. the file->f_wb_err cursor and return it. * * Since the error status of the file is advanced by this function, * callers are responsible for checking the return value and handling and/or * reporting the error. * * Return: error status of the address space vs. the file->f_wb_err cursor. */ int file_fdatawait_range(struct file *file, loff_t start_byte, loff_t end_byte) { struct address_space *mapping = file->f_mapping; __filemap_fdatawait_range(mapping, start_byte, end_byte); return file_check_and_advance_wb_err(file); } EXPORT_SYMBOL(file_fdatawait_range); /** * filemap_fdatawait_keep_errors - wait for writeback without clearing errors * @mapping: address space structure to wait for * * Walk the list of under-writeback pages of the given address space * and wait for all of them. Unlike filemap_fdatawait(), this function * does not clear error status of the address space. * * Use this function if callers don't handle errors themselves. Expected * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2), * fsfreeze(8) * * Return: error status of the address space. */ int filemap_fdatawait_keep_errors(struct address_space *mapping) { __filemap_fdatawait_range(mapping, 0, LLONG_MAX); return filemap_check_and_keep_errors(mapping); } EXPORT_SYMBOL(filemap_fdatawait_keep_errors); /* Returns true if writeback might be needed or already in progress. */ static bool mapping_needs_writeback(struct address_space *mapping) { return mapping->nrpages; } bool filemap_range_has_writeback(struct address_space *mapping, loff_t start_byte, loff_t end_byte) { XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT); pgoff_t max = end_byte >> PAGE_SHIFT; struct folio *folio; if (end_byte < start_byte) return false; rcu_read_lock(); xas_for_each(&xas, folio, max) { if (xas_retry(&xas, folio)) continue; if (xa_is_value(folio)) continue; if (folio_test_dirty(folio) || folio_test_locked(folio) || folio_test_writeback(folio)) break; } rcu_read_unlock(); return folio != NULL; } EXPORT_SYMBOL_GPL(filemap_range_has_writeback); /** * filemap_write_and_wait_range - write out & wait on a file range * @mapping: the address_space for the pages * @lstart: offset in bytes where the range starts * @lend: offset in bytes where the range ends (inclusive) * * Write out and wait upon file offsets lstart->lend, inclusive. * * Note that @lend is inclusive (describes the last byte to be written) so * that this function can be used to write to the very end-of-file (end = -1). * * Return: error status of the address space. */ int filemap_write_and_wait_range(struct address_space *mapping, loff_t lstart, loff_t lend) { int err = 0, err2; if (lend < lstart) return 0; if (mapping_needs_writeback(mapping)) { err = __filemap_fdatawrite_range(mapping, lstart, lend, WB_SYNC_ALL); /* * Even if the above returned error, the pages may be * written partially (e.g. -ENOSPC), so we wait for it. * But the -EIO is special case, it may indicate the worst * thing (e.g. bug) happened, so we avoid waiting for it. */ if (err != -EIO) __filemap_fdatawait_range(mapping, lstart, lend); } err2 = filemap_check_errors(mapping); if (!err) err = err2; return err; } EXPORT_SYMBOL(filemap_write_and_wait_range); void __filemap_set_wb_err(struct address_space *mapping, int err) { errseq_t eseq = errseq_set(&mapping->wb_err, err); trace_filemap_set_wb_err(mapping, eseq); } EXPORT_SYMBOL(__filemap_set_wb_err); /** * file_check_and_advance_wb_err - report wb error (if any) that was previously * and advance wb_err to current one * @file: struct file on which the error is being reported * * When userland calls fsync (or something like nfsd does the equivalent), we * want to report any writeback errors that occurred since the last fsync (or * since the file was opened if there haven't been any). * * Grab the wb_err from the mapping. If it matches what we have in the file, * then just quickly return 0. The file is all caught up. * * If it doesn't match, then take the mapping value, set the "seen" flag in * it and try to swap it into place. If it works, or another task beat us * to it with the new value, then update the f_wb_err and return the error * portion. The error at this point must be reported via proper channels * (a'la fsync, or NFS COMMIT operation, etc.). * * While we handle mapping->wb_err with atomic operations, the f_wb_err * value is protected by the f_lock since we must ensure that it reflects * the latest value swapped in for this file descriptor. * * Return: %0 on success, negative error code otherwise. */ int file_check_and_advance_wb_err(struct file *file) { int err = 0; errseq_t old = READ_ONCE(file->f_wb_err); struct address_space *mapping = file->f_mapping; /* Locklessly handle the common case where nothing has changed */ if (errseq_check(&mapping->wb_err, old)) { /* Something changed, must use slow path */ spin_lock(&file->f_lock); old = file->f_wb_err; err = errseq_check_and_advance(&mapping->wb_err, &file->f_wb_err); trace_file_check_and_advance_wb_err(file, old); spin_unlock(&file->f_lock); } /* * We're mostly using this function as a drop in replacement for * filemap_check_errors. Clear AS_EIO/AS_ENOSPC to emulate the effect * that the legacy code would have had on these flags. */ clear_bit(AS_EIO, &mapping->flags); clear_bit(AS_ENOSPC, &mapping->flags); return err; } EXPORT_SYMBOL(file_check_and_advance_wb_err); /** * file_write_and_wait_range - write out & wait on a file range * @file: file pointing to address_space with pages * @lstart: offset in bytes where the range starts * @lend: offset in bytes where the range ends (inclusive) * * Write out and wait upon file offsets lstart->lend, inclusive. * * Note that @lend is inclusive (describes the last byte to be written) so * that this function can be used to write to the very end-of-file (end = -1). * * After writing out and waiting on the data, we check and advance the * f_wb_err cursor to the latest value, and return any errors detected there. * * Return: %0 on success, negative error code otherwise. */ int file_write_and_wait_range(struct file *file, loff_t lstart, loff_t lend) { int err = 0, err2; struct address_space *mapping = file->f_mapping; if (lend < lstart) return 0; if (mapping_needs_writeback(mapping)) { err = __filemap_fdatawrite_range(mapping, lstart, lend, WB_SYNC_ALL); /* See comment of filemap_write_and_wait() */ if (err != -EIO) __filemap_fdatawait_range(mapping, lstart, lend); } err2 = file_check_and_advance_wb_err(file); if (!err) err = err2; return err; } EXPORT_SYMBOL(file_write_and_wait_range); /** * replace_page_cache_folio - replace a pagecache folio with a new one * @old: folio to be replaced * @new: folio to replace with * * This function replaces a folio in the pagecache with a new one. On * success it acquires the pagecache reference for the new folio and * drops it for the old folio. Both the old and new folios must be * locked. This function does not add the new folio to the LRU, the * caller must do that. * * The remove + add is atomic. This function cannot fail. */ void replace_page_cache_folio(struct folio *old, struct folio *new) { struct address_space *mapping = old->mapping; void (*free_folio)(struct folio *) = mapping->a_ops->free_folio; pgoff_t offset = old->index; XA_STATE(xas, &mapping->i_pages, offset); VM_BUG_ON_FOLIO(!folio_test_locked(old), old); VM_BUG_ON_FOLIO(!folio_test_locked(new), new); VM_BUG_ON_FOLIO(new->mapping, new); folio_get(new); new->mapping = mapping; new->index = offset; mem_cgroup_replace_folio(old, new); xas_lock_irq(&xas); xas_store(&xas, new); old->mapping = NULL; /* hugetlb pages do not participate in page cache accounting. */ if (!folio_test_hugetlb(old)) __lruvec_stat_sub_folio(old, NR_FILE_PAGES); if (!folio_test_hugetlb(new)) __lruvec_stat_add_folio(new, NR_FILE_PAGES); if (folio_test_swapbacked(old)) __lruvec_stat_sub_folio(old, NR_SHMEM); if (folio_test_swapbacked(new)) __lruvec_stat_add_folio(new, NR_SHMEM); xas_unlock_irq(&xas); if (free_folio) free_folio(old); folio_put(old); } EXPORT_SYMBOL_GPL(replace_page_cache_folio); noinline int __filemap_add_folio(struct address_space *mapping, struct folio *folio, pgoff_t index, gfp_t gfp, void **shadowp) { XA_STATE(xas, &mapping->i_pages, index); void *alloced_shadow = NULL; int alloced_order = 0; bool huge; long nr; VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_FOLIO(folio_test_swapbacked(folio), folio); mapping_set_update(&xas, mapping); VM_BUG_ON_FOLIO(index & (folio_nr_pages(folio) - 1), folio); xas_set_order(&xas, index, folio_order(folio)); huge = folio_test_hugetlb(folio); nr = folio_nr_pages(folio); gfp &= GFP_RECLAIM_MASK; folio_ref_add(folio, nr); folio->mapping = mapping; folio->index = xas.xa_index; for (;;) { int order = -1, split_order = 0; void *entry, *old = NULL; xas_lock_irq(&xas); xas_for_each_conflict(&xas, entry) { old = entry; if (!xa_is_value(entry)) { xas_set_err(&xas, -EEXIST); goto unlock; } /* * If a larger entry exists, * it will be the first and only entry iterated. */ if (order == -1) order = xas_get_order(&xas); } /* entry may have changed before we re-acquire the lock */ if (alloced_order && (old != alloced_shadow || order != alloced_order)) { xas_destroy(&xas); alloced_order = 0; } if (old) { if (order > 0 && order > folio_order(folio)) { /* How to handle large swap entries? */ BUG_ON(shmem_mapping(mapping)); if (!alloced_order) { split_order = order; goto unlock; } xas_split(&xas, old, order); xas_reset(&xas); } if (shadowp) *shadowp = old; } xas_store(&xas, folio); if (xas_error(&xas)) goto unlock; mapping->nrpages += nr; /* hugetlb pages do not participate in page cache accounting */ if (!huge) { __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr); if (folio_test_pmd_mappable(folio)) __lruvec_stat_mod_folio(folio, NR_FILE_THPS, nr); } unlock: xas_unlock_irq(&xas); /* split needed, alloc here and retry. */ if (split_order) { xas_split_alloc(&xas, old, split_order, gfp); if (xas_error(&xas)) goto error; alloced_shadow = old; alloced_order = split_order; xas_reset(&xas); continue; } if (!xas_nomem(&xas, gfp)) break; } if (xas_error(&xas)) goto error; trace_mm_filemap_add_to_page_cache(folio); return 0; error: folio->mapping = NULL; /* Leave page->index set: truncation relies upon it */ folio_put_refs(folio, nr); return xas_error(&xas); } ALLOW_ERROR_INJECTION(__filemap_add_folio, ERRNO); int filemap_add_folio(struct address_space *mapping, struct folio *folio, pgoff_t index, gfp_t gfp) { void *shadow = NULL; int ret; ret = mem_cgroup_charge(folio, NULL, gfp); if (ret) return ret; __folio_set_locked(folio); ret = __filemap_add_folio(mapping, folio, index, gfp, &shadow); if (unlikely(ret)) { mem_cgroup_uncharge(folio); __folio_clear_locked(folio); } else { /* * The folio might have been evicted from cache only * recently, in which case it should be activated like * any other repeatedly accessed folio. * The exception is folios getting rewritten; evicting other * data from the working set, only to cache data that will * get overwritten with something else, is a waste of memory. */ WARN_ON_ONCE(folio_test_active(folio)); if (!(gfp & __GFP_WRITE) && shadow) workingset_refault(folio, shadow); folio_add_lru(folio); } return ret; } EXPORT_SYMBOL_GPL(filemap_add_folio); #ifdef CONFIG_NUMA struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order) { int n; struct folio *folio; if (cpuset_do_page_mem_spread()) { unsigned int cpuset_mems_cookie; do { cpuset_mems_cookie = read_mems_allowed_begin(); n = cpuset_mem_spread_node(); folio = __folio_alloc_node_noprof(gfp, order, n); } while (!folio && read_mems_allowed_retry(cpuset_mems_cookie)); return folio; } return folio_alloc_noprof(gfp, order); } EXPORT_SYMBOL(filemap_alloc_folio_noprof); #endif /* * filemap_invalidate_lock_two - lock invalidate_lock for two mappings * * Lock exclusively invalidate_lock of any passed mapping that is not NULL. * * @mapping1: the first mapping to lock * @mapping2: the second mapping to lock */ void filemap_invalidate_lock_two(struct address_space *mapping1, struct address_space *mapping2) { if (mapping1 > mapping2) swap(mapping1, mapping2); if (mapping1) down_write(&mapping1->invalidate_lock); if (mapping2 && mapping1 != mapping2) down_write_nested(&mapping2->invalidate_lock, 1); } EXPORT_SYMBOL(filemap_invalidate_lock_two); /* * filemap_invalidate_unlock_two - unlock invalidate_lock for two mappings * * Unlock exclusive invalidate_lock of any passed mapping that is not NULL. * * @mapping1: the first mapping to unlock * @mapping2: the second mapping to unlock */ void filemap_invalidate_unlock_two(struct address_space *mapping1, struct address_space *mapping2) { if (mapping1) up_write(&mapping1->invalidate_lock); if (mapping2 && mapping1 != mapping2) up_write(&mapping2->invalidate_lock); } EXPORT_SYMBOL(filemap_invalidate_unlock_two); /* * In order to wait for pages to become available there must be * waitqueues associated with pages. By using a hash table of * waitqueues where the bucket discipline is to maintain all * waiters on the same queue and wake all when any of the pages * become available, and for the woken contexts to check to be * sure the appropriate page became available, this saves space * at a cost of "thundering herd" phenomena during rare hash * collisions. */ #define PAGE_WAIT_TABLE_BITS 8 #define PAGE_WAIT_TABLE_SIZE (1 << PAGE_WAIT_TABLE_BITS) static wait_queue_head_t folio_wait_table[PAGE_WAIT_TABLE_SIZE] __cacheline_aligned; static wait_queue_head_t *folio_waitqueue(struct folio *folio) { return &folio_wait_table[hash_ptr(folio, PAGE_WAIT_TABLE_BITS)]; } void __init pagecache_init(void) { int i; for (i = 0; i < PAGE_WAIT_TABLE_SIZE; i++) init_waitqueue_head(&folio_wait_table[i]); page_writeback_init(); } /* * The page wait code treats the "wait->flags" somewhat unusually, because * we have multiple different kinds of waits, not just the usual "exclusive" * one. * * We have: * * (a) no special bits set: * * We're just waiting for the bit to be released, and when a waker * calls the wakeup function, we set WQ_FLAG_WOKEN and wake it up, * and remove it from the wait queue. * * Simple and straightforward. * * (b) WQ_FLAG_EXCLUSIVE: * * The waiter is waiting to get the lock, and only one waiter should * be woken up to avoid any thundering herd behavior. We'll set the * WQ_FLAG_WOKEN bit, wake it up, and remove it from the wait queue. * * This is the traditional exclusive wait. * * (c) WQ_FLAG_EXCLUSIVE | WQ_FLAG_CUSTOM: * * The waiter is waiting to get the bit, and additionally wants the * lock to be transferred to it for fair lock behavior. If the lock * cannot be taken, we stop walking the wait queue without waking * the waiter. * * This is the "fair lock handoff" case, and in addition to setting * WQ_FLAG_WOKEN, we set WQ_FLAG_DONE to let the waiter easily see * that it now has the lock. */ static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg) { unsigned int flags; struct wait_page_key *key = arg; struct wait_page_queue *wait_page = container_of(wait, struct wait_page_queue, wait); if (!wake_page_match(wait_page, key)) return 0; /* * If it's a lock handoff wait, we get the bit for it, and * stop walking (and do not wake it up) if we can't. */ flags = wait->flags; if (flags & WQ_FLAG_EXCLUSIVE) { if (test_bit(key->bit_nr, &key->folio->flags)) return -1; if (flags & WQ_FLAG_CUSTOM) { if (test_and_set_bit(key->bit_nr, &key->folio->flags)) return -1; flags |= WQ_FLAG_DONE; } } /* * We are holding the wait-queue lock, but the waiter that * is waiting for this will be checking the flags without * any locking. * * So update the flags atomically, and wake up the waiter * afterwards to avoid any races. This store-release pairs * with the load-acquire in folio_wait_bit_common(). */ smp_store_release(&wait->flags, flags | WQ_FLAG_WOKEN); wake_up_state(wait->private, mode); /* * Ok, we have successfully done what we're waiting for, * and we can unconditionally remove the wait entry. * * Note that this pairs with the "finish_wait()" in the * waiter, and has to be the absolute last thing we do. * After this list_del_init(&wait->entry) the wait entry * might be de-allocated and the process might even have * exited. */ list_del_init_careful(&wait->entry); return (flags & WQ_FLAG_EXCLUSIVE) != 0; } static void folio_wake_bit(struct folio *folio, int bit_nr) { wait_queue_head_t *q = folio_waitqueue(folio); struct wait_page_key key; unsigned long flags; key.folio = folio; key.bit_nr = bit_nr; key.page_match = 0; spin_lock_irqsave(&q->lock, flags); __wake_up_locked_key(q, TASK_NORMAL, &key); /* * It's possible to miss clearing waiters here, when we woke our page * waiters, but the hashed waitqueue has waiters for other pages on it. * That's okay, it's a rare case. The next waker will clear it. * * Note that, depending on the page pool (buddy, hugetlb, ZONE_DEVICE, * other), the flag may be cleared in the course of freeing the page; * but that is not required for correctness. */ if (!waitqueue_active(q) || !key.page_match) folio_clear_waiters(folio); spin_unlock_irqrestore(&q->lock, flags); } /* * A choice of three behaviors for folio_wait_bit_common(): */ enum behavior { EXCLUSIVE, /* Hold ref to page and take the bit when woken, like * __folio_lock() waiting on then setting PG_locked. */ SHARED, /* Hold ref to page and check the bit when woken, like * folio_wait_writeback() waiting on PG_writeback. */ DROP, /* Drop ref to page before wait, no check when woken, * like folio_put_wait_locked() on PG_locked. */ }; /* * Attempt to check (or get) the folio flag, and mark us done * if successful. */ static inline bool folio_trylock_flag(struct folio *folio, int bit_nr, struct wait_queue_entry *wait) { if (wait->flags & WQ_FLAG_EXCLUSIVE) { if (test_and_set_bit(bit_nr, &folio->flags)) return false; } else if (test_bit(bit_nr, &folio->flags)) return false; wait->flags |= WQ_FLAG_WOKEN | WQ_FLAG_DONE; return true; } /* How many times do we accept lock stealing from under a waiter? */ int sysctl_page_lock_unfairness = 5; static inline int folio_wait_bit_common(struct folio *folio, int bit_nr, int state, enum behavior behavior) { wait_queue_head_t *q = folio_waitqueue(folio); int unfairness = sysctl_page_lock_unfairness; struct wait_page_queue wait_page; wait_queue_entry_t *wait = &wait_page.wait; bool thrashing = false; unsigned long pflags; bool in_thrashing; if (bit_nr == PG_locked && !folio_test_uptodate(folio) && folio_test_workingset(folio)) { delayacct_thrashing_start(&in_thrashing); psi_memstall_enter(&pflags); thrashing = true; } init_wait(wait); wait->func = wake_page_function; wait_page.folio = folio; wait_page.bit_nr = bit_nr; repeat: wait->flags = 0; if (behavior == EXCLUSIVE) { wait->flags = WQ_FLAG_EXCLUSIVE; if (--unfairness < 0) wait->flags |= WQ_FLAG_CUSTOM; } /* * Do one last check whether we can get the * page bit synchronously. * * Do the folio_set_waiters() marking before that * to let any waker we _just_ missed know they * need to wake us up (otherwise they'll never * even go to the slow case that looks at the * page queue), and add ourselves to the wait * queue if we need to sleep. * * This part needs to be done under the queue * lock to avoid races. */ spin_lock_irq(&q->lock); folio_set_waiters(folio); if (!folio_trylock_flag(folio, bit_nr, wait)) __add_wait_queue_entry_tail(q, wait); spin_unlock_irq(&q->lock); /* * From now on, all the logic will be based on * the WQ_FLAG_WOKEN and WQ_FLAG_DONE flag, to * see whether the page bit testing has already * been done by the wake function. * * We can drop our reference to the folio. */ if (behavior == DROP) folio_put(folio); /* * Note that until the "finish_wait()", or until * we see the WQ_FLAG_WOKEN flag, we need to * be very careful with the 'wait->flags', because * we may race with a waker that sets them. */ for (;;) { unsigned int flags; set_current_state(state); /* Loop until we've been woken or interrupted */ flags = smp_load_acquire(&wait->flags); if (!(flags & WQ_FLAG_WOKEN)) { if (signal_pending_state(state, current)) break; io_schedule(); continue; } /* If we were non-exclusive, we're done */ if (behavior != EXCLUSIVE) break; /* If the waker got the lock for us, we're done */ if (flags & WQ_FLAG_DONE) break; /* * Otherwise, if we're getting the lock, we need to * try to get it ourselves. * * And if that fails, we'll have to retry this all. */ if (unlikely(test_and_set_bit(bit_nr, folio_flags(folio, 0)))) goto repeat; wait->flags |= WQ_FLAG_DONE; break; } /* * If a signal happened, this 'finish_wait()' may remove the last * waiter from the wait-queues, but the folio waiters bit will remain * set. That's ok. The next wakeup will take care of it, and trying * to do it here would be difficult and prone to races. */ finish_wait(q, wait); if (thrashing) { delayacct_thrashing_end(&in_thrashing); psi_memstall_leave(&pflags); } /* * NOTE! The wait->flags weren't stable until we've done the * 'finish_wait()', and we could have exited the loop above due * to a signal, and had a wakeup event happen after the signal * test but before the 'finish_wait()'. * * So only after the finish_wait() can we reliably determine * if we got woken up or not, so we can now figure out the final * return value based on that state without races. * * Also note that WQ_FLAG_WOKEN is sufficient for a non-exclusive * waiter, but an exclusive one requires WQ_FLAG_DONE. */ if (behavior == EXCLUSIVE) return wait->flags & WQ_FLAG_DONE ? 0 : -EINTR; return wait->flags & WQ_FLAG_WOKEN ? 0 : -EINTR; } #ifdef CONFIG_MIGRATION /** * migration_entry_wait_on_locked - Wait for a migration entry to be removed * @entry: migration swap entry. * @ptl: already locked ptl. This function will drop the lock. * * Wait for a migration entry referencing the given page to be removed. This is * equivalent to put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE) except * this can be called without taking a reference on the page. Instead this * should be called while holding the ptl for the migration entry referencing * the page. * * Returns after unlocking the ptl. * * This follows the same logic as folio_wait_bit_common() so see the comments * there. */ void migration_entry_wait_on_locked(swp_entry_t entry, spinlock_t *ptl) __releases(ptl) { struct wait_page_queue wait_page; wait_queue_entry_t *wait = &wait_page.wait; bool thrashing = false; unsigned long pflags; bool in_thrashing; wait_queue_head_t *q; struct folio *folio = pfn_swap_entry_folio(entry); q = folio_waitqueue(folio); if (!folio_test_uptodate(folio) && folio_test_workingset(folio)) { delayacct_thrashing_start(&in_thrashing); psi_memstall_enter(&pflags); thrashing = true; } init_wait(wait); wait->func = wake_page_function; wait_page.folio = folio; wait_page.bit_nr = PG_locked; wait->flags = 0; spin_lock_irq(&q->lock); folio_set_waiters(folio); if (!folio_trylock_flag(folio, PG_locked, wait)) __add_wait_queue_entry_tail(q, wait); spin_unlock_irq(&q->lock); /* * If a migration entry exists for the page the migration path must hold * a valid reference to the page, and it must take the ptl to remove the * migration entry. So the page is valid until the ptl is dropped. */ spin_unlock(ptl); for (;;) { unsigned int flags; set_current_state(TASK_UNINTERRUPTIBLE); /* Loop until we've been woken or interrupted */ flags = smp_load_acquire(&wait->flags); if (!(flags & WQ_FLAG_WOKEN)) { if (signal_pending_state(TASK_UNINTERRUPTIBLE, current)) break; io_schedule(); continue; } break; } finish_wait(q, wait); if (thrashing) { delayacct_thrashing_end(&in_thrashing); psi_memstall_leave(&pflags); } } #endif void folio_wait_bit(struct folio *folio, int bit_nr) { folio_wait_bit_common(folio, bit_nr, TASK_UNINTERRUPTIBLE, SHARED); } EXPORT_SYMBOL(folio_wait_bit); int folio_wait_bit_killable(struct folio *folio, int bit_nr) { return folio_wait_bit_common(folio, bit_nr, TASK_KILLABLE, SHARED); } EXPORT_SYMBOL(folio_wait_bit_killable); /** * folio_put_wait_locked - Drop a reference and wait for it to be unlocked * @folio: The folio to wait for. * @state: The sleep state (TASK_KILLABLE, TASK_UNINTERRUPTIBLE, etc). * * The caller should hold a reference on @folio. They expect the page to * become unlocked relatively soon, but do not wish to hold up migration * (for example) by holding the reference while waiting for the folio to * come unlocked. After this function returns, the caller should not * dereference @folio. * * Return: 0 if the folio was unlocked or -EINTR if interrupted by a signal. */ static int folio_put_wait_locked(struct folio *folio, int state) { return folio_wait_bit_common(folio, PG_locked, state, DROP); } /** * folio_add_wait_queue - Add an arbitrary waiter to a folio's wait queue * @folio: Folio defining the wait queue of interest * @waiter: Waiter to add to the queue * * Add an arbitrary @waiter to the wait queue for the nominated @folio. */ void folio_add_wait_queue(struct folio *folio, wait_queue_entry_t *waiter) { wait_queue_head_t *q = folio_waitqueue(folio); unsigned long flags; spin_lock_irqsave(&q->lock, flags); __add_wait_queue_entry_tail(q, waiter); folio_set_waiters(folio); spin_unlock_irqrestore(&q->lock, flags); } EXPORT_SYMBOL_GPL(folio_add_wait_queue); /** * folio_unlock - Unlock a locked folio. * @folio: The folio. * * Unlocks the folio and wakes up any thread sleeping on the page lock. * * Context: May be called from interrupt or process context. May not be * called from NMI context. */ void folio_unlock(struct folio *folio) { /* Bit 7 allows x86 to check the byte's sign bit */ BUILD_BUG_ON(PG_waiters != 7); BUILD_BUG_ON(PG_locked > 7); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); if (folio_xor_flags_has_waiters(folio, 1 << PG_locked)) folio_wake_bit(folio, PG_locked); } EXPORT_SYMBOL(folio_unlock); /** * folio_end_read - End read on a folio. * @folio: The folio. * @success: True if all reads completed successfully. * * When all reads against a folio have completed, filesystems should * call this function to let the pagecache know that no more reads * are outstanding. This will unlock the folio and wake up any thread * sleeping on the lock. The folio will also be marked uptodate if all * reads succeeded. * * Context: May be called from interrupt or process context. May not be * called from NMI context. */ void folio_end_read(struct folio *folio, bool success) { unsigned long mask = 1 << PG_locked; /* Must be in bottom byte for x86 to work */ BUILD_BUG_ON(PG_uptodate > 7); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_FOLIO(folio_test_uptodate(folio), folio); if (likely(success)) mask |= 1 << PG_uptodate; if (folio_xor_flags_has_waiters(folio, mask)) folio_wake_bit(folio, PG_locked); } EXPORT_SYMBOL(folio_end_read); /** * folio_end_private_2 - Clear PG_private_2 and wake any waiters. * @folio: The folio. * * Clear the PG_private_2 bit on a folio and wake up any sleepers waiting for * it. The folio reference held for PG_private_2 being set is released. * * This is, for example, used when a netfs folio is being written to a local * disk cache, thereby allowing writes to the cache for the same folio to be * serialised. */ void folio_end_private_2(struct folio *folio) { VM_BUG_ON_FOLIO(!folio_test_private_2(folio), folio); clear_bit_unlock(PG_private_2, folio_flags(folio, 0)); folio_wake_bit(folio, PG_private_2); folio_put(folio); } EXPORT_SYMBOL(folio_end_private_2); /** * folio_wait_private_2 - Wait for PG_private_2 to be cleared on a folio. * @folio: The folio to wait on. * * Wait for PG_private_2 to be cleared on a folio. */ void folio_wait_private_2(struct folio *folio) { while (folio_test_private_2(folio)) folio_wait_bit(folio, PG_private_2); } EXPORT_SYMBOL(folio_wait_private_2); /** * folio_wait_private_2_killable - Wait for PG_private_2 to be cleared on a folio. * @folio: The folio to wait on. * * Wait for PG_private_2 to be cleared on a folio or until a fatal signal is * received by the calling task. * * Return: * - 0 if successful. * - -EINTR if a fatal signal was encountered. */ int folio_wait_private_2_killable(struct folio *folio) { int ret = 0; while (folio_test_private_2(folio)) { ret = folio_wait_bit_killable(folio, PG_private_2); if (ret < 0) break; } return ret; } EXPORT_SYMBOL(folio_wait_private_2_killable); /** * folio_end_writeback - End writeback against a folio. * @folio: The folio. * * The folio must actually be under writeback. * * Context: May be called from process or interrupt context. */ void folio_end_writeback(struct folio *folio) { VM_BUG_ON_FOLIO(!folio_test_writeback(folio), folio); /* * folio_test_clear_reclaim() could be used here but it is an * atomic operation and overkill in this particular case. Failing * to shuffle a folio marked for immediate reclaim is too mild * a gain to justify taking an atomic operation penalty at the * end of every folio writeback. */ if (folio_test_reclaim(folio)) { folio_clear_reclaim(folio); folio_rotate_reclaimable(folio); } /* * Writeback does not hold a folio reference of its own, relying * on truncation to wait for the clearing of PG_writeback. * But here we must make sure that the folio is not freed and * reused before the folio_wake_bit(). */ folio_get(folio); if (__folio_end_writeback(folio)) folio_wake_bit(folio, PG_writeback); acct_reclaim_writeback(folio); folio_put(folio); } EXPORT_SYMBOL(folio_end_writeback); /** * __folio_lock - Get a lock on the folio, assuming we need to sleep to get it. * @folio: The folio to lock */ void __folio_lock(struct folio *folio) { folio_wait_bit_common(folio, PG_locked, TASK_UNINTERRUPTIBLE, EXCLUSIVE); } EXPORT_SYMBOL(__folio_lock); int __folio_lock_killable(struct folio *folio) { return folio_wait_bit_common(folio, PG_locked, TASK_KILLABLE, EXCLUSIVE); } EXPORT_SYMBOL_GPL(__folio_lock_killable); static int __folio_lock_async(struct folio *folio, struct wait_page_queue *wait) { struct wait_queue_head *q = folio_waitqueue(folio); int ret; wait->folio = folio; wait->bit_nr = PG_locked; spin_lock_irq(&q->lock); __add_wait_queue_entry_tail(q, &wait->wait); folio_set_waiters(folio); ret = !folio_trylock(folio); /* * If we were successful now, we know we're still on the * waitqueue as we're still under the lock. This means it's * safe to remove and return success, we know the callback * isn't going to trigger. */ if (!ret) __remove_wait_queue(q, &wait->wait); else ret = -EIOCBQUEUED; spin_unlock_irq(&q->lock); return ret; } /* * Return values: * 0 - folio is locked. * non-zero - folio is not locked. * mmap_lock or per-VMA lock has been released (mmap_read_unlock() or * vma_end_read()), unless flags had both FAULT_FLAG_ALLOW_RETRY and * FAULT_FLAG_RETRY_NOWAIT set, in which case the lock is still held. * * If neither ALLOW_RETRY nor KILLABLE are set, will always return 0 * with the folio locked and the mmap_lock/per-VMA lock is left unperturbed. */ vm_fault_t __folio_lock_or_retry(struct folio *folio, struct vm_fault *vmf) { unsigned int flags = vmf->flags; if (fault_flag_allow_retry_first(flags)) { /* * CAUTION! In this case, mmap_lock/per-VMA lock is not * released even though returning VM_FAULT_RETRY. */ if (flags & FAULT_FLAG_RETRY_NOWAIT) return VM_FAULT_RETRY; release_fault_lock(vmf); if (flags & FAULT_FLAG_KILLABLE) folio_wait_locked_killable(folio); else folio_wait_locked(folio); return VM_FAULT_RETRY; } if (flags & FAULT_FLAG_KILLABLE) { bool ret; ret = __folio_lock_killable(folio); if (ret) { release_fault_lock(vmf); return VM_FAULT_RETRY; } } else { __folio_lock(folio); } return 0; } /** * page_cache_next_miss() - Find the next gap in the page cache. * @mapping: Mapping. * @index: Index. * @max_scan: Maximum range to search. * * Search the range [index, min(index + max_scan - 1, ULONG_MAX)] for the * gap with the lowest index. * * This function may be called under the rcu_read_lock. However, this will * not atomically search a snapshot of the cache at a single point in time. * For example, if a gap is created at index 5, then subsequently a gap is * created at index 10, page_cache_next_miss covering both indices may * return 10 if called under the rcu_read_lock. * * Return: The index of the gap if found, otherwise an index outside the * range specified (in which case 'return - index >= max_scan' will be true). * In the rare case of index wrap-around, 0 will be returned. */ pgoff_t page_cache_next_miss(struct address_space *mapping, pgoff_t index, unsigned long max_scan) { XA_STATE(xas, &mapping->i_pages, index); while (max_scan--) { void *entry = xas_next(&xas); if (!entry || xa_is_value(entry)) break; if (xas.xa_index == 0) break; } return xas.xa_index; } EXPORT_SYMBOL(page_cache_next_miss); /** * page_cache_prev_miss() - Find the previous gap in the page cache. * @mapping: Mapping. * @index: Index. * @max_scan: Maximum range to search. * * Search the range [max(index - max_scan + 1, 0), index] for the * gap with the highest index. * * This function may be called under the rcu_read_lock. However, this will * not atomically search a snapshot of the cache at a single point in time. * For example, if a gap is created at index 10, then subsequently a gap is * created at index 5, page_cache_prev_miss() covering both indices may * return 5 if called under the rcu_read_lock. * * Return: The index of the gap if found, otherwise an index outside the * range specified (in which case 'index - return >= max_scan' will be true). * In the rare case of wrap-around, ULONG_MAX will be returned. */ pgoff_t page_cache_prev_miss(struct address_space *mapping, pgoff_t index, unsigned long max_scan) { XA_STATE(xas, &mapping->i_pages, index); while (max_scan--) { void *entry = xas_prev(&xas); if (!entry || xa_is_value(entry)) break; if (xas.xa_index == ULONG_MAX) break; } return xas.xa_index; } EXPORT_SYMBOL(page_cache_prev_miss); /* * Lockless page cache protocol: * On the lookup side: * 1. Load the folio from i_pages * 2. Increment the refcount if it's not zero * 3. If the folio is not found by xas_reload(), put the refcount and retry * * On the removal side: * A. Freeze the page (by zeroing the refcount if nobody else has a reference) * B. Remove the page from i_pages * C. Return the page to the page allocator * * This means that any page may have its reference count temporarily * increased by a speculative page cache (or GUP-fast) lookup as it can * be allocated by another user before the RCU grace period expires. * Because the refcount temporarily acquired here may end up being the * last refcount on the page, any page allocation must be freeable by * folio_put(). */ /* * filemap_get_entry - Get a page cache entry. * @mapping: the address_space to search * @index: The page cache index. * * Looks up the page cache entry at @mapping & @index. If it is a folio, * it is returned with an increased refcount. If it is a shadow entry * of a previously evicted folio, or a swap entry from shmem/tmpfs, * it is returned without further action. * * Return: The folio, swap or shadow entry, %NULL if nothing is found. */ void *filemap_get_entry(struct address_space *mapping, pgoff_t index) { XA_STATE(xas, &mapping->i_pages, index); struct folio *folio; rcu_read_lock(); repeat: xas_reset(&xas); folio = xas_load(&xas); if (xas_retry(&xas, folio)) goto repeat; /* * A shadow entry of a recently evicted page, or a swap entry from * shmem/tmpfs. Return it without attempting to raise page count. */ if (!folio || xa_is_value(folio)) goto out; if (!folio_try_get_rcu(folio)) goto repeat; if (unlikely(folio != xas_reload(&xas))) { folio_put(folio); goto repeat; } out: rcu_read_unlock(); return folio; } /** * __filemap_get_folio - Find and get a reference to a folio. * @mapping: The address_space to search. * @index: The page index. * @fgp_flags: %FGP flags modify how the folio is returned. * @gfp: Memory allocation flags to use if %FGP_CREAT is specified. * * Looks up the page cache entry at @mapping & @index. * * If %FGP_LOCK or %FGP_CREAT are specified then the function may sleep even * if the %GFP flags specified for %FGP_CREAT are atomic. * * If this function returns a folio, it is returned with an increased refcount. * * Return: The found folio or an ERR_PTR() otherwise. */ struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index, fgf_t fgp_flags, gfp_t gfp) { struct folio *folio; repeat: folio = filemap_get_entry(mapping, index); if (xa_is_value(folio)) folio = NULL; if (!folio) goto no_page; if (fgp_flags & FGP_LOCK) { if (fgp_flags & FGP_NOWAIT) { if (!folio_trylock(folio)) { folio_put(folio); return ERR_PTR(-EAGAIN); } } else { folio_lock(folio); } /* Has the page been truncated? */ if (unlikely(folio->mapping != mapping)) { folio_unlock(folio); folio_put(folio); goto repeat; } VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio); } if (fgp_flags & FGP_ACCESSED) folio_mark_accessed(folio); else if (fgp_flags & FGP_WRITE) { /* Clear idle flag for buffer write */ if (folio_test_idle(folio)) folio_clear_idle(folio); } if (fgp_flags & FGP_STABLE) folio_wait_stable(folio); no_page: if (!folio && (fgp_flags & FGP_CREAT)) { unsigned order = FGF_GET_ORDER(fgp_flags); int err; if ((fgp_flags & FGP_WRITE) && mapping_can_writeback(mapping)) gfp |= __GFP_WRITE; if (fgp_flags & FGP_NOFS) gfp &= ~__GFP_FS; if (fgp_flags & FGP_NOWAIT) { gfp &= ~GFP_KERNEL; gfp |= GFP_NOWAIT | __GFP_NOWARN; } if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP)))) fgp_flags |= FGP_LOCK; if (!mapping_large_folio_support(mapping)) order = 0; if (order > MAX_PAGECACHE_ORDER) order = MAX_PAGECACHE_ORDER; /* If we're not aligned, allocate a smaller folio */ if (index & ((1UL << order) - 1)) order = __ffs(index); do { gfp_t alloc_gfp = gfp; err = -ENOMEM; if (order > 0) alloc_gfp |= __GFP_NORETRY | __GFP_NOWARN; folio = filemap_alloc_folio(alloc_gfp, order); if (!folio) continue; /* Init accessed so avoid atomic mark_page_accessed later */ if (fgp_flags & FGP_ACCESSED) __folio_set_referenced(folio); err = filemap_add_folio(mapping, folio, index, gfp); if (!err) break; folio_put(folio); folio = NULL; } while (order-- > 0); if (err == -EEXIST) goto repeat; if (err) return ERR_PTR(err); /* * filemap_add_folio locks the page, and for mmap * we expect an unlocked page. */ if (folio && (fgp_flags & FGP_FOR_MMAP)) folio_unlock(folio); } if (!folio) return ERR_PTR(-ENOENT); return folio; } EXPORT_SYMBOL(__filemap_get_folio); static inline struct folio *find_get_entry(struct xa_state *xas, pgoff_t max, xa_mark_t mark) { struct folio *folio; retry: if (mark == XA_PRESENT) folio = xas_find(xas, max); else folio = xas_find_marked(xas, max, mark); if (xas_retry(xas, folio)) goto retry; /* * A shadow entry of a recently evicted page, a swap * entry from shmem/tmpfs or a DAX entry. Return it * without attempting to raise page count. */ if (!folio || xa_is_value(folio)) return folio; if (!folio_try_get_rcu(folio)) goto reset; if (unlikely(folio != xas_reload(xas))) { folio_put(folio); goto reset; } return folio; reset: xas_reset(xas); goto retry; } /** * find_get_entries - gang pagecache lookup * @mapping: The address_space to search * @start: The starting page cache index * @end: The final page index (inclusive). * @fbatch: Where the resulting entries are placed. * @indices: The cache indices corresponding to the entries in @entries * * find_get_entries() will search for and return a batch of entries in * the mapping. The entries are placed in @fbatch. find_get_entries() * takes a reference on any actual folios it returns. * * The entries have ascending indexes. The indices may not be consecutive * due to not-present entries or large folios. * * Any shadow entries of evicted folios, or swap entries from * shmem/tmpfs, are included in the returned array. * * Return: The number of entries which were found. */ unsigned find_get_entries(struct address_space *mapping, pgoff_t *start, pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices) { XA_STATE(xas, &mapping->i_pages, *start); struct folio *folio; rcu_read_lock(); while ((folio = find_get_entry(&xas, end, XA_PRESENT)) != NULL) { indices[fbatch->nr] = xas.xa_index; if (!folio_batch_add(fbatch, folio)) break; } rcu_read_unlock(); if (folio_batch_count(fbatch)) { unsigned long nr = 1; int idx = folio_batch_count(fbatch) - 1; folio = fbatch->folios[idx]; if (!xa_is_value(folio)) nr = folio_nr_pages(folio); *start = indices[idx] + nr; } return folio_batch_count(fbatch); } /** * find_lock_entries - Find a batch of pagecache entries. * @mapping: The address_space to search. * @start: The starting page cache index. * @end: The final page index (inclusive). * @fbatch: Where the resulting entries are placed. * @indices: The cache indices of the entries in @fbatch. * * find_lock_entries() will return a batch of entries from @mapping. * Swap, shadow and DAX entries are included. Folios are returned * locked and with an incremented refcount. Folios which are locked * by somebody else or under writeback are skipped. Folios which are * partially outside the range are not returned. * * The entries have ascending indexes. The indices may not be consecutive * due to not-present entries, large folios, folios which could not be * locked or folios under writeback. * * Return: The number of entries which were found. */ unsigned find_lock_entries(struct address_space *mapping, pgoff_t *start, pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices) { XA_STATE(xas, &mapping->i_pages, *start); struct folio *folio; rcu_read_lock(); while ((folio = find_get_entry(&xas, end, XA_PRESENT))) { if (!xa_is_value(folio)) { if (folio->index < *start) goto put; if (folio_next_index(folio) - 1 > end) goto put; if (!folio_trylock(folio)) goto put; if (folio->mapping != mapping || folio_test_writeback(folio)) goto unlock; VM_BUG_ON_FOLIO(!folio_contains(folio, xas.xa_index), folio); } indices[fbatch->nr] = xas.xa_index; if (!folio_batch_add(fbatch, folio)) break; continue; unlock: folio_unlock(folio); put: folio_put(folio); } rcu_read_unlock(); if (folio_batch_count(fbatch)) { unsigned long nr = 1; int idx = folio_batch_count(fbatch) - 1; folio = fbatch->folios[idx]; if (!xa_is_value(folio)) nr = folio_nr_pages(folio); *start = indices[idx] + nr; } return folio_batch_count(fbatch); } /** * filemap_get_folios - Get a batch of folios * @mapping: The address_space to search * @start: The starting page index * @end: The final page index (inclusive) * @fbatch: The batch to fill. * * Search for and return a batch of folios in the mapping starting at * index @start and up to index @end (inclusive). The folios are returned * in @fbatch with an elevated reference count. * * Return: The number of folios which were found. * We also update @start to index the next folio for the traversal. */ unsigned filemap_get_folios(struct address_space *mapping, pgoff_t *start, pgoff_t end, struct folio_batch *fbatch) { return filemap_get_folios_tag(mapping, start, end, XA_PRESENT, fbatch); } EXPORT_SYMBOL(filemap_get_folios); /** * filemap_get_folios_contig - Get a batch of contiguous folios * @mapping: The address_space to search * @start: The starting page index * @end: The final page index (inclusive) * @fbatch: The batch to fill * * filemap_get_folios_contig() works exactly like filemap_get_folios(), * except the returned folios are guaranteed to be contiguous. This may * not return all contiguous folios if the batch gets filled up. * * Return: The number of folios found. * Also update @start to be positioned for traversal of the next folio. */ unsigned filemap_get_folios_contig(struct address_space *mapping, pgoff_t *start, pgoff_t end, struct folio_batch *fbatch) { XA_STATE(xas, &mapping->i_pages, *start); unsigned long nr; struct folio *folio; rcu_read_lock(); for (folio = xas_load(&xas); folio && xas.xa_index <= end; folio = xas_next(&xas)) { if (xas_retry(&xas, folio)) continue; /* * If the entry has been swapped out, we can stop looking. * No current caller is looking for DAX entries. */ if (xa_is_value(folio)) goto update_start; if (!folio_try_get_rcu(folio)) goto retry; if (unlikely(folio != xas_reload(&xas))) goto put_folio; if (!folio_batch_add(fbatch, folio)) { nr = folio_nr_pages(folio); *start = folio->index + nr; goto out; } continue; put_folio: folio_put(folio); retry: xas_reset(&xas); } update_start: nr = folio_batch_count(fbatch); if (nr) { folio = fbatch->folios[nr - 1]; *start = folio_next_index(folio); } out: rcu_read_unlock(); return folio_batch_count(fbatch); } EXPORT_SYMBOL(filemap_get_folios_contig); /** * filemap_get_folios_tag - Get a batch of folios matching @tag * @mapping: The address_space to search * @start: The starting page index * @end: The final page index (inclusive) * @tag: The tag index * @fbatch: The batch to fill * * The first folio may start before @start; if it does, it will contain * @start. The final folio may extend beyond @end; if it does, it will * contain @end. The folios have ascending indices. There may be gaps * between the folios if there are indices which have no folio in the * page cache. If folios are added to or removed from the page cache * while this is running, they may or may not be found by this call. * Only returns folios that are tagged with @tag. * * Return: The number of folios found. * Also update @start to index the next folio for traversal. */ unsigned filemap_get_folios_tag(struct address_space *mapping, pgoff_t *start, pgoff_t end, xa_mark_t tag, struct folio_batch *fbatch) { XA_STATE(xas, &mapping->i_pages, *start); struct folio *folio; rcu_read_lock(); while ((folio = find_get_entry(&xas, end, tag)) != NULL) { /* * Shadow entries should never be tagged, but this iteration * is lockless so there is a window for page reclaim to evict * a page we saw tagged. Skip over it. */ if (xa_is_value(folio)) continue; if (!folio_batch_add(fbatch, folio)) { unsigned long nr = folio_nr_pages(folio); *start = folio->index + nr; goto out; } } /* * We come here when there is no page beyond @end. We take care to not * overflow the index @start as it confuses some of the callers. This * breaks the iteration when there is a page at index -1 but that is * already broke anyway. */ if (end == (pgoff_t)-1) *start = (pgoff_t)-1; else *start = end + 1; out: rcu_read_unlock(); return folio_batch_count(fbatch); } EXPORT_SYMBOL(filemap_get_folios_tag); /* * CD/DVDs are error prone. When a medium error occurs, the driver may fail * a _large_ part of the i/o request. Imagine the worst scenario: * * ---R__________________________________________B__________ * ^ reading here ^ bad block(assume 4k) * * read(R) => miss => readahead(R...B) => media error => frustrating retries * => failing the whole request => read(R) => read(R+1) => * readahead(R+1...B+1) => bang => read(R+2) => read(R+3) => * readahead(R+3...B+2) => bang => read(R+3) => read(R+4) => * readahead(R+4...B+3) => bang => read(R+4) => read(R+5) => ...... * * It is going insane. Fix it by quickly scaling down the readahead size. */ static void shrink_readahead_size_eio(struct file_ra_state *ra) { ra->ra_pages /= 4; } /* * filemap_get_read_batch - Get a batch of folios for read * * Get a batch of folios which represent a contiguous range of bytes in * the file. No exceptional entries will be returned. If @index is in * the middle of a folio, the entire folio will be returned. The last * folio in the batch may have the readahead flag set or the uptodate flag * clear so that the caller can take the appropriate action. */ static void filemap_get_read_batch(struct address_space *mapping, pgoff_t index, pgoff_t max, struct folio_batch *fbatch) { XA_STATE(xas, &mapping->i_pages, index); struct folio *folio; rcu_read_lock(); for (folio = xas_load(&xas); folio; folio = xas_next(&xas)) { if (xas_retry(&xas, folio)) continue; if (xas.xa_index > max || xa_is_value(folio)) break; if (xa_is_sibling(folio)) break; if (!folio_try_get_rcu(folio)) goto retry; if (unlikely(folio != xas_reload(&xas))) goto put_folio; if (!folio_batch_add(fbatch, folio)) break; if (!folio_test_uptodate(folio)) break; if (folio_test_readahead(folio)) break; xas_advance(&xas, folio_next_index(folio) - 1); continue; put_folio: folio_put(folio); retry: xas_reset(&xas); } rcu_read_unlock(); } static int filemap_read_folio(struct file *file, filler_t filler, struct folio *folio) { bool workingset = folio_test_workingset(folio); unsigned long pflags; int error; /* * A previous I/O error may have been due to temporary failures, * eg. multipath errors. PG_error will be set again if read_folio * fails. */ folio_clear_error(folio); /* Start the actual read. The read will unlock the page. */ if (unlikely(workingset)) psi_memstall_enter(&pflags); error = filler(file, folio); if (unlikely(workingset)) psi_memstall_leave(&pflags); if (error) return error; error = folio_wait_locked_killable(folio); if (error) return error; if (folio_test_uptodate(folio)) return 0; if (file) shrink_readahead_size_eio(&file->f_ra); return -EIO; } static bool filemap_range_uptodate(struct address_space *mapping, loff_t pos, size_t count, struct folio *folio, bool need_uptodate) { if (folio_test_uptodate(folio)) return true; /* pipes can't handle partially uptodate pages */ if (need_uptodate) return false; if (!mapping->a_ops->is_partially_uptodate) return false; if (mapping->host->i_blkbits >= folio_shift(folio)) return false; if (folio_pos(folio) > pos) { count -= folio_pos(folio) - pos; pos = 0; } else { pos -= folio_pos(folio); } return mapping->a_ops->is_partially_uptodate(folio, pos, count); } static int filemap_update_page(struct kiocb *iocb, struct address_space *mapping, size_t count, struct folio *folio, bool need_uptodate) { int error; if (iocb->ki_flags & IOCB_NOWAIT) { if (!filemap_invalidate_trylock_shared(mapping)) return -EAGAIN; } else { filemap_invalidate_lock_shared(mapping); } if (!folio_trylock(folio)) { error = -EAGAIN; if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_NOIO)) goto unlock_mapping; if (!(iocb->ki_flags & IOCB_WAITQ)) { filemap_invalidate_unlock_shared(mapping); /* * This is where we usually end up waiting for a * previously submitted readahead to finish. */ folio_put_wait_locked(folio, TASK_KILLABLE); return AOP_TRUNCATED_PAGE; } error = __folio_lock_async(folio, iocb->ki_waitq); if (error) goto unlock_mapping; } error = AOP_TRUNCATED_PAGE; if (!folio->mapping) goto unlock; error = 0; if (filemap_range_uptodate(mapping, iocb->ki_pos, count, folio, need_uptodate)) goto unlock; error = -EAGAIN; if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT | IOCB_WAITQ)) goto unlock; error = filemap_read_folio(iocb->ki_filp, mapping->a_ops->read_folio, folio); goto unlock_mapping; unlock: folio_unlock(folio); unlock_mapping: filemap_invalidate_unlock_shared(mapping); if (error == AOP_TRUNCATED_PAGE) folio_put(folio); return error; } static int filemap_create_folio(struct file *file, struct address_space *mapping, pgoff_t index, struct folio_batch *fbatch) { struct folio *folio; int error; folio = filemap_alloc_folio(mapping_gfp_mask(mapping), 0); if (!folio) return -ENOMEM; /* * Protect against truncate / hole punch. Grabbing invalidate_lock * here assures we cannot instantiate and bring uptodate new * pagecache folios after evicting page cache during truncate * and before actually freeing blocks. Note that we could * release invalidate_lock after inserting the folio into * the page cache as the locked folio would then be enough to * synchronize with hole punching. But there are code paths * such as filemap_update_page() filling in partially uptodate * pages or ->readahead() that need to hold invalidate_lock * while mapping blocks for IO so let's hold the lock here as * well to keep locking rules simple. */ filemap_invalidate_lock_shared(mapping); error = filemap_add_folio(mapping, folio, index, mapping_gfp_constraint(mapping, GFP_KERNEL)); if (error == -EEXIST) error = AOP_TRUNCATED_PAGE; if (error) goto error; error = filemap_read_folio(file, mapping->a_ops->read_folio, folio); if (error) goto error; filemap_invalidate_unlock_shared(mapping); folio_batch_add(fbatch, folio); return 0; error: filemap_invalidate_unlock_shared(mapping); folio_put(folio); return error; } static int filemap_readahead(struct kiocb *iocb, struct file *file, struct address_space *mapping, struct folio *folio, pgoff_t last_index) { DEFINE_READAHEAD(ractl, file, &file->f_ra, mapping, folio->index); if (iocb->ki_flags & IOCB_NOIO) return -EAGAIN; page_cache_async_ra(&ractl, folio, last_index - folio->index); return 0; } static int filemap_get_pages(struct kiocb *iocb, size_t count, struct folio_batch *fbatch, bool need_uptodate) { struct file *filp = iocb->ki_filp; struct address_space *mapping = filp->f_mapping; struct file_ra_state *ra = &filp->f_ra; pgoff_t index = iocb->ki_pos >> PAGE_SHIFT; pgoff_t last_index; struct folio *folio; int err = 0; /* "last_index" is the index of the page beyond the end of the read */ last_index = DIV_ROUND_UP(iocb->ki_pos + count, PAGE_SIZE); retry: if (fatal_signal_pending(current)) return -EINTR; filemap_get_read_batch(mapping, index, last_index - 1, fbatch); if (!folio_batch_count(fbatch)) { if (iocb->ki_flags & IOCB_NOIO) return -EAGAIN; page_cache_sync_readahead(mapping, ra, filp, index, last_index - index); filemap_get_read_batch(mapping, index, last_index - 1, fbatch); } if (!folio_batch_count(fbatch)) { if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ)) return -EAGAIN; err = filemap_create_folio(filp, mapping, iocb->ki_pos >> PAGE_SHIFT, fbatch); if (err == AOP_TRUNCATED_PAGE) goto retry; return err; } folio = fbatch->folios[folio_batch_count(fbatch) - 1]; if (folio_test_readahead(folio)) { err = filemap_readahead(iocb, filp, mapping, folio, last_index); if (err) goto err; } if (!folio_test_uptodate(folio)) { if ((iocb->ki_flags & IOCB_WAITQ) && folio_batch_count(fbatch) > 1) iocb->ki_flags |= IOCB_NOWAIT; err = filemap_update_page(iocb, mapping, count, folio, need_uptodate); if (err) goto err; } return 0; err: if (err < 0) folio_put(folio); if (likely(--fbatch->nr)) return 0; if (err == AOP_TRUNCATED_PAGE) goto retry; return err; } static inline bool pos_same_folio(loff_t pos1, loff_t pos2, struct folio *folio) { unsigned int shift = folio_shift(folio); return (pos1 >> shift == pos2 >> shift); } /** * filemap_read - Read data from the page cache. * @iocb: The iocb to read. * @iter: Destination for the data. * @already_read: Number of bytes already read by the caller. * * Copies data from the page cache. If the data is not currently present, * uses the readahead and read_folio address_space operations to fetch it. * * Return: Total number of bytes copied, including those already read by * the caller. If an error happens before any bytes are copied, returns * a negative error number. */ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter, ssize_t already_read) { struct file *filp = iocb->ki_filp; struct file_ra_state *ra = &filp->f_ra; struct address_space *mapping = filp->f_mapping; struct inode *inode = mapping->host; struct folio_batch fbatch; int i, error = 0; bool writably_mapped; loff_t isize, end_offset; loff_t last_pos = ra->prev_pos; if (unlikely(iocb->ki_pos >= inode->i_sb->s_maxbytes)) return 0; if (unlikely(!iov_iter_count(iter))) return 0; iov_iter_truncate(iter, inode->i_sb->s_maxbytes); folio_batch_init(&fbatch); do { cond_resched(); /* * If we've already successfully copied some data, then we * can no longer safely return -EIOCBQUEUED. Hence mark * an async read NOWAIT at that point. */ if ((iocb->ki_flags & IOCB_WAITQ) && already_read) iocb->ki_flags |= IOCB_NOWAIT; if (unlikely(iocb->ki_pos >= i_size_read(inode))) break; error = filemap_get_pages(iocb, iter->count, &fbatch, false); if (error < 0) break; /* * i_size must be checked after we know the pages are Uptodate. * * Checking i_size after the check allows us to calculate * the correct value for "nr", which means the zero-filled * part of the page is not copied back to userspace (unless * another truncate extends the file - this is desired though). */ isize = i_size_read(inode); if (unlikely(iocb->ki_pos >= isize)) goto put_folios; end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count); /* * Once we start copying data, we don't want to be touching any * cachelines that might be contended: */ writably_mapped = mapping_writably_mapped(mapping); /* * When a read accesses the same folio several times, only * mark it as accessed the first time. */ if (!pos_same_folio(iocb->ki_pos, last_pos - 1, fbatch.folios[0])) folio_mark_accessed(fbatch.folios[0]); for (i = 0; i < folio_batch_count(&fbatch); i++) { struct folio *folio = fbatch.folios[i]; size_t fsize = folio_size(folio); size_t offset = iocb->ki_pos & (fsize - 1); size_t bytes = min_t(loff_t, end_offset - iocb->ki_pos, fsize - offset); size_t copied; if (end_offset < folio_pos(folio)) break; if (i > 0) folio_mark_accessed(folio); /* * If users can be writing to this folio using arbitrary * virtual addresses, take care of potential aliasing * before reading the folio on the kernel side. */ if (writably_mapped) flush_dcache_folio(folio); copied = copy_folio_to_iter(folio, offset, bytes, iter); already_read += copied; iocb->ki_pos += copied; last_pos = iocb->ki_pos; if (copied < bytes) { error = -EFAULT; break; } } put_folios: for (i = 0; i < folio_batch_count(&fbatch); i++) folio_put(fbatch.folios[i]); folio_batch_init(&fbatch); } while (iov_iter_count(iter) && iocb->ki_pos < isize && !error); file_accessed(filp); ra->prev_pos = last_pos; return already_read ? already_read : error; } EXPORT_SYMBOL_GPL(filemap_read); int kiocb_write_and_wait(struct kiocb *iocb, size_t count) { struct address_space *mapping = iocb->ki_filp->f_mapping; loff_t pos = iocb->ki_pos; loff_t end = pos + count - 1; if (iocb->ki_flags & IOCB_NOWAIT) { if (filemap_range_needs_writeback(mapping, pos, end)) return -EAGAIN; return 0; } return filemap_write_and_wait_range(mapping, pos, end); } EXPORT_SYMBOL_GPL(kiocb_write_and_wait); int kiocb_invalidate_pages(struct kiocb *iocb, size_t count) { struct address_space *mapping = iocb->ki_filp->f_mapping; loff_t pos = iocb->ki_pos; loff_t end = pos + count - 1; int ret; if (iocb->ki_flags & IOCB_NOWAIT) { /* we could block if there are any pages in the range */ if (filemap_range_has_page(mapping, pos, end)) return -EAGAIN; } else { ret = filemap_write_and_wait_range(mapping, pos, end); if (ret) return ret; } /* * After a write we want buffered reads to be sure to go to disk to get * the new data. We invalidate clean cached page from the region we're * about to write. We do this *before* the write so that we can return * without clobbering -EIOCBQUEUED from ->direct_IO(). */ return invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT, end >> PAGE_SHIFT); } EXPORT_SYMBOL_GPL(kiocb_invalidate_pages); /** * generic_file_read_iter - generic filesystem read routine * @iocb: kernel I/O control block * @iter: destination for the data read * * This is the "read_iter()" routine for all filesystems * that can use the page cache directly. * * The IOCB_NOWAIT flag in iocb->ki_flags indicates that -EAGAIN shall * be returned when no data can be read without waiting for I/O requests * to complete; it doesn't prevent readahead. * * The IOCB_NOIO flag in iocb->ki_flags indicates that no new I/O * requests shall be made for the read or for readahead. When no data * can be read, -EAGAIN shall be returned. When readahead would be * triggered, a partial, possibly empty read shall be returned. * * Return: * * number of bytes copied, even for partial reads * * negative error code (or 0 if IOCB_NOIO) if nothing was read */ ssize_t generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) { size_t count = iov_iter_count(iter); ssize_t retval = 0; if (!count) return 0; /* skip atime */ if (iocb->ki_flags & IOCB_DIRECT) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; retval = kiocb_write_and_wait(iocb, count); if (retval < 0) return retval; file_accessed(file); retval = mapping->a_ops->direct_IO(iocb, iter); if (retval >= 0) { iocb->ki_pos += retval; count -= retval; } if (retval != -EIOCBQUEUED) iov_iter_revert(iter, count - iov_iter_count(iter)); /* * Btrfs can have a short DIO read if we encounter * compressed extents, so if there was an error, or if * we've already read everything we wanted to, or if * there was a short read because we hit EOF, go ahead * and return. Otherwise fallthrough to buffered io for * the rest of the read. Buffered reads will not work for * DAX files, so don't bother trying. */ if (retval < 0 || !count || IS_DAX(inode)) return retval; if (iocb->ki_pos >= i_size_read(inode)) return retval; } return filemap_read(iocb, iter, retval); } EXPORT_SYMBOL(generic_file_read_iter); /* * Splice subpages from a folio into a pipe. */ size_t splice_folio_into_pipe(struct pipe_inode_info *pipe, struct folio *folio, loff_t fpos, size_t size) { struct page *page; size_t spliced = 0, offset = offset_in_folio(folio, fpos); page = folio_page(folio, offset / PAGE_SIZE); size = min(size, folio_size(folio) - offset); offset %= PAGE_SIZE; while (spliced < size && !pipe_full(pipe->head, pipe->tail, pipe->max_usage)) { struct pipe_buffer *buf = pipe_head_buf(pipe); size_t part = min_t(size_t, PAGE_SIZE - offset, size - spliced); *buf = (struct pipe_buffer) { .ops = &page_cache_pipe_buf_ops, .page = page, .offset = offset, .len = part, }; folio_get(folio); pipe->head++; page++; spliced += part; offset = 0; } return spliced; } /** * filemap_splice_read - Splice data from a file's pagecache into a pipe * @in: The file to read from * @ppos: Pointer to the file position to read from * @pipe: The pipe to splice into * @len: The amount to splice * @flags: The SPLICE_F_* flags * * This function gets folios from a file's pagecache and splices them into the * pipe. Readahead will be called as necessary to fill more folios. This may * be used for blockdevs also. * * Return: On success, the number of bytes read will be returned and *@ppos * will be updated if appropriate; 0 will be returned if there is no more data * to be read; -EAGAIN will be returned if the pipe had no space, and some * other negative error code will be returned on error. A short read may occur * if the pipe has insufficient space, we reach the end of the data or we hit a * hole. */ ssize_t filemap_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { struct folio_batch fbatch; struct kiocb iocb; size_t total_spliced = 0, used, npages; loff_t isize, end_offset; bool writably_mapped; int i, error = 0; if (unlikely(*ppos >= in->f_mapping->host->i_sb->s_maxbytes)) return 0; init_sync_kiocb(&iocb, in); iocb.ki_pos = *ppos; /* Work out how much data we can actually add into the pipe */ used = pipe_occupancy(pipe->head, pipe->tail); npages = max_t(ssize_t, pipe->max_usage - used, 0); len = min_t(size_t, len, npages * PAGE_SIZE); folio_batch_init(&fbatch); do { cond_resched(); if (*ppos >= i_size_read(in->f_mapping->host)) break; iocb.ki_pos = *ppos; error = filemap_get_pages(&iocb, len, &fbatch, true); if (error < 0) break; /* * i_size must be checked after we know the pages are Uptodate. * * Checking i_size after the check allows us to calculate * the correct value for "nr", which means the zero-filled * part of the page is not copied back to userspace (unless * another truncate extends the file - this is desired though). */ isize = i_size_read(in->f_mapping->host); if (unlikely(*ppos >= isize)) break; end_offset = min_t(loff_t, isize, *ppos + len); /* * Once we start copying data, we don't want to be touching any * cachelines that might be contended: */ writably_mapped = mapping_writably_mapped(in->f_mapping); for (i = 0; i < folio_batch_count(&fbatch); i++) { struct folio *folio = fbatch.folios[i]; size_t n; if (folio_pos(folio) >= end_offset) goto out; folio_mark_accessed(folio); /* * If users can be writing to this folio using arbitrary * virtual addresses, take care of potential aliasing * before reading the folio on the kernel side. */ if (writably_mapped) flush_dcache_folio(folio); n = min_t(loff_t, len, isize - *ppos); n = splice_folio_into_pipe(pipe, folio, *ppos, n); if (!n) goto out; len -= n; total_spliced += n; *ppos += n; in->f_ra.prev_pos = *ppos; if (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) goto out; } folio_batch_release(&fbatch); } while (len); out: folio_batch_release(&fbatch); file_accessed(in); return total_spliced ? total_spliced : error; } EXPORT_SYMBOL(filemap_splice_read); static inline loff_t folio_seek_hole_data(struct xa_state *xas, struct address_space *mapping, struct folio *folio, loff_t start, loff_t end, bool seek_data) { const struct address_space_operations *ops = mapping->a_ops; size_t offset, bsz = i_blocksize(mapping->host); if (xa_is_value(folio) || folio_test_uptodate(folio)) return seek_data ? start : end; if (!ops->is_partially_uptodate) return seek_data ? end : start; xas_pause(xas); rcu_read_unlock(); folio_lock(folio); if (unlikely(folio->mapping != mapping)) goto unlock; offset = offset_in_folio(folio, start) & ~(bsz - 1); do { if (ops->is_partially_uptodate(folio, offset, bsz) == seek_data) break; start = (start + bsz) & ~(bsz - 1); offset += bsz; } while (offset < folio_size(folio)); unlock: folio_unlock(folio); rcu_read_lock(); return start; } static inline size_t seek_folio_size(struct xa_state *xas, struct folio *folio) { if (xa_is_value(folio)) return PAGE_SIZE << xa_get_order(xas->xa, xas->xa_index); return folio_size(folio); } /** * mapping_seek_hole_data - Seek for SEEK_DATA / SEEK_HOLE in the page cache. * @mapping: Address space to search. * @start: First byte to consider. * @end: Limit of search (exclusive). * @whence: Either SEEK_HOLE or SEEK_DATA. * * If the page cache knows which blocks contain holes and which blocks * contain data, your filesystem can use this function to implement * SEEK_HOLE and SEEK_DATA. This is useful for filesystems which are * entirely memory-based such as tmpfs, and filesystems which support * unwritten extents. * * Return: The requested offset on success, or -ENXIO if @whence specifies * SEEK_DATA and there is no data after @start. There is an implicit hole * after @end - 1, so SEEK_HOLE returns @end if all the bytes between @start * and @end contain data. */ loff_t mapping_seek_hole_data(struct address_space *mapping, loff_t start, loff_t end, int whence) { XA_STATE(xas, &mapping->i_pages, start >> PAGE_SHIFT); pgoff_t max = (end - 1) >> PAGE_SHIFT; bool seek_data = (whence == SEEK_DATA); struct folio *folio; if (end <= start) return -ENXIO; rcu_read_lock(); while ((folio = find_get_entry(&xas, max, XA_PRESENT))) { loff_t pos = (u64)xas.xa_index << PAGE_SHIFT; size_t seek_size; if (start < pos) { if (!seek_data) goto unlock; start = pos; } seek_size = seek_folio_size(&xas, folio); pos = round_up((u64)pos + 1, seek_size); start = folio_seek_hole_data(&xas, mapping, folio, start, pos, seek_data); if (start < pos) goto unlock; if (start >= end) break; if (seek_size > PAGE_SIZE) xas_set(&xas, pos >> PAGE_SHIFT); if (!xa_is_value(folio)) folio_put(folio); } if (seek_data) start = -ENXIO; unlock: rcu_read_unlock(); if (folio && !xa_is_value(folio)) folio_put(folio); if (start > end) return end; return start; } #ifdef CONFIG_MMU #define MMAP_LOTSAMISS (100) /* * lock_folio_maybe_drop_mmap - lock the page, possibly dropping the mmap_lock * @vmf - the vm_fault for this fault. * @folio - the folio to lock. * @fpin - the pointer to the file we may pin (or is already pinned). * * This works similar to lock_folio_or_retry in that it can drop the * mmap_lock. It differs in that it actually returns the folio locked * if it returns 1 and 0 if it couldn't lock the folio. If we did have * to drop the mmap_lock then fpin will point to the pinned file and * needs to be fput()'ed at a later point. */ static int lock_folio_maybe_drop_mmap(struct vm_fault *vmf, struct folio *folio, struct file **fpin) { if (folio_trylock(folio)) return 1; /* * NOTE! This will make us return with VM_FAULT_RETRY, but with * the fault lock still held. That's how FAULT_FLAG_RETRY_NOWAIT * is supposed to work. We have way too many special cases.. */ if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT) return 0; *fpin = maybe_unlock_mmap_for_io(vmf, *fpin); if (vmf->flags & FAULT_FLAG_KILLABLE) { if (__folio_lock_killable(folio)) { /* * We didn't have the right flags to drop the * fault lock, but all fault_handlers only check * for fatal signals if we return VM_FAULT_RETRY, * so we need to drop the fault lock here and * return 0 if we don't have a fpin. */ if (*fpin == NULL) release_fault_lock(vmf); return 0; } } else __folio_lock(folio); return 1; } /* * Synchronous readahead happens when we don't even find a page in the page * cache at all. We don't want to perform IO under the mmap sem, so if we have * to drop the mmap sem we return the file that was pinned in order for us to do * that. If we didn't pin a file then we return NULL. The file that is * returned needs to be fput()'ed when we're done with it. */ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) { struct file *file = vmf->vma->vm_file; struct file_ra_state *ra = &file->f_ra; struct address_space *mapping = file->f_mapping; DEFINE_READAHEAD(ractl, file, ra, mapping, vmf->pgoff); struct file *fpin = NULL; unsigned long vm_flags = vmf->vma->vm_flags; unsigned int mmap_miss; #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* Use the readahead code, even if readahead is disabled */ if (vm_flags & VM_HUGEPAGE) { fpin = maybe_unlock_mmap_for_io(vmf, fpin); ractl._index &= ~((unsigned long)HPAGE_PMD_NR - 1); ra->size = HPAGE_PMD_NR; /* * Fetch two PMD folios, so we get the chance to actually * readahead, unless we've been told not to. */ if (!(vm_flags & VM_RAND_READ)) ra->size *= 2; ra->async_size = HPAGE_PMD_NR; page_cache_ra_order(&ractl, ra, HPAGE_PMD_ORDER); return fpin; } #endif /* If we don't want any read-ahead, don't bother */ if (vm_flags & VM_RAND_READ) return fpin; if (!ra->ra_pages) return fpin; if (vm_flags & VM_SEQ_READ) { fpin = maybe_unlock_mmap_for_io(vmf, fpin); page_cache_sync_ra(&ractl, ra->ra_pages); return fpin; } /* Avoid banging the cache line if not needed */ mmap_miss = READ_ONCE(ra->mmap_miss); if (mmap_miss < MMAP_LOTSAMISS * 10) WRITE_ONCE(ra->mmap_miss, ++mmap_miss); /* * Do we miss much more than hit in this file? If so, * stop bothering with read-ahead. It will only hurt. */ if (mmap_miss > MMAP_LOTSAMISS) return fpin; /* * mmap read-around */ fpin = maybe_unlock_mmap_for_io(vmf, fpin); ra->start = max_t(long, 0, vmf->pgoff - ra->ra_pages / 2); ra->size = ra->ra_pages; ra->async_size = ra->ra_pages / 4; ractl._index = ra->start; page_cache_ra_order(&ractl, ra, 0); return fpin; } /* * Asynchronous readahead happens when we find the page and PG_readahead, * so we want to possibly extend the readahead further. We return the file that * was pinned if we have to drop the mmap_lock in order to do IO. */ static struct file *do_async_mmap_readahead(struct vm_fault *vmf, struct folio *folio) { struct file *file = vmf->vma->vm_file; struct file_ra_state *ra = &file->f_ra; DEFINE_READAHEAD(ractl, file, ra, file->f_mapping, vmf->pgoff); struct file *fpin = NULL; unsigned int mmap_miss; /* If we don't want any read-ahead, don't bother */ if (vmf->vma->vm_flags & VM_RAND_READ || !ra->ra_pages) return fpin; mmap_miss = READ_ONCE(ra->mmap_miss); if (mmap_miss) WRITE_ONCE(ra->mmap_miss, --mmap_miss); if (folio_test_readahead(folio)) { fpin = maybe_unlock_mmap_for_io(vmf, fpin); page_cache_async_ra(&ractl, folio, ra->ra_pages); } return fpin; } static vm_fault_t filemap_fault_recheck_pte_none(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; vm_fault_t ret = 0; pte_t *ptep; /* * We might have COW'ed a pagecache folio and might now have an mlocked * anon folio mapped. The original pagecache folio is not mlocked and * might have been evicted. During a read+clear/modify/write update of * the PTE, such as done in do_numa_page()/change_pte_range(), we * temporarily clear the PTE under PT lock and might detect it here as * "none" when not holding the PT lock. * * Not rechecking the PTE under PT lock could result in an unexpected * major fault in an mlock'ed region. Recheck only for this special * scenario while holding the PT lock, to not degrade non-mlocked * scenarios. Recheck the PTE without PT lock firstly, thereby reducing * the number of times we hold PT lock. */ if (!(vma->vm_flags & VM_LOCKED)) return 0; if (!(vmf->flags & FAULT_FLAG_ORIG_PTE_VALID)) return 0; ptep = pte_offset_map(vmf->pmd, vmf->address); if (unlikely(!ptep)) return VM_FAULT_NOPAGE; if (unlikely(!pte_none(ptep_get_lockless(ptep)))) { ret = VM_FAULT_NOPAGE; } else { spin_lock(vmf->ptl); if (unlikely(!pte_none(ptep_get(ptep)))) ret = VM_FAULT_NOPAGE; spin_unlock(vmf->ptl); } pte_unmap(ptep); return ret; } /** * filemap_fault - read in file data for page fault handling * @vmf: struct vm_fault containing details of the fault * * filemap_fault() is invoked via the vma operations vector for a * mapped memory region to read in file data during a page fault. * * The goto's are kind of ugly, but this streamlines the normal case of having * it in the page cache, and handles the special cases reasonably without * having a lot of duplicated code. * * vma->vm_mm->mmap_lock must be held on entry. * * If our return value has VM_FAULT_RETRY set, it's because the mmap_lock * may be dropped before doing I/O or by lock_folio_maybe_drop_mmap(). * * If our return value does not have VM_FAULT_RETRY set, the mmap_lock * has not been released. * * We never return with VM_FAULT_RETRY and a bit from VM_FAULT_ERROR set. * * Return: bitwise-OR of %VM_FAULT_ codes. */ vm_fault_t filemap_fault(struct vm_fault *vmf) { int error; struct file *file = vmf->vma->vm_file; struct file *fpin = NULL; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; pgoff_t max_idx, index = vmf->pgoff; struct folio *folio; vm_fault_t ret = 0; bool mapping_locked = false; max_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); if (unlikely(index >= max_idx)) return VM_FAULT_SIGBUS; /* * Do we have something in the page cache already? */ folio = filemap_get_folio(mapping, index); if (likely(!IS_ERR(folio))) { /* * We found the page, so try async readahead before waiting for * the lock. */ if (!(vmf->flags & FAULT_FLAG_TRIED)) fpin = do_async_mmap_readahead(vmf, folio); if (unlikely(!folio_test_uptodate(folio))) { filemap_invalidate_lock_shared(mapping); mapping_locked = true; } } else { ret = filemap_fault_recheck_pte_none(vmf); if (unlikely(ret)) return ret; /* No page in the page cache at all */ count_vm_event(PGMAJFAULT); count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT); ret = VM_FAULT_MAJOR; fpin = do_sync_mmap_readahead(vmf); retry_find: /* * See comment in filemap_create_folio() why we need * invalidate_lock */ if (!mapping_locked) { filemap_invalidate_lock_shared(mapping); mapping_locked = true; } folio = __filemap_get_folio(mapping, index, FGP_CREAT|FGP_FOR_MMAP, vmf->gfp_mask); if (IS_ERR(folio)) { if (fpin) goto out_retry; filemap_invalidate_unlock_shared(mapping); return VM_FAULT_OOM; } } if (!lock_folio_maybe_drop_mmap(vmf, folio, &fpin)) goto out_retry; /* Did it get truncated? */ if (unlikely(folio->mapping != mapping)) { folio_unlock(folio); folio_put(folio); goto retry_find; } VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio); /* * We have a locked folio in the page cache, now we need to check * that it's up-to-date. If not, it is going to be due to an error, * or because readahead was otherwise unable to retrieve it. */ if (unlikely(!folio_test_uptodate(folio))) { /* * If the invalidate lock is not held, the folio was in cache * and uptodate and now it is not. Strange but possible since we * didn't hold the page lock all the time. Let's drop * everything, get the invalidate lock and try again. */ if (!mapping_locked) { folio_unlock(folio); folio_put(folio); goto retry_find; } /* * OK, the folio is really not uptodate. This can be because the * VMA has the VM_RAND_READ flag set, or because an error * arose. Let's read it in directly. */ goto page_not_uptodate; } /* * We've made it this far and we had to drop our mmap_lock, now is the * time to return to the upper layer and have it re-find the vma and * redo the fault. */ if (fpin) { folio_unlock(folio); goto out_retry; } if (mapping_locked) filemap_invalidate_unlock_shared(mapping); /* * Found the page and have a reference on it. * We must recheck i_size under page lock. */ max_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); if (unlikely(index >= max_idx)) { folio_unlock(folio); folio_put(folio); return VM_FAULT_SIGBUS; } vmf->page = folio_file_page(folio, index); return ret | VM_FAULT_LOCKED; page_not_uptodate: /* * Umm, take care of errors if the page isn't up-to-date. * Try to re-read it _once_. We do this synchronously, * because there really aren't any performance issues here * and we need to check for errors. */ fpin = maybe_unlock_mmap_for_io(vmf, fpin); error = filemap_read_folio(file, mapping->a_ops->read_folio, folio); if (fpin) goto out_retry; folio_put(folio); if (!error || error == AOP_TRUNCATED_PAGE) goto retry_find; filemap_invalidate_unlock_shared(mapping); return VM_FAULT_SIGBUS; out_retry: /* * We dropped the mmap_lock, we need to return to the fault handler to * re-find the vma and come back and find our hopefully still populated * page. */ if (!IS_ERR(folio)) folio_put(folio); if (mapping_locked) filemap_invalidate_unlock_shared(mapping); if (fpin) fput(fpin); return ret | VM_FAULT_RETRY; } EXPORT_SYMBOL(filemap_fault); static bool filemap_map_pmd(struct vm_fault *vmf, struct folio *folio, pgoff_t start) { struct mm_struct *mm = vmf->vma->vm_mm; /* Huge page is mapped? No need to proceed. */ if (pmd_trans_huge(*vmf->pmd)) { folio_unlock(folio); folio_put(folio); return true; } if (pmd_none(*vmf->pmd) && folio_test_pmd_mappable(folio)) { struct page *page = folio_file_page(folio, start); vm_fault_t ret = do_set_pmd(vmf, page); if (!ret) { /* The page is mapped successfully, reference consumed. */ folio_unlock(folio); return true; } } if (pmd_none(*vmf->pmd) && vmf->prealloc_pte) pmd_install(mm, vmf->pmd, &vmf->prealloc_pte); return false; } static struct folio *next_uptodate_folio(struct xa_state *xas, struct address_space *mapping, pgoff_t end_pgoff) { struct folio *folio = xas_next_entry(xas, end_pgoff); unsigned long max_idx; do { if (!folio) return NULL; if (xas_retry(xas, folio)) continue; if (xa_is_value(folio)) continue; if (folio_test_locked(folio)) continue; if (!folio_try_get_rcu(folio)) continue; /* Has the page moved or been split? */ if (unlikely(folio != xas_reload(xas))) goto skip; if (!folio_test_uptodate(folio) || folio_test_readahead(folio)) goto skip; if (!folio_trylock(folio)) goto skip; if (folio->mapping != mapping) goto unlock; if (!folio_test_uptodate(folio)) goto unlock; max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE); if (xas->xa_index >= max_idx) goto unlock; return folio; unlock: folio_unlock(folio); skip: folio_put(folio); } while ((folio = xas_next_entry(xas, end_pgoff)) != NULL); return NULL; } /* * Map page range [start_page, start_page + nr_pages) of folio. * start_page is gotten from start by folio_page(folio, start) */ static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf, struct folio *folio, unsigned long start, unsigned long addr, unsigned int nr_pages, unsigned long *rss, unsigned int *mmap_miss) { vm_fault_t ret = 0; struct page *page = folio_page(folio, start); unsigned int count = 0; pte_t *old_ptep = vmf->pte; do { if (PageHWPoison(page + count)) goto skip; /* * If there are too many folios that are recently evicted * in a file, they will probably continue to be evicted. * In such situation, read-ahead is only a waste of IO. * Don't decrease mmap_miss in this scenario to make sure * we can stop read-ahead. */ if (!folio_test_workingset(folio)) (*mmap_miss)++; /* * NOTE: If there're PTE markers, we'll leave them to be * handled in the specific fault path, and it'll prohibit the * fault-around logic. */ if (!pte_none(ptep_get(&vmf->pte[count]))) goto skip; count++; continue; skip: if (count) { set_pte_range(vmf, folio, page, count, addr); *rss += count; folio_ref_add(folio, count); if (in_range(vmf->address, addr, count * PAGE_SIZE)) ret = VM_FAULT_NOPAGE; } count++; page += count; vmf->pte += count; addr += count * PAGE_SIZE; count = 0; } while (--nr_pages > 0); if (count) { set_pte_range(vmf, folio, page, count, addr); *rss += count; folio_ref_add(folio, count); if (in_range(vmf->address, addr, count * PAGE_SIZE)) ret = VM_FAULT_NOPAGE; } vmf->pte = old_ptep; return ret; } static vm_fault_t filemap_map_order0_folio(struct vm_fault *vmf, struct folio *folio, unsigned long addr, unsigned long *rss, unsigned int *mmap_miss) { vm_fault_t ret = 0; struct page *page = &folio->page; if (PageHWPoison(page)) return ret; /* See comment of filemap_map_folio_range() */ if (!folio_test_workingset(folio)) (*mmap_miss)++; /* * NOTE: If there're PTE markers, we'll leave them to be * handled in the specific fault path, and it'll prohibit * the fault-around logic. */ if (!pte_none(ptep_get(vmf->pte))) return ret; if (vmf->address == addr) ret = VM_FAULT_NOPAGE; set_pte_range(vmf, folio, page, 1, addr); (*rss)++; folio_ref_inc(folio); return ret; } vm_fault_t filemap_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff) { struct vm_area_struct *vma = vmf->vma; struct file *file = vma->vm_file; struct address_space *mapping = file->f_mapping; pgoff_t last_pgoff = start_pgoff; unsigned long addr; XA_STATE(xas, &mapping->i_pages, start_pgoff); struct folio *folio; vm_fault_t ret = 0; unsigned long rss = 0; unsigned int nr_pages = 0, mmap_miss = 0, mmap_miss_saved, folio_type; rcu_read_lock(); folio = next_uptodate_folio(&xas, mapping, end_pgoff); if (!folio) goto out; if (filemap_map_pmd(vmf, folio, start_pgoff)) { ret = VM_FAULT_NOPAGE; goto out; } addr = vma->vm_start + ((start_pgoff - vma->vm_pgoff) << PAGE_SHIFT); vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl); if (!vmf->pte) { folio_unlock(folio); folio_put(folio); goto out; } folio_type = mm_counter_file(folio); do { unsigned long end; addr += (xas.xa_index - last_pgoff) << PAGE_SHIFT; vmf->pte += xas.xa_index - last_pgoff; last_pgoff = xas.xa_index; end = folio_next_index(folio) - 1; nr_pages = min(end, end_pgoff) - xas.xa_index + 1; if (!folio_test_large(folio)) ret |= filemap_map_order0_folio(vmf, folio, addr, &rss, &mmap_miss); else ret |= filemap_map_folio_range(vmf, folio, xas.xa_index - folio->index, addr, nr_pages, &rss, &mmap_miss); folio_unlock(folio); folio_put(folio); } while ((folio = next_uptodate_folio(&xas, mapping, end_pgoff)) != NULL); add_mm_counter(vma->vm_mm, folio_type, rss); pte_unmap_unlock(vmf->pte, vmf->ptl); out: rcu_read_unlock(); mmap_miss_saved = READ_ONCE(file->f_ra.mmap_miss); if (mmap_miss >= mmap_miss_saved) WRITE_ONCE(file->f_ra.mmap_miss, 0); else WRITE_ONCE(file->f_ra.mmap_miss, mmap_miss_saved - mmap_miss); return ret; } EXPORT_SYMBOL(filemap_map_pages); vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf) { struct address_space *mapping = vmf->vma->vm_file->f_mapping; struct folio *folio = page_folio(vmf->page); vm_fault_t ret = VM_FAULT_LOCKED; sb_start_pagefault(mapping->host->i_sb); file_update_time(vmf->vma->vm_file); folio_lock(folio); if (folio->mapping != mapping) { folio_unlock(folio); ret = VM_FAULT_NOPAGE; goto out; } /* * We mark the folio dirty already here so that when freeze is in * progress, we are guaranteed that writeback during freezing will * see the dirty folio and writeprotect it again. */ folio_mark_dirty(folio); folio_wait_stable(folio); out: sb_end_pagefault(mapping->host->i_sb); return ret; } const struct vm_operations_struct generic_file_vm_ops = { .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = filemap_page_mkwrite, }; /* This is used for a general mmap of a disk file */ int generic_file_mmap(struct file *file, struct vm_area_struct *vma) { struct address_space *mapping = file->f_mapping; if (!mapping->a_ops->read_folio) return -ENOEXEC; file_accessed(file); vma->vm_ops = &generic_file_vm_ops; return 0; } /* * This is for filesystems which do not implement ->writepage. */ int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma) { if (vma_is_shared_maywrite(vma)) return -EINVAL; return generic_file_mmap(file, vma); } #else vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf) { return VM_FAULT_SIGBUS; } int generic_file_mmap(struct file *file, struct vm_area_struct *vma) { return -ENOSYS; } int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma) { return -ENOSYS; } #endif /* CONFIG_MMU */ EXPORT_SYMBOL(filemap_page_mkwrite); EXPORT_SYMBOL(generic_file_mmap); EXPORT_SYMBOL(generic_file_readonly_mmap); static struct folio *do_read_cache_folio(struct address_space *mapping, pgoff_t index, filler_t filler, struct file *file, gfp_t gfp) { struct folio *folio; int err; if (!filler) filler = mapping->a_ops->read_folio; repeat: folio = filemap_get_folio(mapping, index); if (IS_ERR(folio)) { folio = filemap_alloc_folio(gfp, 0); if (!folio) return ERR_PTR(-ENOMEM); err = filemap_add_folio(mapping, folio, index, gfp); if (unlikely(err)) { folio_put(folio); if (err == -EEXIST) goto repeat; /* Presumably ENOMEM for xarray node */ return ERR_PTR(err); } goto filler; } if (folio_test_uptodate(folio)) goto out; if (!folio_trylock(folio)) { folio_put_wait_locked(folio, TASK_UNINTERRUPTIBLE); goto repeat; } /* Folio was truncated from mapping */ if (!folio->mapping) { folio_unlock(folio); folio_put(folio); goto repeat; } /* Someone else locked and filled the page in a very small window */ if (folio_test_uptodate(folio)) { folio_unlock(folio); goto out; } filler: err = filemap_read_folio(file, filler, folio); if (err) { folio_put(folio); if (err == AOP_TRUNCATED_PAGE) goto repeat; return ERR_PTR(err); } out: folio_mark_accessed(folio); return folio; } /** * read_cache_folio - Read into page cache, fill it if needed. * @mapping: The address_space to read from. * @index: The index to read. * @filler: Function to perform the read, or NULL to use aops->read_folio(). * @file: Passed to filler function, may be NULL if not required. * * Read one page into the page cache. If it succeeds, the folio returned * will contain @index, but it may not be the first page of the folio. * * If the filler function returns an error, it will be returned to the * caller. * * Context: May sleep. Expects mapping->invalidate_lock to be held. * Return: An uptodate folio on success, ERR_PTR() on failure. */ struct folio *read_cache_folio(struct address_space *mapping, pgoff_t index, filler_t filler, struct file *file) { return do_read_cache_folio(mapping, index, filler, file, mapping_gfp_mask(mapping)); } EXPORT_SYMBOL(read_cache_folio); /** * mapping_read_folio_gfp - Read into page cache, using specified allocation flags. * @mapping: The address_space for the folio. * @index: The index that the allocated folio will contain. * @gfp: The page allocator flags to use if allocating. * * This is the same as "read_cache_folio(mapping, index, NULL, NULL)", but with * any new memory allocations done using the specified allocation flags. * * The most likely error from this function is EIO, but ENOMEM is * possible and so is EINTR. If ->read_folio returns another error, * that will be returned to the caller. * * The function expects mapping->invalidate_lock to be already held. * * Return: Uptodate folio on success, ERR_PTR() on failure. */ struct folio *mapping_read_folio_gfp(struct address_space *mapping, pgoff_t index, gfp_t gfp) { return do_read_cache_folio(mapping, index, NULL, NULL, gfp); } EXPORT_SYMBOL(mapping_read_folio_gfp); static struct page *do_read_cache_page(struct address_space *mapping, pgoff_t index, filler_t *filler, struct file *file, gfp_t gfp) { struct folio *folio; folio = do_read_cache_folio(mapping, index, filler, file, gfp); if (IS_ERR(folio)) return &folio->page; return folio_file_page(folio, index); } struct page *read_cache_page(struct address_space *mapping, pgoff_t index, filler_t *filler, struct file *file) { return do_read_cache_page(mapping, index, filler, file, mapping_gfp_mask(mapping)); } EXPORT_SYMBOL(read_cache_page); /** * read_cache_page_gfp - read into page cache, using specified page allocation flags. * @mapping: the page's address_space * @index: the page index * @gfp: the page allocator flags to use if allocating * * This is the same as "read_mapping_page(mapping, index, NULL)", but with * any new page allocations done using the specified allocation flags. * * If the page does not get brought uptodate, return -EIO. * * The function expects mapping->invalidate_lock to be already held. * * Return: up to date page on success, ERR_PTR() on failure. */ struct page *read_cache_page_gfp(struct address_space *mapping, pgoff_t index, gfp_t gfp) { return do_read_cache_page(mapping, index, NULL, NULL, gfp); } EXPORT_SYMBOL(read_cache_page_gfp); /* * Warn about a page cache invalidation failure during a direct I/O write. */ static void dio_warn_stale_pagecache(struct file *filp) { static DEFINE_RATELIMIT_STATE(_rs, 86400 * HZ, DEFAULT_RATELIMIT_BURST); char pathname[128]; char *path; errseq_set(&filp->f_mapping->wb_err, -EIO); if (__ratelimit(&_rs)) { path = file_path(filp, pathname, sizeof(pathname)); if (IS_ERR(path)) path = "(unknown)"; pr_crit("Page cache invalidation failure on direct I/O. Possible data corruption due to collision with buffered I/O!\n"); pr_crit("File: %s PID: %d Comm: %.20s\n", path, current->pid, current->comm); } } void kiocb_invalidate_post_direct_write(struct kiocb *iocb, size_t count) { struct address_space *mapping = iocb->ki_filp->f_mapping; if (mapping->nrpages && invalidate_inode_pages2_range(mapping, iocb->ki_pos >> PAGE_SHIFT, (iocb->ki_pos + count - 1) >> PAGE_SHIFT)) dio_warn_stale_pagecache(iocb->ki_filp); } ssize_t generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from) { struct address_space *mapping = iocb->ki_filp->f_mapping; size_t write_len = iov_iter_count(from); ssize_t written; /* * If a page can not be invalidated, return 0 to fall back * to buffered write. */ written = kiocb_invalidate_pages(iocb, write_len); if (written) { if (written == -EBUSY) return 0; return written; } written = mapping->a_ops->direct_IO(iocb, from); /* * Finally, try again to invalidate clean pages which might have been * cached by non-direct readahead, or faulted in by get_user_pages() * if the source of the write was an mmap'ed region of the file * we're writing. Either one is a pretty crazy thing to do, * so we don't support it 100%. If this invalidation * fails, tough, the write still worked... * * Most of the time we do not need this since dio_complete() will do * the invalidation for us. However there are some file systems that * do not end up with dio_complete() being called, so let's not break * them by removing it completely. * * Noticeable example is a blkdev_direct_IO(). * * Skip invalidation for async writes or if mapping has no pages. */ if (written > 0) { struct inode *inode = mapping->host; loff_t pos = iocb->ki_pos; kiocb_invalidate_post_direct_write(iocb, written); pos += written; write_len -= written; if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) { i_size_write(inode, pos); mark_inode_dirty(inode); } iocb->ki_pos = pos; } if (written != -EIOCBQUEUED) iov_iter_revert(from, write_len - iov_iter_count(from)); return written; } EXPORT_SYMBOL(generic_file_direct_write); ssize_t generic_perform_write(struct kiocb *iocb, struct iov_iter *i) { struct file *file = iocb->ki_filp; loff_t pos = iocb->ki_pos; struct address_space *mapping = file->f_mapping; const struct address_space_operations *a_ops = mapping->a_ops; long status = 0; ssize_t written = 0; do { struct page *page; unsigned long offset; /* Offset into pagecache page */ unsigned long bytes; /* Bytes to write to page */ size_t copied; /* Bytes copied from user */ void *fsdata = NULL; offset = (pos & (PAGE_SIZE - 1)); bytes = min_t(unsigned long, PAGE_SIZE - offset, iov_iter_count(i)); again: /* * Bring in the user page that we will copy from _first_. * Otherwise there's a nasty deadlock on copying from the * same page as we're writing to, without it being marked * up-to-date. */ if (unlikely(fault_in_iov_iter_readable(i, bytes) == bytes)) { status = -EFAULT; break; } if (fatal_signal_pending(current)) { status = -EINTR; break; } status = a_ops->write_begin(file, mapping, pos, bytes, &page, &fsdata); if (unlikely(status < 0)) break; if (mapping_writably_mapped(mapping)) flush_dcache_page(page); copied = copy_page_from_iter_atomic(page, offset, bytes, i); flush_dcache_page(page); status = a_ops->write_end(file, mapping, pos, bytes, copied, page, fsdata); if (unlikely(status != copied)) { iov_iter_revert(i, copied - max(status, 0L)); if (unlikely(status < 0)) break; } cond_resched(); if (unlikely(status == 0)) { /* * A short copy made ->write_end() reject the * thing entirely. Might be memory poisoning * halfway through, might be a race with munmap, * might be severe memory pressure. */ if (copied) bytes = copied; goto again; } pos += status; written += status; balance_dirty_pages_ratelimited(mapping); } while (iov_iter_count(i)); if (!written) return status; iocb->ki_pos += written; return written; } EXPORT_SYMBOL(generic_perform_write); /** * __generic_file_write_iter - write data to a file * @iocb: IO state structure (file, offset, etc.) * @from: iov_iter with data to write * * This function does all the work needed for actually writing data to a * file. It does all basic checks, removes SUID from the file, updates * modification times and calls proper subroutines depending on whether we * do direct IO or a standard buffered write. * * It expects i_rwsem to be grabbed unless we work on a block device or similar * object which does not need locking at all. * * This function does *not* take care of syncing data in case of O_SYNC write. * A caller has to handle it. This is mainly due to the fact that we want to * avoid syncing under i_rwsem. * * Return: * * number of bytes written, even for truncated writes * * negative error code if no data has been written at all */ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; ssize_t ret; ret = file_remove_privs(file); if (ret) return ret; ret = file_update_time(file); if (ret) return ret; if (iocb->ki_flags & IOCB_DIRECT) { ret = generic_file_direct_write(iocb, from); /* * If the write stopped short of completing, fall back to * buffered writes. Some filesystems do this for writes to * holes, for example. For DAX files, a buffered write will * not succeed (even if it did, DAX does not handle dirty * page-cache pages correctly). */ if (ret < 0 || !iov_iter_count(from) || IS_DAX(inode)) return ret; return direct_write_fallback(iocb, from, ret, generic_perform_write(iocb, from)); } return generic_perform_write(iocb, from); } EXPORT_SYMBOL(__generic_file_write_iter); /** * generic_file_write_iter - write data to a file * @iocb: IO state structure * @from: iov_iter with data to write * * This is a wrapper around __generic_file_write_iter() to be used by most * filesystems. It takes care of syncing the file in case of O_SYNC file * and acquires i_rwsem as needed. * Return: * * negative error code if no data has been written at all of * vfs_fsync_range() failed for a synchronous write * * number of bytes written, even for truncated writes */ ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; ssize_t ret; inode_lock(inode); ret = generic_write_checks(iocb, from); if (ret > 0) ret = __generic_file_write_iter(iocb, from); inode_unlock(inode); if (ret > 0) ret = generic_write_sync(iocb, ret); return ret; } EXPORT_SYMBOL(generic_file_write_iter); /** * filemap_release_folio() - Release fs-specific metadata on a folio. * @folio: The folio which the kernel is trying to free. * @gfp: Memory allocation flags (and I/O mode). * * The address_space is trying to release any data attached to a folio * (presumably at folio->private). * * This will also be called if the private_2 flag is set on a page, * indicating that the folio has other metadata associated with it. * * The @gfp argument specifies whether I/O may be performed to release * this page (__GFP_IO), and whether the call may block * (__GFP_RECLAIM & __GFP_FS). * * Return: %true if the release was successful, otherwise %false. */ bool filemap_release_folio(struct folio *folio, gfp_t gfp) { struct address_space * const mapping = folio->mapping; BUG_ON(!folio_test_locked(folio)); if (!folio_needs_release(folio)) return true; if (folio_test_writeback(folio)) return false; if (mapping && mapping->a_ops->release_folio) return mapping->a_ops->release_folio(folio, gfp); return try_to_free_buffers(folio); } EXPORT_SYMBOL(filemap_release_folio); /** * filemap_invalidate_inode - Invalidate/forcibly write back a range of an inode's pagecache * @inode: The inode to flush * @flush: Set to write back rather than simply invalidate. * @start: First byte to in range. * @end: Last byte in range (inclusive), or LLONG_MAX for everything from start * onwards. * * Invalidate all the folios on an inode that contribute to the specified * range, possibly writing them back first. Whilst the operation is * undertaken, the invalidate lock is held to prevent new folios from being * installed. */ int filemap_invalidate_inode(struct inode *inode, bool flush, loff_t start, loff_t end) { struct address_space *mapping = inode->i_mapping; pgoff_t first = start >> PAGE_SHIFT; pgoff_t last = end >> PAGE_SHIFT; pgoff_t nr = end == LLONG_MAX ? ULONG_MAX : last - first + 1; if (!mapping || !mapping->nrpages || end < start) goto out; /* Prevent new folios from being added to the inode. */ filemap_invalidate_lock(mapping); if (!mapping->nrpages) goto unlock; unmap_mapping_pages(mapping, first, nr, false); /* Write back the data if we're asked to. */ if (flush) { struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, .nr_to_write = LONG_MAX, .range_start = start, .range_end = end, }; filemap_fdatawrite_wbc(mapping, &wbc); } /* Wait for writeback to complete on all folios and discard. */ truncate_inode_pages_range(mapping, start, end); unlock: filemap_invalidate_unlock(mapping); out: return filemap_check_errors(mapping); } EXPORT_SYMBOL_GPL(filemap_invalidate_inode); #ifdef CONFIG_CACHESTAT_SYSCALL /** * filemap_cachestat() - compute the page cache statistics of a mapping * @mapping: The mapping to compute the statistics for. * @first_index: The starting page cache index. * @last_index: The final page index (inclusive). * @cs: the cachestat struct to write the result to. * * This will query the page cache statistics of a mapping in the * page range of [first_index, last_index] (inclusive). The statistics * queried include: number of dirty pages, number of pages marked for * writeback, and the number of (recently) evicted pages. */ static void filemap_cachestat(struct address_space *mapping, pgoff_t first_index, pgoff_t last_index, struct cachestat *cs) { XA_STATE(xas, &mapping->i_pages, first_index); struct folio *folio; rcu_read_lock(); xas_for_each(&xas, folio, last_index) { int order; unsigned long nr_pages; pgoff_t folio_first_index, folio_last_index; /* * Don't deref the folio. It is not pinned, and might * get freed (and reused) underneath us. * * We *could* pin it, but that would be expensive for * what should be a fast and lightweight syscall. * * Instead, derive all information of interest from * the rcu-protected xarray. */ if (xas_retry(&xas, folio)) continue; order = xa_get_order(xas.xa, xas.xa_index); nr_pages = 1 << order; folio_first_index = round_down(xas.xa_index, 1 << order); folio_last_index = folio_first_index + nr_pages - 1; /* Folios might straddle the range boundaries, only count covered pages */ if (folio_first_index < first_index) nr_pages -= first_index - folio_first_index; if (folio_last_index > last_index) nr_pages -= folio_last_index - last_index; if (xa_is_value(folio)) { /* page is evicted */ void *shadow = (void *)folio; bool workingset; /* not used */ cs->nr_evicted += nr_pages; #ifdef CONFIG_SWAP /* implies CONFIG_MMU */ if (shmem_mapping(mapping)) { /* shmem file - in swap cache */ swp_entry_t swp = radix_to_swp_entry(folio); /* swapin error results in poisoned entry */ if (non_swap_entry(swp)) goto resched; /* * Getting a swap entry from the shmem * inode means we beat * shmem_unuse(). rcu_read_lock() * ensures swapoff waits for us before * freeing the swapper space. However, * we can race with swapping and * invalidation, so there might not be * a shadow in the swapcache (yet). */ shadow = get_shadow_from_swap_cache(swp); if (!shadow) goto resched; } #endif if (workingset_test_recent(shadow, true, &workingset)) cs->nr_recently_evicted += nr_pages; goto resched; } /* page is in cache */ cs->nr_cache += nr_pages; if (xas_get_mark(&xas, PAGECACHE_TAG_DIRTY)) cs->nr_dirty += nr_pages; if (xas_get_mark(&xas, PAGECACHE_TAG_WRITEBACK)) cs->nr_writeback += nr_pages; resched: if (need_resched()) { xas_pause(&xas); cond_resched_rcu(); } } rcu_read_unlock(); } /* * The cachestat(2) system call. * * cachestat() returns the page cache statistics of a file in the * bytes range specified by `off` and `len`: number of cached pages, * number of dirty pages, number of pages marked for writeback, * number of evicted pages, and number of recently evicted pages. * * An evicted page is a page that is previously in the page cache * but has been evicted since. A page is recently evicted if its last * eviction was recent enough that its reentry to the cache would * indicate that it is actively being used by the system, and that * there is memory pressure on the system. * * `off` and `len` must be non-negative integers. If `len` > 0, * the queried range is [`off`, `off` + `len`]. If `len` == 0, * we will query in the range from `off` to the end of the file. * * The `flags` argument is unused for now, but is included for future * extensibility. User should pass 0 (i.e no flag specified). * * Currently, hugetlbfs is not supported. * * Because the status of a page can change after cachestat() checks it * but before it returns to the application, the returned values may * contain stale information. * * return values: * zero - success * -EFAULT - cstat or cstat_range points to an illegal address * -EINVAL - invalid flags * -EBADF - invalid file descriptor * -EOPNOTSUPP - file descriptor is of a hugetlbfs file */ SYSCALL_DEFINE4(cachestat, unsigned int, fd, struct cachestat_range __user *, cstat_range, struct cachestat __user *, cstat, unsigned int, flags) { struct fd f = fdget(fd); struct address_space *mapping; struct cachestat_range csr; struct cachestat cs; pgoff_t first_index, last_index; if (!f.file) return -EBADF; if (copy_from_user(&csr, cstat_range, sizeof(struct cachestat_range))) { fdput(f); return -EFAULT; } /* hugetlbfs is not supported */ if (is_file_hugepages(f.file)) { fdput(f); return -EOPNOTSUPP; } if (flags != 0) { fdput(f); return -EINVAL; } first_index = csr.off >> PAGE_SHIFT; last_index = csr.len == 0 ? ULONG_MAX : (csr.off + csr.len - 1) >> PAGE_SHIFT; memset(&cs, 0, sizeof(struct cachestat)); mapping = f.file->f_mapping; filemap_cachestat(mapping, first_index, last_index, &cs); fdput(f); if (copy_to_user(cstat, &cs, sizeof(struct cachestat))) return -EFAULT; return 0; } #endif /* CONFIG_CACHESTAT_SYSCALL */ |
| 3 11 4 3 3 3 3 3 3 8 7 1 1 1 3 1 1 1 1 1 3 6 4 4 6 4 4 4 2 4 2 4 4 4 1 3 7 7 7 3 6 6 4 2 2 2 2 5 7 2 2 2 2 1 2 2 1 2 2 3 3 1 3 1 1 1 1 1 1 1 1 3 13 13 12 2 2 2 3 2 1 3 14 3 4 13 3 3 2 3 4 3 3 2 1 1 10 10 9 8 8 8 8 2 8 8 1 8 8 8 2 1 1 1 1 6 7 4 6 6 8 8 3 10 11 11 11 7 4 4 2 2 2 2 1 4 1 7 3 2 1 1 1 8 8 8 1 1 7 8 8 2 1 2 7 7 4 7 3 7 2 2 1 1 1 1 6 5 5 5 5 5 5 5 4 1 1 1 1 6 4 4 2 7 8 4 4 4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 | // SPDX-License-Identifier: GPL-2.0 /* * linux/ipc/msg.c * Copyright (C) 1992 Krishna Balasubramanian * * Removed all the remaining kerneld mess * Catch the -EFAULT stuff properly * Use GFP_KERNEL for messages as in 1.2 * Fixed up the unchecked user space derefs * Copyright (C) 1998 Alan Cox & Andi Kleen * * /proc/sysvipc/msg support (c) 1999 Dragos Acostachioaie <dragos@iname.com> * * mostly rewritten, threaded and wake-one semantics added * MSGMAX limit removed, sysctl's added * (c) 1999 Manfred Spraul <manfred@colorfullife.com> * * support for audit of ipc object properties and permission changes * Dustin Kirkland <dustin.kirkland@us.ibm.com> * * namespaces support * OpenVZ, SWsoft Inc. * Pavel Emelianov <xemul@openvz.org> */ #include <linux/capability.h> #include <linux/msg.h> #include <linux/spinlock.h> #include <linux/init.h> #include <linux/mm.h> #include <linux/proc_fs.h> #include <linux/list.h> #include <linux/security.h> #include <linux/sched/wake_q.h> #include <linux/syscalls.h> #include <linux/audit.h> #include <linux/seq_file.h> #include <linux/rwsem.h> #include <linux/nsproxy.h> #include <linux/ipc_namespace.h> #include <linux/rhashtable.h> #include <linux/percpu_counter.h> #include <asm/current.h> #include <linux/uaccess.h> #include "util.h" /* one msq_queue structure for each present queue on the system */ struct msg_queue { struct kern_ipc_perm q_perm; time64_t q_stime; /* last msgsnd time */ time64_t q_rtime; /* last msgrcv time */ time64_t q_ctime; /* last change time */ unsigned long q_cbytes; /* current number of bytes on queue */ unsigned long q_qnum; /* number of messages in queue */ unsigned long q_qbytes; /* max number of bytes on queue */ struct pid *q_lspid; /* pid of last msgsnd */ struct pid *q_lrpid; /* last receive pid */ struct list_head q_messages; struct list_head q_receivers; struct list_head q_senders; } __randomize_layout; /* * MSG_BARRIER Locking: * * Similar to the optimization used in ipc/mqueue.c, one syscall return path * does not acquire any locks when it sees that a message exists in * msg_receiver.r_msg. Therefore r_msg is set using smp_store_release() * and accessed using READ_ONCE()+smp_acquire__after_ctrl_dep(). In addition, * wake_q_add_safe() is used. See ipc/mqueue.c for more details */ /* one msg_receiver structure for each sleeping receiver */ struct msg_receiver { struct list_head r_list; struct task_struct *r_tsk; int r_mode; long r_msgtype; long r_maxsize; struct msg_msg *r_msg; }; /* one msg_sender for each sleeping sender */ struct msg_sender { struct list_head list; struct task_struct *tsk; size_t msgsz; }; #define SEARCH_ANY 1 #define SEARCH_EQUAL 2 #define SEARCH_NOTEQUAL 3 #define SEARCH_LESSEQUAL 4 #define SEARCH_NUMBER 5 #define msg_ids(ns) ((ns)->ids[IPC_MSG_IDS]) static inline struct msg_queue *msq_obtain_object(struct ipc_namespace *ns, int id) { struct kern_ipc_perm *ipcp = ipc_obtain_object_idr(&msg_ids(ns), id); if (IS_ERR(ipcp)) return ERR_CAST(ipcp); return container_of(ipcp, struct msg_queue, q_perm); } static inline struct msg_queue *msq_obtain_object_check(struct ipc_namespace *ns, int id) { struct kern_ipc_perm *ipcp = ipc_obtain_object_check(&msg_ids(ns), id); if (IS_ERR(ipcp)) return ERR_CAST(ipcp); return container_of(ipcp, struct msg_queue, q_perm); } static inline void msg_rmid(struct ipc_namespace *ns, struct msg_queue *s) { ipc_rmid(&msg_ids(ns), &s->q_perm); } static void msg_rcu_free(struct rcu_head *head) { struct kern_ipc_perm *p = container_of(head, struct kern_ipc_perm, rcu); struct msg_queue *msq = container_of(p, struct msg_queue, q_perm); security_msg_queue_free(&msq->q_perm); kfree(msq); } /** * newque - Create a new msg queue * @ns: namespace * @params: ptr to the structure that contains the key and msgflg * * Called with msg_ids.rwsem held (writer) */ static int newque(struct ipc_namespace *ns, struct ipc_params *params) { struct msg_queue *msq; int retval; key_t key = params->key; int msgflg = params->flg; msq = kmalloc(sizeof(*msq), GFP_KERNEL_ACCOUNT); if (unlikely(!msq)) return -ENOMEM; msq->q_perm.mode = msgflg & S_IRWXUGO; msq->q_perm.key = key; msq->q_perm.security = NULL; retval = security_msg_queue_alloc(&msq->q_perm); if (retval) { kfree(msq); return retval; } msq->q_stime = msq->q_rtime = 0; msq->q_ctime = ktime_get_real_seconds(); msq->q_cbytes = msq->q_qnum = 0; msq->q_qbytes = ns->msg_ctlmnb; msq->q_lspid = msq->q_lrpid = NULL; INIT_LIST_HEAD(&msq->q_messages); INIT_LIST_HEAD(&msq->q_receivers); INIT_LIST_HEAD(&msq->q_senders); /* ipc_addid() locks msq upon success. */ retval = ipc_addid(&msg_ids(ns), &msq->q_perm, ns->msg_ctlmni); if (retval < 0) { ipc_rcu_putref(&msq->q_perm, msg_rcu_free); return retval; } ipc_unlock_object(&msq->q_perm); rcu_read_unlock(); return msq->q_perm.id; } static inline bool msg_fits_inqueue(struct msg_queue *msq, size_t msgsz) { return msgsz + msq->q_cbytes <= msq->q_qbytes && 1 + msq->q_qnum <= msq->q_qbytes; } static inline void ss_add(struct msg_queue *msq, struct msg_sender *mss, size_t msgsz) { mss->tsk = current; mss->msgsz = msgsz; /* * No memory barrier required: we did ipc_lock_object(), * and the waker obtains that lock before calling wake_q_add(). */ __set_current_state(TASK_INTERRUPTIBLE); list_add_tail(&mss->list, &msq->q_senders); } static inline void ss_del(struct msg_sender *mss) { if (mss->list.next) list_del(&mss->list); } static void ss_wakeup(struct msg_queue *msq, struct wake_q_head *wake_q, bool kill) { struct msg_sender *mss, *t; struct task_struct *stop_tsk = NULL; struct list_head *h = &msq->q_senders; list_for_each_entry_safe(mss, t, h, list) { if (kill) mss->list.next = NULL; /* * Stop at the first task we don't wakeup, * we've already iterated the original * sender queue. */ else if (stop_tsk == mss->tsk) break; /* * We are not in an EIDRM scenario here, therefore * verify that we really need to wakeup the task. * To maintain current semantics and wakeup order, * move the sender to the tail on behalf of the * blocked task. */ else if (!msg_fits_inqueue(msq, mss->msgsz)) { if (!stop_tsk) stop_tsk = mss->tsk; list_move_tail(&mss->list, &msq->q_senders); continue; } wake_q_add(wake_q, mss->tsk); } } static void expunge_all(struct msg_queue *msq, int res, struct wake_q_head *wake_q) { struct msg_receiver *msr, *t; list_for_each_entry_safe(msr, t, &msq->q_receivers, r_list) { struct task_struct *r_tsk; r_tsk = get_task_struct(msr->r_tsk); /* see MSG_BARRIER for purpose/pairing */ smp_store_release(&msr->r_msg, ERR_PTR(res)); wake_q_add_safe(wake_q, r_tsk); } } /* * freeque() wakes up waiters on the sender and receiver waiting queue, * removes the message queue from message queue ID IDR, and cleans up all the * messages associated with this queue. * * msg_ids.rwsem (writer) and the spinlock for this message queue are held * before freeque() is called. msg_ids.rwsem remains locked on exit. */ static void freeque(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) __releases(RCU) __releases(&msq->q_perm) { struct msg_msg *msg, *t; struct msg_queue *msq = container_of(ipcp, struct msg_queue, q_perm); DEFINE_WAKE_Q(wake_q); expunge_all(msq, -EIDRM, &wake_q); ss_wakeup(msq, &wake_q, true); msg_rmid(ns, msq); ipc_unlock_object(&msq->q_perm); wake_up_q(&wake_q); rcu_read_unlock(); list_for_each_entry_safe(msg, t, &msq->q_messages, m_list) { percpu_counter_sub_local(&ns->percpu_msg_hdrs, 1); free_msg(msg); } percpu_counter_sub_local(&ns->percpu_msg_bytes, msq->q_cbytes); ipc_update_pid(&msq->q_lspid, NULL); ipc_update_pid(&msq->q_lrpid, NULL); ipc_rcu_putref(&msq->q_perm, msg_rcu_free); } long ksys_msgget(key_t key, int msgflg) { struct ipc_namespace *ns; static const struct ipc_ops msg_ops = { .getnew = newque, .associate = security_msg_queue_associate, }; struct ipc_params msg_params; ns = current->nsproxy->ipc_ns; msg_params.key = key; msg_params.flg = msgflg; return ipcget(ns, &msg_ids(ns), &msg_ops, &msg_params); } SYSCALL_DEFINE2(msgget, key_t, key, int, msgflg) { return ksys_msgget(key, msgflg); } static inline unsigned long copy_msqid_to_user(void __user *buf, struct msqid64_ds *in, int version) { switch (version) { case IPC_64: return copy_to_user(buf, in, sizeof(*in)); case IPC_OLD: { struct msqid_ds out; memset(&out, 0, sizeof(out)); ipc64_perm_to_ipc_perm(&in->msg_perm, &out.msg_perm); out.msg_stime = in->msg_stime; out.msg_rtime = in->msg_rtime; out.msg_ctime = in->msg_ctime; if (in->msg_cbytes > USHRT_MAX) out.msg_cbytes = USHRT_MAX; else out.msg_cbytes = in->msg_cbytes; out.msg_lcbytes = in->msg_cbytes; if (in->msg_qnum > USHRT_MAX) out.msg_qnum = USHRT_MAX; else out.msg_qnum = in->msg_qnum; if (in->msg_qbytes > USHRT_MAX) out.msg_qbytes = USHRT_MAX; else out.msg_qbytes = in->msg_qbytes; out.msg_lqbytes = in->msg_qbytes; out.msg_lspid = in->msg_lspid; out.msg_lrpid = in->msg_lrpid; return copy_to_user(buf, &out, sizeof(out)); } default: return -EINVAL; } } static inline unsigned long copy_msqid_from_user(struct msqid64_ds *out, void __user *buf, int version) { switch (version) { case IPC_64: if (copy_from_user(out, buf, sizeof(*out))) return -EFAULT; return 0; case IPC_OLD: { struct msqid_ds tbuf_old; if (copy_from_user(&tbuf_old, buf, sizeof(tbuf_old))) return -EFAULT; out->msg_perm.uid = tbuf_old.msg_perm.uid; out->msg_perm.gid = tbuf_old.msg_perm.gid; out->msg_perm.mode = tbuf_old.msg_perm.mode; if (tbuf_old.msg_qbytes == 0) out->msg_qbytes = tbuf_old.msg_lqbytes; else out->msg_qbytes = tbuf_old.msg_qbytes; return 0; } default: return -EINVAL; } } /* * This function handles some msgctl commands which require the rwsem * to be held in write mode. * NOTE: no locks must be held, the rwsem is taken inside this function. */ static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd, struct ipc64_perm *perm, int msg_qbytes) { struct kern_ipc_perm *ipcp; struct msg_queue *msq; int err; down_write(&msg_ids(ns).rwsem); rcu_read_lock(); ipcp = ipcctl_obtain_check(ns, &msg_ids(ns), msqid, cmd, perm, msg_qbytes); if (IS_ERR(ipcp)) { err = PTR_ERR(ipcp); goto out_unlock1; } msq = container_of(ipcp, struct msg_queue, q_perm); err = security_msg_queue_msgctl(&msq->q_perm, cmd); if (err) goto out_unlock1; switch (cmd) { case IPC_RMID: ipc_lock_object(&msq->q_perm); /* freeque unlocks the ipc object and rcu */ freeque(ns, ipcp); goto out_up; case IPC_SET: { DEFINE_WAKE_Q(wake_q); if (msg_qbytes > ns->msg_ctlmnb && !capable(CAP_SYS_RESOURCE)) { err = -EPERM; goto out_unlock1; } ipc_lock_object(&msq->q_perm); err = ipc_update_perm(perm, ipcp); if (err) goto out_unlock0; msq->q_qbytes = msg_qbytes; msq->q_ctime = ktime_get_real_seconds(); /* * Sleeping receivers might be excluded by * stricter permissions. */ expunge_all(msq, -EAGAIN, &wake_q); /* * Sleeping senders might be able to send * due to a larger queue size. */ ss_wakeup(msq, &wake_q, false); ipc_unlock_object(&msq->q_perm); wake_up_q(&wake_q); goto out_unlock1; } default: err = -EINVAL; goto out_unlock1; } out_unlock0: ipc_unlock_object(&msq->q_perm); out_unlock1: rcu_read_unlock(); out_up: up_write(&msg_ids(ns).rwsem); return err; } static int msgctl_info(struct ipc_namespace *ns, int msqid, int cmd, struct msginfo *msginfo) { int err; int max_idx; /* * We must not return kernel stack data. * due to padding, it's not enough * to set all member fields. */ err = security_msg_queue_msgctl(NULL, cmd); if (err) return err; memset(msginfo, 0, sizeof(*msginfo)); msginfo->msgmni = ns->msg_ctlmni; msginfo->msgmax = ns->msg_ctlmax; msginfo->msgmnb = ns->msg_ctlmnb; msginfo->msgssz = MSGSSZ; msginfo->msgseg = MSGSEG; down_read(&msg_ids(ns).rwsem); if (cmd == MSG_INFO) msginfo->msgpool = msg_ids(ns).in_use; max_idx = ipc_get_maxidx(&msg_ids(ns)); up_read(&msg_ids(ns).rwsem); if (cmd == MSG_INFO) { msginfo->msgmap = min_t(int, percpu_counter_sum(&ns->percpu_msg_hdrs), INT_MAX); msginfo->msgtql = min_t(int, percpu_counter_sum(&ns->percpu_msg_bytes), INT_MAX); } else { msginfo->msgmap = MSGMAP; msginfo->msgpool = MSGPOOL; msginfo->msgtql = MSGTQL; } return (max_idx < 0) ? 0 : max_idx; } static int msgctl_stat(struct ipc_namespace *ns, int msqid, int cmd, struct msqid64_ds *p) { struct msg_queue *msq; int err; memset(p, 0, sizeof(*p)); rcu_read_lock(); if (cmd == MSG_STAT || cmd == MSG_STAT_ANY) { msq = msq_obtain_object(ns, msqid); if (IS_ERR(msq)) { err = PTR_ERR(msq); goto out_unlock; } } else { /* IPC_STAT */ msq = msq_obtain_object_check(ns, msqid); if (IS_ERR(msq)) { err = PTR_ERR(msq); goto out_unlock; } } /* see comment for SHM_STAT_ANY */ if (cmd == MSG_STAT_ANY) audit_ipc_obj(&msq->q_perm); else { err = -EACCES; if (ipcperms(ns, &msq->q_perm, S_IRUGO)) goto out_unlock; } err = security_msg_queue_msgctl(&msq->q_perm, cmd); if (err) goto out_unlock; ipc_lock_object(&msq->q_perm); if (!ipc_valid_object(&msq->q_perm)) { ipc_unlock_object(&msq->q_perm); err = -EIDRM; goto out_unlock; } kernel_to_ipc64_perm(&msq->q_perm, &p->msg_perm); p->msg_stime = msq->q_stime; p->msg_rtime = msq->q_rtime; p->msg_ctime = msq->q_ctime; #ifndef CONFIG_64BIT p->msg_stime_high = msq->q_stime >> 32; p->msg_rtime_high = msq->q_rtime >> 32; p->msg_ctime_high = msq->q_ctime >> 32; #endif p->msg_cbytes = msq->q_cbytes; p->msg_qnum = msq->q_qnum; p->msg_qbytes = msq->q_qbytes; p->msg_lspid = pid_vnr(msq->q_lspid); p->msg_lrpid = pid_vnr(msq->q_lrpid); if (cmd == IPC_STAT) { /* * As defined in SUS: * Return 0 on success */ err = 0; } else { /* * MSG_STAT and MSG_STAT_ANY (both Linux specific) * Return the full id, including the sequence number */ err = msq->q_perm.id; } ipc_unlock_object(&msq->q_perm); out_unlock: rcu_read_unlock(); return err; } static long ksys_msgctl(int msqid, int cmd, struct msqid_ds __user *buf, int version) { struct ipc_namespace *ns; struct msqid64_ds msqid64; int err; if (msqid < 0 || cmd < 0) return -EINVAL; ns = current->nsproxy->ipc_ns; switch (cmd) { case IPC_INFO: case MSG_INFO: { struct msginfo msginfo; err = msgctl_info(ns, msqid, cmd, &msginfo); if (err < 0) return err; if (copy_to_user(buf, &msginfo, sizeof(struct msginfo))) err = -EFAULT; return err; } case MSG_STAT: /* msqid is an index rather than a msg queue id */ case MSG_STAT_ANY: case IPC_STAT: err = msgctl_stat(ns, msqid, cmd, &msqid64); if (err < 0) return err; if (copy_msqid_to_user(buf, &msqid64, version)) err = -EFAULT; return err; case IPC_SET: if (copy_msqid_from_user(&msqid64, buf, version)) return -EFAULT; return msgctl_down(ns, msqid, cmd, &msqid64.msg_perm, msqid64.msg_qbytes); case IPC_RMID: return msgctl_down(ns, msqid, cmd, NULL, 0); default: return -EINVAL; } } SYSCALL_DEFINE3(msgctl, int, msqid, int, cmd, struct msqid_ds __user *, buf) { return ksys_msgctl(msqid, cmd, buf, IPC_64); } #ifdef CONFIG_ARCH_WANT_IPC_PARSE_VERSION long ksys_old_msgctl(int msqid, int cmd, struct msqid_ds __user *buf) { int version = ipc_parse_version(&cmd); return ksys_msgctl(msqid, cmd, buf, version); } SYSCALL_DEFINE3(old_msgctl, int, msqid, int, cmd, struct msqid_ds __user *, buf) { return ksys_old_msgctl(msqid, cmd, buf); } #endif #ifdef CONFIG_COMPAT struct compat_msqid_ds { struct compat_ipc_perm msg_perm; compat_uptr_t msg_first; compat_uptr_t msg_last; old_time32_t msg_stime; old_time32_t msg_rtime; old_time32_t msg_ctime; compat_ulong_t msg_lcbytes; compat_ulong_t msg_lqbytes; unsigned short msg_cbytes; unsigned short msg_qnum; unsigned short msg_qbytes; compat_ipc_pid_t msg_lspid; compat_ipc_pid_t msg_lrpid; }; static int copy_compat_msqid_from_user(struct msqid64_ds *out, void __user *buf, int version) { memset(out, 0, sizeof(*out)); if (version == IPC_64) { struct compat_msqid64_ds __user *p = buf; if (get_compat_ipc64_perm(&out->msg_perm, &p->msg_perm)) return -EFAULT; if (get_user(out->msg_qbytes, &p->msg_qbytes)) return -EFAULT; } else { struct compat_msqid_ds __user *p = buf; if (get_compat_ipc_perm(&out->msg_perm, &p->msg_perm)) return -EFAULT; if (get_user(out->msg_qbytes, &p->msg_qbytes)) return -EFAULT; } return 0; } static int copy_compat_msqid_to_user(void __user *buf, struct msqid64_ds *in, int version) { if (version == IPC_64) { struct compat_msqid64_ds v; memset(&v, 0, sizeof(v)); to_compat_ipc64_perm(&v.msg_perm, &in->msg_perm); v.msg_stime = lower_32_bits(in->msg_stime); v.msg_stime_high = upper_32_bits(in->msg_stime); v.msg_rtime = lower_32_bits(in->msg_rtime); v.msg_rtime_high = upper_32_bits(in->msg_rtime); v.msg_ctime = lower_32_bits(in->msg_ctime); v.msg_ctime_high = upper_32_bits(in->msg_ctime); v.msg_cbytes = in->msg_cbytes; v.msg_qnum = in->msg_qnum; v.msg_qbytes = in->msg_qbytes; v.msg_lspid = in->msg_lspid; v.msg_lrpid = in->msg_lrpid; return copy_to_user(buf, &v, sizeof(v)); } else { struct compat_msqid_ds v; memset(&v, 0, sizeof(v)); to_compat_ipc_perm(&v.msg_perm, &in->msg_perm); v.msg_stime = in->msg_stime; v.msg_rtime = in->msg_rtime; v.msg_ctime = in->msg_ctime; v.msg_cbytes = in->msg_cbytes; v.msg_qnum = in->msg_qnum; v.msg_qbytes = in->msg_qbytes; v.msg_lspid = in->msg_lspid; v.msg_lrpid = in->msg_lrpid; return copy_to_user(buf, &v, sizeof(v)); } } static long compat_ksys_msgctl(int msqid, int cmd, void __user *uptr, int version) { struct ipc_namespace *ns; int err; struct msqid64_ds msqid64; ns = current->nsproxy->ipc_ns; if (msqid < 0 || cmd < 0) return -EINVAL; switch (cmd & (~IPC_64)) { case IPC_INFO: case MSG_INFO: { struct msginfo msginfo; err = msgctl_info(ns, msqid, cmd, &msginfo); if (err < 0) return err; if (copy_to_user(uptr, &msginfo, sizeof(struct msginfo))) err = -EFAULT; return err; } case IPC_STAT: case MSG_STAT: case MSG_STAT_ANY: err = msgctl_stat(ns, msqid, cmd, &msqid64); if (err < 0) return err; if (copy_compat_msqid_to_user(uptr, &msqid64, version)) err = -EFAULT; return err; case IPC_SET: if (copy_compat_msqid_from_user(&msqid64, uptr, version)) return -EFAULT; return msgctl_down(ns, msqid, cmd, &msqid64.msg_perm, msqid64.msg_qbytes); case IPC_RMID: return msgctl_down(ns, msqid, cmd, NULL, 0); default: return -EINVAL; } } COMPAT_SYSCALL_DEFINE3(msgctl, int, msqid, int, cmd, void __user *, uptr) { return compat_ksys_msgctl(msqid, cmd, uptr, IPC_64); } #ifdef CONFIG_ARCH_WANT_COMPAT_IPC_PARSE_VERSION long compat_ksys_old_msgctl(int msqid, int cmd, void __user *uptr) { int version = compat_ipc_parse_version(&cmd); return compat_ksys_msgctl(msqid, cmd, uptr, version); } COMPAT_SYSCALL_DEFINE3(old_msgctl, int, msqid, int, cmd, void __user *, uptr) { return compat_ksys_old_msgctl(msqid, cmd, uptr); } #endif #endif static int testmsg(struct msg_msg *msg, long type, int mode) { switch (mode) { case SEARCH_ANY: case SEARCH_NUMBER: return 1; case SEARCH_LESSEQUAL: if (msg->m_type <= type) return 1; break; case SEARCH_EQUAL: if (msg->m_type == type) return 1; break; case SEARCH_NOTEQUAL: if (msg->m_type != type) return 1; break; } return 0; } static inline int pipelined_send(struct msg_queue *msq, struct msg_msg *msg, struct wake_q_head *wake_q) { struct msg_receiver *msr, *t; list_for_each_entry_safe(msr, t, &msq->q_receivers, r_list) { if (testmsg(msg, msr->r_msgtype, msr->r_mode) && !security_msg_queue_msgrcv(&msq->q_perm, msg, msr->r_tsk, msr->r_msgtype, msr->r_mode)) { list_del(&msr->r_list); if (msr->r_maxsize < msg->m_ts) { wake_q_add(wake_q, msr->r_tsk); /* See expunge_all regarding memory barrier */ smp_store_release(&msr->r_msg, ERR_PTR(-E2BIG)); } else { ipc_update_pid(&msq->q_lrpid, task_pid(msr->r_tsk)); msq->q_rtime = ktime_get_real_seconds(); wake_q_add(wake_q, msr->r_tsk); /* See expunge_all regarding memory barrier */ smp_store_release(&msr->r_msg, msg); return 1; } } } return 0; } static long do_msgsnd(int msqid, long mtype, void __user *mtext, size_t msgsz, int msgflg) { struct msg_queue *msq; struct msg_msg *msg; int err; struct ipc_namespace *ns; DEFINE_WAKE_Q(wake_q); ns = current->nsproxy->ipc_ns; if (msgsz > ns->msg_ctlmax || (long) msgsz < 0 || msqid < 0) return -EINVAL; if (mtype < 1) return -EINVAL; msg = load_msg(mtext, msgsz); if (IS_ERR(msg)) return PTR_ERR(msg); msg->m_type = mtype; msg->m_ts = msgsz; rcu_read_lock(); msq = msq_obtain_object_check(ns, msqid); if (IS_ERR(msq)) { err = PTR_ERR(msq); goto out_unlock1; } ipc_lock_object(&msq->q_perm); for (;;) { struct msg_sender s; err = -EACCES; if (ipcperms(ns, &msq->q_perm, S_IWUGO)) goto out_unlock0; /* raced with RMID? */ if (!ipc_valid_object(&msq->q_perm)) { err = -EIDRM; goto out_unlock0; } err = security_msg_queue_msgsnd(&msq->q_perm, msg, msgflg); if (err) goto out_unlock0; if (msg_fits_inqueue(msq, msgsz)) break; /* queue full, wait: */ if (msgflg & IPC_NOWAIT) { err = -EAGAIN; goto out_unlock0; } /* enqueue the sender and prepare to block */ ss_add(msq, &s, msgsz); if (!ipc_rcu_getref(&msq->q_perm)) { err = -EIDRM; goto out_unlock0; } ipc_unlock_object(&msq->q_perm); rcu_read_unlock(); schedule(); rcu_read_lock(); ipc_lock_object(&msq->q_perm); ipc_rcu_putref(&msq->q_perm, msg_rcu_free); /* raced with RMID? */ if (!ipc_valid_object(&msq->q_perm)) { err = -EIDRM; goto out_unlock0; } ss_del(&s); if (signal_pending(current)) { err = -ERESTARTNOHAND; goto out_unlock0; } } ipc_update_pid(&msq->q_lspid, task_tgid(current)); msq->q_stime = ktime_get_real_seconds(); if (!pipelined_send(msq, msg, &wake_q)) { /* no one is waiting for this message, enqueue it */ list_add_tail(&msg->m_list, &msq->q_messages); msq->q_cbytes += msgsz; msq->q_qnum++; percpu_counter_add_local(&ns->percpu_msg_bytes, msgsz); percpu_counter_add_local(&ns->percpu_msg_hdrs, 1); } err = 0; msg = NULL; out_unlock0: ipc_unlock_object(&msq->q_perm); wake_up_q(&wake_q); out_unlock1: rcu_read_unlock(); if (msg != NULL) free_msg(msg); return err; } long ksys_msgsnd(int msqid, struct msgbuf __user *msgp, size_t msgsz, int msgflg) { long mtype; if (get_user(mtype, &msgp->mtype)) return -EFAULT; return do_msgsnd(msqid, mtype, msgp->mtext, msgsz, msgflg); } SYSCALL_DEFINE4(msgsnd, int, msqid, struct msgbuf __user *, msgp, size_t, msgsz, int, msgflg) { return ksys_msgsnd(msqid, msgp, msgsz, msgflg); } #ifdef CONFIG_COMPAT struct compat_msgbuf { compat_long_t mtype; char mtext[1]; }; long compat_ksys_msgsnd(int msqid, compat_uptr_t msgp, compat_ssize_t msgsz, int msgflg) { struct compat_msgbuf __user *up = compat_ptr(msgp); compat_long_t mtype; if (get_user(mtype, &up->mtype)) return -EFAULT; return do_msgsnd(msqid, mtype, up->mtext, (ssize_t)msgsz, msgflg); } COMPAT_SYSCALL_DEFINE4(msgsnd, int, msqid, compat_uptr_t, msgp, compat_ssize_t, msgsz, int, msgflg) { return compat_ksys_msgsnd(msqid, msgp, msgsz, msgflg); } #endif static inline int convert_mode(long *msgtyp, int msgflg) { if (msgflg & MSG_COPY) return SEARCH_NUMBER; /* * find message of correct type. * msgtyp = 0 => get first. * msgtyp > 0 => get first message of matching type. * msgtyp < 0 => get message with least type must be < abs(msgtype). */ if (*msgtyp == 0) return SEARCH_ANY; if (*msgtyp < 0) { if (*msgtyp == LONG_MIN) /* -LONG_MIN is undefined */ *msgtyp = LONG_MAX; else *msgtyp = -*msgtyp; return SEARCH_LESSEQUAL; } if (msgflg & MSG_EXCEPT) return SEARCH_NOTEQUAL; return SEARCH_EQUAL; } static long do_msg_fill(void __user *dest, struct msg_msg *msg, size_t bufsz) { struct msgbuf __user *msgp = dest; size_t msgsz; if (put_user(msg->m_type, &msgp->mtype)) return -EFAULT; msgsz = (bufsz > msg->m_ts) ? msg->m_ts : bufsz; if (store_msg(msgp->mtext, msg, msgsz)) return -EFAULT; return msgsz; } #ifdef CONFIG_CHECKPOINT_RESTORE /* * This function creates new kernel message structure, large enough to store * bufsz message bytes. */ static inline struct msg_msg *prepare_copy(void __user *buf, size_t bufsz) { struct msg_msg *copy; /* * Create dummy message to copy real message to. */ copy = load_msg(buf, bufsz); if (!IS_ERR(copy)) copy->m_ts = bufsz; return copy; } static inline void free_copy(struct msg_msg *copy) { if (copy) free_msg(copy); } #else static inline struct msg_msg *prepare_copy(void __user *buf, size_t bufsz) { return ERR_PTR(-ENOSYS); } static inline void free_copy(struct msg_msg *copy) { } #endif static struct msg_msg *find_msg(struct msg_queue *msq, long *msgtyp, int mode) { struct msg_msg *msg, *found = NULL; long count = 0; list_for_each_entry(msg, &msq->q_messages, m_list) { if (testmsg(msg, *msgtyp, mode) && !security_msg_queue_msgrcv(&msq->q_perm, msg, current, *msgtyp, mode)) { if (mode == SEARCH_LESSEQUAL && msg->m_type != 1) { *msgtyp = msg->m_type - 1; found = msg; } else if (mode == SEARCH_NUMBER) { if (*msgtyp == count) return msg; } else return msg; count++; } } return found ?: ERR_PTR(-EAGAIN); } static long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, int msgflg, long (*msg_handler)(void __user *, struct msg_msg *, size_t)) { int mode; struct msg_queue *msq; struct ipc_namespace *ns; struct msg_msg *msg, *copy = NULL; DEFINE_WAKE_Q(wake_q); ns = current->nsproxy->ipc_ns; if (msqid < 0 || (long) bufsz < 0) return -EINVAL; if (msgflg & MSG_COPY) { if ((msgflg & MSG_EXCEPT) || !(msgflg & IPC_NOWAIT)) return -EINVAL; copy = prepare_copy(buf, min_t(size_t, bufsz, ns->msg_ctlmax)); if (IS_ERR(copy)) return PTR_ERR(copy); } mode = convert_mode(&msgtyp, msgflg); rcu_read_lock(); msq = msq_obtain_object_check(ns, msqid); if (IS_ERR(msq)) { rcu_read_unlock(); free_copy(copy); return PTR_ERR(msq); } for (;;) { struct msg_receiver msr_d; msg = ERR_PTR(-EACCES); if (ipcperms(ns, &msq->q_perm, S_IRUGO)) goto out_unlock1; ipc_lock_object(&msq->q_perm); /* raced with RMID? */ if (!ipc_valid_object(&msq->q_perm)) { msg = ERR_PTR(-EIDRM); goto out_unlock0; } msg = find_msg(msq, &msgtyp, mode); if (!IS_ERR(msg)) { /* * Found a suitable message. * Unlink it from the queue. */ if ((bufsz < msg->m_ts) && !(msgflg & MSG_NOERROR)) { msg = ERR_PTR(-E2BIG); goto out_unlock0; } /* * If we are copying, then do not unlink message and do * not update queue parameters. */ if (msgflg & MSG_COPY) { msg = copy_msg(msg, copy); goto out_unlock0; } list_del(&msg->m_list); msq->q_qnum--; msq->q_rtime = ktime_get_real_seconds(); ipc_update_pid(&msq->q_lrpid, task_tgid(current)); msq->q_cbytes -= msg->m_ts; percpu_counter_sub_local(&ns->percpu_msg_bytes, msg->m_ts); percpu_counter_sub_local(&ns->percpu_msg_hdrs, 1); ss_wakeup(msq, &wake_q, false); goto out_unlock0; } /* No message waiting. Wait for a message */ if (msgflg & IPC_NOWAIT) { msg = ERR_PTR(-ENOMSG); goto out_unlock0; } list_add_tail(&msr_d.r_list, &msq->q_receivers); msr_d.r_tsk = current; msr_d.r_msgtype = msgtyp; msr_d.r_mode = mode; if (msgflg & MSG_NOERROR) msr_d.r_maxsize = INT_MAX; else msr_d.r_maxsize = bufsz; /* memory barrier not require due to ipc_lock_object() */ WRITE_ONCE(msr_d.r_msg, ERR_PTR(-EAGAIN)); /* memory barrier not required, we own ipc_lock_object() */ __set_current_state(TASK_INTERRUPTIBLE); ipc_unlock_object(&msq->q_perm); rcu_read_unlock(); schedule(); /* * Lockless receive, part 1: * We don't hold a reference to the queue and getting a * reference would defeat the idea of a lockless operation, * thus the code relies on rcu to guarantee the existence of * msq: * Prior to destruction, expunge_all(-EIRDM) changes r_msg. * Thus if r_msg is -EAGAIN, then the queue not yet destroyed. */ rcu_read_lock(); /* * Lockless receive, part 2: * The work in pipelined_send() and expunge_all(): * - Set pointer to message * - Queue the receiver task for later wakeup * - Wake up the process after the lock is dropped. * * Should the process wake up before this wakeup (due to a * signal) it will either see the message and continue ... */ msg = READ_ONCE(msr_d.r_msg); if (msg != ERR_PTR(-EAGAIN)) { /* see MSG_BARRIER for purpose/pairing */ smp_acquire__after_ctrl_dep(); goto out_unlock1; } /* * ... or see -EAGAIN, acquire the lock to check the message * again. */ ipc_lock_object(&msq->q_perm); msg = READ_ONCE(msr_d.r_msg); if (msg != ERR_PTR(-EAGAIN)) goto out_unlock0; list_del(&msr_d.r_list); if (signal_pending(current)) { msg = ERR_PTR(-ERESTARTNOHAND); goto out_unlock0; } ipc_unlock_object(&msq->q_perm); } out_unlock0: ipc_unlock_object(&msq->q_perm); wake_up_q(&wake_q); out_unlock1: rcu_read_unlock(); if (IS_ERR(msg)) { free_copy(copy); return PTR_ERR(msg); } bufsz = msg_handler(buf, msg, bufsz); free_msg(msg); return bufsz; } long ksys_msgrcv(int msqid, struct msgbuf __user *msgp, size_t msgsz, long msgtyp, int msgflg) { return do_msgrcv(msqid, msgp, msgsz, msgtyp, msgflg, do_msg_fill); } SYSCALL_DEFINE5(msgrcv, int, msqid, struct msgbuf __user *, msgp, size_t, msgsz, long, msgtyp, int, msgflg) { return ksys_msgrcv(msqid, msgp, msgsz, msgtyp, msgflg); } #ifdef CONFIG_COMPAT static long compat_do_msg_fill(void __user *dest, struct msg_msg *msg, size_t bufsz) { struct compat_msgbuf __user *msgp = dest; size_t msgsz; if (put_user(msg->m_type, &msgp->mtype)) return -EFAULT; msgsz = (bufsz > msg->m_ts) ? msg->m_ts : bufsz; if (store_msg(msgp->mtext, msg, msgsz)) return -EFAULT; return msgsz; } long compat_ksys_msgrcv(int msqid, compat_uptr_t msgp, compat_ssize_t msgsz, compat_long_t msgtyp, int msgflg) { return do_msgrcv(msqid, compat_ptr(msgp), (ssize_t)msgsz, (long)msgtyp, msgflg, compat_do_msg_fill); } COMPAT_SYSCALL_DEFINE5(msgrcv, int, msqid, compat_uptr_t, msgp, compat_ssize_t, msgsz, compat_long_t, msgtyp, int, msgflg) { return compat_ksys_msgrcv(msqid, msgp, msgsz, msgtyp, msgflg); } #endif int msg_init_ns(struct ipc_namespace *ns) { int ret; ns->msg_ctlmax = MSGMAX; ns->msg_ctlmnb = MSGMNB; ns->msg_ctlmni = MSGMNI; ret = percpu_counter_init(&ns->percpu_msg_bytes, 0, GFP_KERNEL); if (ret) goto fail_msg_bytes; ret = percpu_counter_init(&ns->percpu_msg_hdrs, 0, GFP_KERNEL); if (ret) goto fail_msg_hdrs; ipc_init_ids(&ns->ids[IPC_MSG_IDS]); return 0; fail_msg_hdrs: percpu_counter_destroy(&ns->percpu_msg_bytes); fail_msg_bytes: return ret; } #ifdef CONFIG_IPC_NS void msg_exit_ns(struct ipc_namespace *ns) { free_ipcs(ns, &msg_ids(ns), freeque); idr_destroy(&ns->ids[IPC_MSG_IDS].ipcs_idr); rhashtable_destroy(&ns->ids[IPC_MSG_IDS].key_ht); percpu_counter_destroy(&ns->percpu_msg_bytes); percpu_counter_destroy(&ns->percpu_msg_hdrs); } #endif #ifdef CONFIG_PROC_FS static int sysvipc_msg_proc_show(struct seq_file *s, void *it) { struct pid_namespace *pid_ns = ipc_seq_pid_ns(s); struct user_namespace *user_ns = seq_user_ns(s); struct kern_ipc_perm *ipcp = it; struct msg_queue *msq = container_of(ipcp, struct msg_queue, q_perm); seq_printf(s, "%10d %10d %4o %10lu %10lu %5u %5u %5u %5u %5u %5u %10llu %10llu %10llu\n", msq->q_perm.key, msq->q_perm.id, msq->q_perm.mode, msq->q_cbytes, msq->q_qnum, pid_nr_ns(msq->q_lspid, pid_ns), pid_nr_ns(msq->q_lrpid, pid_ns), from_kuid_munged(user_ns, msq->q_perm.uid), from_kgid_munged(user_ns, msq->q_perm.gid), from_kuid_munged(user_ns, msq->q_perm.cuid), from_kgid_munged(user_ns, msq->q_perm.cgid), msq->q_stime, msq->q_rtime, msq->q_ctime); return 0; } #endif void __init msg_init(void) { msg_init_ns(&init_ipc_ns); ipc_init_proc_interface("sysvipc/msg", " key msqid perms cbytes qnum lspid lrpid uid gid cuid cgid stime rtime ctime\n", IPC_MSG_IDS, sysvipc_msg_proc_show); } |
| 1 1 1 1 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 | // SPDX-License-Identifier: GPL-2.0-or-later /* * ChaCha20-Poly1305 AEAD, RFC7539 * * Copyright (C) 2015 Martin Willi */ #include <crypto/internal/aead.h> #include <crypto/internal/hash.h> #include <crypto/internal/skcipher.h> #include <crypto/scatterwalk.h> #include <crypto/chacha.h> #include <crypto/poly1305.h> #include <linux/err.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/module.h> struct chachapoly_instance_ctx { struct crypto_skcipher_spawn chacha; struct crypto_ahash_spawn poly; unsigned int saltlen; }; struct chachapoly_ctx { struct crypto_skcipher *chacha; struct crypto_ahash *poly; /* key bytes we use for the ChaCha20 IV */ unsigned int saltlen; u8 salt[]; }; struct poly_req { /* zero byte padding for AD/ciphertext, as needed */ u8 pad[POLY1305_BLOCK_SIZE]; /* tail data with AD/ciphertext lengths */ struct { __le64 assoclen; __le64 cryptlen; } tail; struct scatterlist src[1]; struct ahash_request req; /* must be last member */ }; struct chacha_req { u8 iv[CHACHA_IV_SIZE]; struct scatterlist src[1]; struct skcipher_request req; /* must be last member */ }; struct chachapoly_req_ctx { struct scatterlist src[2]; struct scatterlist dst[2]; /* the key we generate for Poly1305 using Chacha20 */ u8 key[POLY1305_KEY_SIZE]; /* calculated Poly1305 tag */ u8 tag[POLY1305_DIGEST_SIZE]; /* length of data to en/decrypt, without ICV */ unsigned int cryptlen; /* Actual AD, excluding IV */ unsigned int assoclen; /* request flags, with MAY_SLEEP cleared if needed */ u32 flags; union { struct poly_req poly; struct chacha_req chacha; } u; }; static inline void async_done_continue(struct aead_request *req, int err, int (*cont)(struct aead_request *)) { if (!err) { struct chachapoly_req_ctx *rctx = aead_request_ctx(req); rctx->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; err = cont(req); } if (err != -EINPROGRESS && err != -EBUSY) aead_request_complete(req, err); } static void chacha_iv(u8 *iv, struct aead_request *req, u32 icb) { struct chachapoly_ctx *ctx = crypto_aead_ctx(crypto_aead_reqtfm(req)); __le32 leicb = cpu_to_le32(icb); memcpy(iv, &leicb, sizeof(leicb)); memcpy(iv + sizeof(leicb), ctx->salt, ctx->saltlen); memcpy(iv + sizeof(leicb) + ctx->saltlen, req->iv, CHACHA_IV_SIZE - sizeof(leicb) - ctx->saltlen); } static int poly_verify_tag(struct aead_request *req) { struct chachapoly_req_ctx *rctx = aead_request_ctx(req); u8 tag[sizeof(rctx->tag)]; scatterwalk_map_and_copy(tag, req->src, req->assoclen + rctx->cryptlen, sizeof(tag), 0); if (crypto_memneq(tag, rctx->tag, sizeof(tag))) return -EBADMSG; return 0; } static int poly_copy_tag(struct aead_request *req) { struct chachapoly_req_ctx *rctx = aead_request_ctx(req); scatterwalk_map_and_copy(rctx->tag, req->dst, req->assoclen + rctx->cryptlen, sizeof(rctx->tag), 1); return 0; } static void chacha_decrypt_done(void *data, int err) { async_done_continue(data, err, poly_verify_tag); } static int chacha_decrypt(struct aead_request *req) { struct chachapoly_ctx *ctx = crypto_aead_ctx(crypto_aead_reqtfm(req)); struct chachapoly_req_ctx *rctx = aead_request_ctx(req); struct chacha_req *creq = &rctx->u.chacha; struct scatterlist *src, *dst; int err; if (rctx->cryptlen == 0) goto skip; chacha_iv(creq->iv, req, 1); src = scatterwalk_ffwd(rctx->src, req->src, req->assoclen); dst = src; if (req->src != req->dst) dst = scatterwalk_ffwd(rctx->dst, req->dst, req->assoclen); skcipher_request_set_callback(&creq->req, rctx->flags, chacha_decrypt_done, req); skcipher_request_set_tfm(&creq->req, ctx->chacha); skcipher_request_set_crypt(&creq->req, src, dst, rctx->cryptlen, creq->iv); err = crypto_skcipher_decrypt(&creq->req); if (err) return err; skip: return poly_verify_tag(req); } static int poly_tail_continue(struct aead_request *req) { struct chachapoly_req_ctx *rctx = aead_request_ctx(req); if (rctx->cryptlen == req->cryptlen) /* encrypting */ return poly_copy_tag(req); return chacha_decrypt(req); } static void poly_tail_done(void *data, int err) { async_done_continue(data, err, poly_tail_continue); } static int poly_tail(struct aead_request *req) { struct crypto_aead *tfm = crypto_aead_reqtfm(req); struct chachapoly_ctx *ctx = crypto_aead_ctx(tfm); struct chachapoly_req_ctx *rctx = aead_request_ctx(req); struct poly_req *preq = &rctx->u.poly; int err; preq->tail.assoclen = cpu_to_le64(rctx->assoclen); preq->tail.cryptlen = cpu_to_le64(rctx->cryptlen); sg_init_one(preq->src, &preq->tail, sizeof(preq->tail)); ahash_request_set_callback(&preq->req, rctx->flags, poly_tail_done, req); ahash_request_set_tfm(&preq->req, ctx->poly); ahash_request_set_crypt(&preq->req, preq->src, rctx->tag, sizeof(preq->tail)); err = crypto_ahash_finup(&preq->req); if (err) return err; return poly_tail_continue(req); } static void poly_cipherpad_done(void *data, int err) { async_done_continue(data, err, poly_tail); } static int poly_cipherpad(struct aead_request *req) { struct chachapoly_ctx *ctx = crypto_aead_ctx(crypto_aead_reqtfm(req)); struct chachapoly_req_ctx *rctx = aead_request_ctx(req); struct poly_req *preq = &rctx->u.poly; unsigned int padlen; int err; padlen = -rctx->cryptlen % POLY1305_BLOCK_SIZE; memset(preq->pad, 0, sizeof(preq->pad)); sg_init_one(preq->src, preq->pad, padlen); ahash_request_set_callback(&preq->req, rctx->flags, poly_cipherpad_done, req); ahash_request_set_tfm(&preq->req, ctx->poly); ahash_request_set_crypt(&preq->req, preq->src, NULL, padlen); err = crypto_ahash_update(&preq->req); if (err) return err; return poly_tail(req); } static void poly_cipher_done(void *data, int err) { async_done_continue(data, err, poly_cipherpad); } static int poly_cipher(struct aead_request *req) { struct chachapoly_ctx *ctx = crypto_aead_ctx(crypto_aead_reqtfm(req)); struct chachapoly_req_ctx *rctx = aead_request_ctx(req); struct poly_req *preq = &rctx->u.poly; struct scatterlist *crypt = req->src; int err; if (rctx->cryptlen == req->cryptlen) /* encrypting */ crypt = req->dst; crypt = scatterwalk_ffwd(rctx->src, crypt, req->assoclen); ahash_request_set_callback(&preq->req, rctx->flags, poly_cipher_done, req); ahash_request_set_tfm(&preq->req, ctx->poly); ahash_request_set_crypt(&preq->req, crypt, NULL, rctx->cryptlen); err = crypto_ahash_update(&preq->req); if (err) return err; return poly_cipherpad(req); } static void poly_adpad_done(void *data, int err) { async_done_continue(data, err, poly_cipher); } static int poly_adpad(struct aead_request *req) { struct chachapoly_ctx *ctx = crypto_aead_ctx(crypto_aead_reqtfm(req)); struct chachapoly_req_ctx *rctx = aead_request_ctx(req); struct poly_req *preq = &rctx->u.poly; unsigned int padlen; int err; padlen = -rctx->assoclen % POLY1305_BLOCK_SIZE; memset(preq->pad, 0, sizeof(preq->pad)); sg_init_one(preq->src, preq->pad, padlen); ahash_request_set_callback(&preq->req, rctx->flags, poly_adpad_done, req); ahash_request_set_tfm(&preq->req, ctx->poly); ahash_request_set_crypt(&preq->req, preq->src, NULL, padlen); err = crypto_ahash_update(&preq->req); if (err) return err; return poly_cipher(req); } static void poly_ad_done(void *data, int err) { async_done_continue(data, err, poly_adpad); } static int poly_ad(struct aead_request *req) { struct chachapoly_ctx *ctx = crypto_aead_ctx(crypto_aead_reqtfm(req)); struct chachapoly_req_ctx *rctx = aead_request_ctx(req); struct poly_req *preq = &rctx->u.poly; int err; ahash_request_set_callback(&preq->req, rctx->flags, poly_ad_done, req); ahash_request_set_tfm(&preq->req, ctx->poly); ahash_request_set_crypt(&preq->req, req->src, NULL, rctx->assoclen); err = crypto_ahash_update(&preq->req); if (err) return err; return poly_adpad(req); } static void poly_setkey_done(void *data, int err) { async_done_continue(data, err, poly_ad); } static int poly_setkey(struct aead_request *req) { struct chachapoly_ctx *ctx = crypto_aead_ctx(crypto_aead_reqtfm(req)); struct chachapoly_req_ctx *rctx = aead_request_ctx(req); struct poly_req *preq = &rctx->u.poly; int err; sg_init_one(preq->src, rctx->key, sizeof(rctx->key)); ahash_request_set_callback(&preq->req, rctx->flags, poly_setkey_done, req); ahash_request_set_tfm(&preq->req, ctx->poly); ahash_request_set_crypt(&preq->req, preq->src, NULL, sizeof(rctx->key)); err = crypto_ahash_update(&preq->req); if (err) return err; return poly_ad(req); } static void poly_init_done(void *data, int err) { async_done_continue(data, err, poly_setkey); } static int poly_init(struct aead_request *req) { struct chachapoly_ctx *ctx = crypto_aead_ctx(crypto_aead_reqtfm(req)); struct chachapoly_req_ctx *rctx = aead_request_ctx(req); struct poly_req *preq = &rctx->u.poly; int err; ahash_request_set_callback(&preq->req, rctx->flags, poly_init_done, req); ahash_request_set_tfm(&preq->req, ctx->poly); err = crypto_ahash_init(&preq->req); if (err) return err; return poly_setkey(req); } static void poly_genkey_done(void *data, int err) { async_done_continue(data, err, poly_init); } static int poly_genkey(struct aead_request *req) { struct crypto_aead *tfm = crypto_aead_reqtfm(req); struct chachapoly_ctx *ctx = crypto_aead_ctx(tfm); struct chachapoly_req_ctx *rctx = aead_request_ctx(req); struct chacha_req *creq = &rctx->u.chacha; int err; rctx->assoclen = req->assoclen; if (crypto_aead_ivsize(tfm) == 8) { if (rctx->assoclen < 8) return -EINVAL; rctx->assoclen -= 8; } memset(rctx->key, 0, sizeof(rctx->key)); sg_init_one(creq->src, rctx->key, sizeof(rctx->key)); chacha_iv(creq->iv, req, 0); skcipher_request_set_callback(&creq->req, rctx->flags, poly_genkey_done, req); skcipher_request_set_tfm(&creq->req, ctx->chacha); skcipher_request_set_crypt(&creq->req, creq->src, creq->src, POLY1305_KEY_SIZE, creq->iv); err = crypto_skcipher_decrypt(&creq->req); if (err) return err; return poly_init(req); } static void chacha_encrypt_done(void *data, int err) { async_done_continue(data, err, poly_genkey); } static int chacha_encrypt(struct aead_request *req) { struct chachapoly_ctx *ctx = crypto_aead_ctx(crypto_aead_reqtfm(req)); struct chachapoly_req_ctx *rctx = aead_request_ctx(req); struct chacha_req *creq = &rctx->u.chacha; struct scatterlist *src, *dst; int err; if (req->cryptlen == 0) goto skip; chacha_iv(creq->iv, req, 1); src = scatterwalk_ffwd(rctx->src, req->src, req->assoclen); dst = src; if (req->src != req->dst) dst = scatterwalk_ffwd(rctx->dst, req->dst, req->assoclen); skcipher_request_set_callback(&creq->req, rctx->flags, chacha_encrypt_done, req); skcipher_request_set_tfm(&creq->req, ctx->chacha); skcipher_request_set_crypt(&creq->req, src, dst, req->cryptlen, creq->iv); err = crypto_skcipher_encrypt(&creq->req); if (err) return err; skip: return poly_genkey(req); } static int chachapoly_encrypt(struct aead_request *req) { struct chachapoly_req_ctx *rctx = aead_request_ctx(req); rctx->cryptlen = req->cryptlen; rctx->flags = aead_request_flags(req); /* encrypt call chain: * - chacha_encrypt/done() * - poly_genkey/done() * - poly_init/done() * - poly_setkey/done() * - poly_ad/done() * - poly_adpad/done() * - poly_cipher/done() * - poly_cipherpad/done() * - poly_tail/done/continue() * - poly_copy_tag() */ return chacha_encrypt(req); } static int chachapoly_decrypt(struct aead_request *req) { struct chachapoly_req_ctx *rctx = aead_request_ctx(req); rctx->cryptlen = req->cryptlen - POLY1305_DIGEST_SIZE; rctx->flags = aead_request_flags(req); /* decrypt call chain: * - poly_genkey/done() * - poly_init/done() * - poly_setkey/done() * - poly_ad/done() * - poly_adpad/done() * - poly_cipher/done() * - poly_cipherpad/done() * - poly_tail/done/continue() * - chacha_decrypt/done() * - poly_verify_tag() */ return poly_genkey(req); } static int chachapoly_setkey(struct crypto_aead *aead, const u8 *key, unsigned int keylen) { struct chachapoly_ctx *ctx = crypto_aead_ctx(aead); if (keylen != ctx->saltlen + CHACHA_KEY_SIZE) return -EINVAL; keylen -= ctx->saltlen; memcpy(ctx->salt, key + keylen, ctx->saltlen); crypto_skcipher_clear_flags(ctx->chacha, CRYPTO_TFM_REQ_MASK); crypto_skcipher_set_flags(ctx->chacha, crypto_aead_get_flags(aead) & CRYPTO_TFM_REQ_MASK); return crypto_skcipher_setkey(ctx->chacha, key, keylen); } static int chachapoly_setauthsize(struct crypto_aead *tfm, unsigned int authsize) { if (authsize != POLY1305_DIGEST_SIZE) return -EINVAL; return 0; } static int chachapoly_init(struct crypto_aead *tfm) { struct aead_instance *inst = aead_alg_instance(tfm); struct chachapoly_instance_ctx *ictx = aead_instance_ctx(inst); struct chachapoly_ctx *ctx = crypto_aead_ctx(tfm); struct crypto_skcipher *chacha; struct crypto_ahash *poly; unsigned long align; poly = crypto_spawn_ahash(&ictx->poly); if (IS_ERR(poly)) return PTR_ERR(poly); chacha = crypto_spawn_skcipher(&ictx->chacha); if (IS_ERR(chacha)) { crypto_free_ahash(poly); return PTR_ERR(chacha); } ctx->chacha = chacha; ctx->poly = poly; ctx->saltlen = ictx->saltlen; align = crypto_aead_alignmask(tfm); align &= ~(crypto_tfm_ctx_alignment() - 1); crypto_aead_set_reqsize( tfm, align + offsetof(struct chachapoly_req_ctx, u) + max(offsetof(struct chacha_req, req) + sizeof(struct skcipher_request) + crypto_skcipher_reqsize(chacha), offsetof(struct poly_req, req) + sizeof(struct ahash_request) + crypto_ahash_reqsize(poly))); return 0; } static void chachapoly_exit(struct crypto_aead *tfm) { struct chachapoly_ctx *ctx = crypto_aead_ctx(tfm); crypto_free_ahash(ctx->poly); crypto_free_skcipher(ctx->chacha); } static void chachapoly_free(struct aead_instance *inst) { struct chachapoly_instance_ctx *ctx = aead_instance_ctx(inst); crypto_drop_skcipher(&ctx->chacha); crypto_drop_ahash(&ctx->poly); kfree(inst); } static int chachapoly_create(struct crypto_template *tmpl, struct rtattr **tb, const char *name, unsigned int ivsize) { u32 mask; struct aead_instance *inst; struct chachapoly_instance_ctx *ctx; struct skcipher_alg_common *chacha; struct hash_alg_common *poly; int err; if (ivsize > CHACHAPOLY_IV_SIZE) return -EINVAL; err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_AEAD, &mask); if (err) return err; inst = kzalloc(sizeof(*inst) + sizeof(*ctx), GFP_KERNEL); if (!inst) return -ENOMEM; ctx = aead_instance_ctx(inst); ctx->saltlen = CHACHAPOLY_IV_SIZE - ivsize; err = crypto_grab_skcipher(&ctx->chacha, aead_crypto_instance(inst), crypto_attr_alg_name(tb[1]), 0, mask); if (err) goto err_free_inst; chacha = crypto_spawn_skcipher_alg_common(&ctx->chacha); err = crypto_grab_ahash(&ctx->poly, aead_crypto_instance(inst), crypto_attr_alg_name(tb[2]), 0, mask); if (err) goto err_free_inst; poly = crypto_spawn_ahash_alg(&ctx->poly); err = -EINVAL; if (poly->digestsize != POLY1305_DIGEST_SIZE) goto err_free_inst; /* Need 16-byte IV size, including Initial Block Counter value */ if (chacha->ivsize != CHACHA_IV_SIZE) goto err_free_inst; /* Not a stream cipher? */ if (chacha->base.cra_blocksize != 1) goto err_free_inst; err = -ENAMETOOLONG; if (snprintf(inst->alg.base.cra_name, CRYPTO_MAX_ALG_NAME, "%s(%s,%s)", name, chacha->base.cra_name, poly->base.cra_name) >= CRYPTO_MAX_ALG_NAME) goto err_free_inst; if (snprintf(inst->alg.base.cra_driver_name, CRYPTO_MAX_ALG_NAME, "%s(%s,%s)", name, chacha->base.cra_driver_name, poly->base.cra_driver_name) >= CRYPTO_MAX_ALG_NAME) goto err_free_inst; inst->alg.base.cra_priority = (chacha->base.cra_priority + poly->base.cra_priority) / 2; inst->alg.base.cra_blocksize = 1; inst->alg.base.cra_alignmask = chacha->base.cra_alignmask; inst->alg.base.cra_ctxsize = sizeof(struct chachapoly_ctx) + ctx->saltlen; inst->alg.ivsize = ivsize; inst->alg.chunksize = chacha->chunksize; inst->alg.maxauthsize = POLY1305_DIGEST_SIZE; inst->alg.init = chachapoly_init; inst->alg.exit = chachapoly_exit; inst->alg.encrypt = chachapoly_encrypt; inst->alg.decrypt = chachapoly_decrypt; inst->alg.setkey = chachapoly_setkey; inst->alg.setauthsize = chachapoly_setauthsize; inst->free = chachapoly_free; err = aead_register_instance(tmpl, inst); if (err) { err_free_inst: chachapoly_free(inst); } return err; } static int rfc7539_create(struct crypto_template *tmpl, struct rtattr **tb) { return chachapoly_create(tmpl, tb, "rfc7539", 12); } static int rfc7539esp_create(struct crypto_template *tmpl, struct rtattr **tb) { return chachapoly_create(tmpl, tb, "rfc7539esp", 8); } static struct crypto_template rfc7539_tmpls[] = { { .name = "rfc7539", .create = rfc7539_create, .module = THIS_MODULE, }, { .name = "rfc7539esp", .create = rfc7539esp_create, .module = THIS_MODULE, }, }; static int __init chacha20poly1305_module_init(void) { return crypto_register_templates(rfc7539_tmpls, ARRAY_SIZE(rfc7539_tmpls)); } static void __exit chacha20poly1305_module_exit(void) { crypto_unregister_templates(rfc7539_tmpls, ARRAY_SIZE(rfc7539_tmpls)); } subsys_initcall(chacha20poly1305_module_init); module_exit(chacha20poly1305_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Martin Willi <martin@strongswan.org>"); MODULE_DESCRIPTION("ChaCha20-Poly1305 AEAD"); MODULE_ALIAS_CRYPTO("rfc7539"); MODULE_ALIAS_CRYPTO("rfc7539esp"); |
| 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445 7446 7447 7448 7449 7450 7451 7452 7453 7454 7455 7456 7457 7458 7459 7460 7461 7462 7463 7464 7465 7466 7467 7468 7469 7470 7471 7472 7473 7474 7475 7476 7477 7478 7479 7480 7481 7482 7483 7484 7485 7486 7487 7488 7489 7490 7491 7492 7493 7494 7495 7496 7497 7498 7499 7500 7501 7502 7503 | /* * Copyright (c) 2006 Luc Verhaegen (quirks list) * Copyright (c) 2007-2008 Intel Corporation * Jesse Barnes <jesse.barnes@intel.com> * Copyright 2010 Red Hat, Inc. * * DDC probing routines (drm_ddc_read & drm_do_probe_ddc_edid) originally from * FB layer. * Copyright (C) 2006 Dennis Munsie <dmunsie@cecropia.com> * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sub license, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the * next paragraph) shall be included in all copies or substantial portions * of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. */ #include <linux/bitfield.h> #include <linux/byteorder/generic.h> #include <linux/cec.h> #include <linux/hdmi.h> #include <linux/i2c.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/pci.h> #include <linux/seq_buf.h> #include <linux/slab.h> #include <linux/vga_switcheroo.h> #include <drm/drm_drv.h> #include <drm/drm_edid.h> #include <drm/drm_eld.h> #include <drm/drm_encoder.h> #include <drm/drm_print.h> #include "drm_crtc_internal.h" #include "drm_displayid_internal.h" #include "drm_internal.h" static int oui(u8 first, u8 second, u8 third) { return (first << 16) | (second << 8) | third; } #define EDID_EST_TIMINGS 16 #define EDID_STD_TIMINGS 8 #define EDID_DETAILED_TIMINGS 4 /* * EDID blocks out in the wild have a variety of bugs, try to collect * them here (note that userspace may work around broken monitors first, * but fixes should make their way here so that the kernel "just works" * on as many displays as possible). */ /* First detailed mode wrong, use largest 60Hz mode */ #define EDID_QUIRK_PREFER_LARGE_60 (1 << 0) /* Reported 135MHz pixel clock is too high, needs adjustment */ #define EDID_QUIRK_135_CLOCK_TOO_HIGH (1 << 1) /* Prefer the largest mode at 75 Hz */ #define EDID_QUIRK_PREFER_LARGE_75 (1 << 2) /* Detail timing is in cm not mm */ #define EDID_QUIRK_DETAILED_IN_CM (1 << 3) /* Detailed timing descriptors have bogus size values, so just take the * maximum size and use that. */ #define EDID_QUIRK_DETAILED_USE_MAXIMUM_SIZE (1 << 4) /* use +hsync +vsync for detailed mode */ #define EDID_QUIRK_DETAILED_SYNC_PP (1 << 6) /* Force reduced-blanking timings for detailed modes */ #define EDID_QUIRK_FORCE_REDUCED_BLANKING (1 << 7) /* Force 8bpc */ #define EDID_QUIRK_FORCE_8BPC (1 << 8) /* Force 12bpc */ #define EDID_QUIRK_FORCE_12BPC (1 << 9) /* Force 6bpc */ #define EDID_QUIRK_FORCE_6BPC (1 << 10) /* Force 10bpc */ #define EDID_QUIRK_FORCE_10BPC (1 << 11) /* Non desktop display (i.e. HMD) */ #define EDID_QUIRK_NON_DESKTOP (1 << 12) /* Cap the DSC target bitrate to 15bpp */ #define EDID_QUIRK_CAP_DSC_15BPP (1 << 13) #define MICROSOFT_IEEE_OUI 0xca125c struct detailed_mode_closure { struct drm_connector *connector; const struct drm_edid *drm_edid; bool preferred; int modes; }; struct drm_edid_match_closure { const struct drm_edid_ident *ident; bool matched; }; #define LEVEL_DMT 0 #define LEVEL_GTF 1 #define LEVEL_GTF2 2 #define LEVEL_CVT 3 #define EDID_QUIRK(vend_chr_0, vend_chr_1, vend_chr_2, product_id, _quirks) \ { \ .ident = { \ .panel_id = drm_edid_encode_panel_id(vend_chr_0, vend_chr_1, \ vend_chr_2, product_id), \ }, \ .quirks = _quirks \ } static const struct edid_quirk { const struct drm_edid_ident ident; u32 quirks; } edid_quirk_list[] = { /* Acer AL1706 */ EDID_QUIRK('A', 'C', 'R', 44358, EDID_QUIRK_PREFER_LARGE_60), /* Acer F51 */ EDID_QUIRK('A', 'P', 'I', 0x7602, EDID_QUIRK_PREFER_LARGE_60), /* AEO model 0 reports 8 bpc, but is a 6 bpc panel */ EDID_QUIRK('A', 'E', 'O', 0, EDID_QUIRK_FORCE_6BPC), /* BenQ GW2765 */ EDID_QUIRK('B', 'N', 'Q', 0x78d6, EDID_QUIRK_FORCE_8BPC), /* BOE model on HP Pavilion 15-n233sl reports 8 bpc, but is a 6 bpc panel */ EDID_QUIRK('B', 'O', 'E', 0x78b, EDID_QUIRK_FORCE_6BPC), /* CPT panel of Asus UX303LA reports 8 bpc, but is a 6 bpc panel */ EDID_QUIRK('C', 'P', 'T', 0x17df, EDID_QUIRK_FORCE_6BPC), /* SDC panel of Lenovo B50-80 reports 8 bpc, but is a 6 bpc panel */ EDID_QUIRK('S', 'D', 'C', 0x3652, EDID_QUIRK_FORCE_6BPC), /* BOE model 0x0771 reports 8 bpc, but is a 6 bpc panel */ EDID_QUIRK('B', 'O', 'E', 0x0771, EDID_QUIRK_FORCE_6BPC), /* Belinea 10 15 55 */ EDID_QUIRK('M', 'A', 'X', 1516, EDID_QUIRK_PREFER_LARGE_60), EDID_QUIRK('M', 'A', 'X', 0x77e, EDID_QUIRK_PREFER_LARGE_60), /* Envision Peripherals, Inc. EN-7100e */ EDID_QUIRK('E', 'P', 'I', 59264, EDID_QUIRK_135_CLOCK_TOO_HIGH), /* Envision EN2028 */ EDID_QUIRK('E', 'P', 'I', 8232, EDID_QUIRK_PREFER_LARGE_60), /* Funai Electronics PM36B */ EDID_QUIRK('F', 'C', 'M', 13600, EDID_QUIRK_PREFER_LARGE_75 | EDID_QUIRK_DETAILED_IN_CM), /* LG 27GP950 */ EDID_QUIRK('G', 'S', 'M', 0x5bbf, EDID_QUIRK_CAP_DSC_15BPP), /* LG 27GN950 */ EDID_QUIRK('G', 'S', 'M', 0x5b9a, EDID_QUIRK_CAP_DSC_15BPP), /* LGD panel of HP zBook 17 G2, eDP 10 bpc, but reports unknown bpc */ EDID_QUIRK('L', 'G', 'D', 764, EDID_QUIRK_FORCE_10BPC), /* LG Philips LCD LP154W01-A5 */ EDID_QUIRK('L', 'P', 'L', 0, EDID_QUIRK_DETAILED_USE_MAXIMUM_SIZE), EDID_QUIRK('L', 'P', 'L', 0x2a00, EDID_QUIRK_DETAILED_USE_MAXIMUM_SIZE), /* Samsung SyncMaster 205BW. Note: irony */ EDID_QUIRK('S', 'A', 'M', 541, EDID_QUIRK_DETAILED_SYNC_PP), /* Samsung SyncMaster 22[5-6]BW */ EDID_QUIRK('S', 'A', 'M', 596, EDID_QUIRK_PREFER_LARGE_60), EDID_QUIRK('S', 'A', 'M', 638, EDID_QUIRK_PREFER_LARGE_60), /* Sony PVM-2541A does up to 12 bpc, but only reports max 8 bpc */ EDID_QUIRK('S', 'N', 'Y', 0x2541, EDID_QUIRK_FORCE_12BPC), /* ViewSonic VA2026w */ EDID_QUIRK('V', 'S', 'C', 5020, EDID_QUIRK_FORCE_REDUCED_BLANKING), /* Medion MD 30217 PG */ EDID_QUIRK('M', 'E', 'D', 0x7b8, EDID_QUIRK_PREFER_LARGE_75), /* Lenovo G50 */ EDID_QUIRK('S', 'D', 'C', 18514, EDID_QUIRK_FORCE_6BPC), /* Panel in Samsung NP700G7A-S01PL notebook reports 6bpc */ EDID_QUIRK('S', 'E', 'C', 0xd033, EDID_QUIRK_FORCE_8BPC), /* Rotel RSX-1058 forwards sink's EDID but only does HDMI 1.1*/ EDID_QUIRK('E', 'T', 'R', 13896, EDID_QUIRK_FORCE_8BPC), /* Valve Index Headset */ EDID_QUIRK('V', 'L', 'V', 0x91a8, EDID_QUIRK_NON_DESKTOP), EDID_QUIRK('V', 'L', 'V', 0x91b0, EDID_QUIRK_NON_DESKTOP), EDID_QUIRK('V', 'L', 'V', 0x91b1, EDID_QUIRK_NON_DESKTOP), EDID_QUIRK('V', 'L', 'V', 0x91b2, EDID_QUIRK_NON_DESKTOP), EDID_QUIRK('V', 'L', 'V', 0x91b3, EDID_QUIRK_NON_DESKTOP), EDID_QUIRK('V', 'L', 'V', 0x91b4, EDID_QUIRK_NON_DESKTOP), EDID_QUIRK('V', 'L', 'V', 0x91b5, EDID_QUIRK_NON_DESKTOP), EDID_QUIRK('V', 'L', 'V', 0x91b6, EDID_QUIRK_NON_DESKTOP), EDID_QUIRK('V', 'L', 'V', 0x91b7, EDID_QUIRK_NON_DESKTOP), EDID_QUIRK('V', 'L', 'V', 0x91b8, EDID_QUIRK_NON_DESKTOP), EDID_QUIRK('V', 'L', 'V', 0x91b9, EDID_QUIRK_NON_DESKTOP), EDID_QUIRK('V', 'L', 'V', 0x91ba, EDID_QUIRK_NON_DESKTOP), EDID_QUIRK('V', 'L', 'V', 0x91bb, EDID_QUIRK_NON_DESKTOP), EDID_QUIRK('V', 'L', 'V', 0x91bc, EDID_QUIRK_NON_DESKTOP), EDID_QUIRK('V', 'L', 'V', 0x91bd, EDID_QUIRK_NON_DESKTOP), EDID_QUIRK('V', 'L', 'V', 0x91be, EDID_QUIRK_NON_DESKTOP), EDID_QUIRK('V', 'L', 'V', 0x91bf, EDID_QUIRK_NON_DESKTOP), /* HTC Vive and Vive Pro VR Headsets */ EDID_QUIRK('H', 'V', 'R', 0xaa01, EDID_QUIRK_NON_DESKTOP), EDID_QUIRK('H', 'V', 'R', 0xaa02, EDID_QUIRK_NON_DESKTOP), /* Oculus Rift DK1, DK2, CV1 and Rift S VR Headsets */ EDID_QUIRK('O', 'V', 'R', 0x0001, EDID_QUIRK_NON_DESKTOP), EDID_QUIRK('O', 'V', 'R', 0x0003, EDID_QUIRK_NON_DESKTOP), EDID_QUIRK('O', 'V', 'R', 0x0004, EDID_QUIRK_NON_DESKTOP), EDID_QUIRK('O', 'V', 'R', 0x0012, EDID_QUIRK_NON_DESKTOP), /* Windows Mixed Reality Headsets */ EDID_QUIRK('A', 'C', 'R', 0x7fce, EDID_QUIRK_NON_DESKTOP), EDID_QUIRK('L', 'E', 'N', 0x0408, EDID_QUIRK_NON_DESKTOP), EDID_QUIRK('F', 'U', 'J', 0x1970, EDID_QUIRK_NON_DESKTOP), EDID_QUIRK('D', 'E', 'L', 0x7fce, EDID_QUIRK_NON_DESKTOP), EDID_QUIRK('S', 'E', 'C', 0x144a, EDID_QUIRK_NON_DESKTOP), EDID_QUIRK('A', 'U', 'S', 0xc102, EDID_QUIRK_NON_DESKTOP), /* Sony PlayStation VR Headset */ EDID_QUIRK('S', 'N', 'Y', 0x0704, EDID_QUIRK_NON_DESKTOP), /* Sensics VR Headsets */ EDID_QUIRK('S', 'E', 'N', 0x1019, EDID_QUIRK_NON_DESKTOP), /* OSVR HDK and HDK2 VR Headsets */ EDID_QUIRK('S', 'V', 'R', 0x1019, EDID_QUIRK_NON_DESKTOP), EDID_QUIRK('A', 'U', 'O', 0x1111, EDID_QUIRK_NON_DESKTOP), }; /* * Autogenerated from the DMT spec. * This table is copied from xfree86/modes/xf86EdidModes.c. */ static const struct drm_display_mode drm_dmt_modes[] = { /* 0x01 - 640x350@85Hz */ { DRM_MODE("640x350", DRM_MODE_TYPE_DRIVER, 31500, 640, 672, 736, 832, 0, 350, 382, 385, 445, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_NVSYNC) }, /* 0x02 - 640x400@85Hz */ { DRM_MODE("640x400", DRM_MODE_TYPE_DRIVER, 31500, 640, 672, 736, 832, 0, 400, 401, 404, 445, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_PVSYNC) }, /* 0x03 - 720x400@85Hz */ { DRM_MODE("720x400", DRM_MODE_TYPE_DRIVER, 35500, 720, 756, 828, 936, 0, 400, 401, 404, 446, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_PVSYNC) }, /* 0x04 - 640x480@60Hz */ { DRM_MODE("640x480", DRM_MODE_TYPE_DRIVER, 25175, 640, 656, 752, 800, 0, 480, 490, 492, 525, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_NVSYNC) }, /* 0x05 - 640x480@72Hz */ { DRM_MODE("640x480", DRM_MODE_TYPE_DRIVER, 31500, 640, 664, 704, 832, 0, 480, 489, 492, 520, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_NVSYNC) }, /* 0x06 - 640x480@75Hz */ { DRM_MODE("640x480", DRM_MODE_TYPE_DRIVER, 31500, 640, 656, 720, 840, 0, 480, 481, 484, 500, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_NVSYNC) }, /* 0x07 - 640x480@85Hz */ { DRM_MODE("640x480", DRM_MODE_TYPE_DRIVER, 36000, 640, 696, 752, 832, 0, 480, 481, 484, 509, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_NVSYNC) }, /* 0x08 - 800x600@56Hz */ { DRM_MODE("800x600", DRM_MODE_TYPE_DRIVER, 36000, 800, 824, 896, 1024, 0, 600, 601, 603, 625, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC) }, /* 0x09 - 800x600@60Hz */ { DRM_MODE("800x600", DRM_MODE_TYPE_DRIVER, 40000, 800, 840, 968, 1056, 0, 600, 601, 605, 628, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC) }, /* 0x0a - 800x600@72Hz */ { DRM_MODE("800x600", DRM_MODE_TYPE_DRIVER, 50000, 800, 856, 976, 1040, 0, 600, 637, 643, 666, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC) }, /* 0x0b - 800x600@75Hz */ { DRM_MODE("800x600", DRM_MODE_TYPE_DRIVER, 49500, 800, 816, 896, 1056, 0, 600, 601, 604, 625, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC) }, /* 0x0c - 800x600@85Hz */ { DRM_MODE("800x600", DRM_MODE_TYPE_DRIVER, 56250, 800, 832, 896, 1048, 0, 600, 601, 604, 631, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC) }, /* 0x0d - 800x600@120Hz RB */ { DRM_MODE("800x600", DRM_MODE_TYPE_DRIVER, 73250, 800, 848, 880, 960, 0, 600, 603, 607, 636, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_NVSYNC) }, /* 0x0e - 848x480@60Hz */ { DRM_MODE("848x480", DRM_MODE_TYPE_DRIVER, 33750, 848, 864, 976, 1088, 0, 480, 486, 494, 517, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC) }, /* 0x0f - 1024x768@43Hz, interlace */ { DRM_MODE("1024x768i", DRM_MODE_TYPE_DRIVER, 44900, 1024, 1032, 1208, 1264, 0, 768, 768, 776, 817, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC | DRM_MODE_FLAG_INTERLACE) }, /* 0x10 - 1024x768@60Hz */ { DRM_MODE("1024x768", DRM_MODE_TYPE_DRIVER, 65000, 1024, 1048, 1184, 1344, 0, 768, 771, 777, 806, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_NVSYNC) }, /* 0x11 - 1024x768@70Hz */ { DRM_MODE("1024x768", DRM_MODE_TYPE_DRIVER, 75000, 1024, 1048, 1184, 1328, 0, 768, 771, 777, 806, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_NVSYNC) }, /* 0x12 - 1024x768@75Hz */ { DRM_MODE("1024x768", DRM_MODE_TYPE_DRIVER, 78750, 1024, 1040, 1136, 1312, 0, 768, 769, 772, 800, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC) }, /* 0x13 - 1024x768@85Hz */ { DRM_MODE("1024x768", DRM_MODE_TYPE_DRIVER, 94500, 1024, 1072, 1168, 1376, 0, 768, 769, 772, 808, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC) }, /* 0x14 - 1024x768@120Hz RB */ { DRM_MODE("1024x768", DRM_MODE_TYPE_DRIVER, 115500, 1024, 1072, 1104, 1184, 0, 768, 771, 775, 813, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_NVSYNC) }, /* 0x15 - 1152x864@75Hz */ { DRM_MODE("1152x864", DRM_MODE_TYPE_DRIVER, 108000, 1152, 1216, 1344, 1600, 0, 864, 865, 868, 900, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC) }, /* 0x55 - 1280x720@60Hz */ { DRM_MODE("1280x720", DRM_MODE_TYPE_DRIVER, 74250, 1280, 1390, 1430, 1650, 0, 720, 725, 730, 750, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC) }, /* 0x16 - 1280x768@60Hz RB */ { DRM_MODE("1280x768", DRM_MODE_TYPE_DRIVER, 68250, 1280, 1328, 1360, 1440, 0, 768, 771, 778, 790, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_NVSYNC) }, /* 0x17 - 1280x768@60Hz */ { DRM_MODE("1280x768", DRM_MODE_TYPE_DRIVER, 79500, 1280, 1344, 1472, 1664, 0, 768, 771, 778, 798, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_PVSYNC) }, /* 0x18 - 1280x768@75Hz */ { DRM_MODE("1280x768", DRM_MODE_TYPE_DRIVER, 102250, 1280, 1360, 1488, 1696, 0, 768, 771, 778, 805, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_PVSYNC) }, /* 0x19 - 1280x768@85Hz */ { DRM_MODE("1280x768", DRM_MODE_TYPE_DRIVER, 117500, 1280, 1360, 1496, 1712, 0, 768, 771, 778, 809, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_PVSYNC) }, /* 0x1a - 1280x768@120Hz RB */ { DRM_MODE("1280x768", DRM_MODE_TYPE_DRIVER, 140250, 1280, 1328, 1360, 1440, 0, 768, 771, 778, 813, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_NVSYNC) }, /* 0x1b - 1280x800@60Hz RB */ { DRM_MODE("1280x800", DRM_MODE_TYPE_DRIVER, 71000, 1280, 1328, 1360, 1440, 0, 800, 803, 809, 823, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_NVSYNC) }, /* 0x1c - 1280x800@60Hz */ { DRM_MODE("1280x800", DRM_MODE_TYPE_DRIVER, 83500, 1280, 1352, 1480, 1680, 0, 800, 803, 809, 831, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_PVSYNC) }, /* 0x1d - 1280x800@75Hz */ { DRM_MODE("1280x800", DRM_MODE_TYPE_DRIVER, 106500, 1280, 1360, 1488, 1696, 0, 800, 803, 809, 838, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_PVSYNC) }, /* 0x1e - 1280x800@85Hz */ { DRM_MODE("1280x800", DRM_MODE_TYPE_DRIVER, 122500, 1280, 1360, 1496, 1712, 0, 800, 803, 809, 843, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_PVSYNC) }, /* 0x1f - 1280x800@120Hz RB */ { DRM_MODE("1280x800", DRM_MODE_TYPE_DRIVER, 146250, 1280, 1328, 1360, 1440, 0, 800, 803, 809, 847, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_NVSYNC) }, /* 0x20 - 1280x960@60Hz */ { DRM_MODE("1280x960", DRM_MODE_TYPE_DRIVER, 108000, 1280, 1376, 1488, 1800, 0, 960, 961, 964, 1000, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC) }, /* 0x21 - 1280x960@85Hz */ { DRM_MODE("1280x960", DRM_MODE_TYPE_DRIVER, 148500, 1280, 1344, 1504, 1728, 0, 960, 961, 964, 1011, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC) }, /* 0x22 - 1280x960@120Hz RB */ { DRM_MODE("1280x960", DRM_MODE_TYPE_DRIVER, 175500, 1280, 1328, 1360, 1440, 0, 960, 963, 967, 1017, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_NVSYNC) }, /* 0x23 - 1280x1024@60Hz */ { DRM_MODE("1280x1024", DRM_MODE_TYPE_DRIVER, 108000, 1280, 1328, 1440, 1688, 0, 1024, 1025, 1028, 1066, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC) }, /* 0x24 - 1280x1024@75Hz */ { DRM_MODE("1280x1024", DRM_MODE_TYPE_DRIVER, 135000, 1280, 1296, 1440, 1688, 0, 1024, 1025, 1028, 1066, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC) }, /* 0x25 - 1280x1024@85Hz */ { DRM_MODE("1280x1024", DRM_MODE_TYPE_DRIVER, 157500, 1280, 1344, 1504, 1728, 0, 1024, 1025, 1028, 1072, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC) }, /* 0x26 - 1280x1024@120Hz RB */ { DRM_MODE("1280x1024", DRM_MODE_TYPE_DRIVER, 187250, 1280, 1328, 1360, 1440, 0, 1024, 1027, 1034, 1084, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_NVSYNC) }, /* 0x27 - 1360x768@60Hz */ { DRM_MODE("1360x768", DRM_MODE_TYPE_DRIVER, 85500, 1360, 1424, 1536, 1792, 0, 768, 771, 777, 795, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC) }, /* 0x28 - 1360x768@120Hz RB */ { DRM_MODE("1360x768", DRM_MODE_TYPE_DRIVER, 148250, 1360, 1408, 1440, 1520, 0, 768, 771, 776, 813, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_NVSYNC) }, /* 0x51 - 1366x768@60Hz */ { DRM_MODE("1366x768", DRM_MODE_TYPE_DRIVER, 85500, 1366, 1436, 1579, 1792, 0, 768, 771, 774, 798, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC) }, /* 0x56 - 1366x768@60Hz */ { DRM_MODE("1366x768", DRM_MODE_TYPE_DRIVER, 72000, 1366, 1380, 1436, 1500, 0, 768, 769, 772, 800, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC) }, /* 0x29 - 1400x1050@60Hz RB */ { DRM_MODE("1400x1050", DRM_MODE_TYPE_DRIVER, 101000, 1400, 1448, 1480, 1560, 0, 1050, 1053, 1057, 1080, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_NVSYNC) }, /* 0x2a - 1400x1050@60Hz */ { DRM_MODE("1400x1050", DRM_MODE_TYPE_DRIVER, 121750, 1400, 1488, 1632, 1864, 0, 1050, 1053, 1057, 1089, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_PVSYNC) }, /* 0x2b - 1400x1050@75Hz */ { DRM_MODE("1400x1050", DRM_MODE_TYPE_DRIVER, 156000, 1400, 1504, 1648, 1896, 0, 1050, 1053, 1057, 1099, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_PVSYNC) }, /* 0x2c - 1400x1050@85Hz */ { DRM_MODE("1400x1050", DRM_MODE_TYPE_DRIVER, 179500, 1400, 1504, 1656, 1912, 0, 1050, 1053, 1057, 1105, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_PVSYNC) }, /* 0x2d - 1400x1050@120Hz RB */ { DRM_MODE("1400x1050", DRM_MODE_TYPE_DRIVER, 208000, 1400, 1448, 1480, 1560, 0, 1050, 1053, 1057, 1112, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_NVSYNC) }, /* 0x2e - 1440x900@60Hz RB */ { DRM_MODE("1440x900", DRM_MODE_TYPE_DRIVER, 88750, 1440, 1488, 1520, 1600, 0, 900, 903, 909, 926, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_NVSYNC) }, /* 0x2f - 1440x900@60Hz */ { DRM_MODE("1440x900", DRM_MODE_TYPE_DRIVER, 106500, 1440, 1520, 1672, 1904, 0, 900, 903, 909, 934, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_PVSYNC) }, /* 0x30 - 1440x900@75Hz */ { DRM_MODE("1440x900", DRM_MODE_TYPE_DRIVER, 136750, 1440, 1536, 1688, 1936, 0, 900, 903, 909, 942, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_PVSYNC) }, /* 0x31 - 1440x900@85Hz */ { DRM_MODE("1440x900", DRM_MODE_TYPE_DRIVER, 157000, 1440, 1544, 1696, 1952, 0, 900, 903, 909, 948, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_PVSYNC) }, /* 0x32 - 1440x900@120Hz RB */ { DRM_MODE("1440x900", DRM_MODE_TYPE_DRIVER, 182750, 1440, 1488, 1520, 1600, 0, 900, 903, 909, 953, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_NVSYNC) }, /* 0x53 - 1600x900@60Hz */ { DRM_MODE("1600x900", DRM_MODE_TYPE_DRIVER, 108000, 1600, 1624, 1704, 1800, 0, 900, 901, 904, 1000, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC) }, /* 0x33 - 1600x1200@60Hz */ { DRM_MODE("1600x1200", DRM_MODE_TYPE_DRIVER, 162000, 1600, 1664, 1856, 2160, 0, 1200, 1201, 1204, 1250, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC) }, /* 0x34 - 1600x1200@65Hz */ { DRM_MODE("1600x1200", DRM_MODE_TYPE_DRIVER, 175500, 1600, 1664, 1856, 2160, 0, 1200, 1201, 1204, 1250, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC) }, /* 0x35 - 1600x1200@70Hz */ { DRM_MODE("1600x1200", DRM_MODE_TYPE_DRIVER, 189000, 1600, 1664, 1856, 2160, 0, 1200, 1201, 1204, 1250, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC) }, /* 0x36 - 1600x1200@75Hz */ { DRM_MODE("1600x1200", DRM_MODE_TYPE_DRIVER, 202500, 1600, 1664, 1856, 2160, 0, 1200, 1201, 1204, 1250, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC) }, /* 0x37 - 1600x1200@85Hz */ { DRM_MODE("1600x1200", DRM_MODE_TYPE_DRIVER, 229500, 1600, 1664, 1856, 2160, 0, 1200, 1201, 1204, 1250, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC) }, /* 0x38 - 1600x1200@120Hz RB */ { DRM_MODE("1600x1200", DRM_MODE_TYPE_DRIVER, 268250, 1600, 1648, 1680, 1760, 0, 1200, 1203, 1207, 1271, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_NVSYNC) }, /* 0x39 - 1680x1050@60Hz RB */ { DRM_MODE("1680x1050", DRM_MODE_TYPE_DRIVER, 119000, 1680, 1728, 1760, 1840, 0, 1050, 1053, 1059, 1080, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_NVSYNC) }, /* 0x3a - 1680x1050@60Hz */ { DRM_MODE("1680x1050", DRM_MODE_TYPE_DRIVER, 146250, 1680, 1784, 1960, 2240, 0, 1050, 1053, 1059, 1089, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_PVSYNC) }, /* 0x3b - 1680x1050@75Hz */ { DRM_MODE("1680x1050", DRM_MODE_TYPE_DRIVER, 187000, 1680, 1800, 1976, 2272, 0, 1050, 1053, 1059, 1099, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_PVSYNC) }, /* 0x3c - 1680x1050@85Hz */ { DRM_MODE("1680x1050", DRM_MODE_TYPE_DRIVER, 214750, 1680, 1808, 1984, 2288, 0, 1050, 1053, 1059, 1105, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_PVSYNC) }, /* 0x3d - 1680x1050@120Hz RB */ { DRM_MODE("1680x1050", DRM_MODE_TYPE_DRIVER, 245500, 1680, 1728, 1760, 1840, 0, 1050, 1053, 1059, 1112, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_NVSYNC) }, /* 0x3e - 1792x1344@60Hz */ { DRM_MODE("1792x1344", DRM_MODE_TYPE_DRIVER, 204750, 1792, 1920, 2120, 2448, 0, 1344, 1345, 1348, 1394, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_PVSYNC) }, /* 0x3f - 1792x1344@75Hz */ { DRM_MODE("1792x1344", DRM_MODE_TYPE_DRIVER, 261000, 1792, 1888, 2104, 2456, 0, 1344, 1345, 1348, 1417, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_PVSYNC) }, /* 0x40 - 1792x1344@120Hz RB */ { DRM_MODE("1792x1344", DRM_MODE_TYPE_DRIVER, 333250, 1792, 1840, 1872, 1952, 0, 1344, 1347, 1351, 1423, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_NVSYNC) }, /* 0x41 - 1856x1392@60Hz */ { DRM_MODE("1856x1392", DRM_MODE_TYPE_DRIVER, 218250, 1856, 1952, 2176, 2528, 0, 1392, 1393, 1396, 1439, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_PVSYNC) }, /* 0x42 - 1856x1392@75Hz */ { DRM_MODE("1856x1392", DRM_MODE_TYPE_DRIVER, 288000, 1856, 1984, 2208, 2560, 0, 1392, 1393, 1396, 1500, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_PVSYNC) }, /* 0x43 - 1856x1392@120Hz RB */ { DRM_MODE("1856x1392", DRM_MODE_TYPE_DRIVER, 356500, 1856, 1904, 1936, 2016, 0, 1392, 1395, 1399, 1474, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_NVSYNC) }, /* 0x52 - 1920x1080@60Hz */ { DRM_MODE("1920x1080", DRM_MODE_TYPE_DRIVER, 148500, 1920, 2008, 2052, 2200, 0, 1080, 1084, 1089, 1125, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_NVSYNC) }, /* 0x44 - 1920x1200@60Hz RB */ { DRM_MODE("1920x1200", DRM_MODE_TYPE_DRIVER, 154000, 1920, 1968, 2000, 2080, 0, 1200, 1203, 1209, 1235, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_NVSYNC) }, /* 0x45 - 1920x1200@60Hz */ { DRM_MODE("1920x1200", DRM_MODE_TYPE_DRIVER, 193250, 1920, 2056, 2256, 2592, 0, 1200, 1203, 1209, 1245, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_PVSYNC) }, /* 0x46 - 1920x1200@75Hz */ { DRM_MODE("1920x1200", DRM_MODE_TYPE_DRIVER, 245250, 1920, 2056, 2264, 2608, 0, 1200, 1203, 1209, 1255, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_PVSYNC) }, /* 0x47 - 1920x1200@85Hz */ { DRM_MODE("1920x1200", DRM_MODE_TYPE_DRIVER, 281250, 1920, 2064, 2272, 2624, 0, 1200, 1203, 1209, 1262, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_PVSYNC) }, /* 0x48 - 1920x1200@120Hz RB */ { DRM_MODE("1920x1200", DRM_MODE_TYPE_DRIVER, 317000, 1920, 1968, 2000, 2080, 0, 1200, 1203, 1209, 1271, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_NVSYNC) }, /* 0x49 - 1920x1440@60Hz */ { DRM_MODE("1920x1440", DRM_MODE_TYPE_DRIVER, 234000, 1920, 2048, 2256, 2600, 0, 1440, 1441, 1444, 1500, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_PVSYNC) }, /* 0x4a - 1920x1440@75Hz */ { DRM_MODE("1920x1440", DRM_MODE_TYPE_DRIVER, 297000, 1920, 2064, 2288, 2640, 0, 1440, 1441, 1444, 1500, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_PVSYNC) }, /* 0x4b - 1920x1440@120Hz RB */ { DRM_MODE("1920x1440", DRM_MODE_TYPE_DRIVER, 380500, 1920, 1968, 2000, 2080, 0, 1440, 1443, 1447, 1525, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_NVSYNC) }, /* 0x54 - 2048x1152@60Hz */ { DRM_MODE("2048x1152", DRM_MODE_TYPE_DRIVER, 162000, 2048, 2074, 2154, 2250, 0, 1152, 1153, 1156, 1200, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC) }, /* 0x4c - 2560x1600@60Hz RB */ { DRM_MODE("2560x1600", DRM_MODE_TYPE_DRIVER, 268500, 2560, 2608, 2640, 2720, 0, 1600, 1603, 1609, 1646, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_NVSYNC) }, /* 0x4d - 2560x1600@60Hz */ { DRM_MODE("2560x1600", DRM_MODE_TYPE_DRIVER, 348500, 2560, 2752, 3032, 3504, 0, 1600, 1603, 1609, 1658, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_PVSYNC) }, /* 0x4e - 2560x1600@75Hz */ { DRM_MODE("2560x1600", DRM_MODE_TYPE_DRIVER, 443250, 2560, 2768, 3048, 3536, 0, 1600, 1603, 1609, 1672, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_PVSYNC) }, /* 0x4f - 2560x1600@85Hz */ { DRM_MODE("2560x1600", DRM_MODE_TYPE_DRIVER, 505250, 2560, 2768, 3048, 3536, 0, 1600, 1603, 1609, 1682, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_PVSYNC) }, /* 0x50 - 2560x1600@120Hz RB */ { DRM_MODE("2560x1600", DRM_MODE_TYPE_DRIVER, 552750, 2560, 2608, 2640, 2720, 0, 1600, 1603, 1609, 1694, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_NVSYNC) }, /* 0x57 - 4096x2160@60Hz RB */ { DRM_MODE("4096x2160", DRM_MODE_TYPE_DRIVER, 556744, 4096, 4104, 4136, 4176, 0, 2160, 2208, 2216, 2222, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_NVSYNC) }, /* 0x58 - 4096x2160@59.94Hz RB */ { DRM_MODE("4096x2160", DRM_MODE_TYPE_DRIVER, 556188, 4096, 4104, 4136, 4176, 0, 2160, 2208, 2216, 2222, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_NVSYNC) }, }; /* * These more or less come from the DMT spec. The 720x400 modes are * inferred from historical 80x25 practice. The 640x480@67 and 832x624@75 * modes are old-school Mac modes. The EDID spec says the 1152x864@75 mode * should be 1152x870, again for the Mac, but instead we use the x864 DMT * mode. * * The DMT modes have been fact-checked; the rest are mild guesses. */ static const struct drm_display_mode edid_est_modes[] = { { DRM_MODE("800x600", DRM_MODE_TYPE_DRIVER, 40000, 800, 840, 968, 1056, 0, 600, 601, 605, 628, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC) }, /* 800x600@60Hz */ { DRM_MODE("800x600", DRM_MODE_TYPE_DRIVER, 36000, 800, 824, 896, 1024, 0, 600, 601, 603, 625, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC) }, /* 800x600@56Hz */ { DRM_MODE("640x480", DRM_MODE_TYPE_DRIVER, 31500, 640, 656, 720, 840, 0, 480, 481, 484, 500, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_NVSYNC) }, /* 640x480@75Hz */ { DRM_MODE("640x480", DRM_MODE_TYPE_DRIVER, 31500, 640, 664, 704, 832, 0, 480, 489, 492, 520, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_NVSYNC) }, /* 640x480@72Hz */ { DRM_MODE("640x480", DRM_MODE_TYPE_DRIVER, 30240, 640, 704, 768, 864, 0, 480, 483, 486, 525, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_NVSYNC) }, /* 640x480@67Hz */ { DRM_MODE("640x480", DRM_MODE_TYPE_DRIVER, 25175, 640, 656, 752, 800, 0, 480, 490, 492, 525, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_NVSYNC) }, /* 640x480@60Hz */ { DRM_MODE("720x400", DRM_MODE_TYPE_DRIVER, 35500, 720, 738, 846, 900, 0, 400, 421, 423, 449, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_NVSYNC) }, /* 720x400@88Hz */ { DRM_MODE("720x400", DRM_MODE_TYPE_DRIVER, 28320, 720, 738, 846, 900, 0, 400, 412, 414, 449, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_PVSYNC) }, /* 720x400@70Hz */ { DRM_MODE("1280x1024", DRM_MODE_TYPE_DRIVER, 135000, 1280, 1296, 1440, 1688, 0, 1024, 1025, 1028, 1066, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC) }, /* 1280x1024@75Hz */ { DRM_MODE("1024x768", DRM_MODE_TYPE_DRIVER, 78750, 1024, 1040, 1136, 1312, 0, 768, 769, 772, 800, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC) }, /* 1024x768@75Hz */ { DRM_MODE("1024x768", DRM_MODE_TYPE_DRIVER, 75000, 1024, 1048, 1184, 1328, 0, 768, 771, 777, 806, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_NVSYNC) }, /* 1024x768@70Hz */ { DRM_MODE("1024x768", DRM_MODE_TYPE_DRIVER, 65000, 1024, 1048, 1184, 1344, 0, 768, 771, 777, 806, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_NVSYNC) }, /* 1024x768@60Hz */ { DRM_MODE("1024x768i", DRM_MODE_TYPE_DRIVER,44900, 1024, 1032, 1208, 1264, 0, 768, 768, 776, 817, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC | DRM_MODE_FLAG_INTERLACE) }, /* 1024x768@43Hz */ { DRM_MODE("832x624", DRM_MODE_TYPE_DRIVER, 57284, 832, 864, 928, 1152, 0, 624, 625, 628, 667, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_NVSYNC) }, /* 832x624@75Hz */ { DRM_MODE("800x600", DRM_MODE_TYPE_DRIVER, 49500, 800, 816, 896, 1056, 0, 600, 601, 604, 625, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC) }, /* 800x600@75Hz */ { DRM_MODE("800x600", DRM_MODE_TYPE_DRIVER, 50000, 800, 856, 976, 1040, 0, 600, 637, 643, 666, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC) }, /* 800x600@72Hz */ { DRM_MODE("1152x864", DRM_MODE_TYPE_DRIVER, 108000, 1152, 1216, 1344, 1600, 0, 864, 865, 868, 900, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC) }, /* 1152x864@75Hz */ }; struct minimode { short w; short h; short r; short rb; }; static const struct minimode est3_modes[] = { /* byte 6 */ { 640, 350, 85, 0 }, { 640, 400, 85, 0 }, { 720, 400, 85, 0 }, { 640, 480, 85, 0 }, { 848, 480, 60, 0 }, { 800, 600, 85, 0 }, { 1024, 768, 85, 0 }, { 1152, 864, 75, 0 }, /* byte 7 */ { 1280, 768, 60, 1 }, { 1280, 768, 60, 0 }, { 1280, 768, 75, 0 }, { 1280, 768, 85, 0 }, { 1280, 960, 60, 0 }, { 1280, 960, 85, 0 }, { 1280, 1024, 60, 0 }, { 1280, 1024, 85, 0 }, /* byte 8 */ { 1360, 768, 60, 0 }, { 1440, 900, 60, 1 }, { 1440, 900, 60, 0 }, { 1440, 900, 75, 0 }, { 1440, 900, 85, 0 }, { 1400, 1050, 60, 1 }, { 1400, 1050, 60, 0 }, { 1400, 1050, 75, 0 }, /* byte 9 */ { 1400, 1050, 85, 0 }, { 1680, 1050, 60, 1 }, { 1680, 1050, 60, 0 }, { 1680, 1050, 75, 0 }, { 1680, 1050, 85, 0 }, { 1600, 1200, 60, 0 }, { 1600, 1200, 65, 0 }, { 1600, 1200, 70, 0 }, /* byte 10 */ { 1600, 1200, 75, 0 }, { 1600, 1200, 85, 0 }, { 1792, 1344, 60, 0 }, { 1792, 1344, 75, 0 }, { 1856, 1392, 60, 0 }, { 1856, 1392, 75, 0 }, { 1920, 1200, 60, 1 }, { 1920, 1200, 60, 0 }, /* byte 11 */ { 1920, 1200, 75, 0 }, { 1920, 1200, 85, 0 }, { 1920, 1440, 60, 0 }, { 1920, 1440, 75, 0 }, }; static const struct minimode extra_modes[] = { { 1024, 576, 60, 0 }, { 1366, 768, 60, 0 }, { 1600, 900, 60, 0 }, { 1680, 945, 60, 0 }, { 1920, 1080, 60, 0 }, { 2048, 1152, 60, 0 }, { 2048, 1536, 60, 0 }, }; /* * From CEA/CTA-861 spec. * * Do not access directly, instead always use cea_mode_for_vic(). */ static const struct drm_display_mode edid_cea_modes_1[] = { /* 1 - 640x480@60Hz 4:3 */ { DRM_MODE("640x480", DRM_MODE_TYPE_DRIVER, 25175, 640, 656, 752, 800, 0, 480, 490, 492, 525, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_NVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_4_3, }, /* 2 - 720x480@60Hz 4:3 */ { DRM_MODE("720x480", DRM_MODE_TYPE_DRIVER, 27000, 720, 736, 798, 858, 0, 480, 489, 495, 525, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_NVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_4_3, }, /* 3 - 720x480@60Hz 16:9 */ { DRM_MODE("720x480", DRM_MODE_TYPE_DRIVER, 27000, 720, 736, 798, 858, 0, 480, 489, 495, 525, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_NVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_16_9, }, /* 4 - 1280x720@60Hz 16:9 */ { DRM_MODE("1280x720", DRM_MODE_TYPE_DRIVER, 74250, 1280, 1390, 1430, 1650, 0, 720, 725, 730, 750, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_16_9, }, /* 5 - 1920x1080i@60Hz 16:9 */ { DRM_MODE("1920x1080i", DRM_MODE_TYPE_DRIVER, 74250, 1920, 2008, 2052, 2200, 0, 1080, 1084, 1094, 1125, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC | DRM_MODE_FLAG_INTERLACE), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_16_9, }, /* 6 - 720(1440)x480i@60Hz 4:3 */ { DRM_MODE("720x480i", DRM_MODE_TYPE_DRIVER, 13500, 720, 739, 801, 858, 0, 480, 488, 494, 525, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_NVSYNC | DRM_MODE_FLAG_INTERLACE | DRM_MODE_FLAG_DBLCLK), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_4_3, }, /* 7 - 720(1440)x480i@60Hz 16:9 */ { DRM_MODE("720x480i", DRM_MODE_TYPE_DRIVER, 13500, 720, 739, 801, 858, 0, 480, 488, 494, 525, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_NVSYNC | DRM_MODE_FLAG_INTERLACE | DRM_MODE_FLAG_DBLCLK), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_16_9, }, /* 8 - 720(1440)x240@60Hz 4:3 */ { DRM_MODE("720x240", DRM_MODE_TYPE_DRIVER, 13500, 720, 739, 801, 858, 0, 240, 244, 247, 262, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_NVSYNC | DRM_MODE_FLAG_DBLCLK), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_4_3, }, /* 9 - 720(1440)x240@60Hz 16:9 */ { DRM_MODE("720x240", DRM_MODE_TYPE_DRIVER, 13500, 720, 739, 801, 858, 0, 240, 244, 247, 262, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_NVSYNC | DRM_MODE_FLAG_DBLCLK), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_16_9, }, /* 10 - 2880x480i@60Hz 4:3 */ { DRM_MODE("2880x480i", DRM_MODE_TYPE_DRIVER, 54000, 2880, 2956, 3204, 3432, 0, 480, 488, 494, 525, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_NVSYNC | DRM_MODE_FLAG_INTERLACE), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_4_3, }, /* 11 - 2880x480i@60Hz 16:9 */ { DRM_MODE("2880x480i", DRM_MODE_TYPE_DRIVER, 54000, 2880, 2956, 3204, 3432, 0, 480, 488, 494, 525, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_NVSYNC | DRM_MODE_FLAG_INTERLACE), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_16_9, }, /* 12 - 2880x240@60Hz 4:3 */ { DRM_MODE("2880x240", DRM_MODE_TYPE_DRIVER, 54000, 2880, 2956, 3204, 3432, 0, 240, 244, 247, 262, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_NVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_4_3, }, /* 13 - 2880x240@60Hz 16:9 */ { DRM_MODE("2880x240", DRM_MODE_TYPE_DRIVER, 54000, 2880, 2956, 3204, 3432, 0, 240, 244, 247, 262, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_NVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_16_9, }, /* 14 - 1440x480@60Hz 4:3 */ { DRM_MODE("1440x480", DRM_MODE_TYPE_DRIVER, 54000, 1440, 1472, 1596, 1716, 0, 480, 489, 495, 525, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_NVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_4_3, }, /* 15 - 1440x480@60Hz 16:9 */ { DRM_MODE("1440x480", DRM_MODE_TYPE_DRIVER, 54000, 1440, 1472, 1596, 1716, 0, 480, 489, 495, 525, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_NVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_16_9, }, /* 16 - 1920x1080@60Hz 16:9 */ { DRM_MODE("1920x1080", DRM_MODE_TYPE_DRIVER, 148500, 1920, 2008, 2052, 2200, 0, 1080, 1084, 1089, 1125, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_16_9, }, /* 17 - 720x576@50Hz 4:3 */ { DRM_MODE("720x576", DRM_MODE_TYPE_DRIVER, 27000, 720, 732, 796, 864, 0, 576, 581, 586, 625, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_NVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_4_3, }, /* 18 - 720x576@50Hz 16:9 */ { DRM_MODE("720x576", DRM_MODE_TYPE_DRIVER, 27000, 720, 732, 796, 864, 0, 576, 581, 586, 625, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_NVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_16_9, }, /* 19 - 1280x720@50Hz 16:9 */ { DRM_MODE("1280x720", DRM_MODE_TYPE_DRIVER, 74250, 1280, 1720, 1760, 1980, 0, 720, 725, 730, 750, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_16_9, }, /* 20 - 1920x1080i@50Hz 16:9 */ { DRM_MODE("1920x1080i", DRM_MODE_TYPE_DRIVER, 74250, 1920, 2448, 2492, 2640, 0, 1080, 1084, 1094, 1125, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC | DRM_MODE_FLAG_INTERLACE), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_16_9, }, /* 21 - 720(1440)x576i@50Hz 4:3 */ { DRM_MODE("720x576i", DRM_MODE_TYPE_DRIVER, 13500, 720, 732, 795, 864, 0, 576, 580, 586, 625, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_NVSYNC | DRM_MODE_FLAG_INTERLACE | DRM_MODE_FLAG_DBLCLK), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_4_3, }, /* 22 - 720(1440)x576i@50Hz 16:9 */ { DRM_MODE("720x576i", DRM_MODE_TYPE_DRIVER, 13500, 720, 732, 795, 864, 0, 576, 580, 586, 625, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_NVSYNC | DRM_MODE_FLAG_INTERLACE | DRM_MODE_FLAG_DBLCLK), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_16_9, }, /* 23 - 720(1440)x288@50Hz 4:3 */ { DRM_MODE("720x288", DRM_MODE_TYPE_DRIVER, 13500, 720, 732, 795, 864, 0, 288, 290, 293, 312, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_NVSYNC | DRM_MODE_FLAG_DBLCLK), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_4_3, }, /* 24 - 720(1440)x288@50Hz 16:9 */ { DRM_MODE("720x288", DRM_MODE_TYPE_DRIVER, 13500, 720, 732, 795, 864, 0, 288, 290, 293, 312, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_NVSYNC | DRM_MODE_FLAG_DBLCLK), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_16_9, }, /* 25 - 2880x576i@50Hz 4:3 */ { DRM_MODE("2880x576i", DRM_MODE_TYPE_DRIVER, 54000, 2880, 2928, 3180, 3456, 0, 576, 580, 586, 625, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_NVSYNC | DRM_MODE_FLAG_INTERLACE), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_4_3, }, /* 26 - 2880x576i@50Hz 16:9 */ { DRM_MODE("2880x576i", DRM_MODE_TYPE_DRIVER, 54000, 2880, 2928, 3180, 3456, 0, 576, 580, 586, 625, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_NVSYNC | DRM_MODE_FLAG_INTERLACE), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_16_9, }, /* 27 - 2880x288@50Hz 4:3 */ { DRM_MODE("2880x288", DRM_MODE_TYPE_DRIVER, 54000, 2880, 2928, 3180, 3456, 0, 288, 290, 293, 312, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_NVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_4_3, }, /* 28 - 2880x288@50Hz 16:9 */ { DRM_MODE("2880x288", DRM_MODE_TYPE_DRIVER, 54000, 2880, 2928, 3180, 3456, 0, 288, 290, 293, 312, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_NVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_16_9, }, /* 29 - 1440x576@50Hz 4:3 */ { DRM_MODE("1440x576", DRM_MODE_TYPE_DRIVER, 54000, 1440, 1464, 1592, 1728, 0, 576, 581, 586, 625, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_NVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_4_3, }, /* 30 - 1440x576@50Hz 16:9 */ { DRM_MODE("1440x576", DRM_MODE_TYPE_DRIVER, 54000, 1440, 1464, 1592, 1728, 0, 576, 581, 586, 625, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_NVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_16_9, }, /* 31 - 1920x1080@50Hz 16:9 */ { DRM_MODE("1920x1080", DRM_MODE_TYPE_DRIVER, 148500, 1920, 2448, 2492, 2640, 0, 1080, 1084, 1089, 1125, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_16_9, }, /* 32 - 1920x1080@24Hz 16:9 */ { DRM_MODE("1920x1080", DRM_MODE_TYPE_DRIVER, 74250, 1920, 2558, 2602, 2750, 0, 1080, 1084, 1089, 1125, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_16_9, }, /* 33 - 1920x1080@25Hz 16:9 */ { DRM_MODE("1920x1080", DRM_MODE_TYPE_DRIVER, 74250, 1920, 2448, 2492, 2640, 0, 1080, 1084, 1089, 1125, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_16_9, }, /* 34 - 1920x1080@30Hz 16:9 */ { DRM_MODE("1920x1080", DRM_MODE_TYPE_DRIVER, 74250, 1920, 2008, 2052, 2200, 0, 1080, 1084, 1089, 1125, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_16_9, }, /* 35 - 2880x480@60Hz 4:3 */ { DRM_MODE("2880x480", DRM_MODE_TYPE_DRIVER, 108000, 2880, 2944, 3192, 3432, 0, 480, 489, 495, 525, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_NVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_4_3, }, /* 36 - 2880x480@60Hz 16:9 */ { DRM_MODE("2880x480", DRM_MODE_TYPE_DRIVER, 108000, 2880, 2944, 3192, 3432, 0, 480, 489, 495, 525, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_NVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_16_9, }, /* 37 - 2880x576@50Hz 4:3 */ { DRM_MODE("2880x576", DRM_MODE_TYPE_DRIVER, 108000, 2880, 2928, 3184, 3456, 0, 576, 581, 586, 625, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_NVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_4_3, }, /* 38 - 2880x576@50Hz 16:9 */ { DRM_MODE("2880x576", DRM_MODE_TYPE_DRIVER, 108000, 2880, 2928, 3184, 3456, 0, 576, 581, 586, 625, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_NVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_16_9, }, /* 39 - 1920x1080i@50Hz 16:9 */ { DRM_MODE("1920x1080i", DRM_MODE_TYPE_DRIVER, 72000, 1920, 1952, 2120, 2304, 0, 1080, 1126, 1136, 1250, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_NVSYNC | DRM_MODE_FLAG_INTERLACE), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_16_9, }, /* 40 - 1920x1080i@100Hz 16:9 */ { DRM_MODE("1920x1080i", DRM_MODE_TYPE_DRIVER, 148500, 1920, 2448, 2492, 2640, 0, 1080, 1084, 1094, 1125, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC | DRM_MODE_FLAG_INTERLACE), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_16_9, }, /* 41 - 1280x720@100Hz 16:9 */ { DRM_MODE("1280x720", DRM_MODE_TYPE_DRIVER, 148500, 1280, 1720, 1760, 1980, 0, 720, 725, 730, 750, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_16_9, }, /* 42 - 720x576@100Hz 4:3 */ { DRM_MODE("720x576", DRM_MODE_TYPE_DRIVER, 54000, 720, 732, 796, 864, 0, 576, 581, 586, 625, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_NVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_4_3, }, /* 43 - 720x576@100Hz 16:9 */ { DRM_MODE("720x576", DRM_MODE_TYPE_DRIVER, 54000, 720, 732, 796, 864, 0, 576, 581, 586, 625, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_NVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_16_9, }, /* 44 - 720(1440)x576i@100Hz 4:3 */ { DRM_MODE("720x576i", DRM_MODE_TYPE_DRIVER, 27000, 720, 732, 795, 864, 0, 576, 580, 586, 625, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_NVSYNC | DRM_MODE_FLAG_INTERLACE | DRM_MODE_FLAG_DBLCLK), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_4_3, }, /* 45 - 720(1440)x576i@100Hz 16:9 */ { DRM_MODE("720x576i", DRM_MODE_TYPE_DRIVER, 27000, 720, 732, 795, 864, 0, 576, 580, 586, 625, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_NVSYNC | DRM_MODE_FLAG_INTERLACE | DRM_MODE_FLAG_DBLCLK), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_16_9, }, /* 46 - 1920x1080i@120Hz 16:9 */ { DRM_MODE("1920x1080i", DRM_MODE_TYPE_DRIVER, 148500, 1920, 2008, 2052, 2200, 0, 1080, 1084, 1094, 1125, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC | DRM_MODE_FLAG_INTERLACE), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_16_9, }, /* 47 - 1280x720@120Hz 16:9 */ { DRM_MODE("1280x720", DRM_MODE_TYPE_DRIVER, 148500, 1280, 1390, 1430, 1650, 0, 720, 725, 730, 750, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_16_9, }, /* 48 - 720x480@120Hz 4:3 */ { DRM_MODE("720x480", DRM_MODE_TYPE_DRIVER, 54000, 720, 736, 798, 858, 0, 480, 489, 495, 525, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_NVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_4_3, }, /* 49 - 720x480@120Hz 16:9 */ { DRM_MODE("720x480", DRM_MODE_TYPE_DRIVER, 54000, 720, 736, 798, 858, 0, 480, 489, 495, 525, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_NVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_16_9, }, /* 50 - 720(1440)x480i@120Hz 4:3 */ { DRM_MODE("720x480i", DRM_MODE_TYPE_DRIVER, 27000, 720, 739, 801, 858, 0, 480, 488, 494, 525, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_NVSYNC | DRM_MODE_FLAG_INTERLACE | DRM_MODE_FLAG_DBLCLK), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_4_3, }, /* 51 - 720(1440)x480i@120Hz 16:9 */ { DRM_MODE("720x480i", DRM_MODE_TYPE_DRIVER, 27000, 720, 739, 801, 858, 0, 480, 488, 494, 525, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_NVSYNC | DRM_MODE_FLAG_INTERLACE | DRM_MODE_FLAG_DBLCLK), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_16_9, }, /* 52 - 720x576@200Hz 4:3 */ { DRM_MODE("720x576", DRM_MODE_TYPE_DRIVER, 108000, 720, 732, 796, 864, 0, 576, 581, 586, 625, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_NVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_4_3, }, /* 53 - 720x576@200Hz 16:9 */ { DRM_MODE("720x576", DRM_MODE_TYPE_DRIVER, 108000, 720, 732, 796, 864, 0, 576, 581, 586, 625, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_NVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_16_9, }, /* 54 - 720(1440)x576i@200Hz 4:3 */ { DRM_MODE("720x576i", DRM_MODE_TYPE_DRIVER, 54000, 720, 732, 795, 864, 0, 576, 580, 586, 625, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_NVSYNC | DRM_MODE_FLAG_INTERLACE | DRM_MODE_FLAG_DBLCLK), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_4_3, }, /* 55 - 720(1440)x576i@200Hz 16:9 */ { DRM_MODE("720x576i", DRM_MODE_TYPE_DRIVER, 54000, 720, 732, 795, 864, 0, 576, 580, 586, 625, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_NVSYNC | DRM_MODE_FLAG_INTERLACE | DRM_MODE_FLAG_DBLCLK), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_16_9, }, /* 56 - 720x480@240Hz 4:3 */ { DRM_MODE("720x480", DRM_MODE_TYPE_DRIVER, 108000, 720, 736, 798, 858, 0, 480, 489, 495, 525, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_NVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_4_3, }, /* 57 - 720x480@240Hz 16:9 */ { DRM_MODE("720x480", DRM_MODE_TYPE_DRIVER, 108000, 720, 736, 798, 858, 0, 480, 489, 495, 525, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_NVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_16_9, }, /* 58 - 720(1440)x480i@240Hz 4:3 */ { DRM_MODE("720x480i", DRM_MODE_TYPE_DRIVER, 54000, 720, 739, 801, 858, 0, 480, 488, 494, 525, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_NVSYNC | DRM_MODE_FLAG_INTERLACE | DRM_MODE_FLAG_DBLCLK), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_4_3, }, /* 59 - 720(1440)x480i@240Hz 16:9 */ { DRM_MODE("720x480i", DRM_MODE_TYPE_DRIVER, 54000, 720, 739, 801, 858, 0, 480, 488, 494, 525, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_NVSYNC | DRM_MODE_FLAG_INTERLACE | DRM_MODE_FLAG_DBLCLK), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_16_9, }, /* 60 - 1280x720@24Hz 16:9 */ { DRM_MODE("1280x720", DRM_MODE_TYPE_DRIVER, 59400, 1280, 3040, 3080, 3300, 0, 720, 725, 730, 750, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_16_9, }, /* 61 - 1280x720@25Hz 16:9 */ { DRM_MODE("1280x720", DRM_MODE_TYPE_DRIVER, 74250, 1280, 3700, 3740, 3960, 0, 720, 725, 730, 750, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_16_9, }, /* 62 - 1280x720@30Hz 16:9 */ { DRM_MODE("1280x720", DRM_MODE_TYPE_DRIVER, 74250, 1280, 3040, 3080, 3300, 0, 720, 725, 730, 750, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_16_9, }, /* 63 - 1920x1080@120Hz 16:9 */ { DRM_MODE("1920x1080", DRM_MODE_TYPE_DRIVER, 297000, 1920, 2008, 2052, 2200, 0, 1080, 1084, 1089, 1125, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_16_9, }, /* 64 - 1920x1080@100Hz 16:9 */ { DRM_MODE("1920x1080", DRM_MODE_TYPE_DRIVER, 297000, 1920, 2448, 2492, 2640, 0, 1080, 1084, 1089, 1125, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_16_9, }, /* 65 - 1280x720@24Hz 64:27 */ { DRM_MODE("1280x720", DRM_MODE_TYPE_DRIVER, 59400, 1280, 3040, 3080, 3300, 0, 720, 725, 730, 750, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_64_27, }, /* 66 - 1280x720@25Hz 64:27 */ { DRM_MODE("1280x720", DRM_MODE_TYPE_DRIVER, 74250, 1280, 3700, 3740, 3960, 0, 720, 725, 730, 750, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_64_27, }, /* 67 - 1280x720@30Hz 64:27 */ { DRM_MODE("1280x720", DRM_MODE_TYPE_DRIVER, 74250, 1280, 3040, 3080, 3300, 0, 720, 725, 730, 750, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_64_27, }, /* 68 - 1280x720@50Hz 64:27 */ { DRM_MODE("1280x720", DRM_MODE_TYPE_DRIVER, 74250, 1280, 1720, 1760, 1980, 0, 720, 725, 730, 750, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_64_27, }, /* 69 - 1280x720@60Hz 64:27 */ { DRM_MODE("1280x720", DRM_MODE_TYPE_DRIVER, 74250, 1280, 1390, 1430, 1650, 0, 720, 725, 730, 750, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_64_27, }, /* 70 - 1280x720@100Hz 64:27 */ { DRM_MODE("1280x720", DRM_MODE_TYPE_DRIVER, 148500, 1280, 1720, 1760, 1980, 0, 720, 725, 730, 750, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_64_27, }, /* 71 - 1280x720@120Hz 64:27 */ { DRM_MODE("1280x720", DRM_MODE_TYPE_DRIVER, 148500, 1280, 1390, 1430, 1650, 0, 720, 725, 730, 750, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_64_27, }, /* 72 - 1920x1080@24Hz 64:27 */ { DRM_MODE("1920x1080", DRM_MODE_TYPE_DRIVER, 74250, 1920, 2558, 2602, 2750, 0, 1080, 1084, 1089, 1125, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_64_27, }, /* 73 - 1920x1080@25Hz 64:27 */ { DRM_MODE("1920x1080", DRM_MODE_TYPE_DRIVER, 74250, 1920, 2448, 2492, 2640, 0, 1080, 1084, 1089, 1125, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_64_27, }, /* 74 - 1920x1080@30Hz 64:27 */ { DRM_MODE("1920x1080", DRM_MODE_TYPE_DRIVER, 74250, 1920, 2008, 2052, 2200, 0, 1080, 1084, 1089, 1125, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_64_27, }, /* 75 - 1920x1080@50Hz 64:27 */ { DRM_MODE("1920x1080", DRM_MODE_TYPE_DRIVER, 148500, 1920, 2448, 2492, 2640, 0, 1080, 1084, 1089, 1125, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_64_27, }, /* 76 - 1920x1080@60Hz 64:27 */ { DRM_MODE("1920x1080", DRM_MODE_TYPE_DRIVER, 148500, 1920, 2008, 2052, 2200, 0, 1080, 1084, 1089, 1125, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_64_27, }, /* 77 - 1920x1080@100Hz 64:27 */ { DRM_MODE("1920x1080", DRM_MODE_TYPE_DRIVER, 297000, 1920, 2448, 2492, 2640, 0, 1080, 1084, 1089, 1125, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_64_27, }, /* 78 - 1920x1080@120Hz 64:27 */ { DRM_MODE("1920x1080", DRM_MODE_TYPE_DRIVER, 297000, 1920, 2008, 2052, 2200, 0, 1080, 1084, 1089, 1125, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_64_27, }, /* 79 - 1680x720@24Hz 64:27 */ { DRM_MODE("1680x720", DRM_MODE_TYPE_DRIVER, 59400, 1680, 3040, 3080, 3300, 0, 720, 725, 730, 750, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_64_27, }, /* 80 - 1680x720@25Hz 64:27 */ { DRM_MODE("1680x720", DRM_MODE_TYPE_DRIVER, 59400, 1680, 2908, 2948, 3168, 0, 720, 725, 730, 750, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_64_27, }, /* 81 - 1680x720@30Hz 64:27 */ { DRM_MODE("1680x720", DRM_MODE_TYPE_DRIVER, 59400, 1680, 2380, 2420, 2640, 0, 720, 725, 730, 750, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_64_27, }, /* 82 - 1680x720@50Hz 64:27 */ { DRM_MODE("1680x720", DRM_MODE_TYPE_DRIVER, 82500, 1680, 1940, 1980, 2200, 0, 720, 725, 730, 750, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_64_27, }, /* 83 - 1680x720@60Hz 64:27 */ { DRM_MODE("1680x720", DRM_MODE_TYPE_DRIVER, 99000, 1680, 1940, 1980, 2200, 0, 720, 725, 730, 750, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_64_27, }, /* 84 - 1680x720@100Hz 64:27 */ { DRM_MODE("1680x720", DRM_MODE_TYPE_DRIVER, 165000, 1680, 1740, 1780, 2000, 0, 720, 725, 730, 825, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_64_27, }, /* 85 - 1680x720@120Hz 64:27 */ { DRM_MODE("1680x720", DRM_MODE_TYPE_DRIVER, 198000, 1680, 1740, 1780, 2000, 0, 720, 725, 730, 825, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_64_27, }, /* 86 - 2560x1080@24Hz 64:27 */ { DRM_MODE("2560x1080", DRM_MODE_TYPE_DRIVER, 99000, 2560, 3558, 3602, 3750, 0, 1080, 1084, 1089, 1100, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_64_27, }, /* 87 - 2560x1080@25Hz 64:27 */ { DRM_MODE("2560x1080", DRM_MODE_TYPE_DRIVER, 90000, 2560, 3008, 3052, 3200, 0, 1080, 1084, 1089, 1125, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_64_27, }, /* 88 - 2560x1080@30Hz 64:27 */ { DRM_MODE("2560x1080", DRM_MODE_TYPE_DRIVER, 118800, 2560, 3328, 3372, 3520, 0, 1080, 1084, 1089, 1125, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_64_27, }, /* 89 - 2560x1080@50Hz 64:27 */ { DRM_MODE("2560x1080", DRM_MODE_TYPE_DRIVER, 185625, 2560, 3108, 3152, 3300, 0, 1080, 1084, 1089, 1125, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_64_27, }, /* 90 - 2560x1080@60Hz 64:27 */ { DRM_MODE("2560x1080", DRM_MODE_TYPE_DRIVER, 198000, 2560, 2808, 2852, 3000, 0, 1080, 1084, 1089, 1100, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_64_27, }, /* 91 - 2560x1080@100Hz 64:27 */ { DRM_MODE("2560x1080", DRM_MODE_TYPE_DRIVER, 371250, 2560, 2778, 2822, 2970, 0, 1080, 1084, 1089, 1250, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_64_27, }, /* 92 - 2560x1080@120Hz 64:27 */ { DRM_MODE("2560x1080", DRM_MODE_TYPE_DRIVER, 495000, 2560, 3108, 3152, 3300, 0, 1080, 1084, 1089, 1250, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_64_27, }, /* 93 - 3840x2160@24Hz 16:9 */ { DRM_MODE("3840x2160", DRM_MODE_TYPE_DRIVER, 297000, 3840, 5116, 5204, 5500, 0, 2160, 2168, 2178, 2250, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_16_9, }, /* 94 - 3840x2160@25Hz 16:9 */ { DRM_MODE("3840x2160", DRM_MODE_TYPE_DRIVER, 297000, 3840, 4896, 4984, 5280, 0, 2160, 2168, 2178, 2250, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_16_9, }, /* 95 - 3840x2160@30Hz 16:9 */ { DRM_MODE("3840x2160", DRM_MODE_TYPE_DRIVER, 297000, 3840, 4016, 4104, 4400, 0, 2160, 2168, 2178, 2250, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_16_9, }, /* 96 - 3840x2160@50Hz 16:9 */ { DRM_MODE("3840x2160", DRM_MODE_TYPE_DRIVER, 594000, 3840, 4896, 4984, 5280, 0, 2160, 2168, 2178, 2250, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_16_9, }, /* 97 - 3840x2160@60Hz 16:9 */ { DRM_MODE("3840x2160", DRM_MODE_TYPE_DRIVER, 594000, 3840, 4016, 4104, 4400, 0, 2160, 2168, 2178, 2250, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_16_9, }, /* 98 - 4096x2160@24Hz 256:135 */ { DRM_MODE("4096x2160", DRM_MODE_TYPE_DRIVER, 297000, 4096, 5116, 5204, 5500, 0, 2160, 2168, 2178, 2250, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_256_135, }, /* 99 - 4096x2160@25Hz 256:135 */ { DRM_MODE("4096x2160", DRM_MODE_TYPE_DRIVER, 297000, 4096, 5064, 5152, 5280, 0, 2160, 2168, 2178, 2250, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_256_135, }, /* 100 - 4096x2160@30Hz 256:135 */ { DRM_MODE("4096x2160", DRM_MODE_TYPE_DRIVER, 297000, 4096, 4184, 4272, 4400, 0, 2160, 2168, 2178, 2250, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_256_135, }, /* 101 - 4096x2160@50Hz 256:135 */ { DRM_MODE("4096x2160", DRM_MODE_TYPE_DRIVER, 594000, 4096, 5064, 5152, 5280, 0, 2160, 2168, 2178, 2250, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_256_135, }, /* 102 - 4096x2160@60Hz 256:135 */ { DRM_MODE("4096x2160", DRM_MODE_TYPE_DRIVER, 594000, 4096, 4184, 4272, 4400, 0, 2160, 2168, 2178, 2250, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_256_135, }, /* 103 - 3840x2160@24Hz 64:27 */ { DRM_MODE("3840x2160", DRM_MODE_TYPE_DRIVER, 297000, 3840, 5116, 5204, 5500, 0, 2160, 2168, 2178, 2250, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_64_27, }, /* 104 - 3840x2160@25Hz 64:27 */ { DRM_MODE("3840x2160", DRM_MODE_TYPE_DRIVER, 297000, 3840, 4896, 4984, 5280, 0, 2160, 2168, 2178, 2250, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_64_27, }, /* 105 - 3840x2160@30Hz 64:27 */ { DRM_MODE("3840x2160", DRM_MODE_TYPE_DRIVER, 297000, 3840, 4016, 4104, 4400, 0, 2160, 2168, 2178, 2250, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_64_27, }, /* 106 - 3840x2160@50Hz 64:27 */ { DRM_MODE("3840x2160", DRM_MODE_TYPE_DRIVER, 594000, 3840, 4896, 4984, 5280, 0, 2160, 2168, 2178, 2250, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_64_27, }, /* 107 - 3840x2160@60Hz 64:27 */ { DRM_MODE("3840x2160", DRM_MODE_TYPE_DRIVER, 594000, 3840, 4016, 4104, 4400, 0, 2160, 2168, 2178, 2250, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_64_27, }, /* 108 - 1280x720@48Hz 16:9 */ { DRM_MODE("1280x720", DRM_MODE_TYPE_DRIVER, 90000, 1280, 2240, 2280, 2500, 0, 720, 725, 730, 750, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_16_9, }, /* 109 - 1280x720@48Hz 64:27 */ { DRM_MODE("1280x720", DRM_MODE_TYPE_DRIVER, 90000, 1280, 2240, 2280, 2500, 0, 720, 725, 730, 750, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_64_27, }, /* 110 - 1680x720@48Hz 64:27 */ { DRM_MODE("1680x720", DRM_MODE_TYPE_DRIVER, 99000, 1680, 2490, 2530, 2750, 0, 720, 725, 730, 750, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_64_27, }, /* 111 - 1920x1080@48Hz 16:9 */ { DRM_MODE("1920x1080", DRM_MODE_TYPE_DRIVER, 148500, 1920, 2558, 2602, 2750, 0, 1080, 1084, 1089, 1125, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_16_9, }, /* 112 - 1920x1080@48Hz 64:27 */ { DRM_MODE("1920x1080", DRM_MODE_TYPE_DRIVER, 148500, 1920, 2558, 2602, 2750, 0, 1080, 1084, 1089, 1125, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_64_27, }, /* 113 - 2560x1080@48Hz 64:27 */ { DRM_MODE("2560x1080", DRM_MODE_TYPE_DRIVER, 198000, 2560, 3558, 3602, 3750, 0, 1080, 1084, 1089, 1100, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_64_27, }, /* 114 - 3840x2160@48Hz 16:9 */ { DRM_MODE("3840x2160", DRM_MODE_TYPE_DRIVER, 594000, 3840, 5116, 5204, 5500, 0, 2160, 2168, 2178, 2250, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_16_9, }, /* 115 - 4096x2160@48Hz 256:135 */ { DRM_MODE("4096x2160", DRM_MODE_TYPE_DRIVER, 594000, 4096, 5116, 5204, 5500, 0, 2160, 2168, 2178, 2250, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_256_135, }, /* 116 - 3840x2160@48Hz 64:27 */ { DRM_MODE("3840x2160", DRM_MODE_TYPE_DRIVER, 594000, 3840, 5116, 5204, 5500, 0, 2160, 2168, 2178, 2250, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_64_27, }, /* 117 - 3840x2160@100Hz 16:9 */ { DRM_MODE("3840x2160", DRM_MODE_TYPE_DRIVER, 1188000, 3840, 4896, 4984, 5280, 0, 2160, 2168, 2178, 2250, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_16_9, }, /* 118 - 3840x2160@120Hz 16:9 */ { DRM_MODE("3840x2160", DRM_MODE_TYPE_DRIVER, 1188000, 3840, 4016, 4104, 4400, 0, 2160, 2168, 2178, 2250, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_16_9, }, /* 119 - 3840x2160@100Hz 64:27 */ { DRM_MODE("3840x2160", DRM_MODE_TYPE_DRIVER, 1188000, 3840, 4896, 4984, 5280, 0, 2160, 2168, 2178, 2250, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_64_27, }, /* 120 - 3840x2160@120Hz 64:27 */ { DRM_MODE("3840x2160", DRM_MODE_TYPE_DRIVER, 1188000, 3840, 4016, 4104, 4400, 0, 2160, 2168, 2178, 2250, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_64_27, }, /* 121 - 5120x2160@24Hz 64:27 */ { DRM_MODE("5120x2160", DRM_MODE_TYPE_DRIVER, 396000, 5120, 7116, 7204, 7500, 0, 2160, 2168, 2178, 2200, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_64_27, }, /* 122 - 5120x2160@25Hz 64:27 */ { DRM_MODE("5120x2160", DRM_MODE_TYPE_DRIVER, 396000, 5120, 6816, 6904, 7200, 0, 2160, 2168, 2178, 2200, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_64_27, }, /* 123 - 5120x2160@30Hz 64:27 */ { DRM_MODE("5120x2160", DRM_MODE_TYPE_DRIVER, 396000, 5120, 5784, 5872, 6000, 0, 2160, 2168, 2178, 2200, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_64_27, }, /* 124 - 5120x2160@48Hz 64:27 */ { DRM_MODE("5120x2160", DRM_MODE_TYPE_DRIVER, 742500, 5120, 5866, 5954, 6250, 0, 2160, 2168, 2178, 2475, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_64_27, }, /* 125 - 5120x2160@50Hz 64:27 */ { DRM_MODE("5120x2160", DRM_MODE_TYPE_DRIVER, 742500, 5120, 6216, 6304, 6600, 0, 2160, 2168, 2178, 2250, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_64_27, }, /* 126 - 5120x2160@60Hz 64:27 */ { DRM_MODE("5120x2160", DRM_MODE_TYPE_DRIVER, 742500, 5120, 5284, 5372, 5500, 0, 2160, 2168, 2178, 2250, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_64_27, }, /* 127 - 5120x2160@100Hz 64:27 */ { DRM_MODE("5120x2160", DRM_MODE_TYPE_DRIVER, 1485000, 5120, 6216, 6304, 6600, 0, 2160, 2168, 2178, 2250, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_64_27, }, }; /* * From CEA/CTA-861 spec. * * Do not access directly, instead always use cea_mode_for_vic(). */ static const struct drm_display_mode edid_cea_modes_193[] = { /* 193 - 5120x2160@120Hz 64:27 */ { DRM_MODE("5120x2160", DRM_MODE_TYPE_DRIVER, 1485000, 5120, 5284, 5372, 5500, 0, 2160, 2168, 2178, 2250, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_64_27, }, /* 194 - 7680x4320@24Hz 16:9 */ { DRM_MODE("7680x4320", DRM_MODE_TYPE_DRIVER, 1188000, 7680, 10232, 10408, 11000, 0, 4320, 4336, 4356, 4500, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_16_9, }, /* 195 - 7680x4320@25Hz 16:9 */ { DRM_MODE("7680x4320", DRM_MODE_TYPE_DRIVER, 1188000, 7680, 10032, 10208, 10800, 0, 4320, 4336, 4356, 4400, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_16_9, }, /* 196 - 7680x4320@30Hz 16:9 */ { DRM_MODE("7680x4320", DRM_MODE_TYPE_DRIVER, 1188000, 7680, 8232, 8408, 9000, 0, 4320, 4336, 4356, 4400, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_16_9, }, /* 197 - 7680x4320@48Hz 16:9 */ { DRM_MODE("7680x4320", DRM_MODE_TYPE_DRIVER, 2376000, 7680, 10232, 10408, 11000, 0, 4320, 4336, 4356, 4500, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_16_9, }, /* 198 - 7680x4320@50Hz 16:9 */ { DRM_MODE("7680x4320", DRM_MODE_TYPE_DRIVER, 2376000, 7680, 10032, 10208, 10800, 0, 4320, 4336, 4356, 4400, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_16_9, }, /* 199 - 7680x4320@60Hz 16:9 */ { DRM_MODE("7680x4320", DRM_MODE_TYPE_DRIVER, 2376000, 7680, 8232, 8408, 9000, 0, 4320, 4336, 4356, 4400, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_16_9, }, /* 200 - 7680x4320@100Hz 16:9 */ { DRM_MODE("7680x4320", DRM_MODE_TYPE_DRIVER, 4752000, 7680, 9792, 9968, 10560, 0, 4320, 4336, 4356, 4500, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_16_9, }, /* 201 - 7680x4320@120Hz 16:9 */ { DRM_MODE("7680x4320", DRM_MODE_TYPE_DRIVER, 4752000, 7680, 8032, 8208, 8800, 0, 4320, 4336, 4356, 4500, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_16_9, }, /* 202 - 7680x4320@24Hz 64:27 */ { DRM_MODE("7680x4320", DRM_MODE_TYPE_DRIVER, 1188000, 7680, 10232, 10408, 11000, 0, 4320, 4336, 4356, 4500, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_64_27, }, /* 203 - 7680x4320@25Hz 64:27 */ { DRM_MODE("7680x4320", DRM_MODE_TYPE_DRIVER, 1188000, 7680, 10032, 10208, 10800, 0, 4320, 4336, 4356, 4400, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_64_27, }, /* 204 - 7680x4320@30Hz 64:27 */ { DRM_MODE("7680x4320", DRM_MODE_TYPE_DRIVER, 1188000, 7680, 8232, 8408, 9000, 0, 4320, 4336, 4356, 4400, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_64_27, }, /* 205 - 7680x4320@48Hz 64:27 */ { DRM_MODE("7680x4320", DRM_MODE_TYPE_DRIVER, 2376000, 7680, 10232, 10408, 11000, 0, 4320, 4336, 4356, 4500, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_64_27, }, /* 206 - 7680x4320@50Hz 64:27 */ { DRM_MODE("7680x4320", DRM_MODE_TYPE_DRIVER, 2376000, 7680, 10032, 10208, 10800, 0, 4320, 4336, 4356, 4400, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_64_27, }, /* 207 - 7680x4320@60Hz 64:27 */ { DRM_MODE("7680x4320", DRM_MODE_TYPE_DRIVER, 2376000, 7680, 8232, 8408, 9000, 0, 4320, 4336, 4356, 4400, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_64_27, }, /* 208 - 7680x4320@100Hz 64:27 */ { DRM_MODE("7680x4320", DRM_MODE_TYPE_DRIVER, 4752000, 7680, 9792, 9968, 10560, 0, 4320, 4336, 4356, 4500, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_64_27, }, /* 209 - 7680x4320@120Hz 64:27 */ { DRM_MODE("7680x4320", DRM_MODE_TYPE_DRIVER, 4752000, 7680, 8032, 8208, 8800, 0, 4320, 4336, 4356, 4500, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_64_27, }, /* 210 - 10240x4320@24Hz 64:27 */ { DRM_MODE("10240x4320", DRM_MODE_TYPE_DRIVER, 1485000, 10240, 11732, 11908, 12500, 0, 4320, 4336, 4356, 4950, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_64_27, }, /* 211 - 10240x4320@25Hz 64:27 */ { DRM_MODE("10240x4320", DRM_MODE_TYPE_DRIVER, 1485000, 10240, 12732, 12908, 13500, 0, 4320, 4336, 4356, 4400, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_64_27, }, /* 212 - 10240x4320@30Hz 64:27 */ { DRM_MODE("10240x4320", DRM_MODE_TYPE_DRIVER, 1485000, 10240, 10528, 10704, 11000, 0, 4320, 4336, 4356, 4500, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_64_27, }, /* 213 - 10240x4320@48Hz 64:27 */ { DRM_MODE("10240x4320", DRM_MODE_TYPE_DRIVER, 2970000, 10240, 11732, 11908, 12500, 0, 4320, 4336, 4356, 4950, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_64_27, }, /* 214 - 10240x4320@50Hz 64:27 */ { DRM_MODE("10240x4320", DRM_MODE_TYPE_DRIVER, 2970000, 10240, 12732, 12908, 13500, 0, 4320, 4336, 4356, 4400, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_64_27, }, /* 215 - 10240x4320@60Hz 64:27 */ { DRM_MODE("10240x4320", DRM_MODE_TYPE_DRIVER, 2970000, 10240, 10528, 10704, 11000, 0, 4320, 4336, 4356, 4500, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_64_27, }, /* 216 - 10240x4320@100Hz 64:27 */ { DRM_MODE("10240x4320", DRM_MODE_TYPE_DRIVER, 5940000, 10240, 12432, 12608, 13200, 0, 4320, 4336, 4356, 4500, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_64_27, }, /* 217 - 10240x4320@120Hz 64:27 */ { DRM_MODE("10240x4320", DRM_MODE_TYPE_DRIVER, 5940000, 10240, 10528, 10704, 11000, 0, 4320, 4336, 4356, 4500, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_64_27, }, /* 218 - 4096x2160@100Hz 256:135 */ { DRM_MODE("4096x2160", DRM_MODE_TYPE_DRIVER, 1188000, 4096, 4896, 4984, 5280, 0, 2160, 2168, 2178, 2250, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_256_135, }, /* 219 - 4096x2160@120Hz 256:135 */ { DRM_MODE("4096x2160", DRM_MODE_TYPE_DRIVER, 1188000, 4096, 4184, 4272, 4400, 0, 2160, 2168, 2178, 2250, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_256_135, }, }; /* * HDMI 1.4 4k modes. Index using the VIC. */ static const struct drm_display_mode edid_4k_modes[] = { /* 0 - dummy, VICs start at 1 */ { }, /* 1 - 3840x2160@30Hz */ { DRM_MODE("3840x2160", DRM_MODE_TYPE_DRIVER, 297000, 3840, 4016, 4104, 4400, 0, 2160, 2168, 2178, 2250, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_16_9, }, /* 2 - 3840x2160@25Hz */ { DRM_MODE("3840x2160", DRM_MODE_TYPE_DRIVER, 297000, 3840, 4896, 4984, 5280, 0, 2160, 2168, 2178, 2250, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_16_9, }, /* 3 - 3840x2160@24Hz */ { DRM_MODE("3840x2160", DRM_MODE_TYPE_DRIVER, 297000, 3840, 5116, 5204, 5500, 0, 2160, 2168, 2178, 2250, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_16_9, }, /* 4 - 4096x2160@24Hz (SMPTE) */ { DRM_MODE("4096x2160", DRM_MODE_TYPE_DRIVER, 297000, 4096, 5116, 5204, 5500, 0, 2160, 2168, 2178, 2250, 0, DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC), .picture_aspect_ratio = HDMI_PICTURE_ASPECT_256_135, }, }; /*** DDC fetch and block validation ***/ /* * The opaque EDID type, internal to drm_edid.c. */ struct drm_edid { /* Size allocated for edid */ size_t size; const struct edid *edid; }; static int edid_hfeeodb_extension_block_count(const struct edid *edid); static int edid_hfeeodb_block_count(const struct edid *edid) { int eeodb = edid_hfeeodb_extension_block_count(edid); return eeodb ? eeodb + 1 : 0; } static int edid_extension_block_count(const struct edid *edid) { return edid->extensions; } static int edid_block_count(const struct edid *edid) { return edid_extension_block_count(edid) + 1; } static int edid_size_by_blocks(int num_blocks) { return num_blocks * EDID_LENGTH; } static int edid_size(const struct edid *edid) { return edid_size_by_blocks(edid_block_count(edid)); } static const void *edid_block_data(const struct edid *edid, int index) { BUILD_BUG_ON(sizeof(*edid) != EDID_LENGTH); return edid + index; } static const void *edid_extension_block_data(const struct edid *edid, int index) { return edid_block_data(edid, index + 1); } /* EDID block count indicated in EDID, may exceed allocated size */ static int __drm_edid_block_count(const struct drm_edid *drm_edid) { int num_blocks; /* Starting point */ num_blocks = edid_block_count(drm_edid->edid); /* HF-EEODB override */ if (drm_edid->size >= edid_size_by_blocks(2)) { int eeodb; /* * Note: HF-EEODB may specify a smaller extension count than the * regular one. Unlike in buffer allocation, here we can use it. */ eeodb = edid_hfeeodb_block_count(drm_edid->edid); if (eeodb) num_blocks = eeodb; } return num_blocks; } /* EDID block count, limited by allocated size */ static int drm_edid_block_count(const struct drm_edid *drm_edid) { /* Limit by allocated size */ return min(__drm_edid_block_count(drm_edid), (int)drm_edid->size / EDID_LENGTH); } /* EDID extension block count, limited by allocated size */ static int drm_edid_extension_block_count(const struct drm_edid *drm_edid) { return drm_edid_block_count(drm_edid) - 1; } static const void *drm_edid_block_data(const struct drm_edid *drm_edid, int index) { return edid_block_data(drm_edid->edid, index); } static const void *drm_edid_extension_block_data(const struct drm_edid *drm_edid, int index) { return edid_extension_block_data(drm_edid->edid, index); } /* * Initializer helper for legacy interfaces, where we have no choice but to * trust edid size. Not for general purpose use. */ static const struct drm_edid *drm_edid_legacy_init(struct drm_edid *drm_edid, const struct edid *edid) { if (!edid) return NULL; memset(drm_edid, 0, sizeof(*drm_edid)); drm_edid->edid = edid; drm_edid->size = edid_size(edid); return drm_edid; } /* * EDID base and extension block iterator. * * struct drm_edid_iter iter; * const u8 *block; * * drm_edid_iter_begin(drm_edid, &iter); * drm_edid_iter_for_each(block, &iter) { * // do stuff with block * } * drm_edid_iter_end(&iter); */ struct drm_edid_iter { const struct drm_edid *drm_edid; /* Current block index. */ int index; }; static void drm_edid_iter_begin(const struct drm_edid *drm_edid, struct drm_edid_iter *iter) { memset(iter, 0, sizeof(*iter)); iter->drm_edid = drm_edid; } static const void *__drm_edid_iter_next(struct drm_edid_iter *iter) { const void *block = NULL; if (!iter->drm_edid) return NULL; if (iter->index < drm_edid_block_count(iter->drm_edid)) block = drm_edid_block_data(iter->drm_edid, iter->index++); return block; } #define drm_edid_iter_for_each(__block, __iter) \ while (((__block) = __drm_edid_iter_next(__iter))) static void drm_edid_iter_end(struct drm_edid_iter *iter) { memset(iter, 0, sizeof(*iter)); } static const u8 edid_header[] = { 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00 }; static void edid_header_fix(void *edid) { memcpy(edid, edid_header, sizeof(edid_header)); } /** * drm_edid_header_is_valid - sanity check the header of the base EDID block * @_edid: pointer to raw base EDID block * * Sanity check the header of the base EDID block. * * Return: 8 if the header is perfect, down to 0 if it's totally wrong. */ int drm_edid_header_is_valid(const void *_edid) { const struct edid *edid = _edid; int i, score = 0; for (i = 0; i < sizeof(edid_header); i++) { if (edid->header[i] == edid_header[i]) score++; } return score; } EXPORT_SYMBOL(drm_edid_header_is_valid); static int edid_fixup __read_mostly = 6; module_param_named(edid_fixup, edid_fixup, int, 0400); MODULE_PARM_DESC(edid_fixup, "Minimum number of valid EDID header bytes (0-8, default 6)"); static int edid_block_compute_checksum(const void *_block) { const u8 *block = _block; int i; u8 csum = 0, crc = 0; for (i = 0; i < EDID_LENGTH - 1; i++) csum += block[i]; crc = 0x100 - csum; return crc; } static int edid_block_get_checksum(const void *_block) { const struct edid *block = _block; return block->checksum; } static int edid_block_tag(const void *_block) { const u8 *block = _block; return block[0]; } static bool edid_block_is_zero(const void *edid) { return !memchr_inv(edid, 0, EDID_LENGTH); } static bool drm_edid_eq(const struct drm_edid *drm_edid, const void *raw_edid, size_t raw_edid_size) { bool edid1_present = drm_edid && drm_edid->edid && drm_edid->size; bool edid2_present = raw_edid && raw_edid_size; if (edid1_present != edid2_present) return false; if (edid1_present) { if (drm_edid->size != raw_edid_size) return false; if (memcmp(drm_edid->edid, raw_edid, drm_edid->size)) return false; } return true; } enum edid_block_status { EDID_BLOCK_OK = 0, EDID_BLOCK_READ_FAIL, EDID_BLOCK_NULL, EDID_BLOCK_ZERO, EDID_BLOCK_HEADER_CORRUPT, EDID_BLOCK_HEADER_REPAIR, EDID_BLOCK_HEADER_FIXED, EDID_BLOCK_CHECKSUM, EDID_BLOCK_VERSION, }; static enum edid_block_status edid_block_check(const void *_block, bool is_base_block) { const struct edid *block = _block; if (!block) return EDID_BLOCK_NULL; if (is_base_block) { int score = drm_edid_header_is_valid(block); if (score < clamp(edid_fixup, 0, 8)) { if (edid_block_is_zero(block)) return EDID_BLOCK_ZERO; else return EDID_BLOCK_HEADER_CORRUPT; } if (score < 8) return EDID_BLOCK_HEADER_REPAIR; } if (edid_block_compute_checksum(block) != edid_block_get_checksum(block)) { if (edid_block_is_zero(block)) return EDID_BLOCK_ZERO; else return EDID_BLOCK_CHECKSUM; } if (is_base_block) { if (block->version != 1) return EDID_BLOCK_VERSION; } return EDID_BLOCK_OK; } static bool edid_block_status_valid(enum edid_block_status status, int tag) { return status == EDID_BLOCK_OK || status == EDID_BLOCK_HEADER_FIXED || (status == EDID_BLOCK_CHECKSUM && tag == CEA_EXT); } static bool edid_block_valid(const void *block, bool base) { return edid_block_status_valid(edid_block_check(block, base), edid_block_tag(block)); } static void edid_block_status_print(enum edid_block_status status, const struct edid *block, int block_num) { switch (status) { case EDID_BLOCK_OK: break; case EDID_BLOCK_READ_FAIL: pr_debug("EDID block %d read failed\n", block_num); break; case EDID_BLOCK_NULL: pr_debug("EDID block %d pointer is NULL\n", block_num); break; case EDID_BLOCK_ZERO: pr_notice("EDID block %d is all zeroes\n", block_num); break; case EDID_BLOCK_HEADER_CORRUPT: pr_notice("EDID has corrupt header\n"); break; case EDID_BLOCK_HEADER_REPAIR: pr_debug("EDID corrupt header needs repair\n"); break; case EDID_BLOCK_HEADER_FIXED: pr_debug("EDID corrupt header fixed\n"); break; case EDID_BLOCK_CHECKSUM: if (edid_block_status_valid(status, edid_block_tag(block))) { pr_debug("EDID block %d (tag 0x%02x) checksum is invalid, remainder is %d, ignoring\n", block_num, edid_block_tag(block), edid_block_compute_checksum(block)); } else { pr_notice("EDID block %d (tag 0x%02x) checksum is invalid, remainder is %d\n", block_num, edid_block_tag(block), edid_block_compute_checksum(block)); } break; case EDID_BLOCK_VERSION: pr_notice("EDID has major version %d, instead of 1\n", block->version); break; default: WARN(1, "EDID block %d unknown edid block status code %d\n", block_num, status); break; } } static void edid_block_dump(const char *level, const void *block, int block_num) { enum edid_block_status status; char prefix[20]; status = edid_block_check(block, block_num == 0); if (status == EDID_BLOCK_ZERO) sprintf(prefix, "\t[%02x] ZERO ", block_num); else if (!edid_block_status_valid(status, edid_block_tag(block))) sprintf(prefix, "\t[%02x] BAD ", block_num); else sprintf(prefix, "\t[%02x] GOOD ", block_num); print_hex_dump(level, prefix, DUMP_PREFIX_NONE, 16, 1, block, EDID_LENGTH, false); } /** * drm_edid_block_valid - Sanity check the EDID block (base or extension) * @_block: pointer to raw EDID block * @block_num: type of block to validate (0 for base, extension otherwise) * @print_bad_edid: if true, dump bad EDID blocks to the console * @edid_corrupt: if true, the header or checksum is invalid * * Validate a base or extension EDID block and optionally dump bad blocks to * the console. * * Return: True if the block is valid, false otherwise. */ bool drm_edid_block_valid(u8 *_block, int block_num, bool print_bad_edid, bool *edid_corrupt) { struct edid *block = (struct edid *)_block; enum edid_block_status status; bool is_base_block = block_num == 0; bool valid; if (WARN_ON(!block)) return false; status = edid_block_check(block, is_base_block); if (status == EDID_BLOCK_HEADER_REPAIR) { DRM_DEBUG_KMS("Fixing EDID header, your hardware may be failing\n"); edid_header_fix(block); /* Retry with fixed header, update status if that worked. */ status = edid_block_check(block, is_base_block); if (status == EDID_BLOCK_OK) status = EDID_BLOCK_HEADER_FIXED; } if (edid_corrupt) { /* * Unknown major version isn't corrupt but we can't use it. Only * the base block can reset edid_corrupt to false. */ if (is_base_block && (status == EDID_BLOCK_OK || status == EDID_BLOCK_VERSION)) *edid_corrupt = false; else if (status != EDID_BLOCK_OK) *edid_corrupt = true; } edid_block_status_print(status, block, block_num); /* Determine whether we can use this block with this status. */ valid = edid_block_status_valid(status, edid_block_tag(block)); if (!valid && print_bad_edid && status != EDID_BLOCK_ZERO) { pr_notice("Raw EDID:\n"); edid_block_dump(KERN_NOTICE, block, block_num); } return valid; } EXPORT_SYMBOL(drm_edid_block_valid); /** * drm_edid_is_valid - sanity check EDID data * @edid: EDID data * * Sanity-check an entire EDID record (including extensions) * * Return: True if the EDID data is valid, false otherwise. */ bool drm_edid_is_valid(struct edid *edid) { int i; if (!edid) return false; for (i = 0; i < edid_block_count(edid); i++) { void *block = (void *)edid_block_data(edid, i); if (!drm_edid_block_valid(block, i, true, NULL)) return false; } return true; } EXPORT_SYMBOL(drm_edid_is_valid); /** * drm_edid_valid - sanity check EDID data * @drm_edid: EDID data * * Sanity check an EDID. Cross check block count against allocated size and * checksum the blocks. * * Return: True if the EDID data is valid, false otherwise. */ bool drm_edid_valid(const struct drm_edid *drm_edid) { int i; if (!drm_edid) return false; if (edid_size_by_blocks(__drm_edid_block_count(drm_edid)) != drm_edid->size) return false; for (i = 0; i < drm_edid_block_count(drm_edid); i++) { const void *block = drm_edid_block_data(drm_edid, i); if (!edid_block_valid(block, i == 0)) return false; } return true; } EXPORT_SYMBOL(drm_edid_valid); static struct edid *edid_filter_invalid_blocks(struct edid *edid, size_t *alloc_size) { struct edid *new; int i, valid_blocks = 0; /* * Note: If the EDID uses HF-EEODB, but has invalid blocks, we'll revert * back to regular extension count here. We don't want to start * modifying the HF-EEODB extension too. */ for (i = 0; i < edid_block_count(edid); i++) { const void *src_block = edid_block_data(edid, i); if (edid_block_valid(src_block, i == 0)) { void *dst_block = (void *)edid_block_data(edid, valid_blocks); memmove(dst_block, src_block, EDID_LENGTH); valid_blocks++; } } /* We already trusted the base block to be valid here... */ if (WARN_ON(!valid_blocks)) { kfree(edid); return NULL; } edid->extensions = valid_blocks - 1; edid->checksum = edid_block_compute_checksum(edid); *alloc_size = edid_size_by_blocks(valid_blocks); new = krealloc(edid, *alloc_size, GFP_KERNEL); if (!new) kfree(edid); return new; } #define DDC_SEGMENT_ADDR 0x30 /** * drm_do_probe_ddc_edid() - get EDID information via I2C * @data: I2C device adapter * @buf: EDID data buffer to be filled * @block: 128 byte EDID block to start fetching from * @len: EDID data buffer length to fetch * * Try to fetch EDID information by calling I2C driver functions. * * Return: 0 on success or -1 on failure. */ static int drm_do_probe_ddc_edid(void *data, u8 *buf, unsigned int block, size_t len) { struct i2c_adapter *adapter = data; unsigned char start = block * EDID_LENGTH; unsigned char segment = block >> 1; unsigned char xfers = segment ? 3 : 2; int ret, retries = 5; /* * The core I2C driver will automatically retry the transfer if the * adapter reports EAGAIN. However, we find that bit-banging transfers * are susceptible to errors under a heavily loaded machine and * generate spurious NAKs and timeouts. Retrying the transfer * of the individual block a few times seems to overcome this. */ do { struct i2c_msg msgs[] = { { .addr = DDC_SEGMENT_ADDR, .flags = 0, .len = 1, .buf = &segment, }, { .addr = DDC_ADDR, .flags = 0, .len = 1, .buf = &start, }, { .addr = DDC_ADDR, .flags = I2C_M_RD, .len = len, .buf = buf, } }; /* * Avoid sending the segment addr to not upset non-compliant * DDC monitors. */ ret = i2c_transfer(adapter, &msgs[3 - xfers], xfers); if (ret == -ENXIO) { DRM_DEBUG_KMS("drm: skipping non-existent adapter %s\n", adapter->name); break; } } while (ret != xfers && --retries); return ret == xfers ? 0 : -1; } static void connector_bad_edid(struct drm_connector *connector, const struct edid *edid, int num_blocks) { int i; u8 last_block; /* * 0x7e in the EDID is the number of extension blocks. The EDID * is 1 (base block) + num_ext_blocks big. That means we can think * of 0x7e in the EDID of the _index_ of the last block in the * combined chunk of memory. */ last_block = edid->extensions; /* Calculate real checksum for the last edid extension block data */ if (last_block < num_blocks) connector->real_edid_checksum = edid_block_compute_checksum(edid + last_block); if (connector->bad_edid_counter++ && !drm_debug_enabled(DRM_UT_KMS)) return; drm_dbg_kms(connector->dev, "[CONNECTOR:%d:%s] EDID is invalid:\n", connector->base.id, connector->name); for (i = 0; i < num_blocks; i++) edid_block_dump(KERN_DEBUG, edid + i, i); } /* Get override or firmware EDID */ static const struct drm_edid *drm_edid_override_get(struct drm_connector *connector) { const struct drm_edid *override = NULL; mutex_lock(&connector->edid_override_mutex); if (connector->edid_override) override = drm_edid_dup(connector->edid_override); mutex_unlock(&connector->edid_override_mutex); if (!override) override = drm_edid_load_firmware(connector); return IS_ERR(override) ? NULL : override; } /* For debugfs edid_override implementation */ int drm_edid_override_show(struct drm_connector *connector, struct seq_file *m) { const struct drm_edid *drm_edid; mutex_lock(&connector->edid_override_mutex); drm_edid = connector->edid_override; if (drm_edid) seq_write(m, drm_edid->edid, drm_edid->size); mutex_unlock(&connector->edid_override_mutex); return 0; } /* For debugfs edid_override implementation */ int drm_edid_override_set(struct drm_connector *connector, const void *edid, size_t size) { const struct drm_edid *drm_edid; drm_edid = drm_edid_alloc(edid, size); if (!drm_edid_valid(drm_edid)) { drm_dbg_kms(connector->dev, "[CONNECTOR:%d:%s] EDID override invalid\n", connector->base.id, connector->name); drm_edid_free(drm_edid); return -EINVAL; } drm_dbg_kms(connector->dev, "[CONNECTOR:%d:%s] EDID override set\n", connector->base.id, connector->name); mutex_lock(&connector->edid_override_mutex); drm_edid_free(connector->edid_override); connector->edid_override = drm_edid; mutex_unlock(&connector->edid_override_mutex); return 0; } /* For debugfs edid_override implementation */ int drm_edid_override_reset(struct drm_connector *connector) { drm_dbg_kms(connector->dev, "[CONNECTOR:%d:%s] EDID override reset\n", connector->base.id, connector->name); mutex_lock(&connector->edid_override_mutex); drm_edid_free(connector->edid_override); connector->edid_override = NULL; mutex_unlock(&connector->edid_override_mutex); return 0; } /** * drm_edid_override_connector_update - add modes from override/firmware EDID * @connector: connector we're probing * * Add modes from the override/firmware EDID, if available. Only to be used from * drm_helper_probe_single_connector_modes() as a fallback for when DDC probe * failed during drm_get_edid() and caused the override/firmware EDID to be * skipped. * * Return: The number of modes added or 0 if we couldn't find any. */ int drm_edid_override_connector_update(struct drm_connector *connector) { const struct drm_edid *override; int num_modes = 0; override = drm_edid_override_get(connector); if (override) { if (drm_edid_connector_update(connector, override) == 0) num_modes = drm_edid_connector_add_modes(connector); drm_edid_free(override); drm_dbg_kms(connector->dev, "[CONNECTOR:%d:%s] adding %d modes via fallback override/firmware EDID\n", connector->base.id, connector->name, num_modes); } return num_modes; } EXPORT_SYMBOL(drm_edid_override_connector_update); typedef int read_block_fn(void *context, u8 *buf, unsigned int block, size_t len); static enum edid_block_status edid_block_read(void *block, unsigned int block_num, read_block_fn read_block, void *context) { enum edid_block_status status; bool is_base_block = block_num == 0; int try; for (try = 0; try < 4; try++) { if (read_block(context, block, block_num, EDID_LENGTH)) return EDID_BLOCK_READ_FAIL; status = edid_block_check(block, is_base_block); if (status == EDID_BLOCK_HEADER_REPAIR) { edid_header_fix(block); /* Retry with fixed header, update status if that worked. */ status = edid_block_check(block, is_base_block); if (status == EDID_BLOCK_OK) status = EDID_BLOCK_HEADER_FIXED; } if (edid_block_status_valid(status, edid_block_tag(block))) break; /* Fail early for unrepairable base block all zeros. */ if (try == 0 && is_base_block && status == EDID_BLOCK_ZERO) break; } return status; } static struct edid *_drm_do_get_edid(struct drm_connector *connector, read_block_fn read_block, void *context, size_t *size) { enum edid_block_status status; int i, num_blocks, invalid_blocks = 0; const struct drm_edid *override; struct edid *edid, *new; size_t alloc_size = EDID_LENGTH; override = drm_edid_override_get(connector); if (override) { alloc_size = override->size; edid = kmemdup(override->edid, alloc_size, GFP_KERNEL); drm_edid_free(override); if (!edid) return NULL; goto ok; } edid = kmalloc(alloc_size, GFP_KERNEL); if (!edid) return NULL; status = edid_block_read(edid, 0, read_block, context); edid_block_status_print(status, edid, 0); if (status == EDID_BLOCK_READ_FAIL) goto fail; /* FIXME: Clarify what a corrupt EDID actually means. */ if (status == EDID_BLOCK_OK || status == EDID_BLOCK_VERSION) connector->edid_corrupt = false; else connector->edid_corrupt = true; if (!edid_block_status_valid(status, edid_block_tag(edid))) { if (status == EDID_BLOCK_ZERO) connector->null_edid_counter++; connector_bad_edid(connector, edid, 1); goto fail; } if (!edid_extension_block_count(edid)) goto ok; alloc_size = edid_size(edid); new = krealloc(edid, alloc_size, GFP_KERNEL); if (!new) goto fail; edid = new; num_blocks = edid_block_count(edid); for (i = 1; i < num_blocks; i++) { void *block = (void *)edid_block_data(edid, i); status = edid_block_read(block, i, read_block, context); edid_block_status_print(status, block, i); if (!edid_block_status_valid(status, edid_block_tag(block))) { if (status == EDID_BLOCK_READ_FAIL) goto fail; invalid_blocks++; } else if (i == 1) { /* * If the first EDID extension is a CTA extension, and * the first Data Block is HF-EEODB, override the * extension block count. * * Note: HF-EEODB could specify a smaller extension * count too, but we can't risk allocating a smaller * amount. */ int eeodb = edid_hfeeodb_block_count(edid); if (eeodb > num_blocks) { num_blocks = eeodb; alloc_size = edid_size_by_blocks(num_blocks); new = krealloc(edid, alloc_size, GFP_KERNEL); if (!new) goto fail; edid = new; } } } if (invalid_blocks) { connector_bad_edid(connector, edid, num_blocks); edid = edid_filter_invalid_blocks(edid, &alloc_size); } ok: if (size) *size = alloc_size; return edid; fail: kfree(edid); return NULL; } /** * drm_do_get_edid - get EDID data using a custom EDID block read function * @connector: connector we're probing * @read_block: EDID block read function * @context: private data passed to the block read function * * When the I2C adapter connected to the DDC bus is hidden behind a device that * exposes a different interface to read EDID blocks this function can be used * to get EDID data using a custom block read function. * * As in the general case the DDC bus is accessible by the kernel at the I2C * level, drivers must make all reasonable efforts to expose it as an I2C * adapter and use drm_get_edid() instead of abusing this function. * * The EDID may be overridden using debugfs override_edid or firmware EDID * (drm_edid_load_firmware() and drm.edid_firmware parameter), in this priority * order. Having either of them bypasses actual EDID reads. * * Return: Pointer to valid EDID or NULL if we couldn't find any. */ struct edid *drm_do_get_edid(struct drm_connector *connector, read_block_fn read_block, void *context) { return _drm_do_get_edid(connector, read_block, context, NULL); } EXPORT_SYMBOL_GPL(drm_do_get_edid); /** * drm_edid_raw - Get a pointer to the raw EDID data. * @drm_edid: drm_edid container * * Get a pointer to the raw EDID data. * * This is for transition only. Avoid using this like the plague. * * Return: Pointer to raw EDID data. */ const struct edid *drm_edid_raw(const struct drm_edid *drm_edid) { if (!drm_edid || !drm_edid->size) return NULL; /* * Do not return pointers where relying on EDID extension count would * lead to buffer overflow. */ if (WARN_ON(edid_size(drm_edid->edid) > drm_edid->size)) return NULL; return drm_edid->edid; } EXPORT_SYMBOL(drm_edid_raw); /* Allocate struct drm_edid container *without* duplicating the edid data */ static const struct drm_edid *_drm_edid_alloc(const void *edid, size_t size) { struct drm_edid *drm_edid; if (!edid || !size || size < EDID_LENGTH) return NULL; drm_edid = kzalloc(sizeof(*drm_edid), GFP_KERNEL); if (drm_edid) { drm_edid->edid = edid; drm_edid->size = size; } return drm_edid; } /** * drm_edid_alloc - Allocate a new drm_edid container * @edid: Pointer to raw EDID data * @size: Size of memory allocated for EDID * * Allocate a new drm_edid container. Do not calculate edid size from edid, pass * the actual size that has been allocated for the data. There is no validation * of the raw EDID data against the size, but at least the EDID base block must * fit in the buffer. * * The returned pointer must be freed using drm_edid_free(). * * Return: drm_edid container, or NULL on errors */ const struct drm_edid *drm_edid_alloc(const void *edid, size_t size) { const struct drm_edid *drm_edid; if (!edid || !size || size < EDID_LENGTH) return NULL; edid = kmemdup(edid, size, GFP_KERNEL); if (!edid) return NULL; drm_edid = _drm_edid_alloc(edid, size); if (!drm_edid) kfree(edid); return drm_edid; } EXPORT_SYMBOL(drm_edid_alloc); /** * drm_edid_dup - Duplicate a drm_edid container * @drm_edid: EDID to duplicate * * The returned pointer must be freed using drm_edid_free(). * * Returns: drm_edid container copy, or NULL on errors */ const struct drm_edid *drm_edid_dup(const struct drm_edid *drm_edid) { if (!drm_edid) return NULL; return drm_edid_alloc(drm_edid->edid, drm_edid->size); } EXPORT_SYMBOL(drm_edid_dup); /** * drm_edid_free - Free the drm_edid container * @drm_edid: EDID to free */ void drm_edid_free(const struct drm_edid *drm_edid) { if (!drm_edid) return; kfree(drm_edid->edid); kfree(drm_edid); } EXPORT_SYMBOL(drm_edid_free); /** * drm_probe_ddc() - probe DDC presence * @adapter: I2C adapter to probe * * Return: True on success, false on failure. */ bool drm_probe_ddc(struct i2c_adapter *adapter) { unsigned char out; return (drm_do_probe_ddc_edid(adapter, &out, 0, 1) == 0); } EXPORT_SYMBOL(drm_probe_ddc); /** * drm_get_edid - get EDID data, if available * @connector: connector we're probing * @adapter: I2C adapter to use for DDC * * Poke the given I2C channel to grab EDID data if possible. If found, * attach it to the connector. * * Return: Pointer to valid EDID or NULL if we couldn't find any. */ struct edid *drm_get_edid(struct drm_connector *connector, struct i2c_adapter *adapter) { struct edid *edid; if (connector->force == DRM_FORCE_OFF) return NULL; if (connector->force == DRM_FORCE_UNSPECIFIED && !drm_probe_ddc(adapter)) return NULL; edid = _drm_do_get_edid(connector, drm_do_probe_ddc_edid, adapter, NULL); drm_connector_update_edid_property(connector, edid); return edid; } EXPORT_SYMBOL(drm_get_edid); /** * drm_edid_read_custom - Read EDID data using given EDID block read function * @connector: Connector to use * @read_block: EDID block read function * @context: Private data passed to the block read function * * When the I2C adapter connected to the DDC bus is hidden behind a device that * exposes a different interface to read EDID blocks this function can be used * to get EDID data using a custom block read function. * * As in the general case the DDC bus is accessible by the kernel at the I2C * level, drivers must make all reasonable efforts to expose it as an I2C * adapter and use drm_edid_read() or drm_edid_read_ddc() instead of abusing * this function. * * The EDID may be overridden using debugfs override_edid or firmware EDID * (drm_edid_load_firmware() and drm.edid_firmware parameter), in this priority * order. Having either of them bypasses actual EDID reads. * * The returned pointer must be freed using drm_edid_free(). * * Return: Pointer to EDID, or NULL if probe/read failed. */ const struct drm_edid *drm_edid_read_custom(struct drm_connector *connector, read_block_fn read_block, void *context) { const struct drm_edid *drm_edid; struct edid *edid; size_t size = 0; edid = _drm_do_get_edid(connector, read_block, context, &size); if (!edid) return NULL; /* Sanity check for now */ drm_WARN_ON(connector->dev, !size); drm_edid = _drm_edid_alloc(edid, size); if (!drm_edid) kfree(edid); return drm_edid; } EXPORT_SYMBOL(drm_edid_read_custom); /** * drm_edid_read_ddc - Read EDID data using given I2C adapter * @connector: Connector to use * @adapter: I2C adapter to use for DDC * * Read EDID using the given I2C adapter. * * The EDID may be overridden using debugfs override_edid or firmware EDID * (drm_edid_load_firmware() and drm.edid_firmware parameter), in this priority * order. Having either of them bypasses actual EDID reads. * * Prefer initializing connector->ddc with drm_connector_init_with_ddc() and * using drm_edid_read() instead of this function. * * The returned pointer must be freed using drm_edid_free(). * * Return: Pointer to EDID, or NULL if probe/read failed. */ const struct drm_edid *drm_edid_read_ddc(struct drm_connector *connector, struct i2c_adapter *adapter) { const struct drm_edid *drm_edid; if (connector->force == DRM_FORCE_OFF) return NULL; if (connector->force == DRM_FORCE_UNSPECIFIED && !drm_probe_ddc(adapter)) return NULL; drm_edid = drm_edid_read_custom(connector, drm_do_probe_ddc_edid, adapter); /* Note: Do *not* call connector updates here. */ return drm_edid; } EXPORT_SYMBOL(drm_edid_read_ddc); /** * drm_edid_read - Read EDID data using connector's I2C adapter * @connector: Connector to use * * Read EDID using the connector's I2C adapter. * * The EDID may be overridden using debugfs override_edid or firmware EDID * (drm_edid_load_firmware() and drm.edid_firmware parameter), in this priority * order. Having either of them bypasses actual EDID reads. * * The returned pointer must be freed using drm_edid_free(). * * Return: Pointer to EDID, or NULL if probe/read failed. */ const struct drm_edid *drm_edid_read(struct drm_connector *connector) { if (drm_WARN_ON(connector->dev, !connector->ddc)) return NULL; return drm_edid_read_ddc(connector, connector->ddc); } EXPORT_SYMBOL(drm_edid_read); /** * drm_edid_get_product_id - Get the vendor and product identification * @drm_edid: EDID * @id: Where to place the product id */ void drm_edid_get_product_id(const struct drm_edid *drm_edid, struct drm_edid_product_id *id) { if (drm_edid && drm_edid->edid && drm_edid->size >= EDID_LENGTH) memcpy(id, &drm_edid->edid->product_id, sizeof(*id)); else memset(id, 0, sizeof(*id)); } EXPORT_SYMBOL(drm_edid_get_product_id); static void decode_date(struct seq_buf *s, const struct drm_edid_product_id *id) { int week = id->week_of_manufacture; int year = id->year_of_manufacture + 1990; if (week == 0xff) seq_buf_printf(s, "model year: %d", year); else if (!week) seq_buf_printf(s, "year of manufacture: %d", year); else seq_buf_printf(s, "week/year of manufacture: %d/%d", week, year); } /** * drm_edid_print_product_id - Print decoded product id to printer * @p: drm printer * @id: EDID product id * @raw: If true, also print the raw hex * * See VESA E-EDID 1.4 section 3.4. */ void drm_edid_print_product_id(struct drm_printer *p, const struct drm_edid_product_id *id, bool raw) { DECLARE_SEQ_BUF(date, 40); char vend[4]; drm_edid_decode_mfg_id(be16_to_cpu(id->manufacturer_name), vend); decode_date(&date, id); drm_printf(p, "manufacturer name: %s, product code: %u, serial number: %u, %s\n", vend, le16_to_cpu(id->product_code), le32_to_cpu(id->serial_number), seq_buf_str(&date)); if (raw) drm_printf(p, "raw product id: %*ph\n", (int)sizeof(*id), id); WARN_ON(seq_buf_has_overflowed(&date)); } EXPORT_SYMBOL(drm_edid_print_product_id); /** * drm_edid_get_panel_id - Get a panel's ID from EDID * @drm_edid: EDID that contains panel ID. * * This function uses the first block of the EDID of a panel and (assuming * that the EDID is valid) extracts the ID out of it. The ID is a 32-bit value * (16 bits of manufacturer ID and 16 bits of per-manufacturer ID) that's * supposed to be different for each different modem of panel. * * Return: A 32-bit ID that should be different for each make/model of panel. * See the functions drm_edid_encode_panel_id() and * drm_edid_decode_panel_id() for some details on the structure of this * ID. Return 0 if the EDID size is less than a base block. */ u32 drm_edid_get_panel_id(const struct drm_edid *drm_edid) { const struct edid *edid = drm_edid->edid; if (drm_edid->size < EDID_LENGTH) return 0; /* * We represent the ID as a 32-bit number so it can easily be compared * with "==". * * NOTE that we deal with endianness differently for the top half * of this ID than for the bottom half. The bottom half (the product * id) gets decoded as little endian by the EDID_PRODUCT_ID because * that's how everyone seems to interpret it. The top half (the mfg_id) * gets stored as big endian because that makes * drm_edid_encode_panel_id() and drm_edid_decode_panel_id() easier * to write (it's easier to extract the ASCII). It doesn't really * matter, though, as long as the number here is unique. */ return (u32)edid->mfg_id[0] << 24 | (u32)edid->mfg_id[1] << 16 | (u32)EDID_PRODUCT_ID(edid); } EXPORT_SYMBOL(drm_edid_get_panel_id); /** * drm_edid_read_base_block - Get a panel's EDID base block * @adapter: I2C adapter to use for DDC * * This function returns the drm_edid containing the first block of the EDID of * a panel. * * This function is intended to be used during early probing on devices where * more than one panel might be present. Because of its intended use it must * assume that the EDID of the panel is correct, at least as far as the base * block is concerned (in other words, we don't process any overrides here). * * Caller should call drm_edid_free() after use. * * NOTE: it's expected that this function and drm_do_get_edid() will both * be read the EDID, but there is no caching between them. Since we're only * reading the first block, hopefully this extra overhead won't be too big. * * WARNING: Only use this function when the connector is unknown. For example, * during the early probe of panel. The EDID read from the function is temporary * and should be replaced by the full EDID returned from other drm_edid_read. * * Return: Pointer to allocated EDID base block, or NULL on any failure. */ const struct drm_edid *drm_edid_read_base_block(struct i2c_adapter *adapter) { enum edid_block_status status; void *base_block; base_block = kzalloc(EDID_LENGTH, GFP_KERNEL); if (!base_block) return NULL; status = edid_block_read(base_block, 0, drm_do_probe_ddc_edid, adapter); edid_block_status_print(status, base_block, 0); if (!edid_block_status_valid(status, edid_block_tag(base_block))) { edid_block_dump(KERN_NOTICE, base_block, 0); kfree(base_block); return NULL; } return _drm_edid_alloc(base_block, EDID_LENGTH); } EXPORT_SYMBOL(drm_edid_read_base_block); /** * drm_get_edid_switcheroo - get EDID data for a vga_switcheroo output * @connector: connector we're probing * @adapter: I2C adapter to use for DDC * * Wrapper around drm_get_edid() for laptops with dual GPUs using one set of * outputs. The wrapper adds the requisite vga_switcheroo calls to temporarily * switch DDC to the GPU which is retrieving EDID. * * Return: Pointer to valid EDID or %NULL if we couldn't find any. */ struct edid *drm_get_edid_switcheroo(struct drm_connector *connector, struct i2c_adapter *adapter) { struct drm_device *dev = connector->dev; struct pci_dev *pdev = to_pci_dev(dev->dev); struct edid *edid; if (drm_WARN_ON_ONCE(dev, !dev_is_pci(dev->dev))) return NULL; vga_switcheroo_lock_ddc(pdev); edid = drm_get_edid(connector, adapter); vga_switcheroo_unlock_ddc(pdev); return edid; } EXPORT_SYMBOL(drm_get_edid_switcheroo); /** * drm_edid_read_switcheroo - get EDID data for a vga_switcheroo output * @connector: connector we're probing * @adapter: I2C adapter to use for DDC * * Wrapper around drm_edid_read_ddc() for laptops with dual GPUs using one set * of outputs. The wrapper adds the requisite vga_switcheroo calls to * temporarily switch DDC to the GPU which is retrieving EDID. * * Return: Pointer to valid EDID or %NULL if we couldn't find any. */ const struct drm_edid *drm_edid_read_switcheroo(struct drm_connector *connector, struct i2c_adapter *adapter) { struct drm_device *dev = connector->dev; struct pci_dev *pdev = to_pci_dev(dev->dev); const struct drm_edid *drm_edid; if (drm_WARN_ON_ONCE(dev, !dev_is_pci(dev->dev))) return NULL; vga_switcheroo_lock_ddc(pdev); drm_edid = drm_edid_read_ddc(connector, adapter); vga_switcheroo_unlock_ddc(pdev); return drm_edid; } EXPORT_SYMBOL(drm_edid_read_switcheroo); /** * drm_edid_duplicate - duplicate an EDID and the extensions * @edid: EDID to duplicate * * Return: Pointer to duplicated EDID or NULL on allocation failure. */ struct edid *drm_edid_duplicate(const struct edid *edid) { if (!edid) return NULL; return kmemdup(edid, edid_size(edid), GFP_KERNEL); } EXPORT_SYMBOL(drm_edid_duplicate); /*** EDID parsing ***/ /** * edid_get_quirks - return quirk flags for a given EDID * @drm_edid: EDID to process * * This tells subsequent routines what fixes they need to apply. * * Return: A u32 represents the quirks to apply. */ static u32 edid_get_quirks(const struct drm_edid *drm_edid) { const struct edid_quirk *quirk; int i; for (i = 0; i < ARRAY_SIZE(edid_quirk_list); i++) { quirk = &edid_quirk_list[i]; if (drm_edid_match(drm_edid, &quirk->ident)) return quirk->quirks; } return 0; } #define MODE_SIZE(m) ((m)->hdisplay * (m)->vdisplay) #define MODE_REFRESH_DIFF(c,t) (abs((c) - (t))) /* * Walk the mode list for connector, clearing the preferred status on existing * modes and setting it anew for the right mode ala quirks. */ static void edid_fixup_preferred(struct drm_connector *connector) { const struct drm_display_info *info = &connector->display_info; struct drm_display_mode *t, *cur_mode, *preferred_mode; int target_refresh = 0; int cur_vrefresh, preferred_vrefresh; if (list_empty(&connector->probed_modes)) return; if (info->quirks & EDID_QUIRK_PREFER_LARGE_60) target_refresh = 60; if (info->quirks & EDID_QUIRK_PREFER_LARGE_75) target_refresh = 75; preferred_mode = list_first_entry(&connector->probed_modes, struct drm_display_mode, head); list_for_each_entry_safe(cur_mode, t, &connector->probed_modes, head) { cur_mode->type &= ~DRM_MODE_TYPE_PREFERRED; if (cur_mode == preferred_mode) continue; /* Largest mode is preferred */ if (MODE_SIZE(cur_mode) > MODE_SIZE(preferred_mode)) preferred_mode = cur_mode; cur_vrefresh = drm_mode_vrefresh(cur_mode); preferred_vrefresh = drm_mode_vrefresh(preferred_mode); /* At a given size, try to get closest to target refresh */ if ((MODE_SIZE(cur_mode) == MODE_SIZE(preferred_mode)) && MODE_REFRESH_DIFF(cur_vrefresh, target_refresh) < MODE_REFRESH_DIFF(preferred_vrefresh, target_refresh)) { preferred_mode = cur_mode; } } preferred_mode->type |= DRM_MODE_TYPE_PREFERRED; } static bool mode_is_rb(const struct drm_display_mode *mode) { return (mode->htotal - mode->hdisplay == 160) && (mode->hsync_end - mode->hdisplay == 80) && (mode->hsync_end - mode->hsync_start == 32) && (mode->vsync_start - mode->vdisplay == 3); } /* * drm_mode_find_dmt - Create a copy of a mode if present in DMT * @dev: Device to duplicate against * @hsize: Mode width * @vsize: Mode height * @fresh: Mode refresh rate * @rb: Mode reduced-blanking-ness * * Walk the DMT mode list looking for a match for the given parameters. * * Return: A newly allocated copy of the mode, or NULL if not found. */ struct drm_display_mode *drm_mode_find_dmt(struct drm_device *dev, int hsize, int vsize, int fresh, bool rb) { int i; for (i = 0; i < ARRAY_SIZE(drm_dmt_modes); i++) { const struct drm_display_mode *ptr = &drm_dmt_modes[i]; if (hsize != ptr->hdisplay) continue; if (vsize != ptr->vdisplay) continue; if (fresh != drm_mode_vrefresh(ptr)) continue; if (rb != mode_is_rb(ptr)) continue; return drm_mode_duplicate(dev, ptr); } return NULL; } EXPORT_SYMBOL(drm_mode_find_dmt); static bool is_display_descriptor(const struct detailed_timing *descriptor, u8 type) { BUILD_BUG_ON(offsetof(typeof(*descriptor), pixel_clock) != 0); BUILD_BUG_ON(offsetof(typeof(*descriptor), data.other_data.pad1) != 2); BUILD_BUG_ON(offsetof(typeof(*descriptor), data.other_data.type) != 3); return descriptor->pixel_clock == 0 && descriptor->data.other_data.pad1 == 0 && descriptor->data.other_data.type == type; } static bool is_detailed_timing_descriptor(const struct detailed_timing *descriptor) { BUILD_BUG_ON(offsetof(typeof(*descriptor), pixel_clock) != 0); return descriptor->pixel_clock != 0; } typedef void detailed_cb(const struct detailed_timing *timing, void *closure); static void cea_for_each_detailed_block(const u8 *ext, detailed_cb *cb, void *closure) { int i, n; u8 d = ext[0x02]; const u8 *det_base = ext + d; if (d < 4 || d > 127) return; n = (127 - d) / 18; for (i = 0; i < n; i++) cb((const struct detailed_timing *)(det_base + 18 * i), closure); } static void vtb_for_each_detailed_block(const u8 *ext, detailed_cb *cb, void *closure) { unsigned int i, n = min((int)ext[0x02], 6); const u8 *det_base = ext + 5; if (ext[0x01] != 1) return; /* unknown version */ for (i = 0; i < n; i++) cb((const struct detailed_timing *)(det_base + 18 * i), closure); } static void drm_for_each_detailed_block(const struct drm_edid *drm_edid, detailed_cb *cb, void *closure) { struct drm_edid_iter edid_iter; const u8 *ext; int i; if (!drm_edid) return; for (i = 0; i < EDID_DETAILED_TIMINGS; i++) cb(&drm_edid->edid->detailed_timings[i], closure); drm_edid_iter_begin(drm_edid, &edid_iter); drm_edid_iter_for_each(ext, &edid_iter) { switch (*ext) { case CEA_EXT: cea_for_each_detailed_block(ext, cb, closure); break; case VTB_EXT: vtb_for_each_detailed_block(ext, cb, closure); break; default: break; } } drm_edid_iter_end(&edid_iter); } static void is_rb(const struct detailed_timing *descriptor, void *data) { bool *res = data; if (!is_display_descriptor(descriptor, EDID_DETAIL_MONITOR_RANGE)) return; BUILD_BUG_ON(offsetof(typeof(*descriptor), data.other_data.data.range.flags) != 10); BUILD_BUG_ON(offsetof(typeof(*descriptor), data.other_data.data.range.formula.cvt.flags) != 15); if (descriptor->data.other_data.data.range.flags == DRM_EDID_CVT_SUPPORT_FLAG && descriptor->data.other_data.data.range.formula.cvt.flags & DRM_EDID_CVT_FLAGS_REDUCED_BLANKING) *res = true; } /* EDID 1.4 defines this explicitly. For EDID 1.3, we guess, badly. */ static bool drm_monitor_supports_rb(const struct drm_edid *drm_edid) { if (drm_edid->edid->revision >= 4) { bool ret = false; drm_for_each_detailed_block(drm_edid, is_rb, &ret); return ret; } return drm_edid_is_digital(drm_edid); } static void find_gtf2(const struct detailed_timing *descriptor, void *data) { const struct detailed_timing **res = data; if (!is_display_descriptor(descriptor, EDID_DETAIL_MONITOR_RANGE)) return; BUILD_BUG_ON(offsetof(typeof(*descriptor), data.other_data.data.range.flags) != 10); if (descriptor->data.other_data.data.range.flags == DRM_EDID_SECONDARY_GTF_SUPPORT_FLAG) *res = descriptor; } /* Secondary GTF curve kicks in above some break frequency */ static int drm_gtf2_hbreak(const struct drm_edid *drm_edid) { const struct detailed_timing *descriptor = NULL; drm_for_each_detailed_block(drm_edid, find_gtf2, &descriptor); BUILD_BUG_ON(offsetof(typeof(*descriptor), data.other_data.data.range.formula.gtf2.hfreq_start_khz) != 12); return descriptor ? descriptor->data.other_data.data.range.formula.gtf2.hfreq_start_khz * 2 : 0; } static int drm_gtf2_2c(const struct drm_edid *drm_edid) { const struct detailed_timing *descriptor = NULL; drm_for_each_detailed_block(drm_edid, find_gtf2, &descriptor); BUILD_BUG_ON(offsetof(typeof(*descriptor), data.other_data.data.range.formula.gtf2.c) != 13); return descriptor ? descriptor->data.other_data.data.range.formula.gtf2.c : 0; } static int drm_gtf2_m(const struct drm_edid *drm_edid) { const struct detailed_timing *descriptor = NULL; drm_for_each_detailed_block(drm_edid, find_gtf2, &descriptor); BUILD_BUG_ON(offsetof(typeof(*descriptor), data.other_data.data.range.formula.gtf2.m) != 14); return descriptor ? le16_to_cpu(descriptor->data.other_data.data.range.formula.gtf2.m) : 0; } static int drm_gtf2_k(const struct drm_edid *drm_edid) { const struct detailed_timing *descriptor = NULL; drm_for_each_detailed_block(drm_edid, find_gtf2, &descriptor); BUILD_BUG_ON(offsetof(typeof(*descriptor), data.other_data.data.range.formula.gtf2.k) != 16); return descriptor ? descriptor->data.other_data.data.range.formula.gtf2.k : 0; } static int drm_gtf2_2j(const struct drm_edid *drm_edid) { const struct detailed_timing *descriptor = NULL; drm_for_each_detailed_block(drm_edid, find_gtf2, &descriptor); BUILD_BUG_ON(offsetof(typeof(*descriptor), data.other_data.data.range.formula.gtf2.j) != 17); return descriptor ? descriptor->data.other_data.data.range.formula.gtf2.j : 0; } static void get_timing_level(const struct detailed_timing *descriptor, void *data) { int *res = data; if (!is_display_descriptor(descriptor, EDID_DETAIL_MONITOR_RANGE)) return; BUILD_BUG_ON(offsetof(typeof(*descriptor), data.other_data.data.range.flags) != 10); switch (descriptor->data.other_data.data.range.flags) { case DRM_EDID_DEFAULT_GTF_SUPPORT_FLAG: *res = LEVEL_GTF; break; case DRM_EDID_SECONDARY_GTF_SUPPORT_FLAG: *res = LEVEL_GTF2; break; case DRM_EDID_CVT_SUPPORT_FLAG: *res = LEVEL_CVT; break; default: break; } } /* Get standard timing level (CVT/GTF/DMT). */ static int standard_timing_level(const struct drm_edid *drm_edid) { const struct edid *edid = drm_edid->edid; if (edid->revision >= 4) { /* * If the range descriptor doesn't * indicate otherwise default to CVT */ int ret = LEVEL_CVT; drm_for_each_detailed_block(drm_edid, get_timing_level, &ret); return ret; } else if (edid->revision >= 3 && drm_gtf2_hbreak(drm_edid)) { return LEVEL_GTF2; } else if (edid->revision >= 2) { return LEVEL_GTF; } else { return LEVEL_DMT; } } /* * 0 is reserved. The spec says 0x01 fill for unused timings. Some old * monitors fill with ascii space (0x20) instead. */ static int bad_std_timing(u8 a, u8 b) { return (a == 0x00 && b == 0x00) || (a == 0x01 && b == 0x01) || (a == 0x20 && b == 0x20); } static int drm_mode_hsync(const struct drm_display_mode *mode) { if (mode->htotal <= 0) return 0; return DIV_ROUND_CLOSEST(mode->clock, mode->htotal); } static struct drm_display_mode * drm_gtf2_mode(struct drm_device *dev, const struct drm_edid *drm_edid, int hsize, int vsize, int vrefresh_rate) { struct drm_display_mode *mode; /* * This is potentially wrong if there's ever a monitor with * more than one ranges section, each claiming a different * secondary GTF curve. Please don't do that. */ mode = drm_gtf_mode(dev, hsize, vsize, vrefresh_rate, 0, 0); if (!mode) return NULL; if (drm_mode_hsync(mode) > drm_gtf2_hbreak(drm_edid)) { drm_mode_destroy(dev, mode); mode = drm_gtf_mode_complex(dev, hsize, vsize, vrefresh_rate, 0, 0, drm_gtf2_m(drm_edid), drm_gtf2_2c(drm_edid), drm_gtf2_k(drm_edid), drm_gtf2_2j(drm_edid)); } return mode; } /* * Take the standard timing params (in this case width, aspect, and refresh) * and convert them into a real mode using CVT/GTF/DMT. */ static struct drm_display_mode *drm_mode_std(struct drm_connector *connector, const struct drm_edid *drm_edid, const struct std_timing *t) { struct drm_device *dev = connector->dev; struct drm_display_mode *m, *mode = NULL; int hsize, vsize; int vrefresh_rate; unsigned aspect_ratio = (t->vfreq_aspect & EDID_TIMING_ASPECT_MASK) >> EDID_TIMING_ASPECT_SHIFT; unsigned vfreq = (t->vfreq_aspect & EDID_TIMING_VFREQ_MASK) >> EDID_TIMING_VFREQ_SHIFT; int timing_level = standard_timing_level(drm_edid); if (bad_std_timing(t->hsize, t->vfreq_aspect)) return NULL; /* According to the EDID spec, the hdisplay = hsize * 8 + 248 */ hsize = t->hsize * 8 + 248; /* vrefresh_rate = vfreq + 60 */ vrefresh_rate = vfreq + 60; /* the vdisplay is calculated based on the aspect ratio */ if (aspect_ratio == 0) { if (drm_edid->edid->revision < 3) vsize = hsize; else vsize = (hsize * 10) / 16; } else if (aspect_ratio == 1) vsize = (hsize * 3) / 4; else if (aspect_ratio == 2) vsize = (hsize * 4) / 5; else vsize = (hsize * 9) / 16; /* HDTV hack, part 1 */ if (vrefresh_rate == 60 && ((hsize == 1360 && vsize == 765) || (hsize == 1368 && vsize == 769))) { hsize = 1366; vsize = 768; } /* * If this connector already has a mode for this size and refresh * rate (because it came from detailed or CVT info), use that * instead. This way we don't have to guess at interlace or * reduced blanking. */ list_for_each_entry(m, &connector->probed_modes, head) if (m->hdisplay == hsize && m->vdisplay == vsize && drm_mode_vrefresh(m) == vrefresh_rate) return NULL; /* HDTV hack, part 2 */ if (hsize == 1366 && vsize == 768 && vrefresh_rate == 60) { mode = drm_cvt_mode(dev, 1366, 768, vrefresh_rate, 0, 0, false); if (!mode) return NULL; mode->hdisplay = 1366; mode->hsync_start = mode->hsync_start - 1; mode->hsync_end = mode->hsync_end - 1; return mode; } /* check whether it can be found in default mode table */ if (drm_monitor_supports_rb(drm_edid)) { mode = drm_mode_find_dmt(dev, hsize, vsize, vrefresh_rate, true); if (mode) return mode; } mode = drm_mode_find_dmt(dev, hsize, vsize, vrefresh_rate, false); if (mode) return mode; /* okay, generate it */ switch (timing_level) { case LEVEL_DMT: break; case LEVEL_GTF: mode = drm_gtf_mode(dev, hsize, vsize, vrefresh_rate, 0, 0); break; case LEVEL_GTF2: mode = drm_gtf2_mode(dev, drm_edid, hsize, vsize, vrefresh_rate); break; case LEVEL_CVT: mode = drm_cvt_mode(dev, hsize, vsize, vrefresh_rate, 0, 0, false); break; } return mode; } /* * EDID is delightfully ambiguous about how interlaced modes are to be * encoded. Our internal representation is of frame height, but some * HDTV detailed timings are encoded as field height. * * The format list here is from CEA, in frame size. Technically we * should be checking refresh rate too. Whatever. */ static void drm_mode_do_interlace_quirk(struct drm_display_mode *mode, const struct detailed_pixel_timing *pt) { int i; static const struct { int w, h; } cea_interlaced[] = { { 1920, 1080 }, { 720, 480 }, { 1440, 480 }, { 2880, 480 }, { 720, 576 }, { 1440, 576 }, { 2880, 576 }, }; if (!(pt->misc & DRM_EDID_PT_INTERLACED)) return; for (i = 0; i < ARRAY_SIZE(cea_interlaced); i++) { if ((mode->hdisplay == cea_interlaced[i].w) && (mode->vdisplay == cea_interlaced[i].h / 2)) { mode->vdisplay *= 2; mode->vsync_start *= 2; mode->vsync_end *= 2; mode->vtotal *= 2; mode->vtotal |= 1; } } mode->flags |= DRM_MODE_FLAG_INTERLACE; } /* * Create a new mode from an EDID detailed timing section. An EDID detailed * timing block contains enough info for us to create and return a new struct * drm_display_mode. */ static struct drm_display_mode *drm_mode_detailed(struct drm_connector *connector, const struct drm_edid *drm_edid, const struct detailed_timing *timing) { const struct drm_display_info *info = &connector->display_info; struct drm_device *dev = connector->dev; struct drm_display_mode *mode; const struct detailed_pixel_timing *pt = &timing->data.pixel_data; unsigned hactive = (pt->hactive_hblank_hi & 0xf0) << 4 | pt->hactive_lo; unsigned vactive = (pt->vactive_vblank_hi & 0xf0) << 4 | pt->vactive_lo; unsigned hblank = (pt->hactive_hblank_hi & 0xf) << 8 | pt->hblank_lo; unsigned vblank = (pt->vactive_vblank_hi & 0xf) << 8 | pt->vblank_lo; unsigned hsync_offset = (pt->hsync_vsync_offset_pulse_width_hi & 0xc0) << 2 | pt->hsync_offset_lo; unsigned hsync_pulse_width = (pt->hsync_vsync_offset_pulse_width_hi & 0x30) << 4 | pt->hsync_pulse_width_lo; unsigned vsync_offset = (pt->hsync_vsync_offset_pulse_width_hi & 0xc) << 2 | pt->vsync_offset_pulse_width_lo >> 4; unsigned vsync_pulse_width = (pt->hsync_vsync_offset_pulse_width_hi & 0x3) << 4 | (pt->vsync_offset_pulse_width_lo & 0xf); /* ignore tiny modes */ if (hactive < 64 || vactive < 64) return NULL; if (pt->misc & DRM_EDID_PT_STEREO) { drm_dbg_kms(dev, "[CONNECTOR:%d:%s] Stereo mode not supported\n", connector->base.id, connector->name); return NULL; } if (!(pt->misc & DRM_EDID_PT_SEPARATE_SYNC)) { drm_dbg_kms(dev, "[CONNECTOR:%d:%s] Composite sync not supported\n", connector->base.id, connector->name); } /* it is incorrect if hsync/vsync width is zero */ if (!hsync_pulse_width || !vsync_pulse_width) { drm_dbg_kms(dev, "[CONNECTOR:%d:%s] Incorrect Detailed timing. Wrong Hsync/Vsync pulse width\n", connector->base.id, connector->name); return NULL; } if (info->quirks & EDID_QUIRK_FORCE_REDUCED_BLANKING) { mode = drm_cvt_mode(dev, hactive, vactive, 60, true, false, false); if (!mode) return NULL; goto set_size; } mode = drm_mode_create(dev); if (!mode) return NULL; if (info->quirks & EDID_QUIRK_135_CLOCK_TOO_HIGH) mode->clock = 1088 * 10; else mode->clock = le16_to_cpu(timing->pixel_clock) * 10; mode->hdisplay = hactive; mode->hsync_start = mode->hdisplay + hsync_offset; mode->hsync_end = mode->hsync_start + hsync_pulse_width; mode->htotal = mode->hdisplay + hblank; mode->vdisplay = vactive; mode->vsync_start = mode->vdisplay + vsync_offset; mode->vsync_end = mode->vsync_start + vsync_pulse_width; mode->vtotal = mode->vdisplay + vblank; /* Some EDIDs have bogus h/vsync_end values */ if (mode->hsync_end > mode->htotal) { drm_dbg_kms(dev, "[CONNECTOR:%d:%s] reducing hsync_end %d->%d\n", connector->base.id, connector->name, mode->hsync_end, mode->htotal); mode->hsync_end = mode->htotal; } if (mode->vsync_end > mode->vtotal) { drm_dbg_kms(dev, "[CONNECTOR:%d:%s] reducing vsync_end %d->%d\n", connector->base.id, connector->name, mode->vsync_end, mode->vtotal); mode->vsync_end = mode->vtotal; } drm_mode_do_interlace_quirk(mode, pt); if (info->quirks & EDID_QUIRK_DETAILED_SYNC_PP) { mode->flags |= DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC; } else { mode->flags |= (pt->misc & DRM_EDID_PT_HSYNC_POSITIVE) ? DRM_MODE_FLAG_PHSYNC : DRM_MODE_FLAG_NHSYNC; mode->flags |= (pt->misc & DRM_EDID_PT_VSYNC_POSITIVE) ? DRM_MODE_FLAG_PVSYNC : DRM_MODE_FLAG_NVSYNC; } set_size: mode->width_mm = pt->width_mm_lo | (pt->width_height_mm_hi & 0xf0) << 4; mode->height_mm = pt->height_mm_lo | (pt->width_height_mm_hi & 0xf) << 8; if (info->quirks & EDID_QUIRK_DETAILED_IN_CM) { mode->width_mm *= 10; mode->height_mm *= 10; } if (info->quirks & EDID_QUIRK_DETAILED_USE_MAXIMUM_SIZE) { mode->width_mm = drm_edid->edid->width_cm * 10; mode->height_mm = drm_edid->edid->height_cm * 10; } mode->type = DRM_MODE_TYPE_DRIVER; drm_mode_set_name(mode); return mode; } static bool mode_in_hsync_range(const struct drm_display_mode *mode, const struct edid *edid, const u8 *t) { int hsync, hmin, hmax; hmin = t[7]; if (edid->revision >= 4) hmin += ((t[4] & 0x04) ? 255 : 0); hmax = t[8]; if (edid->revision >= 4) hmax += ((t[4] & 0x08) ? 255 : 0); hsync = drm_mode_hsync(mode); return (hsync <= hmax && hsync >= hmin); } static bool mode_in_vsync_range(const struct drm_display_mode *mode, const struct edid *edid, const u8 *t) { int vsync, vmin, vmax; vmin = t[5]; if (edid->revision >= 4) vmin += ((t[4] & 0x01) ? 255 : 0); vmax = t[6]; if (edid->revision >= 4) vmax += ((t[4] & 0x02) ? 255 : 0); vsync = drm_mode_vrefresh(mode); return (vsync <= vmax && vsync >= vmin); } static u32 range_pixel_clock(const struct edid *edid, const u8 *t) { /* unspecified */ if (t[9] == 0 || t[9] == 255) return 0; /* 1.4 with CVT support gives us real precision, yay */ if (edid->revision >= 4 && t[10] == DRM_EDID_CVT_SUPPORT_FLAG) return (t[9] * 10000) - ((t[12] >> 2) * 250); /* 1.3 is pathetic, so fuzz up a bit */ return t[9] * 10000 + 5001; } static bool mode_in_range(const struct drm_display_mode *mode, const struct drm_edid *drm_edid, const struct detailed_timing *timing) { const struct edid *edid = drm_edid->edid; u32 max_clock; const u8 *t = (const u8 *)timing; if (!mode_in_hsync_range(mode, edid, t)) return false; if (!mode_in_vsync_range(mode, edid, t)) return false; max_clock = range_pixel_clock(edid, t); if (max_clock) if (mode->clock > max_clock) return false; /* 1.4 max horizontal check */ if (edid->revision >= 4 && t[10] == DRM_EDID_CVT_SUPPORT_FLAG) if (t[13] && mode->hdisplay > 8 * (t[13] + (256 * (t[12]&0x3)))) return false; if (mode_is_rb(mode) && !drm_monitor_supports_rb(drm_edid)) return false; return true; } static bool valid_inferred_mode(const struct drm_connector *connector, const struct drm_display_mode *mode) { const struct drm_display_mode *m; bool ok = false; list_for_each_entry(m, &connector->probed_modes, head) { if (mode->hdisplay == m->hdisplay && mode->vdisplay == m->vdisplay && drm_mode_vrefresh(mode) == drm_mode_vrefresh(m)) return false; /* duplicated */ if (mode->hdisplay <= m->hdisplay && mode->vdisplay <= m->vdisplay) ok = true; } return ok; } static int drm_dmt_modes_for_range(struct drm_connector *connector, const struct drm_edid *drm_edid, const struct detailed_timing *timing) { int i, modes = 0; struct drm_display_mode *newmode; struct drm_device *dev = connector->dev; for (i = 0; i < ARRAY_SIZE(drm_dmt_modes); i++) { if (mode_in_range(drm_dmt_modes + i, drm_edid, timing) && valid_inferred_mode(connector, drm_dmt_modes + i)) { newmode = drm_mode_duplicate(dev, &drm_dmt_modes[i]); if (newmode) { drm_mode_probed_add(connector, newmode); modes++; } } } return modes; } /* fix up 1366x768 mode from 1368x768; * GFT/CVT can't express 1366 width which isn't dividable by 8 */ void drm_mode_fixup_1366x768(struct drm_display_mode *mode) { if (mode->hdisplay == 1368 && mode->vdisplay == 768) { mode->hdisplay = 1366; mode->hsync_start--; mode->hsync_end--; drm_mode_set_name(mode); } } static int drm_gtf_modes_for_range(struct drm_connector *connector, const struct drm_edid *drm_edid, const struct detailed_timing *timing) { int i, modes = 0; struct drm_display_mode *newmode; struct drm_device *dev = connector->dev; for (i = 0; i < ARRAY_SIZE(extra_modes); i++) { const struct minimode *m = &extra_modes[i]; newmode = drm_gtf_mode(dev, m->w, m->h, m->r, 0, 0); if (!newmode) return modes; drm_mode_fixup_1366x768(newmode); if (!mode_in_range(newmode, drm_edid, timing) || !valid_inferred_mode(connector, newmode)) { drm_mode_destroy(dev, newmode); continue; } drm_mode_probed_add(connector, newmode); modes++; } return modes; } static int drm_gtf2_modes_for_range(struct drm_connector *connector, const struct drm_edid *drm_edid, const struct detailed_timing *timing) { int i, modes = 0; struct drm_display_mode *newmode; struct drm_device *dev = connector->dev; for (i = 0; i < ARRAY_SIZE(extra_modes); i++) { const struct minimode *m = &extra_modes[i]; newmode = drm_gtf2_mode(dev, drm_edid, m->w, m->h, m->r); if (!newmode) return modes; drm_mode_fixup_1366x768(newmode); if (!mode_in_range(newmode, drm_edid, timing) || !valid_inferred_mode(connector, newmode)) { drm_mode_destroy(dev, newmode); continue; } drm_mode_probed_add(connector, newmode); modes++; } return modes; } static int drm_cvt_modes_for_range(struct drm_connector *connector, const struct drm_edid *drm_edid, const struct detailed_timing *timing) { int i, modes = 0; struct drm_display_mode *newmode; struct drm_device *dev = connector->dev; bool rb = drm_monitor_supports_rb(drm_edid); for (i = 0; i < ARRAY_SIZE(extra_modes); i++) { const struct minimode *m = &extra_modes[i]; newmode = drm_cvt_mode(dev, m->w, m->h, m->r, rb, 0, 0); if (!newmode) return modes; drm_mode_fixup_1366x768(newmode); if (!mode_in_range(newmode, drm_edid, timing) || !valid_inferred_mode(connector, newmode)) { drm_mode_destroy(dev, newmode); continue; } drm_mode_probed_add(connector, newmode); modes++; } return modes; } static void do_inferred_modes(const struct detailed_timing *timing, void *c) { struct detailed_mode_closure *closure = c; const struct detailed_non_pixel *data = &timing->data.other_data; const struct detailed_data_monitor_range *range = &data->data.range; if (!is_display_descriptor(timing, EDID_DETAIL_MONITOR_RANGE)) return; closure->modes += drm_dmt_modes_for_range(closure->connector, closure->drm_edid, timing); if (closure->drm_edid->edid->revision < 2) return; /* GTF not defined yet */ switch (range->flags) { case DRM_EDID_SECONDARY_GTF_SUPPORT_FLAG: closure->modes += drm_gtf2_modes_for_range(closure->connector, closure->drm_edid, timing); break; case DRM_EDID_DEFAULT_GTF_SUPPORT_FLAG: closure->modes += drm_gtf_modes_for_range(closure->connector, closure->drm_edid, timing); break; case DRM_EDID_CVT_SUPPORT_FLAG: if (closure->drm_edid->edid->revision < 4) break; closure->modes += drm_cvt_modes_for_range(closure->connector, closure->drm_edid, timing); break; case DRM_EDID_RANGE_LIMITS_ONLY_FLAG: default: break; } } static int add_inferred_modes(struct drm_connector *connector, const struct drm_edid *drm_edid) { struct detailed_mode_closure closure = { .connector = connector, .drm_edid = drm_edid, }; if (drm_edid->edid->revision >= 1) drm_for_each_detailed_block(drm_edid, do_inferred_modes, &closure); return closure.modes; } static int drm_est3_modes(struct drm_connector *connector, const struct detailed_timing *timing) { int i, j, m, modes = 0; struct drm_display_mode *mode; const u8 *est = ((const u8 *)timing) + 6; for (i = 0; i < 6; i++) { for (j = 7; j >= 0; j--) { m = (i * 8) + (7 - j); if (m >= ARRAY_SIZE(est3_modes)) break; if (est[i] & (1 << j)) { mode = drm_mode_find_dmt(connector->dev, est3_modes[m].w, est3_modes[m].h, est3_modes[m].r, est3_modes[m].rb); if (mode) { drm_mode_probed_add(connector, mode); modes++; } } } } return modes; } static void do_established_modes(const struct detailed_timing *timing, void *c) { struct detailed_mode_closure *closure = c; if (!is_display_descriptor(timing, EDID_DETAIL_EST_TIMINGS)) return; closure->modes += drm_est3_modes(closure->connector, timing); } /* * Get established modes from EDID and add them. Each EDID block contains a * bitmap of the supported "established modes" list (defined above). Tease them * out and add them to the global modes list. */ static int add_established_modes(struct drm_connector *connector, const struct drm_edid *drm_edid) { struct drm_device *dev = connector->dev; const struct edid *edid = drm_edid->edid; unsigned long est_bits = edid->established_timings.t1 | (edid->established_timings.t2 << 8) | ((edid->established_timings.mfg_rsvd & 0x80) << 9); int i, modes = 0; struct detailed_mode_closure closure = { .connector = connector, .drm_edid = drm_edid, }; for (i = 0; i <= EDID_EST_TIMINGS; i++) { if (est_bits & (1<<i)) { struct drm_display_mode *newmode; newmode = drm_mode_duplicate(dev, &edid_est_modes[i]); if (newmode) { drm_mode_probed_add(connector, newmode); modes++; } } } if (edid->revision >= 1) drm_for_each_detailed_block(drm_edid, do_established_modes, &closure); return modes + closure.modes; } static void do_standard_modes(const struct detailed_timing *timing, void *c) { struct detailed_mode_closure *closure = c; const struct detailed_non_pixel *data = &timing->data.other_data; struct drm_connector *connector = closure->connector; int i; if (!is_display_descriptor(timing, EDID_DETAIL_STD_MODES)) return; for (i = 0; i < 6; i++) { const struct std_timing *std = &data->data.timings[i]; struct drm_display_mode *newmode; newmode = drm_mode_std(connector, closure->drm_edid, std); if (newmode) { drm_mode_probed_add(connector, newmode); closure->modes++; } } } /* * Get standard modes from EDID and add them. Standard modes can be calculated * using the appropriate standard (DMT, GTF, or CVT). Grab them from EDID and * add them to the list. */ static int add_standard_modes(struct drm_connector *connector, const struct drm_edid *drm_edid) { int i, modes = 0; struct detailed_mode_closure closure = { .connector = connector, .drm_edid = drm_edid, }; for (i = 0; i < EDID_STD_TIMINGS; i++) { struct drm_display_mode *newmode; newmode = drm_mode_std(connector, drm_edid, &drm_edid->edid->standard_timings[i]); if (newmode) { drm_mode_probed_add(connector, newmode); modes++; } } if (drm_edid->edid->revision >= 1) drm_for_each_detailed_block(drm_edid, do_standard_modes, &closure); /* XXX should also look for standard codes in VTB blocks */ return modes + closure.modes; } static int drm_cvt_modes(struct drm_connector *connector, const struct detailed_timing *timing) { int i, j, modes = 0; struct drm_display_mode *newmode; struct drm_device *dev = connector->dev; const struct cvt_timing *cvt; static const int rates[] = { 60, 85, 75, 60, 50 }; const u8 empty[3] = { 0, 0, 0 }; for (i = 0; i < 4; i++) { int width, height; cvt = &(timing->data.other_data.data.cvt[i]); if (!memcmp(cvt->code, empty, 3)) continue; height = (cvt->code[0] + ((cvt->code[1] & 0xf0) << 4) + 1) * 2; switch (cvt->code[1] & 0x0c) { /* default - because compiler doesn't see that we've enumerated all cases */ default: case 0x00: width = height * 4 / 3; break; case 0x04: width = height * 16 / 9; break; case 0x08: width = height * 16 / 10; break; case 0x0c: width = height * 15 / 9; break; } for (j = 1; j < 5; j++) { if (cvt->code[2] & (1 << j)) { newmode = drm_cvt_mode(dev, width, height, rates[j], j == 0, false, false); if (newmode) { drm_mode_probed_add(connector, newmode); modes++; } } } } return modes; } static void do_cvt_mode(const struct detailed_timing *timing, void *c) { struct detailed_mode_closure *closure = c; if (!is_display_descriptor(timing, EDID_DETAIL_CVT_3BYTE)) return; closure->modes += drm_cvt_modes(closure->connector, timing); } static int add_cvt_modes(struct drm_connector *connector, const struct drm_edid *drm_edid) { struct detailed_mode_closure closure = { .connector = connector, .drm_edid = drm_edid, }; if (drm_edid->edid->revision >= 3) drm_for_each_detailed_block(drm_edid, do_cvt_mode, &closure); /* XXX should also look for CVT codes in VTB blocks */ return closure.modes; } static void fixup_detailed_cea_mode_clock(struct drm_connector *connector, struct drm_display_mode *mode); static void do_detailed_mode(const struct detailed_timing *timing, void *c) { struct detailed_mode_closure *closure = c; struct drm_display_mode *newmode; if (!is_detailed_timing_descriptor(timing)) return; newmode = drm_mode_detailed(closure->connector, closure->drm_edid, timing); if (!newmode) return; if (closure->preferred) newmode->type |= DRM_MODE_TYPE_PREFERRED; /* * Detailed modes are limited to 10kHz pixel clock resolution, * so fix up anything that looks like CEA/HDMI mode, but the clock * is just slightly off. */ fixup_detailed_cea_mode_clock(closure->connector, newmode); drm_mode_probed_add(closure->connector, newmode); closure->modes++; closure->preferred = false; } /* * add_detailed_modes - Add modes from detailed timings * @connector: attached connector * @drm_edid: EDID block to scan */ static int add_detailed_modes(struct drm_connector *connector, const struct drm_edid *drm_edid) { struct detailed_mode_closure closure = { .connector = connector, .drm_edid = drm_edid, }; if (drm_edid->edid->revision >= 4) closure.preferred = true; /* first detailed timing is always preferred */ else closure.preferred = drm_edid->edid->features & DRM_EDID_FEATURE_PREFERRED_TIMING; drm_for_each_detailed_block(drm_edid, do_detailed_mode, &closure); return closure.modes; } /* CTA-861-H Table 60 - CTA Tag Codes */ #define CTA_DB_AUDIO 1 #define CTA_DB_VIDEO 2 #define CTA_DB_VENDOR 3 #define CTA_DB_SPEAKER 4 #define CTA_DB_EXTENDED_TAG 7 /* CTA-861-H Table 62 - CTA Extended Tag Codes */ #define CTA_EXT_DB_VIDEO_CAP 0 #define CTA_EXT_DB_VENDOR 1 #define CTA_EXT_DB_HDR_STATIC_METADATA 6 #define CTA_EXT_DB_420_VIDEO_DATA 14 #define CTA_EXT_DB_420_VIDEO_CAP_MAP 15 #define CTA_EXT_DB_HF_EEODB 0x78 #define CTA_EXT_DB_HF_SCDB 0x79 #define EDID_BASIC_AUDIO (1 << 6) #define EDID_CEA_YCRCB444 (1 << 5) #define EDID_CEA_YCRCB422 (1 << 4) #define EDID_CEA_VCDB_QS (1 << 6) /* * Search EDID for CEA extension block. * * FIXME: Prefer not returning pointers to raw EDID data. */ const u8 *drm_edid_find_extension(const struct drm_edid *drm_edid, int ext_id, int *ext_index) { const u8 *edid_ext = NULL; int i; /* No EDID or EDID extensions */ if (!drm_edid || !drm_edid_extension_block_count(drm_edid)) return NULL; /* Find CEA extension */ for (i = *ext_index; i < drm_edid_extension_block_count(drm_edid); i++) { edid_ext = drm_edid_extension_block_data(drm_edid, i); if (edid_block_tag(edid_ext) == ext_id) break; } if (i >= drm_edid_extension_block_count(drm_edid)) return NULL; *ext_index = i + 1; return edid_ext; } /* Return true if the EDID has a CTA extension or a DisplayID CTA data block */ static bool drm_edid_has_cta_extension(const struct drm_edid *drm_edid) { const struct displayid_block *block; struct displayid_iter iter; struct drm_edid_iter edid_iter; const u8 *ext; bool found = false; /* Look for a top level CEA extension block */ drm_edid_iter_begin(drm_edid, &edid_iter); drm_edid_iter_for_each(ext, &edid_iter) { if (ext[0] == CEA_EXT) { found = true; break; } } drm_edid_iter_end(&edid_iter); if (found) return true; /* CEA blocks can also be found embedded in a DisplayID block */ displayid_iter_edid_begin(drm_edid, &iter); displayid_iter_for_each(block, &iter) { if (block->tag == DATA_BLOCK_CTA) { found = true; break; } } displayid_iter_end(&iter); return found; } static __always_inline const struct drm_display_mode *cea_mode_for_vic(u8 vic) { BUILD_BUG_ON(1 + ARRAY_SIZE(edid_cea_modes_1) - 1 != 127); BUILD_BUG_ON(193 + ARRAY_SIZE(edid_cea_modes_193) - 1 != 219); if (vic >= 1 && vic < 1 + ARRAY_SIZE(edid_cea_modes_1)) return &edid_cea_modes_1[vic - 1]; if (vic >= 193 && vic < 193 + ARRAY_SIZE(edid_cea_modes_193)) return &edid_cea_modes_193[vic - 193]; return NULL; } static u8 cea_num_vics(void) { return 193 + ARRAY_SIZE(edid_cea_modes_193); } static u8 cea_next_vic(u8 vic) { if (++vic == 1 + ARRAY_SIZE(edid_cea_modes_1)) vic = 193; return vic; } /* * Calculate the alternate clock for the CEA mode * (60Hz vs. 59.94Hz etc.) */ static unsigned int cea_mode_alternate_clock(const struct drm_display_mode *cea_mode) { unsigned int clock = cea_mode->clock; if (drm_mode_vrefresh(cea_mode) % 6 != 0) return clock; /* * edid_cea_modes contains the 59.94Hz * variant for 240 and 480 line modes, * and the 60Hz variant otherwise. */ if (cea_mode->vdisplay == 240 || cea_mode->vdisplay == 480) clock = DIV_ROUND_CLOSEST(clock * 1001, 1000); else clock = DIV_ROUND_CLOSEST(clock * 1000, 1001); return clock; } static bool cea_mode_alternate_timings(u8 vic, struct drm_display_mode *mode) { /* * For certain VICs the spec allows the vertical * front porch to vary by one or two lines. * * cea_modes[] stores the variant with the shortest * vertical front porch. We can adjust the mode to * get the other variants by simply increasing the * vertical front porch length. */ BUILD_BUG_ON(cea_mode_for_vic(8)->vtotal != 262 || cea_mode_for_vic(9)->vtotal != 262 || cea_mode_for_vic(12)->vtotal != 262 || cea_mode_for_vic(13)->vtotal != 262 || cea_mode_for_vic(23)->vtotal != 312 || cea_mode_for_vic(24)->vtotal != 312 || cea_mode_for_vic(27)->vtotal != 312 || cea_mode_for_vic(28)->vtotal != 312); if (((vic == 8 || vic == 9 || vic == 12 || vic == 13) && mode->vtotal < 263) || ((vic == 23 || vic == 24 || vic == 27 || vic == 28) && mode->vtotal < 314)) { mode->vsync_start++; mode->vsync_end++; mode->vtotal++; return true; } return false; } static u8 drm_match_cea_mode_clock_tolerance(const struct drm_display_mode *to_match, unsigned int clock_tolerance) { unsigned int match_flags = DRM_MODE_MATCH_TIMINGS | DRM_MODE_MATCH_FLAGS; u8 vic; if (!to_match->clock) return 0; if (to_match->picture_aspect_ratio) match_flags |= DRM_MODE_MATCH_ASPECT_RATIO; for (vic = 1; vic < cea_num_vics(); vic = cea_next_vic(vic)) { struct drm_display_mode cea_mode; unsigned int clock1, clock2; drm_mode_init(&cea_mode, cea_mode_for_vic(vic)); /* Check both 60Hz and 59.94Hz */ clock1 = cea_mode.clock; clock2 = cea_mode_alternate_clock(&cea_mode); if (abs(to_match->clock - clock1) > clock_tolerance && abs(to_match->clock - clock2) > clock_tolerance) continue; do { if (drm_mode_match(to_match, &cea_mode, match_flags)) return vic; } while (cea_mode_alternate_timings(vic, &cea_mode)); } return 0; } /** * drm_match_cea_mode - look for a CEA mode matching given mode * @to_match: display mode * * Return: The CEA Video ID (VIC) of the mode or 0 if it isn't a CEA-861 * mode. */ u8 drm_match_cea_mode(const struct drm_display_mode *to_match) { unsigned int match_flags = DRM_MODE_MATCH_TIMINGS | DRM_MODE_MATCH_FLAGS; u8 vic; if (!to_match->clock) return 0; if (to_match->picture_aspect_ratio) match_flags |= DRM_MODE_MATCH_ASPECT_RATIO; for (vic = 1; vic < cea_num_vics(); vic = cea_next_vic(vic)) { struct drm_display_mode cea_mode; unsigned int clock1, clock2; drm_mode_init(&cea_mode, cea_mode_for_vic(vic)); /* Check both 60Hz and 59.94Hz */ clock1 = cea_mode.clock; clock2 = cea_mode_alternate_clock(&cea_mode); if (KHZ2PICOS(to_match->clock) != KHZ2PICOS(clock1) && KHZ2PICOS(to_match->clock) != KHZ2PICOS(clock2)) continue; do { if (drm_mode_match(to_match, &cea_mode, match_flags)) return vic; } while (cea_mode_alternate_timings(vic, &cea_mode)); } return 0; } EXPORT_SYMBOL(drm_match_cea_mode); static bool drm_valid_cea_vic(u8 vic) { return cea_mode_for_vic(vic) != NULL; } static enum hdmi_picture_aspect drm_get_cea_aspect_ratio(const u8 video_code) { const struct drm_display_mode *mode = cea_mode_for_vic(video_code); if (mode) return mode->picture_aspect_ratio; return HDMI_PICTURE_ASPECT_NONE; } static enum hdmi_picture_aspect drm_get_hdmi_aspect_ratio(const u8 video_code) { return edid_4k_modes[video_code].picture_aspect_ratio; } /* * Calculate the alternate clock for HDMI modes (those from the HDMI vendor * specific block). */ static unsigned int hdmi_mode_alternate_clock(const struct drm_display_mode *hdmi_mode) { return cea_mode_alternate_clock(hdmi_mode); } static u8 drm_match_hdmi_mode_clock_tolerance(const struct drm_display_mode *to_match, unsigned int clock_tolerance) { unsigned int match_flags = DRM_MODE_MATCH_TIMINGS | DRM_MODE_MATCH_FLAGS; u8 vic; if (!to_match->clock) return 0; if (to_match->picture_aspect_ratio) match_flags |= DRM_MODE_MATCH_ASPECT_RATIO; for (vic = 1; vic < ARRAY_SIZE(edid_4k_modes); vic++) { const struct drm_display_mode *hdmi_mode = &edid_4k_modes[vic]; unsigned int clock1, clock2; /* Make sure to also match alternate clocks */ clock1 = hdmi_mode->clock; clock2 = hdmi_mode_alternate_clock(hdmi_mode); if (abs(to_match->clock - clock1) > clock_tolerance && abs(to_match->clock - clock2) > clock_tolerance) continue; if (drm_mode_match(to_match, hdmi_mode, match_flags)) return vic; } return 0; } /* * drm_match_hdmi_mode - look for a HDMI mode matching given mode * @to_match: display mode * * An HDMI mode is one defined in the HDMI vendor specific block. * * Returns the HDMI Video ID (VIC) of the mode or 0 if it isn't one. */ static u8 drm_match_hdmi_mode(const struct drm_display_mode *to_match) { unsigned int match_flags = DRM_MODE_MATCH_TIMINGS | DRM_MODE_MATCH_FLAGS; u8 vic; if (!to_match->clock) return 0; if (to_match->picture_aspect_ratio) match_flags |= DRM_MODE_MATCH_ASPECT_RATIO; for (vic = 1; vic < ARRAY_SIZE(edid_4k_modes); vic++) { const struct drm_display_mode *hdmi_mode = &edid_4k_modes[vic]; unsigned int clock1, clock2; /* Make sure to also match alternate clocks */ clock1 = hdmi_mode->clock; clock2 = hdmi_mode_alternate_clock(hdmi_mode); if ((KHZ2PICOS(to_match->clock) == KHZ2PICOS(clock1) || KHZ2PICOS(to_match->clock) == KHZ2PICOS(clock2)) && drm_mode_match(to_match, hdmi_mode, match_flags)) return vic; } return 0; } static bool drm_valid_hdmi_vic(u8 vic) { return vic > 0 && vic < ARRAY_SIZE(edid_4k_modes); } static int add_alternate_cea_modes(struct drm_connector *connector, const struct drm_edid *drm_edid) { struct drm_device *dev = connector->dev; struct drm_display_mode *mode, *tmp; LIST_HEAD(list); int modes = 0; /* Don't add CTA modes if the CTA extension block is missing */ if (!drm_edid_has_cta_extension(drm_edid)) return 0; /* * Go through all probed modes and create a new mode * with the alternate clock for certain CEA modes. */ list_for_each_entry(mode, &connector->probed_modes, head) { const struct drm_display_mode *cea_mode = NULL; struct drm_display_mode *newmode; u8 vic = drm_match_cea_mode(mode); unsigned int clock1, clock2; if (drm_valid_cea_vic(vic)) { cea_mode = cea_mode_for_vic(vic); clock2 = cea_mode_alternate_clock(cea_mode); } else { vic = drm_match_hdmi_mode(mode); if (drm_valid_hdmi_vic(vic)) { cea_mode = &edid_4k_modes[vic]; clock2 = hdmi_mode_alternate_clock(cea_mode); } } if (!cea_mode) continue; clock1 = cea_mode->clock; if (clock1 == clock2) continue; if (mode->clock != clock1 && mode->clock != clock2) continue; newmode = drm_mode_duplicate(dev, cea_mode); if (!newmode) continue; /* Carry over the stereo flags */ newmode->flags |= mode->flags & DRM_MODE_FLAG_3D_MASK; /* * The current mode could be either variant. Make * sure to pick the "other" clock for the new mode. */ if (mode->clock != clock1) newmode->clock = clock1; else newmode->clock = clock2; list_add_tail(&newmode->head, &list); } list_for_each_entry_safe(mode, tmp, &list, head) { list_del(&mode->head); drm_mode_probed_add(connector, mode); modes++; } return modes; } static u8 svd_to_vic(u8 svd) { /* 0-6 bit vic, 7th bit native mode indicator */ if ((svd >= 1 && svd <= 64) || (svd >= 129 && svd <= 192)) return svd & 127; return svd; } /* * Return a display mode for the 0-based vic_index'th VIC across all CTA VDBs in * the EDID, or NULL on errors. */ static struct drm_display_mode * drm_display_mode_from_vic_index(struct drm_connector *connector, int vic_index) { const struct drm_display_info *info = &connector->display_info; struct drm_device *dev = connector->dev; if (!info->vics || vic_index >= info->vics_len || !info->vics[vic_index]) return NULL; return drm_display_mode_from_cea_vic(dev, info->vics[vic_index]); } /* * do_y420vdb_modes - Parse YCBCR 420 only modes * @connector: connector corresponding to the HDMI sink * @svds: start of the data block of CEA YCBCR 420 VDB * @len: length of the CEA YCBCR 420 VDB * * Parse the CEA-861-F YCBCR 420 Video Data Block (Y420VDB) * which contains modes which can be supported in YCBCR 420 * output format only. */ static int do_y420vdb_modes(struct drm_connector *connector, const u8 *svds, u8 svds_len) { struct drm_device *dev = connector->dev; int modes = 0, i; for (i = 0; i < svds_len; i++) { u8 vic = svd_to_vic(svds[i]); struct drm_display_mode *newmode; if (!drm_valid_cea_vic(vic)) continue; newmode = drm_mode_duplicate(dev, cea_mode_for_vic(vic)); if (!newmode) break; drm_mode_probed_add(connector, newmode); modes++; } return modes; } /** * drm_display_mode_from_cea_vic() - return a mode for CEA VIC * @dev: DRM device * @video_code: CEA VIC of the mode * * Creates a new mode matching the specified CEA VIC. * * Returns: A new drm_display_mode on success or NULL on failure */ struct drm_display_mode * drm_display_mode_from_cea_vic(struct drm_device *dev, u8 video_code) { const struct drm_display_mode *cea_mode; struct drm_display_mode *newmode; cea_mode = cea_mode_for_vic(video_code); if (!cea_mode) return NULL; newmode = drm_mode_duplicate(dev, cea_mode); if (!newmode) return NULL; return newmode; } EXPORT_SYMBOL(drm_display_mode_from_cea_vic); /* Add modes based on VICs parsed in parse_cta_vdb() */ static int add_cta_vdb_modes(struct drm_connector *connector) { const struct drm_display_info *info = &connector->display_info; int i, modes = 0; if (!info->vics) return 0; for (i = 0; i < info->vics_len; i++) { struct drm_display_mode *mode; mode = drm_display_mode_from_vic_index(connector, i); if (mode) { drm_mode_probed_add(connector, mode); modes++; } } return modes; } struct stereo_mandatory_mode { int width, height, vrefresh; unsigned int flags; }; static const struct stereo_mandatory_mode stereo_mandatory_modes[] = { { 1920, 1080, 24, DRM_MODE_FLAG_3D_TOP_AND_BOTTOM }, { 1920, 1080, 24, DRM_MODE_FLAG_3D_FRAME_PACKING }, { 1920, 1080, 50, DRM_MODE_FLAG_INTERLACE | DRM_MODE_FLAG_3D_SIDE_BY_SIDE_HALF }, { 1920, 1080, 60, DRM_MODE_FLAG_INTERLACE | DRM_MODE_FLAG_3D_SIDE_BY_SIDE_HALF }, { 1280, 720, 50, DRM_MODE_FLAG_3D_TOP_AND_BOTTOM }, { 1280, 720, 50, DRM_MODE_FLAG_3D_FRAME_PACKING }, { 1280, 720, 60, DRM_MODE_FLAG_3D_TOP_AND_BOTTOM }, { 1280, 720, 60, DRM_MODE_FLAG_3D_FRAME_PACKING } }; static bool stereo_match_mandatory(const struct drm_display_mode *mode, const struct stereo_mandatory_mode *stereo_mode) { unsigned int interlaced = mode->flags & DRM_MODE_FLAG_INTERLACE; return mode->hdisplay == stereo_mode->width && mode->vdisplay == stereo_mode->height && interlaced == (stereo_mode->flags & DRM_MODE_FLAG_INTERLACE) && drm_mode_vrefresh(mode) == stereo_mode->vrefresh; } static int add_hdmi_mandatory_stereo_modes(struct drm_connector *connector) { struct drm_device *dev = connector->dev; const struct drm_display_mode *mode; struct list_head stereo_modes; int modes = 0, i; INIT_LIST_HEAD(&stereo_modes); list_for_each_entry(mode, &connector->probed_modes, head) { for (i = 0; i < ARRAY_SIZE(stereo_mandatory_modes); i++) { const struct stereo_mandatory_mode *mandatory; struct drm_display_mode *new_mode; if (!stereo_match_mandatory(mode, &stereo_mandatory_modes[i])) continue; mandatory = &stereo_mandatory_modes[i]; new_mode = drm_mode_duplicate(dev, mode); if (!new_mode) continue; new_mode->flags |= mandatory->flags; list_add_tail(&new_mode->head, &stereo_modes); modes++; } } list_splice_tail(&stereo_modes, &connector->probed_modes); return modes; } static int add_hdmi_mode(struct drm_connector *connector, u8 vic) { struct drm_device *dev = connector->dev; struct drm_display_mode *newmode; if (!drm_valid_hdmi_vic(vic)) { drm_err(connector->dev, "[CONNECTOR:%d:%s] Unknown HDMI VIC: %d\n", connector->base.id, connector->name, vic); return 0; } newmode = drm_mode_duplicate(dev, &edid_4k_modes[vic]); if (!newmode) return 0; drm_mode_probed_add(connector, newmode); return 1; } static int add_3d_struct_modes(struct drm_connector *connector, u16 structure, int vic_index) { struct drm_display_mode *newmode; int modes = 0; if (structure & (1 << 0)) { newmode = drm_display_mode_from_vic_index(connector, vic_index); if (newmode) { newmode->flags |= DRM_MODE_FLAG_3D_FRAME_PACKING; drm_mode_probed_add(connector, newmode); modes++; } } if (structure & (1 << 6)) { newmode = drm_display_mode_from_vic_index(connector, vic_index); if (newmode) { newmode->flags |= DRM_MODE_FLAG_3D_TOP_AND_BOTTOM; drm_mode_probed_add(connector, newmode); modes++; } } if (structure & (1 << 8)) { newmode = drm_display_mode_from_vic_index(connector, vic_index); if (newmode) { newmode->flags |= DRM_MODE_FLAG_3D_SIDE_BY_SIDE_HALF; drm_mode_probed_add(connector, newmode); modes++; } } return modes; } static bool hdmi_vsdb_latency_present(const u8 *db) { return db[8] & BIT(7); } static bool hdmi_vsdb_i_latency_present(const u8 *db) { return hdmi_vsdb_latency_present(db) && db[8] & BIT(6); } static int hdmi_vsdb_latency_length(const u8 *db) { if (hdmi_vsdb_i_latency_present(db)) return 4; else if (hdmi_vsdb_latency_present(db)) return 2; else return 0; } /* * do_hdmi_vsdb_modes - Parse the HDMI Vendor Specific data block * @connector: connector corresponding to the HDMI sink * @db: start of the CEA vendor specific block * @len: length of the CEA block payload, ie. one can access up to db[len] * * Parses the HDMI VSDB looking for modes to add to @connector. This function * also adds the stereo 3d modes when applicable. */ static int do_hdmi_vsdb_modes(struct drm_connector *connector, const u8 *db, u8 len) { int modes = 0, offset = 0, i, multi_present = 0, multi_len; u8 vic_len, hdmi_3d_len = 0; u16 mask; u16 structure_all; if (len < 8) goto out; /* no HDMI_Video_Present */ if (!(db[8] & (1 << 5))) goto out; offset += hdmi_vsdb_latency_length(db); /* the declared length is not long enough for the 2 first bytes * of additional video format capabilities */ if (len < (8 + offset + 2)) goto out; /* 3D_Present */ offset++; if (db[8 + offset] & (1 << 7)) { modes += add_hdmi_mandatory_stereo_modes(connector); /* 3D_Multi_present */ multi_present = (db[8 + offset] & 0x60) >> 5; } offset++; vic_len = db[8 + offset] >> 5; hdmi_3d_len = db[8 + offset] & 0x1f; for (i = 0; i < vic_len && len >= (9 + offset + i); i++) { u8 vic; vic = db[9 + offset + i]; modes += add_hdmi_mode(connector, vic); } offset += 1 + vic_len; if (multi_present == 1) multi_len = 2; else if (multi_present == 2) multi_len = 4; else multi_len = 0; if (len < (8 + offset + hdmi_3d_len - 1)) goto out; if (hdmi_3d_len < multi_len) goto out; if (multi_present == 1 || multi_present == 2) { /* 3D_Structure_ALL */ structure_all = (db[8 + offset] << 8) | db[9 + offset]; /* check if 3D_MASK is present */ if (multi_present == 2) mask = (db[10 + offset] << 8) | db[11 + offset]; else mask = 0xffff; for (i = 0; i < 16; i++) { if (mask & (1 << i)) modes += add_3d_struct_modes(connector, structure_all, i); } } offset += multi_len; for (i = 0; i < (hdmi_3d_len - multi_len); i++) { int vic_index; struct drm_display_mode *newmode = NULL; unsigned int newflag = 0; bool detail_present; detail_present = ((db[8 + offset + i] & 0x0f) > 7); if (detail_present && (i + 1 == hdmi_3d_len - multi_len)) break; /* 2D_VIC_order_X */ vic_index = db[8 + offset + i] >> 4; /* 3D_Structure_X */ switch (db[8 + offset + i] & 0x0f) { case 0: newflag = DRM_MODE_FLAG_3D_FRAME_PACKING; break; case 6: newflag = DRM_MODE_FLAG_3D_TOP_AND_BOTTOM; break; case 8: /* 3D_Detail_X */ if ((db[9 + offset + i] >> 4) == 1) newflag = DRM_MODE_FLAG_3D_SIDE_BY_SIDE_HALF; break; } if (newflag != 0) { newmode = drm_display_mode_from_vic_index(connector, vic_index); if (newmode) { newmode->flags |= newflag; drm_mode_probed_add(connector, newmode); modes++; } } if (detail_present) i++; } out: return modes; } static int cea_revision(const u8 *cea) { /* * FIXME is this correct for the DispID variant? * The DispID spec doesn't really specify whether * this is the revision of the CEA extension or * the DispID CEA data block. And the only value * given as an example is 0. */ return cea[1]; } /* * CTA Data Block iterator. * * Iterate through all CTA Data Blocks in both EDID CTA Extensions and DisplayID * CTA Data Blocks. * * struct cea_db *db: * struct cea_db_iter iter; * * cea_db_iter_edid_begin(edid, &iter); * cea_db_iter_for_each(db, &iter) { * // do stuff with db * } * cea_db_iter_end(&iter); */ struct cea_db_iter { struct drm_edid_iter edid_iter; struct displayid_iter displayid_iter; /* Current Data Block Collection. */ const u8 *collection; /* Current Data Block index in current collection. */ int index; /* End index in current collection. */ int end; }; /* CTA-861-H section 7.4 CTA Data BLock Collection */ struct cea_db { u8 tag_length; u8 data[]; } __packed; static int cea_db_tag(const struct cea_db *db) { return db->tag_length >> 5; } static int cea_db_payload_len(const void *_db) { /* FIXME: Transition to passing struct cea_db * everywhere. */ const struct cea_db *db = _db; return db->tag_length & 0x1f; } static const void *cea_db_data(const struct cea_db *db) { return db->data; } static bool cea_db_is_extended_tag(const struct cea_db *db, int tag) { return cea_db_tag(db) == CTA_DB_EXTENDED_TAG && cea_db_payload_len(db) >= 1 && db->data[0] == tag; } static bool cea_db_is_vendor(const struct cea_db *db, int vendor_oui) { const u8 *data = cea_db_data(db); return cea_db_tag(db) == CTA_DB_VENDOR && cea_db_payload_len(db) >= 3 && oui(data[2], data[1], data[0]) == vendor_oui; } static void cea_db_iter_edid_begin(const struct drm_edid *drm_edid, struct cea_db_iter *iter) { memset(iter, 0, sizeof(*iter)); drm_edid_iter_begin(drm_edid, &iter->edid_iter); displayid_iter_edid_begin(drm_edid, &iter->displayid_iter); } static const struct cea_db * __cea_db_iter_current_block(const struct cea_db_iter *iter) { const struct cea_db *db; if (!iter->collection) return NULL; db = (const struct cea_db *)&iter->collection[iter->index]; if (iter->index + sizeof(*db) <= iter->end && iter->index + sizeof(*db) + cea_db_payload_len(db) <= iter->end) return db; return NULL; } /* * References: * - CTA-861-H section 7.3.3 CTA Extension Version 3 */ static int cea_db_collection_size(const u8 *cta) { u8 d = cta[2]; if (d < 4 || d > 127) return 0; return d - 4; } /* * References: * - VESA E-EDID v1.4 * - CTA-861-H section 7.3.3 CTA Extension Version 3 */ static const void *__cea_db_iter_edid_next(struct cea_db_iter *iter) { const u8 *ext; drm_edid_iter_for_each(ext, &iter->edid_iter) { int size; /* Only support CTA Extension revision 3+ */ if (ext[0] != CEA_EXT || cea_revision(ext) < 3) continue; size = cea_db_collection_size(ext); if (!size) continue; iter->index = 4; iter->end = iter->index + size; return ext; } return NULL; } /* * References: * - DisplayID v1.3 Appendix C: CEA Data Block within a DisplayID Data Block * - DisplayID v2.0 section 4.10 CTA DisplayID Data Block * * Note that the above do not specify any connection between DisplayID Data * Block revision and CTA Extension versions. */ static const void *__cea_db_iter_displayid_next(struct cea_db_iter *iter) { const struct displayid_block *block; displayid_iter_for_each(block, &iter->displayid_iter) { if (block->tag != DATA_BLOCK_CTA) continue; /* * The displayid iterator has already verified the block bounds * in displayid_iter_block(). */ iter->index = sizeof(*block); iter->end = iter->index + block->num_bytes; return block; } return NULL; } static const struct cea_db *__cea_db_iter_next(struct cea_db_iter *iter) { const struct cea_db *db; if (iter->collection) { /* Current collection should always be valid. */ db = __cea_db_iter_current_block(iter); if (WARN_ON(!db)) { iter->collection = NULL; return NULL; } /* Next block in CTA Data Block Collection */ iter->index += sizeof(*db) + cea_db_payload_len(db); db = __cea_db_iter_current_block(iter); if (db) return db; } for (;;) { /* * Find the next CTA Data Block Collection. First iterate all * the EDID CTA Extensions, then all the DisplayID CTA blocks. * * Per DisplayID v1.3 Appendix B: DisplayID as an EDID * Extension, it's recommended that DisplayID extensions are * exposed after all of the CTA Extensions. */ iter->collection = __cea_db_iter_edid_next(iter); if (!iter->collection) iter->collection = __cea_db_iter_displayid_next(iter); if (!iter->collection) return NULL; db = __cea_db_iter_current_block(iter); if (db) return db; } } #define cea_db_iter_for_each(__db, __iter) \ while (((__db) = __cea_db_iter_next(__iter))) static void cea_db_iter_end(struct cea_db_iter *iter) { displayid_iter_end(&iter->displayid_iter); drm_edid_iter_end(&iter->edid_iter); memset(iter, 0, sizeof(*iter)); } static bool cea_db_is_hdmi_vsdb(const struct cea_db *db) { return cea_db_is_vendor(db, HDMI_IEEE_OUI) && cea_db_payload_len(db) >= 5; } static bool cea_db_is_hdmi_forum_vsdb(const struct cea_db *db) { return cea_db_is_vendor(db, HDMI_FORUM_IEEE_OUI) && cea_db_payload_len(db) >= 7; } static bool cea_db_is_hdmi_forum_eeodb(const void *db) { return cea_db_is_extended_tag(db, CTA_EXT_DB_HF_EEODB) && cea_db_payload_len(db) >= 2; } static bool cea_db_is_microsoft_vsdb(const struct cea_db *db) { return cea_db_is_vendor(db, MICROSOFT_IEEE_OUI) && cea_db_payload_len(db) == 21; } static bool cea_db_is_vcdb(const struct cea_db *db) { return cea_db_is_extended_tag(db, CTA_EXT_DB_VIDEO_CAP) && cea_db_payload_len(db) == 2; } static bool cea_db_is_hdmi_forum_scdb(const struct cea_db *db) { return cea_db_is_extended_tag(db, CTA_EXT_DB_HF_SCDB) && cea_db_payload_len(db) >= 7; } static bool cea_db_is_y420cmdb(const struct cea_db *db) { return cea_db_is_extended_tag(db, CTA_EXT_DB_420_VIDEO_CAP_MAP); } static bool cea_db_is_y420vdb(const struct cea_db *db) { return cea_db_is_extended_tag(db, CTA_EXT_DB_420_VIDEO_DATA); } static bool cea_db_is_hdmi_hdr_metadata_block(const struct cea_db *db) { return cea_db_is_extended_tag(db, CTA_EXT_DB_HDR_STATIC_METADATA) && cea_db_payload_len(db) >= 3; } /* * Get the HF-EEODB override extension block count from EDID. * * The passed in EDID may be partially read, as long as it has at least two * blocks (base block and one extension block) if EDID extension count is > 0. * * Note that this is *not* how you should parse CTA Data Blocks in general; this * is only to handle partially read EDIDs. Normally, use the CTA Data Block * iterators instead. * * References: * - HDMI 2.1 section 10.3.6 HDMI Forum EDID Extension Override Data Block */ static int edid_hfeeodb_extension_block_count(const struct edid *edid) { const u8 *cta; /* No extensions according to base block, no HF-EEODB. */ if (!edid_extension_block_count(edid)) return 0; /* HF-EEODB is always in the first EDID extension block only */ cta = edid_extension_block_data(edid, 0); if (edid_block_tag(cta) != CEA_EXT || cea_revision(cta) < 3) return 0; /* Need to have the data block collection, and at least 3 bytes. */ if (cea_db_collection_size(cta) < 3) return 0; /* * Sinks that include the HF-EEODB in their E-EDID shall include one and * only one instance of the HF-EEODB in the E-EDID, occupying bytes 4 * through 6 of Block 1 of the E-EDID. */ if (!cea_db_is_hdmi_forum_eeodb(&cta[4])) return 0; return cta[4 + 2]; } /* * CTA-861 YCbCr 4:2:0 Capability Map Data Block (CTA Y420CMDB) * * Y420CMDB contains a bitmap which gives the index of CTA modes from CTA VDB, * which can support YCBCR 420 sampling output also (apart from RGB/YCBCR444 * etc). For example, if the bit 0 in bitmap is set, first mode in VDB can * support YCBCR420 output too. */ static void parse_cta_y420cmdb(struct drm_connector *connector, const struct cea_db *db, u64 *y420cmdb_map) { struct drm_display_info *info = &connector->display_info; int i, map_len = cea_db_payload_len(db) - 1; const u8 *data = cea_db_data(db) + 1; u64 map = 0; if (map_len == 0) { /* All CEA modes support ycbcr420 sampling also.*/ map = U64_MAX; goto out; } /* * This map indicates which of the existing CEA block modes * from VDB can support YCBCR420 output too. So if bit=0 is * set, first mode from VDB can support YCBCR420 output too. * We will parse and keep this map, before parsing VDB itself * to avoid going through the same block again and again. * * Spec is not clear about max possible size of this block. * Clamping max bitmap block size at 8 bytes. Every byte can * address 8 CEA modes, in this way this map can address * 8*8 = first 64 SVDs. */ if (WARN_ON_ONCE(map_len > 8)) map_len = 8; for (i = 0; i < map_len; i++) map |= (u64)data[i] << (8 * i); out: if (map) info->color_formats |= DRM_COLOR_FORMAT_YCBCR420; *y420cmdb_map = map; } static int add_cea_modes(struct drm_connector *connector, const struct drm_edid *drm_edid) { const struct cea_db *db; struct cea_db_iter iter; int modes; /* CTA VDB block VICs parsed earlier */ modes = add_cta_vdb_modes(connector); cea_db_iter_edid_begin(drm_edid, &iter); cea_db_iter_for_each(db, &iter) { if (cea_db_is_hdmi_vsdb(db)) { modes += do_hdmi_vsdb_modes(connector, (const u8 *)db, cea_db_payload_len(db)); } else if (cea_db_is_y420vdb(db)) { const u8 *vdb420 = cea_db_data(db) + 1; /* Add 4:2:0(only) modes present in EDID */ modes += do_y420vdb_modes(connector, vdb420, cea_db_payload_len(db) - 1); } } cea_db_iter_end(&iter); return modes; } static void fixup_detailed_cea_mode_clock(struct drm_connector *connector, struct drm_display_mode *mode) { const struct drm_display_mode *cea_mode; int clock1, clock2, clock; u8 vic; const char *type; /* * allow 5kHz clock difference either way to account for * the 10kHz clock resolution limit of detailed timings. */ vic = drm_match_cea_mode_clock_tolerance(mode, 5); if (drm_valid_cea_vic(vic)) { type = "CEA"; cea_mode = cea_mode_for_vic(vic); clock1 = cea_mode->clock; clock2 = cea_mode_alternate_clock(cea_mode); } else { vic = drm_match_hdmi_mode_clock_tolerance(mode, 5); if (drm_valid_hdmi_vic(vic)) { type = "HDMI"; cea_mode = &edid_4k_modes[vic]; clock1 = cea_mode->clock; clock2 = hdmi_mode_alternate_clock(cea_mode); } else { return; } } /* pick whichever is closest */ if (abs(mode->clock - clock1) < abs(mode->clock - clock2)) clock = clock1; else clock = clock2; if (mode->clock == clock) return; drm_dbg_kms(connector->dev, "[CONNECTOR:%d:%s] detailed mode matches %s VIC %d, adjusting clock %d -> %d\n", connector->base.id, connector->name, type, vic, mode->clock, clock); mode->clock = clock; } static void drm_calculate_luminance_range(struct drm_connector *connector) { struct hdr_static_metadata *hdr_metadata = &connector->hdr_sink_metadata.hdmi_type1; struct drm_luminance_range_info *luminance_range = &connector->display_info.luminance_range; static const u8 pre_computed_values[] = { 50, 51, 52, 53, 55, 56, 57, 58, 59, 61, 62, 63, 65, 66, 68, 69, 71, 72, 74, 75, 77, 79, 81, 82, 84, 86, 88, 90, 92, 94, 96, 98 }; u32 max_avg, min_cll, max, min, q, r; if (!(hdr_metadata->metadata_type & BIT(HDMI_STATIC_METADATA_TYPE1))) return; max_avg = hdr_metadata->max_fall; min_cll = hdr_metadata->min_cll; /* * From the specification (CTA-861-G), for calculating the maximum * luminance we need to use: * Luminance = 50*2**(CV/32) * Where CV is a one-byte value. * For calculating this expression we may need float point precision; * to avoid this complexity level, we take advantage that CV is divided * by a constant. From the Euclids division algorithm, we know that CV * can be written as: CV = 32*q + r. Next, we replace CV in the * Luminance expression and get 50*(2**q)*(2**(r/32)), hence we just * need to pre-compute the value of r/32. For pre-computing the values * We just used the following Ruby line: * (0...32).each {|cv| puts (50*2**(cv/32.0)).round} * The results of the above expressions can be verified at * pre_computed_values. */ q = max_avg >> 5; r = max_avg % 32; max = (1 << q) * pre_computed_values[r]; /* min luminance: maxLum * (CV/255)^2 / 100 */ q = DIV_ROUND_CLOSEST(min_cll, 255); min = max * DIV_ROUND_CLOSEST((q * q), 100); luminance_range->min_luminance = min; luminance_range->max_luminance = max; } static uint8_t eotf_supported(const u8 *edid_ext) { return edid_ext[2] & (BIT(HDMI_EOTF_TRADITIONAL_GAMMA_SDR) | BIT(HDMI_EOTF_TRADITIONAL_GAMMA_HDR) | BIT(HDMI_EOTF_SMPTE_ST2084) | BIT(HDMI_EOTF_BT_2100_HLG)); } static uint8_t hdr_metadata_type(const u8 *edid_ext) { return edid_ext[3] & BIT(HDMI_STATIC_METADATA_TYPE1); } static void drm_parse_hdr_metadata_block(struct drm_connector *connector, const u8 *db) { u16 len; len = cea_db_payload_len(db); connector->hdr_sink_metadata.hdmi_type1.eotf = eotf_supported(db); connector->hdr_sink_metadata.hdmi_type1.metadata_type = hdr_metadata_type(db); if (len >= 4) connector->hdr_sink_metadata.hdmi_type1.max_cll = db[4]; if (len >= 5) connector->hdr_sink_metadata.hdmi_type1.max_fall = db[5]; if (len >= 6) { connector->hdr_sink_metadata.hdmi_type1.min_cll = db[6]; /* Calculate only when all values are available */ drm_calculate_luminance_range(connector); } } /* HDMI Vendor-Specific Data Block (HDMI VSDB, H14b-VSDB) */ static void drm_parse_hdmi_vsdb_audio(struct drm_connector *connector, const u8 *db) { u8 len = cea_db_payload_len(db); if (len >= 6 && (db[6] & (1 << 7))) connector->eld[DRM_ELD_SAD_COUNT_CONN_TYPE] |= DRM_ELD_SUPPORTS_AI; if (len >= 10 && hdmi_vsdb_latency_present(db)) { connector->latency_present[0] = true; connector->video_latency[0] = db[9]; connector->audio_latency[0] = db[10]; } if (len >= 12 && hdmi_vsdb_i_latency_present(db)) { connector->latency_present[1] = true; connector->video_latency[1] = db[11]; connector->audio_latency[1] = db[12]; } drm_dbg_kms(connector->dev, "[CONNECTOR:%d:%s] HDMI: latency present %d %d, video latency %d %d, audio latency %d %d\n", connector->base.id, connector->name, connector->latency_present[0], connector->latency_present[1], connector->video_latency[0], connector->video_latency[1], connector->audio_latency[0], connector->audio_latency[1]); } static void match_identity(const struct detailed_timing *timing, void *data) { struct drm_edid_match_closure *closure = data; unsigned int i; const char *name = closure->ident->name; unsigned int name_len = strlen(name); const char *desc = timing->data.other_data.data.str.str; unsigned int desc_len = ARRAY_SIZE(timing->data.other_data.data.str.str); if (name_len > desc_len || !(is_display_descriptor(timing, EDID_DETAIL_MONITOR_NAME) || is_display_descriptor(timing, EDID_DETAIL_MONITOR_STRING))) return; if (strncmp(name, desc, name_len)) return; for (i = name_len; i < desc_len; i++) { if (desc[i] == '\n') break; /* Allow white space before EDID string terminator. */ if (!isspace(desc[i])) return; } closure->matched = true; } /** * drm_edid_match - match drm_edid with given identity * @drm_edid: EDID * @ident: the EDID identity to match with * * Check if the EDID matches with the given identity. * * Return: True if the given identity matched with EDID, false otherwise. */ bool drm_edid_match(const struct drm_edid *drm_edid, const struct drm_edid_ident *ident) { if (!drm_edid || drm_edid_get_panel_id(drm_edid) != ident->panel_id) return false; /* Match with name only if it's not NULL. */ if (ident->name) { struct drm_edid_match_closure closure = { .ident = ident, .matched = false, }; drm_for_each_detailed_block(drm_edid, match_identity, &closure); return closure.matched; } return true; } EXPORT_SYMBOL(drm_edid_match); static void monitor_name(const struct detailed_timing *timing, void *data) { const char **res = data; if (!is_display_descriptor(timing, EDID_DETAIL_MONITOR_NAME)) return; *res = timing->data.other_data.data.str.str; } static int get_monitor_name(const struct drm_edid *drm_edid, char name[13]) { const char *edid_name = NULL; int mnl; if (!drm_edid || !name) return 0; drm_for_each_detailed_block(drm_edid, monitor_name, &edid_name); for (mnl = 0; edid_name && mnl < 13; mnl++) { if (edid_name[mnl] == 0x0a) break; name[mnl] = edid_name[mnl]; } return mnl; } /** * drm_edid_get_monitor_name - fetch the monitor name from the edid * @edid: monitor EDID information * @name: pointer to a character array to hold the name of the monitor * @bufsize: The size of the name buffer (should be at least 14 chars.) * */ void drm_edid_get_monitor_name(const struct edid *edid, char *name, int bufsize) { int name_length = 0; if (bufsize <= 0) return; if (edid) { char buf[13]; struct drm_edid drm_edid = { .edid = edid, .size = edid_size(edid), }; name_length = min(get_monitor_name(&drm_edid, buf), bufsize - 1); memcpy(name, buf, name_length); } name[name_length] = '\0'; } EXPORT_SYMBOL(drm_edid_get_monitor_name); static void clear_eld(struct drm_connector *connector) { memset(connector->eld, 0, sizeof(connector->eld)); connector->latency_present[0] = false; connector->latency_present[1] = false; connector->video_latency[0] = 0; connector->audio_latency[0] = 0; connector->video_latency[1] = 0; connector->audio_latency[1] = 0; } /* * Get 3-byte SAD buffer from struct cea_sad. */ void drm_edid_cta_sad_get(const struct cea_sad *cta_sad, u8 *sad) { sad[0] = cta_sad->format << 3 | cta_sad->channels; sad[1] = cta_sad->freq; sad[2] = cta_sad->byte2; } /* * Set struct cea_sad from 3-byte SAD buffer. */ void drm_edid_cta_sad_set(struct cea_sad *cta_sad, const u8 *sad) { cta_sad->format = (sad[0] & 0x78) >> 3; cta_sad->channels = sad[0] & 0x07; cta_sad->freq = sad[1] & 0x7f; cta_sad->byte2 = sad[2]; } /* * drm_edid_to_eld - build ELD from EDID * @connector: connector corresponding to the HDMI/DP sink * @drm_edid: EDID to parse * * Fill the ELD (EDID-Like Data) buffer for passing to the audio driver. The * HDCP and Port_ID ELD fields are left for the graphics driver to fill in. */ static void drm_edid_to_eld(struct drm_connector *connector, const struct drm_edid *drm_edid) { const struct drm_display_info *info = &connector->display_info; const struct cea_db *db; struct cea_db_iter iter; uint8_t *eld = connector->eld; int total_sad_count = 0; int mnl; if (!drm_edid) return; mnl = get_monitor_name(drm_edid, &eld[DRM_ELD_MONITOR_NAME_STRING]); drm_dbg_kms(connector->dev, "[CONNECTOR:%d:%s] ELD monitor %s\n", connector->base.id, connector->name, &eld[DRM_ELD_MONITOR_NAME_STRING]); eld[DRM_ELD_CEA_EDID_VER_MNL] = info->cea_rev << DRM_ELD_CEA_EDID_VER_SHIFT; eld[DRM_ELD_CEA_EDID_VER_MNL] |= mnl; eld[DRM_ELD_VER] = DRM_ELD_VER_CEA861D; eld[DRM_ELD_MANUFACTURER_NAME0] = drm_edid->edid->mfg_id[0]; eld[DRM_ELD_MANUFACTURER_NAME1] = drm_edid->edid->mfg_id[1]; eld[DRM_ELD_PRODUCT_CODE0] = drm_edid->edid->prod_code[0]; eld[DRM_ELD_PRODUCT_CODE1] = drm_edid->edid->prod_code[1]; cea_db_iter_edid_begin(drm_edid, &iter); cea_db_iter_for_each(db, &iter) { const u8 *data = cea_db_data(db); int len = cea_db_payload_len(db); int sad_count; switch (cea_db_tag(db)) { case CTA_DB_AUDIO: /* Audio Data Block, contains SADs */ sad_count = min(len / 3, 15 - total_sad_count); if (sad_count >= 1) memcpy(&eld[DRM_ELD_CEA_SAD(mnl, total_sad_count)], data, sad_count * 3); total_sad_count += sad_count; break; case CTA_DB_SPEAKER: /* Speaker Allocation Data Block */ if (len >= 1) eld[DRM_ELD_SPEAKER] = data[0]; break; case CTA_DB_VENDOR: /* HDMI Vendor-Specific Data Block */ if (cea_db_is_hdmi_vsdb(db)) drm_parse_hdmi_vsdb_audio(connector, (const u8 *)db); break; default: break; } } cea_db_iter_end(&iter); eld[DRM_ELD_SAD_COUNT_CONN_TYPE] |= total_sad_count << DRM_ELD_SAD_COUNT_SHIFT; if (connector->connector_type == DRM_MODE_CONNECTOR_DisplayPort || connector->connector_type == DRM_MODE_CONNECTOR_eDP) eld[DRM_ELD_SAD_COUNT_CONN_TYPE] |= DRM_ELD_CONN_TYPE_DP; else eld[DRM_ELD_SAD_COUNT_CONN_TYPE] |= DRM_ELD_CONN_TYPE_HDMI; eld[DRM_ELD_BASELINE_ELD_LEN] = DIV_ROUND_UP(drm_eld_calc_baseline_block_size(eld), 4); drm_dbg_kms(connector->dev, "[CONNECTOR:%d:%s] ELD size %d, SAD count %d\n", connector->base.id, connector->name, drm_eld_size(eld), total_sad_count); } static int _drm_edid_to_sad(const struct drm_edid *drm_edid, struct cea_sad **psads) { const struct cea_db *db; struct cea_db_iter iter; int count = 0; cea_db_iter_edid_begin(drm_edid, &iter); cea_db_iter_for_each(db, &iter) { if (cea_db_tag(db) == CTA_DB_AUDIO) { struct cea_sad *sads; int i; count = cea_db_payload_len(db) / 3; /* SAD is 3B */ sads = kcalloc(count, sizeof(*sads), GFP_KERNEL); *psads = sads; if (!sads) return -ENOMEM; for (i = 0; i < count; i++) drm_edid_cta_sad_set(&sads[i], &db->data[i * 3]); break; } } cea_db_iter_end(&iter); DRM_DEBUG_KMS("Found %d Short Audio Descriptors\n", count); return count; } /** * drm_edid_to_sad - extracts SADs from EDID * @edid: EDID to parse * @sads: pointer that will be set to the extracted SADs * * Looks for CEA EDID block and extracts SADs (Short Audio Descriptors) from it. * * Note: The returned pointer needs to be freed using kfree(). * * Return: The number of found SADs or negative number on error. */ int drm_edid_to_sad(const struct edid *edid, struct cea_sad **sads) { struct drm_edid drm_edid; return _drm_edid_to_sad(drm_edid_legacy_init(&drm_edid, edid), sads); } EXPORT_SYMBOL(drm_edid_to_sad); static int _drm_edid_to_speaker_allocation(const struct drm_edid *drm_edid, u8 **sadb) { const struct cea_db *db; struct cea_db_iter iter; int count = 0; cea_db_iter_edid_begin(drm_edid, &iter); cea_db_iter_for_each(db, &iter) { if (cea_db_tag(db) == CTA_DB_SPEAKER && cea_db_payload_len(db) == 3) { *sadb = kmemdup(db->data, cea_db_payload_len(db), GFP_KERNEL); if (!*sadb) return -ENOMEM; count = cea_db_payload_len(db); break; } } cea_db_iter_end(&iter); DRM_DEBUG_KMS("Found %d Speaker Allocation Data Blocks\n", count); return count; } /** * drm_edid_to_speaker_allocation - extracts Speaker Allocation Data Blocks from EDID * @edid: EDID to parse * @sadb: pointer to the speaker block * * Looks for CEA EDID block and extracts the Speaker Allocation Data Block from it. * * Note: The returned pointer needs to be freed using kfree(). * * Return: The number of found Speaker Allocation Blocks or negative number on * error. */ int drm_edid_to_speaker_allocation(const struct edid *edid, u8 **sadb) { struct drm_edid drm_edid; return _drm_edid_to_speaker_allocation(drm_edid_legacy_init(&drm_edid, edid), sadb); } EXPORT_SYMBOL(drm_edid_to_speaker_allocation); /** * drm_av_sync_delay - compute the HDMI/DP sink audio-video sync delay * @connector: connector associated with the HDMI/DP sink * @mode: the display mode * * Return: The HDMI/DP sink's audio-video sync delay in milliseconds or 0 if * the sink doesn't support audio or video. */ int drm_av_sync_delay(struct drm_connector *connector, const struct drm_display_mode *mode) { int i = !!(mode->flags & DRM_MODE_FLAG_INTERLACE); int a, v; if (!connector->latency_present[0]) return 0; if (!connector->latency_present[1]) i = 0; a = connector->audio_latency[i]; v = connector->video_latency[i]; /* * HDMI/DP sink doesn't support audio or video? */ if (a == 255 || v == 255) return 0; /* * Convert raw EDID values to millisecond. * Treat unknown latency as 0ms. */ if (a) a = min(2 * (a - 1), 500); if (v) v = min(2 * (v - 1), 500); return max(v - a, 0); } EXPORT_SYMBOL(drm_av_sync_delay); static bool _drm_detect_hdmi_monitor(const struct drm_edid *drm_edid) { const struct cea_db *db; struct cea_db_iter iter; bool hdmi = false; /* * Because HDMI identifier is in Vendor Specific Block, * search it from all data blocks of CEA extension. */ cea_db_iter_edid_begin(drm_edid, &iter); cea_db_iter_for_each(db, &iter) { if (cea_db_is_hdmi_vsdb(db)) { hdmi = true; break; } } cea_db_iter_end(&iter); return hdmi; } /** * drm_detect_hdmi_monitor - detect whether monitor is HDMI * @edid: monitor EDID information * * Parse the CEA extension according to CEA-861-B. * * Drivers that have added the modes parsed from EDID to drm_display_info * should use &drm_display_info.is_hdmi instead of calling this function. * * Return: True if the monitor is HDMI, false if not or unknown. */ bool drm_detect_hdmi_monitor(const struct edid *edid) { struct drm_edid drm_edid; return _drm_detect_hdmi_monitor(drm_edid_legacy_init(&drm_edid, edid)); } EXPORT_SYMBOL(drm_detect_hdmi_monitor); static bool _drm_detect_monitor_audio(const struct drm_edid *drm_edid) { struct drm_edid_iter edid_iter; const struct cea_db *db; struct cea_db_iter iter; const u8 *edid_ext; bool has_audio = false; drm_edid_iter_begin(drm_edid, &edid_iter); drm_edid_iter_for_each(edid_ext, &edid_iter) { if (edid_ext[0] == CEA_EXT) { has_audio = edid_ext[3] & EDID_BASIC_AUDIO; if (has_audio) break; } } drm_edid_iter_end(&edid_iter); if (has_audio) { DRM_DEBUG_KMS("Monitor has basic audio support\n"); goto end; } cea_db_iter_edid_begin(drm_edid, &iter); cea_db_iter_for_each(db, &iter) { if (cea_db_tag(db) == CTA_DB_AUDIO) { const u8 *data = cea_db_data(db); int i; for (i = 0; i < cea_db_payload_len(db); i += 3) DRM_DEBUG_KMS("CEA audio format %d\n", (data[i] >> 3) & 0xf); has_audio = true; break; } } cea_db_iter_end(&iter); end: return has_audio; } /** * drm_detect_monitor_audio - check monitor audio capability * @edid: EDID block to scan * * Monitor should have CEA extension block. * If monitor has 'basic audio', but no CEA audio blocks, it's 'basic * audio' only. If there is any audio extension block and supported * audio format, assume at least 'basic audio' support, even if 'basic * audio' is not defined in EDID. * * Return: True if the monitor supports audio, false otherwise. */ bool drm_detect_monitor_audio(const struct edid *edid) { struct drm_edid drm_edid; return _drm_detect_monitor_audio(drm_edid_legacy_init(&drm_edid, edid)); } EXPORT_SYMBOL(drm_detect_monitor_audio); /** * drm_default_rgb_quant_range - default RGB quantization range * @mode: display mode * * Determine the default RGB quantization range for the mode, * as specified in CEA-861. * * Return: The default RGB quantization range for the mode */ enum hdmi_quantization_range drm_default_rgb_quant_range(const struct drm_display_mode *mode) { /* All CEA modes other than VIC 1 use limited quantization range. */ return drm_match_cea_mode(mode) > 1 ? HDMI_QUANTIZATION_RANGE_LIMITED : HDMI_QUANTIZATION_RANGE_FULL; } EXPORT_SYMBOL(drm_default_rgb_quant_range); /* CTA-861 Video Data Block (CTA VDB) */ static void parse_cta_vdb(struct drm_connector *connector, const struct cea_db *db) { struct drm_display_info *info = &connector->display_info; int i, vic_index, len = cea_db_payload_len(db); const u8 *svds = cea_db_data(db); u8 *vics; if (!len) return; /* Gracefully handle multiple VDBs, however unlikely that is */ vics = krealloc(info->vics, info->vics_len + len, GFP_KERNEL); if (!vics) return; vic_index = info->vics_len; info->vics_len += len; info->vics = vics; for (i = 0; i < len; i++) { u8 vic = svd_to_vic(svds[i]); if (!drm_valid_cea_vic(vic)) vic = 0; info->vics[vic_index++] = vic; } } /* * Update y420_cmdb_modes based on previously parsed CTA VDB and Y420CMDB. * * Translate the y420cmdb_map based on VIC indexes to y420_cmdb_modes indexed * using the VICs themselves. */ static void update_cta_y420cmdb(struct drm_connector *connector, u64 y420cmdb_map) { struct drm_display_info *info = &connector->display_info; struct drm_hdmi_info *hdmi = &info->hdmi; int i, len = min_t(int, info->vics_len, BITS_PER_TYPE(y420cmdb_map)); for (i = 0; i < len; i++) { u8 vic = info->vics[i]; if (vic && y420cmdb_map & BIT_ULL(i)) bitmap_set(hdmi->y420_cmdb_modes, vic, 1); } } static bool cta_vdb_has_vic(const struct drm_connector *connector, u8 vic) { const struct drm_display_info *info = &connector->display_info; int i; if (!vic || !info->vics) return false; for (i = 0; i < info->vics_len; i++) { if (info->vics[i] == vic) return true; } return false; } /* CTA-861-H YCbCr 4:2:0 Video Data Block (CTA Y420VDB) */ static void parse_cta_y420vdb(struct drm_connector *connector, const struct cea_db *db) { struct drm_display_info *info = &connector->display_info; struct drm_hdmi_info *hdmi = &info->hdmi; const u8 *svds = cea_db_data(db) + 1; int i; for (i = 0; i < cea_db_payload_len(db) - 1; i++) { u8 vic = svd_to_vic(svds[i]); if (!drm_valid_cea_vic(vic)) continue; bitmap_set(hdmi->y420_vdb_modes, vic, 1); info->color_formats |= DRM_COLOR_FORMAT_YCBCR420; } } static void drm_parse_vcdb(struct drm_connector *connector, const u8 *db) { struct drm_display_info *info = &connector->display_info; drm_dbg_kms(connector->dev, "[CONNECTOR:%d:%s] CEA VCDB 0x%02x\n", connector->base.id, connector->name, db[2]); if (db[2] & EDID_CEA_VCDB_QS) info->rgb_quant_range_selectable = true; } static void drm_get_max_frl_rate(int max_frl_rate, u8 *max_lanes, u8 *max_rate_per_lane) { switch (max_frl_rate) { case 1: *max_lanes = 3; *max_rate_per_lane = 3; break; case 2: *max_lanes = 3; *max_rate_per_lane = 6; break; case 3: *max_lanes = 4; *max_rate_per_lane = 6; break; case 4: *max_lanes = 4; *max_rate_per_lane = 8; break; case 5: *max_lanes = 4; *max_rate_per_lane = 10; break; case 6: *max_lanes = 4; *max_rate_per_lane = 12; break; case 0: default: *max_lanes = 0; *max_rate_per_lane = 0; } } static void drm_parse_ycbcr420_deep_color_info(struct drm_connector *connector, const u8 *db) { u8 dc_mask; struct drm_hdmi_info *hdmi = &connector->display_info.hdmi; dc_mask = db[7] & DRM_EDID_YCBCR420_DC_MASK; hdmi->y420_dc_modes = dc_mask; } static void drm_parse_dsc_info(struct drm_hdmi_dsc_cap *hdmi_dsc, const u8 *hf_scds) { hdmi_dsc->v_1p2 = hf_scds[11] & DRM_EDID_DSC_1P2; if (!hdmi_dsc->v_1p2) return; hdmi_dsc->native_420 = hf_scds[11] & DRM_EDID_DSC_NATIVE_420; hdmi_dsc->all_bpp = hf_scds[11] & DRM_EDID_DSC_ALL_BPP; if (hf_scds[11] & DRM_EDID_DSC_16BPC) hdmi_dsc->bpc_supported = 16; else if (hf_scds[11] & DRM_EDID_DSC_12BPC) hdmi_dsc->bpc_supported = 12; else if (hf_scds[11] & DRM_EDID_DSC_10BPC) hdmi_dsc->bpc_supported = 10; else /* Supports min 8 BPC if DSC 1.2 is supported*/ hdmi_dsc->bpc_supported = 8; if (cea_db_payload_len(hf_scds) >= 12 && hf_scds[12]) { u8 dsc_max_slices; u8 dsc_max_frl_rate; dsc_max_frl_rate = (hf_scds[12] & DRM_EDID_DSC_MAX_FRL_RATE_MASK) >> 4; drm_get_max_frl_rate(dsc_max_frl_rate, &hdmi_dsc->max_lanes, &hdmi_dsc->max_frl_rate_per_lane); dsc_max_slices = hf_scds[12] & DRM_EDID_DSC_MAX_SLICES; switch (dsc_max_slices) { case 1: hdmi_dsc->max_slices = 1; hdmi_dsc->clk_per_slice = 340; break; case 2: hdmi_dsc->max_slices = 2; hdmi_dsc->clk_per_slice = 340; break; case 3: hdmi_dsc->max_slices = 4; hdmi_dsc->clk_per_slice = 340; break; case 4: hdmi_dsc->max_slices = 8; hdmi_dsc->clk_per_slice = 340; break; case 5: hdmi_dsc->max_slices = 8; hdmi_dsc->clk_per_slice = 400; break; case 6: hdmi_dsc->max_slices = 12; hdmi_dsc->clk_per_slice = 400; break; case 7: hdmi_dsc->max_slices = 16; hdmi_dsc->clk_per_slice = 400; break; case 0: default: hdmi_dsc->max_slices = 0; hdmi_dsc->clk_per_slice = 0; } } if (cea_db_payload_len(hf_scds) >= 13 && hf_scds[13]) hdmi_dsc->total_chunk_kbytes = hf_scds[13] & DRM_EDID_DSC_TOTAL_CHUNK_KBYTES; } /* Sink Capability Data Structure */ static void drm_parse_hdmi_forum_scds(struct drm_connector *connector, const u8 *hf_scds) { struct drm_display_info *info = &connector->display_info; struct drm_hdmi_info *hdmi = &info->hdmi; struct drm_hdmi_dsc_cap *hdmi_dsc = &hdmi->dsc_cap; int max_tmds_clock = 0; u8 max_frl_rate = 0; bool dsc_support = false; info->has_hdmi_infoframe = true; if (hf_scds[6] & 0x80) { hdmi->scdc.supported = true; if (hf_scds[6] & 0x40) hdmi->scdc.read_request = true; } /* * All HDMI 2.0 monitors must support scrambling at rates > 340 MHz. * And as per the spec, three factors confirm this: * * Availability of a HF-VSDB block in EDID (check) * * Non zero Max_TMDS_Char_Rate filed in HF-VSDB (let's check) * * SCDC support available (let's check) * Lets check it out. */ if (hf_scds[5]) { struct drm_scdc *scdc = &hdmi->scdc; /* max clock is 5000 KHz times block value */ max_tmds_clock = hf_scds[5] * 5000; if (max_tmds_clock > 340000) { info->max_tmds_clock = max_tmds_clock; } if (scdc->supported) { scdc->scrambling.supported = true; /* Few sinks support scrambling for clocks < 340M */ if ((hf_scds[6] & 0x8)) scdc->scrambling.low_rates = true; } } if (hf_scds[7]) { max_frl_rate = (hf_scds[7] & DRM_EDID_MAX_FRL_RATE_MASK) >> 4; drm_get_max_frl_rate(max_frl_rate, &hdmi->max_lanes, &hdmi->max_frl_rate_per_lane); } drm_parse_ycbcr420_deep_color_info(connector, hf_scds); if (cea_db_payload_len(hf_scds) >= 11 && hf_scds[11]) { drm_parse_dsc_info(hdmi_dsc, hf_scds); dsc_support = true; } drm_dbg_kms(connector->dev, "[CONNECTOR:%d:%s] HF-VSDB: max TMDS clock: %d KHz, HDMI 2.1 support: %s, DSC 1.2 support: %s\n", connector->base.id, connector->name, max_tmds_clock, str_yes_no(max_frl_rate), str_yes_no(dsc_support)); } static void drm_parse_hdmi_deep_color_info(struct drm_connector *connector, const u8 *hdmi) { struct drm_display_info *info = &connector->display_info; unsigned int dc_bpc = 0; /* HDMI supports at least 8 bpc */ info->bpc = 8; if (cea_db_payload_len(hdmi) < 6) return; if (hdmi[6] & DRM_EDID_HDMI_DC_30) { dc_bpc = 10; info->edid_hdmi_rgb444_dc_modes |= DRM_EDID_HDMI_DC_30; drm_dbg_kms(connector->dev, "[CONNECTOR:%d:%s] HDMI sink does deep color 30.\n", connector->base.id, connector->name); } if (hdmi[6] & DRM_EDID_HDMI_DC_36) { dc_bpc = 12; info->edid_hdmi_rgb444_dc_modes |= DRM_EDID_HDMI_DC_36; drm_dbg_kms(connector->dev, "[CONNECTOR:%d:%s] HDMI sink does deep color 36.\n", connector->base.id, connector->name); } if (hdmi[6] & DRM_EDID_HDMI_DC_48) { dc_bpc = 16; info->edid_hdmi_rgb444_dc_modes |= DRM_EDID_HDMI_DC_48; drm_dbg_kms(connector->dev, "[CONNECTOR:%d:%s] HDMI sink does deep color 48.\n", connector->base.id, connector->name); } if (dc_bpc == 0) { drm_dbg_kms(connector->dev, "[CONNECTOR:%d:%s] No deep color support on this HDMI sink.\n", connector->base.id, connector->name); return; } drm_dbg_kms(connector->dev, "[CONNECTOR:%d:%s] Assigning HDMI sink color depth as %d bpc.\n", connector->base.id, connector->name, dc_bpc); info->bpc = dc_bpc; /* YCRCB444 is optional according to spec. */ if (hdmi[6] & DRM_EDID_HDMI_DC_Y444) { info->edid_hdmi_ycbcr444_dc_modes = info->edid_hdmi_rgb444_dc_modes; drm_dbg_kms(connector->dev, "[CONNECTOR:%d:%s] HDMI sink does YCRCB444 in deep color.\n", connector->base.id, connector->name); } /* * Spec says that if any deep color mode is supported at all, * then deep color 36 bit must be supported. */ if (!(hdmi[6] & DRM_EDID_HDMI_DC_36)) { drm_dbg_kms(connector->dev, "[CONNECTOR:%d:%s] HDMI sink should do DC_36, but does not!\n", connector->base.id, connector->name); } } /* HDMI Vendor-Specific Data Block (HDMI VSDB, H14b-VSDB) */ static void drm_parse_hdmi_vsdb_video(struct drm_connector *connector, const u8 *db) { struct drm_display_info *info = &connector->display_info; u8 len = cea_db_payload_len(db); info->is_hdmi = true; info->source_physical_address = (db[4] << 8) | db[5]; if (len >= 6) info->dvi_dual = db[6] & 1; if (len >= 7) info->max_tmds_clock = db[7] * 5000; /* * Try to infer whether the sink supports HDMI infoframes. * * HDMI infoframe support was first added in HDMI 1.4. Assume the sink * supports infoframes if HDMI_Video_present is set. */ if (len >= 8 && db[8] & BIT(5)) info->has_hdmi_infoframe = true; drm_dbg_kms(connector->dev, "[CONNECTOR:%d:%s] HDMI: DVI dual %d, max TMDS clock %d kHz\n", connector->base.id, connector->name, info->dvi_dual, info->max_tmds_clock); drm_parse_hdmi_deep_color_info(connector, db); } /* * See EDID extension for head-mounted and specialized monitors, specified at: * https://docs.microsoft.com/en-us/windows-hardware/drivers/display/specialized-monitors-edid-extension */ static void drm_parse_microsoft_vsdb(struct drm_connector *connector, const u8 *db) { struct drm_display_info *info = &connector->display_info; u8 version = db[4]; bool desktop_usage = db[5] & BIT(6); /* Version 1 and 2 for HMDs, version 3 flags desktop usage explicitly */ if (version == 1 || version == 2 || (version == 3 && !desktop_usage)) info->non_desktop = true; drm_dbg_kms(connector->dev, "[CONNECTOR:%d:%s] HMD or specialized display VSDB version %u: 0x%02x\n", connector->base.id, connector->name, version, db[5]); } static void drm_parse_cea_ext(struct drm_connector *connector, const struct drm_edid *drm_edid) { struct drm_display_info *info = &connector->display_info; struct drm_edid_iter edid_iter; const struct cea_db *db; struct cea_db_iter iter; const u8 *edid_ext; u64 y420cmdb_map = 0; drm_edid_iter_begin(drm_edid, &edid_iter); drm_edid_iter_for_each(edid_ext, &edid_iter) { if (edid_ext[0] != CEA_EXT) continue; if (!info->cea_rev) info->cea_rev = edid_ext[1]; if (info->cea_rev != edid_ext[1]) drm_dbg_kms(connector->dev, "[CONNECTOR:%d:%s] CEA extension version mismatch %u != %u\n", connector->base.id, connector->name, info->cea_rev, edid_ext[1]); /* The existence of a CTA extension should imply RGB support */ info->color_formats = DRM_COLOR_FORMAT_RGB444; if (edid_ext[3] & EDID_CEA_YCRCB444) info->color_formats |= DRM_COLOR_FORMAT_YCBCR444; if (edid_ext[3] & EDID_CEA_YCRCB422) info->color_formats |= DRM_COLOR_FORMAT_YCBCR422; if (edid_ext[3] & EDID_BASIC_AUDIO) info->has_audio = true; } drm_edid_iter_end(&edid_iter); cea_db_iter_edid_begin(drm_edid, &iter); cea_db_iter_for_each(db, &iter) { /* FIXME: convert parsers to use struct cea_db */ const u8 *data = (const u8 *)db; if (cea_db_is_hdmi_vsdb(db)) drm_parse_hdmi_vsdb_video(connector, data); else if (cea_db_is_hdmi_forum_vsdb(db) || cea_db_is_hdmi_forum_scdb(db)) drm_parse_hdmi_forum_scds(connector, data); else if (cea_db_is_microsoft_vsdb(db)) drm_parse_microsoft_vsdb(connector, data); else if (cea_db_is_y420cmdb(db)) parse_cta_y420cmdb(connector, db, &y420cmdb_map); else if (cea_db_is_y420vdb(db)) parse_cta_y420vdb(connector, db); else if (cea_db_is_vcdb(db)) drm_parse_vcdb(connector, data); else if (cea_db_is_hdmi_hdr_metadata_block(db)) drm_parse_hdr_metadata_block(connector, data); else if (cea_db_tag(db) == CTA_DB_VIDEO) parse_cta_vdb(connector, db); else if (cea_db_tag(db) == CTA_DB_AUDIO) info->has_audio = true; } cea_db_iter_end(&iter); if (y420cmdb_map) update_cta_y420cmdb(connector, y420cmdb_map); } static void get_monitor_range(const struct detailed_timing *timing, void *c) { struct detailed_mode_closure *closure = c; struct drm_display_info *info = &closure->connector->display_info; struct drm_monitor_range_info *monitor_range = &info->monitor_range; const struct detailed_non_pixel *data = &timing->data.other_data; const struct detailed_data_monitor_range *range = &data->data.range; const struct edid *edid = closure->drm_edid->edid; if (!is_display_descriptor(timing, EDID_DETAIL_MONITOR_RANGE)) return; /* * These limits are used to determine the VRR refresh * rate range. Only the "range limits only" variant * of the range descriptor seems to guarantee that * any and all timings are accepted by the sink, as * opposed to just timings conforming to the indicated * formula (GTF/GTF2/CVT). Thus other variants of the * range descriptor are not accepted here. */ if (range->flags != DRM_EDID_RANGE_LIMITS_ONLY_FLAG) return; monitor_range->min_vfreq = range->min_vfreq; monitor_range->max_vfreq = range->max_vfreq; if (edid->revision >= 4) { if (data->pad2 & DRM_EDID_RANGE_OFFSET_MIN_VFREQ) monitor_range->min_vfreq += 255; if (data->pad2 & DRM_EDID_RANGE_OFFSET_MAX_VFREQ) monitor_range->max_vfreq += 255; } } static void drm_get_monitor_range(struct drm_connector *connector, const struct drm_edid *drm_edid) { const struct drm_display_info *info = &connector->display_info; struct detailed_mode_closure closure = { .connector = connector, .drm_edid = drm_edid, }; if (drm_edid->edid->revision < 4) return; if (!(drm_edid->edid->features & DRM_EDID_FEATURE_CONTINUOUS_FREQ)) return; drm_for_each_detailed_block(drm_edid, get_monitor_range, &closure); drm_dbg_kms(connector->dev, "[CONNECTOR:%d:%s] Supported Monitor Refresh rate range is %d Hz - %d Hz\n", connector->base.id, connector->name, info->monitor_range.min_vfreq, info->monitor_range.max_vfreq); } static void drm_parse_vesa_mso_data(struct drm_connector *connector, const struct displayid_block *block) { struct displayid_vesa_vendor_specific_block *vesa = (struct displayid_vesa_vendor_specific_block *)block; struct drm_display_info *info = &connector->display_info; if (block->num_bytes < 3) { drm_dbg_kms(connector->dev, "[CONNECTOR:%d:%s] Unexpected vendor block size %u\n", connector->base.id, connector->name, block->num_bytes); return; } if (oui(vesa->oui[0], vesa->oui[1], vesa->oui[2]) != VESA_IEEE_OUI) return; if (sizeof(*vesa) != sizeof(*block) + block->num_bytes) { drm_dbg_kms(connector->dev, "[CONNECTOR:%d:%s] Unexpected VESA vendor block size\n", connector->base.id, connector->name); return; } switch (FIELD_GET(DISPLAYID_VESA_MSO_MODE, vesa->mso)) { default: drm_dbg_kms(connector->dev, "[CONNECTOR:%d:%s] Reserved MSO mode value\n", connector->base.id, connector->name); fallthrough; case 0: info->mso_stream_count = 0; break; case 1: info->mso_stream_count = 2; /* 2 or 4 links */ break; case 2: info->mso_stream_count = 4; /* 4 links */ break; } if (!info->mso_stream_count) { info->mso_pixel_overlap = 0; return; } info->mso_pixel_overlap = FIELD_GET(DISPLAYID_VESA_MSO_OVERLAP, vesa->mso); if (info->mso_pixel_overlap > 8) { drm_dbg_kms(connector->dev, "[CONNECTOR:%d:%s] Reserved MSO pixel overlap value %u\n", connector->base.id, connector->name, info->mso_pixel_overlap); info->mso_pixel_overlap = 8; } drm_dbg_kms(connector->dev, "[CONNECTOR:%d:%s] MSO stream count %u, pixel overlap %u\n", connector->base.id, connector->name, info->mso_stream_count, info->mso_pixel_overlap); } static void drm_update_mso(struct drm_connector *connector, const struct drm_edid *drm_edid) { const struct displayid_block *block; struct displayid_iter iter; displayid_iter_edid_begin(drm_edid, &iter); displayid_iter_for_each(block, &iter) { if (block->tag == DATA_BLOCK_2_VENDOR_SPECIFIC) drm_parse_vesa_mso_data(connector, block); } displayid_iter_end(&iter); } /* A connector has no EDID information, so we've got no EDID to compute quirks from. Reset * all of the values which would have been set from EDID */ static void drm_reset_display_info(struct drm_connector *connector) { struct drm_display_info *info = &connector->display_info; info->width_mm = 0; info->height_mm = 0; info->bpc = 0; info->color_formats = 0; info->cea_rev = 0; info->max_tmds_clock = 0; info->dvi_dual = false; info->is_hdmi = false; info->has_audio = false; info->has_hdmi_infoframe = false; info->rgb_quant_range_selectable = false; memset(&info->hdmi, 0, sizeof(info->hdmi)); info->edid_hdmi_rgb444_dc_modes = 0; info->edid_hdmi_ycbcr444_dc_modes = 0; info->non_desktop = 0; memset(&info->monitor_range, 0, sizeof(info->monitor_range)); memset(&info->luminance_range, 0, sizeof(info->luminance_range)); info->mso_stream_count = 0; info->mso_pixel_overlap = 0; info->max_dsc_bpp = 0; kfree(info->vics); info->vics = NULL; info->vics_len = 0; info->quirks = 0; info->source_physical_address = CEC_PHYS_ADDR_INVALID; } static void update_displayid_info(struct drm_connector *connector, const struct drm_edid *drm_edid) { struct drm_display_info *info = &connector->display_info; const struct displayid_block *block; struct displayid_iter iter; displayid_iter_edid_begin(drm_edid, &iter); displayid_iter_for_each(block, &iter) { if (displayid_version(&iter) == DISPLAY_ID_STRUCTURE_VER_20 && (displayid_primary_use(&iter) == PRIMARY_USE_HEAD_MOUNTED_VR || displayid_primary_use(&iter) == PRIMARY_USE_HEAD_MOUNTED_AR)) info->non_desktop = true; /* * We're only interested in the base section here, no need to * iterate further. */ break; } displayid_iter_end(&iter); } static void update_display_info(struct drm_connector *connector, const struct drm_edid *drm_edid) { struct drm_display_info *info = &connector->display_info; const struct edid *edid; drm_reset_display_info(connector); clear_eld(connector); if (!drm_edid) return; edid = drm_edid->edid; info->quirks = edid_get_quirks(drm_edid); info->width_mm = edid->width_cm * 10; info->height_mm = edid->height_cm * 10; drm_get_monitor_range(connector, drm_edid); if (edid->revision < 3) goto out; if (!drm_edid_is_digital(drm_edid)) goto out; info->color_formats |= DRM_COLOR_FORMAT_RGB444; drm_parse_cea_ext(connector, drm_edid); update_displayid_info(connector, drm_edid); /* * Digital sink with "DFP 1.x compliant TMDS" according to EDID 1.3? * * For such displays, the DFP spec 1.0, section 3.10 "EDID support" * tells us to assume 8 bpc color depth if the EDID doesn't have * extensions which tell otherwise. */ if (info->bpc == 0 && edid->revision == 3 && edid->input & DRM_EDID_DIGITAL_DFP_1_X) { info->bpc = 8; drm_dbg_kms(connector->dev, "[CONNECTOR:%d:%s] Assigning DFP sink color depth as %d bpc.\n", connector->base.id, connector->name, info->bpc); } /* Only defined for 1.4 with digital displays */ if (edid->revision < 4) goto out; switch (edid->input & DRM_EDID_DIGITAL_DEPTH_MASK) { case DRM_EDID_DIGITAL_DEPTH_6: info->bpc = 6; break; case DRM_EDID_DIGITAL_DEPTH_8: info->bpc = 8; break; case DRM_EDID_DIGITAL_DEPTH_10: info->bpc = 10; break; case DRM_EDID_DIGITAL_DEPTH_12: info->bpc = 12; break; case DRM_EDID_DIGITAL_DEPTH_14: info->bpc = 14; break; case DRM_EDID_DIGITAL_DEPTH_16: info->bpc = 16; break; case DRM_EDID_DIGITAL_DEPTH_UNDEF: default: info->bpc = 0; break; } drm_dbg_kms(connector->dev, "[CONNECTOR:%d:%s] Assigning EDID-1.4 digital sink color depth as %d bpc.\n", connector->base.id, connector->name, info->bpc); if (edid->features & DRM_EDID_FEATURE_RGB_YCRCB444) info->color_formats |= DRM_COLOR_FORMAT_YCBCR444; if (edid->features & DRM_EDID_FEATURE_RGB_YCRCB422) info->color_formats |= DRM_COLOR_FORMAT_YCBCR422; drm_update_mso(connector, drm_edid); out: if (info->quirks & EDID_QUIRK_NON_DESKTOP) { drm_dbg_kms(connector->dev, "[CONNECTOR:%d:%s] Non-desktop display%s\n", connector->base.id, connector->name, info->non_desktop ? " (redundant quirk)" : ""); info->non_desktop = true; } if (info->quirks & EDID_QUIRK_CAP_DSC_15BPP) info->max_dsc_bpp = 15; if (info->quirks & EDID_QUIRK_FORCE_6BPC) info->bpc = 6; if (info->quirks & EDID_QUIRK_FORCE_8BPC) info->bpc = 8; if (info->quirks & EDID_QUIRK_FORCE_10BPC) info->bpc = 10; if (info->quirks & EDID_QUIRK_FORCE_12BPC) info->bpc = 12; /* Depends on info->cea_rev set by drm_parse_cea_ext() above */ drm_edid_to_eld(connector, drm_edid); } static struct drm_display_mode *drm_mode_displayid_detailed(struct drm_device *dev, struct displayid_detailed_timings_1 *timings, bool type_7) { struct drm_display_mode *mode; unsigned pixel_clock = (timings->pixel_clock[0] | (timings->pixel_clock[1] << 8) | (timings->pixel_clock[2] << 16)) + 1; unsigned hactive = (timings->hactive[0] | timings->hactive[1] << 8) + 1; unsigned hblank = (timings->hblank[0] | timings->hblank[1] << 8) + 1; unsigned hsync = (timings->hsync[0] | (timings->hsync[1] & 0x7f) << 8) + 1; unsigned hsync_width = (timings->hsw[0] | timings->hsw[1] << 8) + 1; unsigned vactive = (timings->vactive[0] | timings->vactive[1] << 8) + 1; unsigned vblank = (timings->vblank[0] | timings->vblank[1] << 8) + 1; unsigned vsync = (timings->vsync[0] | (timings->vsync[1] & 0x7f) << 8) + 1; unsigned vsync_width = (timings->vsw[0] | timings->vsw[1] << 8) + 1; bool hsync_positive = (timings->hsync[1] >> 7) & 0x1; bool vsync_positive = (timings->vsync[1] >> 7) & 0x1; mode = drm_mode_create(dev); if (!mode) return NULL; /* resolution is kHz for type VII, and 10 kHz for type I */ mode->clock = type_7 ? pixel_clock : pixel_clock * 10; mode->hdisplay = hactive; mode->hsync_start = mode->hdisplay + hsync; mode->hsync_end = mode->hsync_start + hsync_width; mode->htotal = mode->hdisplay + hblank; mode->vdisplay = vactive; mode->vsync_start = mode->vdisplay + vsync; mode->vsync_end = mode->vsync_start + vsync_width; mode->vtotal = mode->vdisplay + vblank; mode->flags = 0; mode->flags |= hsync_positive ? DRM_MODE_FLAG_PHSYNC : DRM_MODE_FLAG_NHSYNC; mode->flags |= vsync_positive ? DRM_MODE_FLAG_PVSYNC : DRM_MODE_FLAG_NVSYNC; mode->type = DRM_MODE_TYPE_DRIVER; if (timings->flags & 0x80) mode->type |= DRM_MODE_TYPE_PREFERRED; drm_mode_set_name(mode); return mode; } static int add_displayid_detailed_1_modes(struct drm_connector *connector, const struct displayid_block *block) { struct displayid_detailed_timing_block *det = (struct displayid_detailed_timing_block *)block; int i; int num_timings; struct drm_display_mode *newmode; int num_modes = 0; bool type_7 = block->tag == DATA_BLOCK_2_TYPE_7_DETAILED_TIMING; /* blocks must be multiple of 20 bytes length */ if (block->num_bytes % 20) return 0; num_timings = block->num_bytes / 20; for (i = 0; i < num_timings; i++) { struct displayid_detailed_timings_1 *timings = &det->timings[i]; newmode = drm_mode_displayid_detailed(connector->dev, timings, type_7); if (!newmode) continue; drm_mode_probed_add(connector, newmode); num_modes++; } return num_modes; } static int add_displayid_detailed_modes(struct drm_connector *connector, const struct drm_edid *drm_edid) { const struct displayid_block *block; struct displayid_iter iter; int num_modes = 0; displayid_iter_edid_begin(drm_edid, &iter); displayid_iter_for_each(block, &iter) { if (block->tag == DATA_BLOCK_TYPE_1_DETAILED_TIMING || block->tag == DATA_BLOCK_2_TYPE_7_DETAILED_TIMING) num_modes += add_displayid_detailed_1_modes(connector, block); } displayid_iter_end(&iter); return num_modes; } static int _drm_edid_connector_add_modes(struct drm_connector *connector, const struct drm_edid *drm_edid) { const struct drm_display_info *info = &connector->display_info; int num_modes = 0; if (!drm_edid) return 0; /* * EDID spec says modes should be preferred in this order: * - preferred detailed mode * - other detailed modes from base block * - detailed modes from extension blocks * - CVT 3-byte code modes * - standard timing codes * - established timing codes * - modes inferred from GTF or CVT range information * * We get this pretty much right. * * XXX order for additional mode types in extension blocks? */ num_modes += add_detailed_modes(connector, drm_edid); num_modes += add_cvt_modes(connector, drm_edid); num_modes += add_standard_modes(connector, drm_edid); num_modes += add_established_modes(connector, drm_edid); num_modes += add_cea_modes(connector, drm_edid); num_modes += add_alternate_cea_modes(connector, drm_edid); num_modes += add_displayid_detailed_modes(connector, drm_edid); if (drm_edid->edid->features & DRM_EDID_FEATURE_CONTINUOUS_FREQ) num_modes += add_inferred_modes(connector, drm_edid); if (info->quirks & (EDID_QUIRK_PREFER_LARGE_60 | EDID_QUIRK_PREFER_LARGE_75)) edid_fixup_preferred(connector); return num_modes; } static void _drm_update_tile_info(struct drm_connector *connector, const struct drm_edid *drm_edid); static int _drm_edid_connector_property_update(struct drm_connector *connector, const struct drm_edid *drm_edid) { struct drm_device *dev = connector->dev; int ret; if (connector->edid_blob_ptr) { const void *old_edid = connector->edid_blob_ptr->data; size_t old_edid_size = connector->edid_blob_ptr->length; if (old_edid && !drm_edid_eq(drm_edid, old_edid, old_edid_size)) { connector->epoch_counter++; drm_dbg_kms(dev, "[CONNECTOR:%d:%s] EDID changed, epoch counter %llu\n", connector->base.id, connector->name, connector->epoch_counter); } } ret = drm_property_replace_global_blob(dev, &connector->edid_blob_ptr, drm_edid ? drm_edid->size : 0, drm_edid ? drm_edid->edid : NULL, &connector->base, dev->mode_config.edid_property); if (ret) { drm_dbg_kms(dev, "[CONNECTOR:%d:%s] EDID property update failed (%d)\n", connector->base.id, connector->name, ret); goto out; } ret = drm_object_property_set_value(&connector->base, dev->mode_config.non_desktop_property, connector->display_info.non_desktop); if (ret) { drm_dbg_kms(dev, "[CONNECTOR:%d:%s] Non-desktop property update failed (%d)\n", connector->base.id, connector->name, ret); goto out; } ret = drm_connector_set_tile_property(connector); if (ret) { drm_dbg_kms(dev, "[CONNECTOR:%d:%s] Tile property update failed (%d)\n", connector->base.id, connector->name, ret); goto out; } out: return ret; } /** * drm_edid_connector_update - Update connector information from EDID * @connector: Connector * @drm_edid: EDID * * Update the connector display info, ELD, HDR metadata, relevant properties, * etc. from the passed in EDID. * * If EDID is NULL, reset the information. * * Must be called before calling drm_edid_connector_add_modes(). * * Return: 0 on success, negative error on errors. */ int drm_edid_connector_update(struct drm_connector *connector, const struct drm_edid *drm_edid) { update_display_info(connector, drm_edid); _drm_update_tile_info(connector, drm_edid); return _drm_edid_connector_property_update(connector, drm_edid); } EXPORT_SYMBOL(drm_edid_connector_update); /** * drm_edid_connector_add_modes - Update probed modes from the EDID property * @connector: Connector * * Add the modes from the previously updated EDID property to the connector * probed modes list. * * drm_edid_connector_update() must have been called before this to update the * EDID property. * * Return: The number of modes added, or 0 if we couldn't find any. */ int drm_edid_connector_add_modes(struct drm_connector *connector) { const struct drm_edid *drm_edid = NULL; int count; if (connector->edid_blob_ptr) drm_edid = drm_edid_alloc(connector->edid_blob_ptr->data, connector->edid_blob_ptr->length); count = _drm_edid_connector_add_modes(connector, drm_edid); drm_edid_free(drm_edid); return count; } EXPORT_SYMBOL(drm_edid_connector_add_modes); /** * drm_connector_update_edid_property - update the edid property of a connector * @connector: drm connector * @edid: new value of the edid property * * This function creates a new blob modeset object and assigns its id to the * connector's edid property. * Since we also parse tile information from EDID's displayID block, we also * set the connector's tile property here. See drm_connector_set_tile_property() * for more details. * * This function is deprecated. Use drm_edid_connector_update() instead. * * Returns: * Zero on success, negative errno on failure. */ int drm_connector_update_edid_property(struct drm_connector *connector, const struct edid *edid) { struct drm_edid drm_edid; return drm_edid_connector_update(connector, drm_edid_legacy_init(&drm_edid, edid)); } EXPORT_SYMBOL(drm_connector_update_edid_property); /** * drm_add_edid_modes - add modes from EDID data, if available * @connector: connector we're probing * @edid: EDID data * * Add the specified modes to the connector's mode list. Also fills out the * &drm_display_info structure and ELD in @connector with any information which * can be derived from the edid. * * This function is deprecated. Use drm_edid_connector_add_modes() instead. * * Return: The number of modes added or 0 if we couldn't find any. */ int drm_add_edid_modes(struct drm_connector *connector, struct edid *edid) { struct drm_edid _drm_edid; const struct drm_edid *drm_edid; if (edid && !drm_edid_is_valid(edid)) { drm_warn(connector->dev, "[CONNECTOR:%d:%s] EDID invalid.\n", connector->base.id, connector->name); edid = NULL; } drm_edid = drm_edid_legacy_init(&_drm_edid, edid); update_display_info(connector, drm_edid); return _drm_edid_connector_add_modes(connector, drm_edid); } EXPORT_SYMBOL(drm_add_edid_modes); /** * drm_add_modes_noedid - add modes for the connectors without EDID * @connector: connector we're probing * @hdisplay: the horizontal display limit * @vdisplay: the vertical display limit * * Add the specified modes to the connector's mode list. Only when the * hdisplay/vdisplay is not beyond the given limit, it will be added. * * Return: The number of modes added or 0 if we couldn't find any. */ int drm_add_modes_noedid(struct drm_connector *connector, int hdisplay, int vdisplay) { int i, count, num_modes = 0; struct drm_display_mode *mode; struct drm_device *dev = connector->dev; count = ARRAY_SIZE(drm_dmt_modes); if (hdisplay < 0) hdisplay = 0; if (vdisplay < 0) vdisplay = 0; for (i = 0; i < count; i++) { const struct drm_display_mode *ptr = &drm_dmt_modes[i]; if (hdisplay && vdisplay) { /* * Only when two are valid, they will be used to check * whether the mode should be added to the mode list of * the connector. */ if (ptr->hdisplay > hdisplay || ptr->vdisplay > vdisplay) continue; } if (drm_mode_vrefresh(ptr) > 61) continue; mode = drm_mode_duplicate(dev, ptr); if (mode) { drm_mode_probed_add(connector, mode); num_modes++; } } return num_modes; } EXPORT_SYMBOL(drm_add_modes_noedid); static bool is_hdmi2_sink(const struct drm_connector *connector) { /* * FIXME: sil-sii8620 doesn't have a connector around when * we need one, so we have to be prepared for a NULL connector. */ if (!connector) return true; return connector->display_info.hdmi.scdc.supported || connector->display_info.color_formats & DRM_COLOR_FORMAT_YCBCR420; } static u8 drm_mode_hdmi_vic(const struct drm_connector *connector, const struct drm_display_mode *mode) { bool has_hdmi_infoframe = connector ? connector->display_info.has_hdmi_infoframe : false; if (!has_hdmi_infoframe) return 0; /* No HDMI VIC when signalling 3D video format */ if (mode->flags & DRM_MODE_FLAG_3D_MASK) return 0; return drm_match_hdmi_mode(mode); } static u8 drm_mode_cea_vic(const struct drm_connector *connector, const struct drm_display_mode *mode) { /* * HDMI spec says if a mode is found in HDMI 1.4b 4K modes * we should send its VIC in vendor infoframes, else send the * VIC in AVI infoframes. Lets check if this mode is present in * HDMI 1.4b 4K modes */ if (drm_mode_hdmi_vic(connector, mode)) return 0; return drm_match_cea_mode(mode); } /* * Avoid sending VICs defined in HDMI 2.0 in AVI infoframes to sinks that * conform to HDMI 1.4. * * HDMI 1.4 (CTA-861-D) VIC range: [1..64] * HDMI 2.0 (CTA-861-F) VIC range: [1..107] * * If the sink lists the VIC in CTA VDB, assume it's fine, regardless of HDMI * version. */ static u8 vic_for_avi_infoframe(const struct drm_connector *connector, u8 vic) { if (!is_hdmi2_sink(connector) && vic > 64 && !cta_vdb_has_vic(connector, vic)) return 0; return vic; } /** * drm_hdmi_avi_infoframe_from_display_mode() - fill an HDMI AVI infoframe with * data from a DRM display mode * @frame: HDMI AVI infoframe * @connector: the connector * @mode: DRM display mode * * Return: 0 on success or a negative error code on failure. */ int drm_hdmi_avi_infoframe_from_display_mode(struct hdmi_avi_infoframe *frame, const struct drm_connector *connector, const struct drm_display_mode *mode) { enum hdmi_picture_aspect picture_aspect; u8 vic, hdmi_vic; if (!frame || !mode) return -EINVAL; hdmi_avi_infoframe_init(frame); if (mode->flags & DRM_MODE_FLAG_DBLCLK) frame->pixel_repeat = 1; vic = drm_mode_cea_vic(connector, mode); hdmi_vic = drm_mode_hdmi_vic(connector, mode); frame->picture_aspect = HDMI_PICTURE_ASPECT_NONE; /* * As some drivers don't support atomic, we can't use connector state. * So just initialize the frame with default values, just the same way * as it's done with other properties here. */ frame->content_type = HDMI_CONTENT_TYPE_GRAPHICS; frame->itc = 0; /* * Populate picture aspect ratio from either * user input (if specified) or from the CEA/HDMI mode lists. */ picture_aspect = mode->picture_aspect_ratio; if (picture_aspect == HDMI_PICTURE_ASPECT_NONE) { if (vic) picture_aspect = drm_get_cea_aspect_ratio(vic); else if (hdmi_vic) picture_aspect = drm_get_hdmi_aspect_ratio(hdmi_vic); } /* * The infoframe can't convey anything but none, 4:3 * and 16:9, so if the user has asked for anything else * we can only satisfy it by specifying the right VIC. */ if (picture_aspect > HDMI_PICTURE_ASPECT_16_9) { if (vic) { if (picture_aspect != drm_get_cea_aspect_ratio(vic)) return -EINVAL; } else if (hdmi_vic) { if (picture_aspect != drm_get_hdmi_aspect_ratio(hdmi_vic)) return -EINVAL; } else { return -EINVAL; } picture_aspect = HDMI_PICTURE_ASPECT_NONE; } frame->video_code = vic_for_avi_infoframe(connector, vic); frame->picture_aspect = picture_aspect; frame->active_aspect = HDMI_ACTIVE_ASPECT_PICTURE; frame->scan_mode = HDMI_SCAN_MODE_UNDERSCAN; return 0; } EXPORT_SYMBOL(drm_hdmi_avi_infoframe_from_display_mode); /** * drm_hdmi_avi_infoframe_quant_range() - fill the HDMI AVI infoframe * quantization range information * @frame: HDMI AVI infoframe * @connector: the connector * @mode: DRM display mode * @rgb_quant_range: RGB quantization range (Q) */ void drm_hdmi_avi_infoframe_quant_range(struct hdmi_avi_infoframe *frame, const struct drm_connector *connector, const struct drm_display_mode *mode, enum hdmi_quantization_range rgb_quant_range) { const struct drm_display_info *info = &connector->display_info; /* * CEA-861: * "A Source shall not send a non-zero Q value that does not correspond * to the default RGB Quantization Range for the transmitted Picture * unless the Sink indicates support for the Q bit in a Video * Capabilities Data Block." * * HDMI 2.0 recommends sending non-zero Q when it does match the * default RGB quantization range for the mode, even when QS=0. */ if (info->rgb_quant_range_selectable || rgb_quant_range == drm_default_rgb_quant_range(mode)) frame->quantization_range = rgb_quant_range; else frame->quantization_range = HDMI_QUANTIZATION_RANGE_DEFAULT; /* * CEA-861-F: * "When transmitting any RGB colorimetry, the Source should set the * YQ-field to match the RGB Quantization Range being transmitted * (e.g., when Limited Range RGB, set YQ=0 or when Full Range RGB, * set YQ=1) and the Sink shall ignore the YQ-field." * * Unfortunate certain sinks (eg. VIZ Model 67/E261VA) get confused * by non-zero YQ when receiving RGB. There doesn't seem to be any * good way to tell which version of CEA-861 the sink supports, so * we limit non-zero YQ to HDMI 2.0 sinks only as HDMI 2.0 is based * on CEA-861-F. */ if (!is_hdmi2_sink(connector) || rgb_quant_range == HDMI_QUANTIZATION_RANGE_LIMITED) frame->ycc_quantization_range = HDMI_YCC_QUANTIZATION_RANGE_LIMITED; else frame->ycc_quantization_range = HDMI_YCC_QUANTIZATION_RANGE_FULL; } EXPORT_SYMBOL(drm_hdmi_avi_infoframe_quant_range); static enum hdmi_3d_structure s3d_structure_from_display_mode(const struct drm_display_mode *mode) { u32 layout = mode->flags & DRM_MODE_FLAG_3D_MASK; switch (layout) { case DRM_MODE_FLAG_3D_FRAME_PACKING: return HDMI_3D_STRUCTURE_FRAME_PACKING; case DRM_MODE_FLAG_3D_FIELD_ALTERNATIVE: return HDMI_3D_STRUCTURE_FIELD_ALTERNATIVE; case DRM_MODE_FLAG_3D_LINE_ALTERNATIVE: return HDMI_3D_STRUCTURE_LINE_ALTERNATIVE; case DRM_MODE_FLAG_3D_SIDE_BY_SIDE_FULL: return HDMI_3D_STRUCTURE_SIDE_BY_SIDE_FULL; case DRM_MODE_FLAG_3D_L_DEPTH: return HDMI_3D_STRUCTURE_L_DEPTH; case DRM_MODE_FLAG_3D_L_DEPTH_GFX_GFX_DEPTH: return HDMI_3D_STRUCTURE_L_DEPTH_GFX_GFX_DEPTH; case DRM_MODE_FLAG_3D_TOP_AND_BOTTOM: return HDMI_3D_STRUCTURE_TOP_AND_BOTTOM; case DRM_MODE_FLAG_3D_SIDE_BY_SIDE_HALF: return HDMI_3D_STRUCTURE_SIDE_BY_SIDE_HALF; default: return HDMI_3D_STRUCTURE_INVALID; } } /** * drm_hdmi_vendor_infoframe_from_display_mode() - fill an HDMI infoframe with * data from a DRM display mode * @frame: HDMI vendor infoframe * @connector: the connector * @mode: DRM display mode * * Note that there's is a need to send HDMI vendor infoframes only when using a * 4k or stereoscopic 3D mode. So when giving any other mode as input this * function will return -EINVAL, error that can be safely ignored. * * Return: 0 on success or a negative error code on failure. */ int drm_hdmi_vendor_infoframe_from_display_mode(struct hdmi_vendor_infoframe *frame, const struct drm_connector *connector, const struct drm_display_mode *mode) { /* * FIXME: sil-sii8620 doesn't have a connector around when * we need one, so we have to be prepared for a NULL connector. */ bool has_hdmi_infoframe = connector ? connector->display_info.has_hdmi_infoframe : false; int err; if (!frame || !mode) return -EINVAL; if (!has_hdmi_infoframe) return -EINVAL; err = hdmi_vendor_infoframe_init(frame); if (err < 0) return err; /* * Even if it's not absolutely necessary to send the infoframe * (ie.vic==0 and s3d_struct==0) we will still send it if we * know that the sink can handle it. This is based on a * suggestion in HDMI 2.0 Appendix F. Apparently some sinks * have trouble realizing that they should switch from 3D to 2D * mode if the source simply stops sending the infoframe when * it wants to switch from 3D to 2D. */ frame->vic = drm_mode_hdmi_vic(connector, mode); frame->s3d_struct = s3d_structure_from_display_mode(mode); return 0; } EXPORT_SYMBOL(drm_hdmi_vendor_infoframe_from_display_mode); static void drm_parse_tiled_block(struct drm_connector *connector, const struct displayid_block *block) { const struct displayid_tiled_block *tile = (struct displayid_tiled_block *)block; u16 w, h; u8 tile_v_loc, tile_h_loc; u8 num_v_tile, num_h_tile; struct drm_tile_group *tg; w = tile->tile_size[0] | tile->tile_size[1] << 8; h = tile->tile_size[2] | tile->tile_size[3] << 8; num_v_tile = (tile->topo[0] & 0xf) | (tile->topo[2] & 0x30); num_h_tile = (tile->topo[0] >> 4) | ((tile->topo[2] >> 2) & 0x30); tile_v_loc = (tile->topo[1] & 0xf) | ((tile->topo[2] & 0x3) << 4); tile_h_loc = (tile->topo[1] >> 4) | (((tile->topo[2] >> 2) & 0x3) << 4); connector->has_tile = true; if (tile->tile_cap & 0x80) connector->tile_is_single_monitor = true; connector->num_h_tile = num_h_tile + 1; connector->num_v_tile = num_v_tile + 1; connector->tile_h_loc = tile_h_loc; connector->tile_v_loc = tile_v_loc; connector->tile_h_size = w + 1; connector->tile_v_size = h + 1; drm_dbg_kms(connector->dev, "[CONNECTOR:%d:%s] tile cap 0x%x, size %dx%d, num tiles %dx%d, location %dx%d, vend %c%c%c", connector->base.id, connector->name, tile->tile_cap, connector->tile_h_size, connector->tile_v_size, connector->num_h_tile, connector->num_v_tile, connector->tile_h_loc, connector->tile_v_loc, tile->topology_id[0], tile->topology_id[1], tile->topology_id[2]); tg = drm_mode_get_tile_group(connector->dev, tile->topology_id); if (!tg) tg = drm_mode_create_tile_group(connector->dev, tile->topology_id); if (!tg) return; if (connector->tile_group != tg) { /* if we haven't got a pointer, take the reference, drop ref to old tile group */ if (connector->tile_group) drm_mode_put_tile_group(connector->dev, connector->tile_group); connector->tile_group = tg; } else { /* if same tile group, then release the ref we just took. */ drm_mode_put_tile_group(connector->dev, tg); } } static bool displayid_is_tiled_block(const struct displayid_iter *iter, const struct displayid_block *block) { return (displayid_version(iter) < DISPLAY_ID_STRUCTURE_VER_20 && block->tag == DATA_BLOCK_TILED_DISPLAY) || (displayid_version(iter) == DISPLAY_ID_STRUCTURE_VER_20 && block->tag == DATA_BLOCK_2_TILED_DISPLAY_TOPOLOGY); } static void _drm_update_tile_info(struct drm_connector *connector, const struct drm_edid *drm_edid) { const struct displayid_block *block; struct displayid_iter iter; connector->has_tile = false; displayid_iter_edid_begin(drm_edid, &iter); displayid_iter_for_each(block, &iter) { if (displayid_is_tiled_block(&iter, block)) drm_parse_tiled_block(connector, block); } displayid_iter_end(&iter); if (!connector->has_tile && connector->tile_group) { drm_mode_put_tile_group(connector->dev, connector->tile_group); connector->tile_group = NULL; } } /** * drm_edid_is_digital - is digital? * @drm_edid: The EDID * * Return true if input is digital. */ bool drm_edid_is_digital(const struct drm_edid *drm_edid) { return drm_edid && drm_edid->edid && drm_edid->edid->input & DRM_EDID_INPUT_DIGITAL; } EXPORT_SYMBOL(drm_edid_is_digital); |
| 4 1 4 4 4 4 4 5 5 5 4 5 2 1 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 | // SPDX-License-Identifier: GPL-2.0 /* * Performance events callchain code, extracted from core.c: * * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> */ #include <linux/perf_event.h> #include <linux/slab.h> #include <linux/sched/task_stack.h> #include "internal.h" struct callchain_cpus_entries { struct rcu_head rcu_head; struct perf_callchain_entry *cpu_entries[]; }; int sysctl_perf_event_max_stack __read_mostly = PERF_MAX_STACK_DEPTH; int sysctl_perf_event_max_contexts_per_stack __read_mostly = PERF_MAX_CONTEXTS_PER_STACK; static inline size_t perf_callchain_entry__sizeof(void) { return (sizeof(struct perf_callchain_entry) + sizeof(__u64) * (sysctl_perf_event_max_stack + sysctl_perf_event_max_contexts_per_stack)); } static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]); static atomic_t nr_callchain_events; static DEFINE_MUTEX(callchain_mutex); static struct callchain_cpus_entries *callchain_cpus_entries; __weak void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { } __weak void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { } static void release_callchain_buffers_rcu(struct rcu_head *head) { struct callchain_cpus_entries *entries; int cpu; entries = container_of(head, struct callchain_cpus_entries, rcu_head); for_each_possible_cpu(cpu) kfree(entries->cpu_entries[cpu]); kfree(entries); } static void release_callchain_buffers(void) { struct callchain_cpus_entries *entries; entries = callchain_cpus_entries; RCU_INIT_POINTER(callchain_cpus_entries, NULL); call_rcu(&entries->rcu_head, release_callchain_buffers_rcu); } static int alloc_callchain_buffers(void) { int cpu; int size; struct callchain_cpus_entries *entries; /* * We can't use the percpu allocation API for data that can be * accessed from NMI. Use a temporary manual per cpu allocation * until that gets sorted out. */ size = offsetof(struct callchain_cpus_entries, cpu_entries[nr_cpu_ids]); entries = kzalloc(size, GFP_KERNEL); if (!entries) return -ENOMEM; size = perf_callchain_entry__sizeof() * PERF_NR_CONTEXTS; for_each_possible_cpu(cpu) { entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL, cpu_to_node(cpu)); if (!entries->cpu_entries[cpu]) goto fail; } rcu_assign_pointer(callchain_cpus_entries, entries); return 0; fail: for_each_possible_cpu(cpu) kfree(entries->cpu_entries[cpu]); kfree(entries); return -ENOMEM; } int get_callchain_buffers(int event_max_stack) { int err = 0; int count; mutex_lock(&callchain_mutex); count = atomic_inc_return(&nr_callchain_events); if (WARN_ON_ONCE(count < 1)) { err = -EINVAL; goto exit; } /* * If requesting per event more than the global cap, * return a different error to help userspace figure * this out. * * And also do it here so that we have &callchain_mutex held. */ if (event_max_stack > sysctl_perf_event_max_stack) { err = -EOVERFLOW; goto exit; } if (count == 1) err = alloc_callchain_buffers(); exit: if (err) atomic_dec(&nr_callchain_events); mutex_unlock(&callchain_mutex); return err; } void put_callchain_buffers(void) { if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) { release_callchain_buffers(); mutex_unlock(&callchain_mutex); } } struct perf_callchain_entry *get_callchain_entry(int *rctx) { int cpu; struct callchain_cpus_entries *entries; *rctx = get_recursion_context(this_cpu_ptr(callchain_recursion)); if (*rctx == -1) return NULL; entries = rcu_dereference(callchain_cpus_entries); if (!entries) { put_recursion_context(this_cpu_ptr(callchain_recursion), *rctx); return NULL; } cpu = smp_processor_id(); return (((void *)entries->cpu_entries[cpu]) + (*rctx * perf_callchain_entry__sizeof())); } void put_callchain_entry(int rctx) { put_recursion_context(this_cpu_ptr(callchain_recursion), rctx); } struct perf_callchain_entry * get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, u32 max_stack, bool crosstask, bool add_mark) { struct perf_callchain_entry *entry; struct perf_callchain_entry_ctx ctx; int rctx; entry = get_callchain_entry(&rctx); if (!entry) return NULL; ctx.entry = entry; ctx.max_stack = max_stack; ctx.nr = entry->nr = init_nr; ctx.contexts = 0; ctx.contexts_maxed = false; if (kernel && !user_mode(regs)) { if (add_mark) perf_callchain_store_context(&ctx, PERF_CONTEXT_KERNEL); perf_callchain_kernel(&ctx, regs); } if (user) { if (!user_mode(regs)) { if (current->mm) regs = task_pt_regs(current); else regs = NULL; } if (regs) { if (crosstask) goto exit_put; if (add_mark) perf_callchain_store_context(&ctx, PERF_CONTEXT_USER); perf_callchain_user(&ctx, regs); } } exit_put: put_callchain_entry(rctx); return entry; } /* * Used for sysctl_perf_event_max_stack and * sysctl_perf_event_max_contexts_per_stack. */ int perf_event_max_stack_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int *value = table->data; int new_value = *value, ret; struct ctl_table new_table = *table; new_table.data = &new_value; ret = proc_dointvec_minmax(&new_table, write, buffer, lenp, ppos); if (ret || !write) return ret; mutex_lock(&callchain_mutex); if (atomic_read(&nr_callchain_events)) ret = -EBUSY; else *value = new_value; mutex_unlock(&callchain_mutex); return ret; } |
| 4113 4103 4143 4097 4107 4111 4144 4110 4141 4114 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 | // SPDX-License-Identifier: GPL-2.0-only /* * AppArmor security module * * This file contains AppArmor network mediation * * Copyright (C) 1998-2008 Novell/SUSE * Copyright 2009-2017 Canonical Ltd. */ #include "include/apparmor.h" #include "include/audit.h" #include "include/cred.h" #include "include/label.h" #include "include/net.h" #include "include/policy.h" #include "include/secid.h" #include "net_names.h" struct aa_sfs_entry aa_sfs_entry_network[] = { AA_SFS_FILE_STRING("af_mask", AA_SFS_AF_MASK), { } }; static const char * const net_mask_names[] = { "unknown", "send", "receive", "unknown", "create", "shutdown", "connect", "unknown", "setattr", "getattr", "setcred", "getcred", "chmod", "chown", "chgrp", "lock", "mmap", "mprot", "unknown", "unknown", "accept", "bind", "listen", "unknown", "setopt", "getopt", "unknown", "unknown", "unknown", "unknown", "unknown", "unknown", }; /* audit callback for net specific fields */ void audit_net_cb(struct audit_buffer *ab, void *va) { struct common_audit_data *sa = va; struct apparmor_audit_data *ad = aad(sa); if (address_family_names[sa->u.net->family]) audit_log_format(ab, " family=\"%s\"", address_family_names[sa->u.net->family]); else audit_log_format(ab, " family=\"unknown(%d)\"", sa->u.net->family); if (sock_type_names[ad->net.type]) audit_log_format(ab, " sock_type=\"%s\"", sock_type_names[ad->net.type]); else audit_log_format(ab, " sock_type=\"unknown(%d)\"", ad->net.type); audit_log_format(ab, " protocol=%d", ad->net.protocol); if (ad->request & NET_PERMS_MASK) { audit_log_format(ab, " requested_mask="); aa_audit_perm_mask(ab, ad->request, NULL, 0, net_mask_names, NET_PERMS_MASK); if (ad->denied & NET_PERMS_MASK) { audit_log_format(ab, " denied_mask="); aa_audit_perm_mask(ab, ad->denied, NULL, 0, net_mask_names, NET_PERMS_MASK); } } if (ad->peer) { audit_log_format(ab, " peer="); aa_label_xaudit(ab, labels_ns(ad->subj_label), ad->peer, FLAGS_NONE, GFP_ATOMIC); } } /* Generic af perm */ int aa_profile_af_perm(struct aa_profile *profile, struct apparmor_audit_data *ad, u32 request, u16 family, int type) { struct aa_ruleset *rules = list_first_entry(&profile->rules, typeof(*rules), list); struct aa_perms perms = { }; aa_state_t state; __be16 buffer[2]; AA_BUG(family >= AF_MAX); AA_BUG(type < 0 || type >= SOCK_MAX); if (profile_unconfined(profile)) return 0; state = RULE_MEDIATES(rules, AA_CLASS_NET); if (!state) return 0; buffer[0] = cpu_to_be16(family); buffer[1] = cpu_to_be16((u16) type); state = aa_dfa_match_len(rules->policy->dfa, state, (char *) &buffer, 4); perms = *aa_lookup_perms(rules->policy, state); aa_apply_modes_to_perms(profile, &perms); return aa_check_perms(profile, &perms, request, ad, audit_net_cb); } int aa_af_perm(const struct cred *subj_cred, struct aa_label *label, const char *op, u32 request, u16 family, int type, int protocol) { struct aa_profile *profile; DEFINE_AUDIT_NET(ad, op, NULL, family, type, protocol); return fn_for_each_confined(label, profile, aa_profile_af_perm(profile, &ad, request, family, type)); } static int aa_label_sk_perm(const struct cred *subj_cred, struct aa_label *label, const char *op, u32 request, struct sock *sk) { struct aa_sk_ctx *ctx = SK_CTX(sk); int error = 0; AA_BUG(!label); AA_BUG(!sk); if (ctx->label != kernel_t && !unconfined(label)) { struct aa_profile *profile; DEFINE_AUDIT_SK(ad, op, sk); ad.subj_cred = subj_cred; error = fn_for_each_confined(label, profile, aa_profile_af_sk_perm(profile, &ad, request, sk)); } return error; } int aa_sk_perm(const char *op, u32 request, struct sock *sk) { struct aa_label *label; int error; AA_BUG(!sk); AA_BUG(in_interrupt()); /* TODO: switch to begin_current_label ???? */ label = begin_current_label_crit_section(); error = aa_label_sk_perm(current_cred(), label, op, request, sk); end_current_label_crit_section(label); return error; } int aa_sock_file_perm(const struct cred *subj_cred, struct aa_label *label, const char *op, u32 request, struct socket *sock) { AA_BUG(!label); AA_BUG(!sock); AA_BUG(!sock->sk); return aa_label_sk_perm(subj_cred, label, op, request, sock->sk); } #ifdef CONFIG_NETWORK_SECMARK static int apparmor_secmark_init(struct aa_secmark *secmark) { struct aa_label *label; if (secmark->label[0] == '*') { secmark->secid = AA_SECID_WILDCARD; return 0; } label = aa_label_strn_parse(&root_ns->unconfined->label, secmark->label, strlen(secmark->label), GFP_ATOMIC, false, false); if (IS_ERR(label)) return PTR_ERR(label); secmark->secid = label->secid; return 0; } static int aa_secmark_perm(struct aa_profile *profile, u32 request, u32 secid, struct apparmor_audit_data *ad) { int i, ret; struct aa_perms perms = { }; struct aa_ruleset *rules = list_first_entry(&profile->rules, typeof(*rules), list); if (rules->secmark_count == 0) return 0; for (i = 0; i < rules->secmark_count; i++) { if (!rules->secmark[i].secid) { ret = apparmor_secmark_init(&rules->secmark[i]); if (ret) return ret; } if (rules->secmark[i].secid == secid || rules->secmark[i].secid == AA_SECID_WILDCARD) { if (rules->secmark[i].deny) perms.deny = ALL_PERMS_MASK; else perms.allow = ALL_PERMS_MASK; if (rules->secmark[i].audit) perms.audit = ALL_PERMS_MASK; } } aa_apply_modes_to_perms(profile, &perms); return aa_check_perms(profile, &perms, request, ad, audit_net_cb); } int apparmor_secmark_check(struct aa_label *label, char *op, u32 request, u32 secid, const struct sock *sk) { struct aa_profile *profile; DEFINE_AUDIT_SK(ad, op, sk); return fn_for_each_confined(label, profile, aa_secmark_perm(profile, request, secid, &ad)); } #endif |
| 26 26 1 1 1 1 2 2 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 | // SPDX-License-Identifier: GPL-2.0 /* Multipath TCP * * Copyright (c) 2022, Intel Corporation. */ #include "protocol.h" #include "mib.h" #include "mptcp_pm_gen.h" void mptcp_free_local_addr_list(struct mptcp_sock *msk) { struct mptcp_pm_addr_entry *entry, *tmp; struct sock *sk = (struct sock *)msk; LIST_HEAD(free_list); if (!mptcp_pm_is_userspace(msk)) return; spin_lock_bh(&msk->pm.lock); list_splice_init(&msk->pm.userspace_pm_local_addr_list, &free_list); spin_unlock_bh(&msk->pm.lock); list_for_each_entry_safe(entry, tmp, &free_list, list) { sock_kfree_s(sk, entry, sizeof(*entry)); } } static int mptcp_userspace_pm_append_new_local_addr(struct mptcp_sock *msk, struct mptcp_pm_addr_entry *entry, bool needs_id) { DECLARE_BITMAP(id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); struct mptcp_pm_addr_entry *match = NULL; struct sock *sk = (struct sock *)msk; struct mptcp_pm_addr_entry *e; bool addr_match = false; bool id_match = false; int ret = -EINVAL; bitmap_zero(id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); spin_lock_bh(&msk->pm.lock); list_for_each_entry(e, &msk->pm.userspace_pm_local_addr_list, list) { addr_match = mptcp_addresses_equal(&e->addr, &entry->addr, true); if (addr_match && entry->addr.id == 0 && needs_id) entry->addr.id = e->addr.id; id_match = (e->addr.id == entry->addr.id); if (addr_match && id_match) { match = e; break; } else if (addr_match || id_match) { break; } __set_bit(e->addr.id, id_bitmap); } if (!match && !addr_match && !id_match) { /* Memory for the entry is allocated from the * sock option buffer. */ e = sock_kmalloc(sk, sizeof(*e), GFP_ATOMIC); if (!e) { ret = -ENOMEM; goto append_err; } *e = *entry; if (!e->addr.id && needs_id) e->addr.id = find_next_zero_bit(id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1, 1); list_add_tail_rcu(&e->list, &msk->pm.userspace_pm_local_addr_list); msk->pm.local_addr_used++; ret = e->addr.id; } else if (match) { ret = entry->addr.id; } append_err: spin_unlock_bh(&msk->pm.lock); return ret; } /* If the subflow is closed from the other peer (not via a * subflow destroy command then), we want to keep the entry * not to assign the same ID to another address and to be * able to send RM_ADDR after the removal of the subflow. */ static int mptcp_userspace_pm_delete_local_addr(struct mptcp_sock *msk, struct mptcp_pm_addr_entry *addr) { struct mptcp_pm_addr_entry *entry, *tmp; list_for_each_entry_safe(entry, tmp, &msk->pm.userspace_pm_local_addr_list, list) { if (mptcp_addresses_equal(&entry->addr, &addr->addr, false)) { /* TODO: a refcount is needed because the entry can * be used multiple times (e.g. fullmesh mode). */ list_del_rcu(&entry->list); kfree(entry); msk->pm.local_addr_used--; return 0; } } return -EINVAL; } static struct mptcp_pm_addr_entry * mptcp_userspace_pm_lookup_addr_by_id(struct mptcp_sock *msk, unsigned int id) { struct mptcp_pm_addr_entry *entry; list_for_each_entry(entry, &msk->pm.userspace_pm_local_addr_list, list) { if (entry->addr.id == id) return entry; } return NULL; } int mptcp_userspace_pm_get_flags_and_ifindex_by_id(struct mptcp_sock *msk, unsigned int id, u8 *flags, int *ifindex) { struct mptcp_pm_addr_entry *match; spin_lock_bh(&msk->pm.lock); match = mptcp_userspace_pm_lookup_addr_by_id(msk, id); spin_unlock_bh(&msk->pm.lock); if (match) { *flags = match->flags; *ifindex = match->ifindex; } return 0; } int mptcp_userspace_pm_get_local_id(struct mptcp_sock *msk, struct mptcp_addr_info *skc) { struct mptcp_pm_addr_entry *entry = NULL, *e, new_entry; __be16 msk_sport = ((struct inet_sock *) inet_sk((struct sock *)msk))->inet_sport; spin_lock_bh(&msk->pm.lock); list_for_each_entry(e, &msk->pm.userspace_pm_local_addr_list, list) { if (mptcp_addresses_equal(&e->addr, skc, false)) { entry = e; break; } } spin_unlock_bh(&msk->pm.lock); if (entry) return entry->addr.id; memset(&new_entry, 0, sizeof(struct mptcp_pm_addr_entry)); new_entry.addr = *skc; new_entry.addr.id = 0; new_entry.flags = MPTCP_PM_ADDR_FLAG_IMPLICIT; if (new_entry.addr.port == msk_sport) new_entry.addr.port = 0; return mptcp_userspace_pm_append_new_local_addr(msk, &new_entry, true); } int mptcp_pm_nl_announce_doit(struct sk_buff *skb, struct genl_info *info) { struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN]; struct nlattr *addr = info->attrs[MPTCP_PM_ATTR_ADDR]; struct mptcp_pm_addr_entry addr_val; struct mptcp_sock *msk; int err = -EINVAL; struct sock *sk; u32 token_val; if (!addr || !token) { GENL_SET_ERR_MSG(info, "missing required inputs"); return err; } token_val = nla_get_u32(token); msk = mptcp_token_get_sock(sock_net(skb->sk), token_val); if (!msk) { NL_SET_ERR_MSG_ATTR(info->extack, token, "invalid token"); return err; } sk = (struct sock *)msk; if (!mptcp_pm_is_userspace(msk)) { GENL_SET_ERR_MSG(info, "invalid request; userspace PM not selected"); goto announce_err; } err = mptcp_pm_parse_entry(addr, info, true, &addr_val); if (err < 0) { GENL_SET_ERR_MSG(info, "error parsing local address"); goto announce_err; } if (addr_val.addr.id == 0 || !(addr_val.flags & MPTCP_PM_ADDR_FLAG_SIGNAL)) { GENL_SET_ERR_MSG(info, "invalid addr id or flags"); err = -EINVAL; goto announce_err; } err = mptcp_userspace_pm_append_new_local_addr(msk, &addr_val, false); if (err < 0) { GENL_SET_ERR_MSG(info, "did not match address and id"); goto announce_err; } lock_sock(sk); spin_lock_bh(&msk->pm.lock); if (mptcp_pm_alloc_anno_list(msk, &addr_val.addr)) { msk->pm.add_addr_signaled++; mptcp_pm_announce_addr(msk, &addr_val.addr, false); mptcp_pm_nl_addr_send_ack(msk); } spin_unlock_bh(&msk->pm.lock); release_sock(sk); err = 0; announce_err: sock_put(sk); return err; } static int mptcp_userspace_pm_remove_id_zero_address(struct mptcp_sock *msk, struct genl_info *info) { struct mptcp_rm_list list = { .nr = 0 }; struct mptcp_subflow_context *subflow; struct sock *sk = (struct sock *)msk; bool has_id_0 = false; int err = -EINVAL; lock_sock(sk); mptcp_for_each_subflow(msk, subflow) { if (READ_ONCE(subflow->local_id) == 0) { has_id_0 = true; break; } } if (!has_id_0) { GENL_SET_ERR_MSG(info, "address with id 0 not found"); goto remove_err; } list.ids[list.nr++] = 0; spin_lock_bh(&msk->pm.lock); mptcp_pm_remove_addr(msk, &list); spin_unlock_bh(&msk->pm.lock); err = 0; remove_err: release_sock(sk); return err; } int mptcp_pm_nl_remove_doit(struct sk_buff *skb, struct genl_info *info) { struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN]; struct nlattr *id = info->attrs[MPTCP_PM_ATTR_LOC_ID]; struct mptcp_pm_addr_entry *match; struct mptcp_pm_addr_entry *entry; struct mptcp_sock *msk; LIST_HEAD(free_list); int err = -EINVAL; struct sock *sk; u32 token_val; u8 id_val; if (!id || !token) { GENL_SET_ERR_MSG(info, "missing required inputs"); return err; } id_val = nla_get_u8(id); token_val = nla_get_u32(token); msk = mptcp_token_get_sock(sock_net(skb->sk), token_val); if (!msk) { NL_SET_ERR_MSG_ATTR(info->extack, token, "invalid token"); return err; } sk = (struct sock *)msk; if (!mptcp_pm_is_userspace(msk)) { GENL_SET_ERR_MSG(info, "invalid request; userspace PM not selected"); goto out; } if (id_val == 0) { err = mptcp_userspace_pm_remove_id_zero_address(msk, info); goto out; } lock_sock(sk); match = mptcp_userspace_pm_lookup_addr_by_id(msk, id_val); if (!match) { GENL_SET_ERR_MSG(info, "address with specified id not found"); release_sock(sk); goto out; } list_move(&match->list, &free_list); mptcp_pm_remove_addrs(msk, &free_list); release_sock(sk); list_for_each_entry_safe(match, entry, &free_list, list) { sock_kfree_s(sk, match, sizeof(*match)); } err = 0; out: sock_put(sk); return err; } int mptcp_pm_nl_subflow_create_doit(struct sk_buff *skb, struct genl_info *info) { struct nlattr *raddr = info->attrs[MPTCP_PM_ATTR_ADDR_REMOTE]; struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN]; struct nlattr *laddr = info->attrs[MPTCP_PM_ATTR_ADDR]; struct mptcp_pm_addr_entry local = { 0 }; struct mptcp_addr_info addr_r; struct mptcp_sock *msk; int err = -EINVAL; struct sock *sk; u32 token_val; if (!laddr || !raddr || !token) { GENL_SET_ERR_MSG(info, "missing required inputs"); return err; } token_val = nla_get_u32(token); msk = mptcp_token_get_sock(genl_info_net(info), token_val); if (!msk) { NL_SET_ERR_MSG_ATTR(info->extack, token, "invalid token"); return err; } sk = (struct sock *)msk; if (!mptcp_pm_is_userspace(msk)) { GENL_SET_ERR_MSG(info, "invalid request; userspace PM not selected"); goto create_err; } err = mptcp_pm_parse_entry(laddr, info, true, &local); if (err < 0) { NL_SET_ERR_MSG_ATTR(info->extack, laddr, "error parsing local addr"); goto create_err; } if (local.flags & MPTCP_PM_ADDR_FLAG_SIGNAL) { GENL_SET_ERR_MSG(info, "invalid addr flags"); err = -EINVAL; goto create_err; } local.flags |= MPTCP_PM_ADDR_FLAG_SUBFLOW; err = mptcp_pm_parse_addr(raddr, info, &addr_r); if (err < 0) { NL_SET_ERR_MSG_ATTR(info->extack, raddr, "error parsing remote addr"); goto create_err; } if (!mptcp_pm_addr_families_match(sk, &local.addr, &addr_r)) { GENL_SET_ERR_MSG(info, "families mismatch"); err = -EINVAL; goto create_err; } err = mptcp_userspace_pm_append_new_local_addr(msk, &local, false); if (err < 0) { GENL_SET_ERR_MSG(info, "did not match address and id"); goto create_err; } lock_sock(sk); err = __mptcp_subflow_connect(sk, &local.addr, &addr_r); release_sock(sk); spin_lock_bh(&msk->pm.lock); if (err) mptcp_userspace_pm_delete_local_addr(msk, &local); else msk->pm.subflows++; spin_unlock_bh(&msk->pm.lock); create_err: sock_put(sk); return err; } static struct sock *mptcp_nl_find_ssk(struct mptcp_sock *msk, const struct mptcp_addr_info *local, const struct mptcp_addr_info *remote) { struct mptcp_subflow_context *subflow; if (local->family != remote->family) return NULL; mptcp_for_each_subflow(msk, subflow) { const struct inet_sock *issk; struct sock *ssk; ssk = mptcp_subflow_tcp_sock(subflow); if (local->family != ssk->sk_family) continue; issk = inet_sk(ssk); switch (ssk->sk_family) { case AF_INET: if (issk->inet_saddr != local->addr.s_addr || issk->inet_daddr != remote->addr.s_addr) continue; break; #if IS_ENABLED(CONFIG_MPTCP_IPV6) case AF_INET6: { const struct ipv6_pinfo *pinfo = inet6_sk(ssk); if (!ipv6_addr_equal(&local->addr6, &pinfo->saddr) || !ipv6_addr_equal(&remote->addr6, &ssk->sk_v6_daddr)) continue; break; } #endif default: continue; } if (issk->inet_sport == local->port && issk->inet_dport == remote->port) return ssk; } return NULL; } int mptcp_pm_nl_subflow_destroy_doit(struct sk_buff *skb, struct genl_info *info) { struct nlattr *raddr = info->attrs[MPTCP_PM_ATTR_ADDR_REMOTE]; struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN]; struct nlattr *laddr = info->attrs[MPTCP_PM_ATTR_ADDR]; struct mptcp_addr_info addr_l; struct mptcp_addr_info addr_r; struct mptcp_sock *msk; struct sock *sk, *ssk; int err = -EINVAL; u32 token_val; if (!laddr || !raddr || !token) { GENL_SET_ERR_MSG(info, "missing required inputs"); return err; } token_val = nla_get_u32(token); msk = mptcp_token_get_sock(genl_info_net(info), token_val); if (!msk) { NL_SET_ERR_MSG_ATTR(info->extack, token, "invalid token"); return err; } sk = (struct sock *)msk; if (!mptcp_pm_is_userspace(msk)) { GENL_SET_ERR_MSG(info, "invalid request; userspace PM not selected"); goto destroy_err; } err = mptcp_pm_parse_addr(laddr, info, &addr_l); if (err < 0) { NL_SET_ERR_MSG_ATTR(info->extack, laddr, "error parsing local addr"); goto destroy_err; } err = mptcp_pm_parse_addr(raddr, info, &addr_r); if (err < 0) { NL_SET_ERR_MSG_ATTR(info->extack, raddr, "error parsing remote addr"); goto destroy_err; } #if IS_ENABLED(CONFIG_MPTCP_IPV6) if (addr_l.family == AF_INET && ipv6_addr_v4mapped(&addr_r.addr6)) { ipv6_addr_set_v4mapped(addr_l.addr.s_addr, &addr_l.addr6); addr_l.family = AF_INET6; } if (addr_r.family == AF_INET && ipv6_addr_v4mapped(&addr_l.addr6)) { ipv6_addr_set_v4mapped(addr_r.addr.s_addr, &addr_r.addr6); addr_r.family = AF_INET6; } #endif if (addr_l.family != addr_r.family) { GENL_SET_ERR_MSG(info, "address families do not match"); err = -EINVAL; goto destroy_err; } if (!addr_l.port || !addr_r.port) { GENL_SET_ERR_MSG(info, "missing local or remote port"); err = -EINVAL; goto destroy_err; } lock_sock(sk); ssk = mptcp_nl_find_ssk(msk, &addr_l, &addr_r); if (ssk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct mptcp_pm_addr_entry entry = { .addr = addr_l }; spin_lock_bh(&msk->pm.lock); mptcp_userspace_pm_delete_local_addr(msk, &entry); spin_unlock_bh(&msk->pm.lock); mptcp_subflow_shutdown(sk, ssk, RCV_SHUTDOWN | SEND_SHUTDOWN); mptcp_close_ssk(sk, ssk, subflow); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RMSUBFLOW); err = 0; } else { err = -ESRCH; } release_sock(sk); destroy_err: sock_put(sk); return err; } int mptcp_userspace_pm_set_flags(struct sk_buff *skb, struct genl_info *info) { struct mptcp_pm_addr_entry loc = { .addr = { .family = AF_UNSPEC }, }; struct mptcp_pm_addr_entry rem = { .addr = { .family = AF_UNSPEC }, }; struct nlattr *attr_rem = info->attrs[MPTCP_PM_ATTR_ADDR_REMOTE]; struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN]; struct nlattr *attr = info->attrs[MPTCP_PM_ATTR_ADDR]; struct net *net = sock_net(skb->sk); struct mptcp_sock *msk; int ret = -EINVAL; struct sock *sk; u32 token_val; u8 bkup = 0; token_val = nla_get_u32(token); msk = mptcp_token_get_sock(net, token_val); if (!msk) { NL_SET_ERR_MSG_ATTR(info->extack, token, "invalid token"); return ret; } sk = (struct sock *)msk; if (!mptcp_pm_is_userspace(msk)) { GENL_SET_ERR_MSG(info, "userspace PM not selected"); goto set_flags_err; } ret = mptcp_pm_parse_entry(attr, info, false, &loc); if (ret < 0) goto set_flags_err; if (attr_rem) { ret = mptcp_pm_parse_entry(attr_rem, info, false, &rem); if (ret < 0) goto set_flags_err; } if (loc.addr.family == AF_UNSPEC || rem.addr.family == AF_UNSPEC) { GENL_SET_ERR_MSG(info, "invalid address families"); ret = -EINVAL; goto set_flags_err; } if (loc.flags & MPTCP_PM_ADDR_FLAG_BACKUP) bkup = 1; lock_sock(sk); ret = mptcp_pm_nl_mp_prio_send_ack(msk, &loc.addr, &rem.addr, bkup); release_sock(sk); set_flags_err: sock_put(sk); return ret; } int mptcp_userspace_pm_dump_addr(struct sk_buff *msg, struct netlink_callback *cb) { struct id_bitmap { DECLARE_BITMAP(map, MPTCP_PM_MAX_ADDR_ID + 1); } *bitmap; const struct genl_info *info = genl_info_dump(cb); struct net *net = sock_net(msg->sk); struct mptcp_pm_addr_entry *entry; struct mptcp_sock *msk; struct nlattr *token; int ret = -EINVAL; struct sock *sk; void *hdr; bitmap = (struct id_bitmap *)cb->ctx; token = info->attrs[MPTCP_PM_ATTR_TOKEN]; msk = mptcp_token_get_sock(net, nla_get_u32(token)); if (!msk) { NL_SET_ERR_MSG_ATTR(info->extack, token, "invalid token"); return ret; } sk = (struct sock *)msk; if (!mptcp_pm_is_userspace(msk)) { GENL_SET_ERR_MSG(info, "invalid request; userspace PM not selected"); goto out; } lock_sock(sk); spin_lock_bh(&msk->pm.lock); list_for_each_entry(entry, &msk->pm.userspace_pm_local_addr_list, list) { if (test_bit(entry->addr.id, bitmap->map)) continue; hdr = genlmsg_put(msg, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &mptcp_genl_family, NLM_F_MULTI, MPTCP_PM_CMD_GET_ADDR); if (!hdr) break; if (mptcp_nl_fill_addr(msg, entry) < 0) { genlmsg_cancel(msg, hdr); break; } __set_bit(entry->addr.id, bitmap->map); genlmsg_end(msg, hdr); } spin_unlock_bh(&msk->pm.lock); release_sock(sk); ret = msg->len; out: sock_put(sk); return ret; } int mptcp_userspace_pm_get_addr(struct sk_buff *skb, struct genl_info *info) { struct nlattr *attr = info->attrs[MPTCP_PM_ENDPOINT_ADDR]; struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN]; struct mptcp_pm_addr_entry addr, *entry; struct net *net = sock_net(skb->sk); struct mptcp_sock *msk; struct sk_buff *msg; int ret = -EINVAL; struct sock *sk; void *reply; msk = mptcp_token_get_sock(net, nla_get_u32(token)); if (!msk) { NL_SET_ERR_MSG_ATTR(info->extack, token, "invalid token"); return ret; } sk = (struct sock *)msk; if (!mptcp_pm_is_userspace(msk)) { GENL_SET_ERR_MSG(info, "invalid request; userspace PM not selected"); goto out; } ret = mptcp_pm_parse_entry(attr, info, false, &addr); if (ret < 0) goto out; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) { ret = -ENOMEM; goto out; } reply = genlmsg_put_reply(msg, info, &mptcp_genl_family, 0, info->genlhdr->cmd); if (!reply) { GENL_SET_ERR_MSG(info, "not enough space in Netlink message"); ret = -EMSGSIZE; goto fail; } lock_sock(sk); spin_lock_bh(&msk->pm.lock); entry = mptcp_userspace_pm_lookup_addr_by_id(msk, addr.addr.id); if (!entry) { GENL_SET_ERR_MSG(info, "address not found"); ret = -EINVAL; goto unlock_fail; } ret = mptcp_nl_fill_addr(msg, entry); if (ret) goto unlock_fail; genlmsg_end(msg, reply); ret = genlmsg_reply(msg, info); spin_unlock_bh(&msk->pm.lock); release_sock(sk); sock_put(sk); return ret; unlock_fail: spin_unlock_bh(&msk->pm.lock); release_sock(sk); fail: nlmsg_free(msg); out: sock_put(sk); return ret; } |
| 130 129 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_HEX_H #define _LINUX_HEX_H #include <linux/types.h> extern const char hex_asc[]; #define hex_asc_lo(x) hex_asc[((x) & 0x0f)] #define hex_asc_hi(x) hex_asc[((x) & 0xf0) >> 4] static inline char *hex_byte_pack(char *buf, u8 byte) { *buf++ = hex_asc_hi(byte); *buf++ = hex_asc_lo(byte); return buf; } extern const char hex_asc_upper[]; #define hex_asc_upper_lo(x) hex_asc_upper[((x) & 0x0f)] #define hex_asc_upper_hi(x) hex_asc_upper[((x) & 0xf0) >> 4] static inline char *hex_byte_pack_upper(char *buf, u8 byte) { *buf++ = hex_asc_upper_hi(byte); *buf++ = hex_asc_upper_lo(byte); return buf; } extern int hex_to_bin(unsigned char ch); extern int __must_check hex2bin(u8 *dst, const char *src, size_t count); extern char *bin2hex(char *dst, const void *src, size_t count); bool mac_pton(const char *s, u8 *mac); #endif |
| 2 2 2 8 8 7 11 11 5 9 9 3 3 3 2 2 2 2 3 1 3 1 2 2 2 3 3 3 3 2 2 2 28 28 28 28 4 4 4 4 4 4 13 13 13 13 13 13 4 4 13 3 3 2 2 1 2 2 2 2 2 1 1 1 2 1 1 1 1 2 3 4 4 4 4 4 2 4 4 2 1 3 6 7 7 7 3 7 3 3 3 3 7 7 5 7 10 10 10 9 8 8 3 1 10 2 2 1 2 3 3 3 3 2 2 1 1 2 6 6 6 6 6 5 5 4 5 2 4 1 1 1 1 3 2 2 1 2 2 2 2 4 2 7 8 4 4 4 4 4 4 8 8 10 9 8 6 4 6 3 6 2 8 8 8 1 2 7 7 7 8 7 8 8 7 8 8 8 6 8 8 8 4 7 7 7 2 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 | // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/errno.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/mm.h> #include <linux/slab.h> #include <linux/nospec.h> #include <linux/hugetlb.h> #include <linux/compat.h> #include <linux/io_uring.h> #include <uapi/linux/io_uring.h> #include "io_uring.h" #include "alloc_cache.h" #include "openclose.h" #include "rsrc.h" #include "memmap.h" struct io_rsrc_update { struct file *file; u64 arg; u32 nr_args; u32 offset; }; static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc); static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, struct io_mapped_ubuf **pimu, struct page **last_hpage); /* only define max */ #define IORING_MAX_FIXED_FILES (1U << 20) #define IORING_MAX_REG_BUFFERS (1U << 14) static const struct io_mapped_ubuf dummy_ubuf = { /* set invalid range, so io_import_fixed() fails meeting it */ .ubuf = -1UL, .ubuf_end = 0, }; int __io_account_mem(struct user_struct *user, unsigned long nr_pages) { unsigned long page_limit, cur_pages, new_pages; if (!nr_pages) return 0; /* Don't allow more pages than we can safely lock */ page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; cur_pages = atomic_long_read(&user->locked_vm); do { new_pages = cur_pages + nr_pages; if (new_pages > page_limit) return -ENOMEM; } while (!atomic_long_try_cmpxchg(&user->locked_vm, &cur_pages, new_pages)); return 0; } static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) { if (ctx->user) __io_unaccount_mem(ctx->user, nr_pages); if (ctx->mm_account) atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm); } static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) { int ret; if (ctx->user) { ret = __io_account_mem(ctx->user, nr_pages); if (ret) return ret; } if (ctx->mm_account) atomic64_add(nr_pages, &ctx->mm_account->pinned_vm); return 0; } static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst, void __user *arg, unsigned index) { struct iovec __user *src; #ifdef CONFIG_COMPAT if (ctx->compat) { struct compat_iovec __user *ciovs; struct compat_iovec ciov; ciovs = (struct compat_iovec __user *) arg; if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov))) return -EFAULT; dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base); dst->iov_len = ciov.iov_len; return 0; } #endif src = (struct iovec __user *) arg; if (copy_from_user(dst, &src[index], sizeof(*dst))) return -EFAULT; return 0; } static int io_buffer_validate(struct iovec *iov) { unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1); /* * Don't impose further limits on the size and buffer * constraints here, we'll -EINVAL later when IO is * submitted if they are wrong. */ if (!iov->iov_base) return iov->iov_len ? -EFAULT : 0; if (!iov->iov_len) return -EFAULT; /* arbitrary limit, but we need something */ if (iov->iov_len > SZ_1G) return -EFAULT; if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp)) return -EOVERFLOW; return 0; } static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slot) { struct io_mapped_ubuf *imu = *slot; unsigned int i; if (imu != &dummy_ubuf) { for (i = 0; i < imu->nr_bvecs; i++) unpin_user_page(imu->bvec[i].bv_page); if (imu->acct_pages) io_unaccount_mem(ctx, imu->acct_pages); kvfree(imu); } *slot = NULL; } static void io_rsrc_put_work(struct io_rsrc_node *node) { struct io_rsrc_put *prsrc = &node->item; if (prsrc->tag) io_post_aux_cqe(node->ctx, prsrc->tag, 0, 0); switch (node->type) { case IORING_RSRC_FILE: fput(prsrc->file); break; case IORING_RSRC_BUFFER: io_rsrc_buf_put(node->ctx, prsrc); break; default: WARN_ON_ONCE(1); break; } } void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *node) { if (!io_alloc_cache_put(&ctx->rsrc_node_cache, node)) kfree(node); } void io_rsrc_node_ref_zero(struct io_rsrc_node *node) __must_hold(&node->ctx->uring_lock) { struct io_ring_ctx *ctx = node->ctx; while (!list_empty(&ctx->rsrc_ref_list)) { node = list_first_entry(&ctx->rsrc_ref_list, struct io_rsrc_node, node); /* recycle ref nodes in order */ if (node->refs) break; list_del(&node->node); if (likely(!node->empty)) io_rsrc_put_work(node); io_rsrc_node_destroy(ctx, node); } if (list_empty(&ctx->rsrc_ref_list) && unlikely(ctx->rsrc_quiesce)) wake_up_all(&ctx->rsrc_quiesce_wq); } struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx) { struct io_rsrc_node *ref_node; ref_node = io_alloc_cache_get(&ctx->rsrc_node_cache); if (!ref_node) { ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL); if (!ref_node) return NULL; } ref_node->ctx = ctx; ref_node->empty = 0; ref_node->refs = 1; return ref_node; } __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, struct io_ring_ctx *ctx) { struct io_rsrc_node *backup; DEFINE_WAIT(we); int ret; /* As We may drop ->uring_lock, other task may have started quiesce */ if (data->quiesce) return -ENXIO; backup = io_rsrc_node_alloc(ctx); if (!backup) return -ENOMEM; ctx->rsrc_node->empty = true; ctx->rsrc_node->type = -1; list_add_tail(&ctx->rsrc_node->node, &ctx->rsrc_ref_list); io_put_rsrc_node(ctx, ctx->rsrc_node); ctx->rsrc_node = backup; if (list_empty(&ctx->rsrc_ref_list)) return 0; if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { atomic_set(&ctx->cq_wait_nr, 1); smp_mb(); } ctx->rsrc_quiesce++; data->quiesce = true; do { prepare_to_wait(&ctx->rsrc_quiesce_wq, &we, TASK_INTERRUPTIBLE); mutex_unlock(&ctx->uring_lock); ret = io_run_task_work_sig(ctx); if (ret < 0) { __set_current_state(TASK_RUNNING); mutex_lock(&ctx->uring_lock); if (list_empty(&ctx->rsrc_ref_list)) ret = 0; break; } schedule(); __set_current_state(TASK_RUNNING); mutex_lock(&ctx->uring_lock); ret = 0; } while (!list_empty(&ctx->rsrc_ref_list)); finish_wait(&ctx->rsrc_quiesce_wq, &we); data->quiesce = false; ctx->rsrc_quiesce--; if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { atomic_set(&ctx->cq_wait_nr, 0); smp_mb(); } return ret; } static void io_free_page_table(void **table, size_t size) { unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE); for (i = 0; i < nr_tables; i++) kfree(table[i]); kfree(table); } static void io_rsrc_data_free(struct io_rsrc_data *data) { size_t size = data->nr * sizeof(data->tags[0][0]); if (data->tags) io_free_page_table((void **)data->tags, size); kfree(data); } static __cold void **io_alloc_page_table(size_t size) { unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE); size_t init_size = size; void **table; table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL_ACCOUNT); if (!table) return NULL; for (i = 0; i < nr_tables; i++) { unsigned int this_size = min_t(size_t, size, PAGE_SIZE); table[i] = kzalloc(this_size, GFP_KERNEL_ACCOUNT); if (!table[i]) { io_free_page_table(table, init_size); return NULL; } size -= this_size; } return table; } __cold static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, int type, u64 __user *utags, unsigned nr, struct io_rsrc_data **pdata) { struct io_rsrc_data *data; int ret = 0; unsigned i; data = kzalloc(sizeof(*data), GFP_KERNEL); if (!data) return -ENOMEM; data->tags = (u64 **)io_alloc_page_table(nr * sizeof(data->tags[0][0])); if (!data->tags) { kfree(data); return -ENOMEM; } data->nr = nr; data->ctx = ctx; data->rsrc_type = type; if (utags) { ret = -EFAULT; for (i = 0; i < nr; i++) { u64 *tag_slot = io_get_tag_slot(data, i); if (copy_from_user(tag_slot, &utags[i], sizeof(*tag_slot))) goto fail; } } *pdata = data; return 0; fail: io_rsrc_data_free(data); return ret; } static int __io_sqe_files_update(struct io_ring_ctx *ctx, struct io_uring_rsrc_update2 *up, unsigned nr_args) { u64 __user *tags = u64_to_user_ptr(up->tags); __s32 __user *fds = u64_to_user_ptr(up->data); struct io_rsrc_data *data = ctx->file_data; struct io_fixed_file *file_slot; int fd, i, err = 0; unsigned int done; if (!ctx->file_data) return -ENXIO; if (up->offset + nr_args > ctx->nr_user_files) return -EINVAL; for (done = 0; done < nr_args; done++) { u64 tag = 0; if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) || copy_from_user(&fd, &fds[done], sizeof(fd))) { err = -EFAULT; break; } if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) { err = -EINVAL; break; } if (fd == IORING_REGISTER_FILES_SKIP) continue; i = array_index_nospec(up->offset + done, ctx->nr_user_files); file_slot = io_fixed_file_slot(&ctx->file_table, i); if (file_slot->file_ptr) { err = io_queue_rsrc_removal(data, i, io_slot_file(file_slot)); if (err) break; file_slot->file_ptr = 0; io_file_bitmap_clear(&ctx->file_table, i); } if (fd != -1) { struct file *file = fget(fd); if (!file) { err = -EBADF; break; } /* * Don't allow io_uring instances to be registered. */ if (io_is_uring_fops(file)) { fput(file); err = -EBADF; break; } *io_get_tag_slot(data, i) = tag; io_fixed_file_set(file_slot, file); io_file_bitmap_set(&ctx->file_table, i); } } return done ? done : err; } static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, struct io_uring_rsrc_update2 *up, unsigned int nr_args) { u64 __user *tags = u64_to_user_ptr(up->tags); struct iovec iov, __user *iovs = u64_to_user_ptr(up->data); struct page *last_hpage = NULL; __u32 done; int i, err; if (!ctx->buf_data) return -ENXIO; if (up->offset + nr_args > ctx->nr_user_bufs) return -EINVAL; for (done = 0; done < nr_args; done++) { struct io_mapped_ubuf *imu; u64 tag = 0; err = io_copy_iov(ctx, &iov, iovs, done); if (err) break; if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) { err = -EFAULT; break; } err = io_buffer_validate(&iov); if (err) break; if (!iov.iov_base && tag) { err = -EINVAL; break; } err = io_sqe_buffer_register(ctx, &iov, &imu, &last_hpage); if (err) break; i = array_index_nospec(up->offset + done, ctx->nr_user_bufs); if (ctx->user_bufs[i] != &dummy_ubuf) { err = io_queue_rsrc_removal(ctx->buf_data, i, ctx->user_bufs[i]); if (unlikely(err)) { io_buffer_unmap(ctx, &imu); break; } ctx->user_bufs[i] = (struct io_mapped_ubuf *)&dummy_ubuf; } ctx->user_bufs[i] = imu; *io_get_tag_slot(ctx->buf_data, i) = tag; } return done ? done : err; } static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, struct io_uring_rsrc_update2 *up, unsigned nr_args) { __u32 tmp; lockdep_assert_held(&ctx->uring_lock); if (check_add_overflow(up->offset, nr_args, &tmp)) return -EOVERFLOW; switch (type) { case IORING_RSRC_FILE: return __io_sqe_files_update(ctx, up, nr_args); case IORING_RSRC_BUFFER: return __io_sqe_buffers_update(ctx, up, nr_args); } return -EINVAL; } int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args) { struct io_uring_rsrc_update2 up; if (!nr_args) return -EINVAL; memset(&up, 0, sizeof(up)); if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update))) return -EFAULT; if (up.resv || up.resv2) return -EINVAL; return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args); } int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, unsigned size, unsigned type) { struct io_uring_rsrc_update2 up; if (size != sizeof(up)) return -EINVAL; if (copy_from_user(&up, arg, sizeof(up))) return -EFAULT; if (!up.nr || up.resv || up.resv2) return -EINVAL; return __io_register_rsrc_update(ctx, type, &up, up.nr); } __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, unsigned int size, unsigned int type) { struct io_uring_rsrc_register rr; /* keep it extendible */ if (size != sizeof(rr)) return -EINVAL; memset(&rr, 0, sizeof(rr)); if (copy_from_user(&rr, arg, size)) return -EFAULT; if (!rr.nr || rr.resv2) return -EINVAL; if (rr.flags & ~IORING_RSRC_REGISTER_SPARSE) return -EINVAL; switch (type) { case IORING_RSRC_FILE: if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data) break; return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data), rr.nr, u64_to_user_ptr(rr.tags)); case IORING_RSRC_BUFFER: if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data) break; return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data), rr.nr, u64_to_user_ptr(rr.tags)); } return -EINVAL; } int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) return -EINVAL; if (sqe->rw_flags || sqe->splice_fd_in) return -EINVAL; up->offset = READ_ONCE(sqe->off); up->nr_args = READ_ONCE(sqe->len); if (!up->nr_args) return -EINVAL; up->arg = READ_ONCE(sqe->addr); return 0; } static int io_files_update_with_index_alloc(struct io_kiocb *req, unsigned int issue_flags) { struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); __s32 __user *fds = u64_to_user_ptr(up->arg); unsigned int done; struct file *file; int ret, fd; if (!req->ctx->file_data) return -ENXIO; for (done = 0; done < up->nr_args; done++) { if (copy_from_user(&fd, &fds[done], sizeof(fd))) { ret = -EFAULT; break; } file = fget(fd); if (!file) { ret = -EBADF; break; } ret = io_fixed_fd_install(req, issue_flags, file, IORING_FILE_INDEX_ALLOC); if (ret < 0) break; if (copy_to_user(&fds[done], &ret, sizeof(ret))) { __io_close_fixed(req->ctx, issue_flags, ret); ret = -EFAULT; break; } } if (done) return done; return ret; } int io_files_update(struct io_kiocb *req, unsigned int issue_flags) { struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); struct io_ring_ctx *ctx = req->ctx; struct io_uring_rsrc_update2 up2; int ret; up2.offset = up->offset; up2.data = up->arg; up2.nr = 0; up2.tags = 0; up2.resv = 0; up2.resv2 = 0; if (up->offset == IORING_FILE_INDEX_ALLOC) { ret = io_files_update_with_index_alloc(req, issue_flags); } else { io_ring_submit_lock(ctx, issue_flags); ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up2, up->nr_args); io_ring_submit_unlock(ctx, issue_flags); } if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); return IOU_OK; } int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, void *rsrc) { struct io_ring_ctx *ctx = data->ctx; struct io_rsrc_node *node = ctx->rsrc_node; u64 *tag_slot = io_get_tag_slot(data, idx); ctx->rsrc_node = io_rsrc_node_alloc(ctx); if (unlikely(!ctx->rsrc_node)) { ctx->rsrc_node = node; return -ENOMEM; } node->item.rsrc = rsrc; node->type = data->rsrc_type; node->item.tag = *tag_slot; *tag_slot = 0; list_add_tail(&node->node, &ctx->rsrc_ref_list); io_put_rsrc_node(ctx, node); return 0; } void __io_sqe_files_unregister(struct io_ring_ctx *ctx) { int i; for (i = 0; i < ctx->nr_user_files; i++) { struct file *file = io_file_from_index(&ctx->file_table, i); if (!file) continue; io_file_bitmap_clear(&ctx->file_table, i); fput(file); } io_free_file_tables(&ctx->file_table); io_file_table_set_alloc_range(ctx, 0, 0); io_rsrc_data_free(ctx->file_data); ctx->file_data = NULL; ctx->nr_user_files = 0; } int io_sqe_files_unregister(struct io_ring_ctx *ctx) { unsigned nr = ctx->nr_user_files; int ret; if (!ctx->file_data) return -ENXIO; /* * Quiesce may unlock ->uring_lock, and while it's not held * prevent new requests using the table. */ ctx->nr_user_files = 0; ret = io_rsrc_ref_quiesce(ctx->file_data, ctx); ctx->nr_user_files = nr; if (!ret) __io_sqe_files_unregister(ctx); return ret; } int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args, u64 __user *tags) { __s32 __user *fds = (__s32 __user *) arg; struct file *file; int fd, ret; unsigned i; if (ctx->file_data) return -EBUSY; if (!nr_args) return -EINVAL; if (nr_args > IORING_MAX_FIXED_FILES) return -EMFILE; if (nr_args > rlimit(RLIMIT_NOFILE)) return -EMFILE; ret = io_rsrc_data_alloc(ctx, IORING_RSRC_FILE, tags, nr_args, &ctx->file_data); if (ret) return ret; if (!io_alloc_file_tables(&ctx->file_table, nr_args)) { io_rsrc_data_free(ctx->file_data); ctx->file_data = NULL; return -ENOMEM; } for (i = 0; i < nr_args; i++, ctx->nr_user_files++) { struct io_fixed_file *file_slot; if (fds && copy_from_user(&fd, &fds[i], sizeof(fd))) { ret = -EFAULT; goto fail; } /* allow sparse sets */ if (!fds || fd == -1) { ret = -EINVAL; if (unlikely(*io_get_tag_slot(ctx->file_data, i))) goto fail; continue; } file = fget(fd); ret = -EBADF; if (unlikely(!file)) goto fail; /* * Don't allow io_uring instances to be registered. */ if (io_is_uring_fops(file)) { fput(file); goto fail; } file_slot = io_fixed_file_slot(&ctx->file_table, i); io_fixed_file_set(file_slot, file); io_file_bitmap_set(&ctx->file_table, i); } /* default it to the whole table */ io_file_table_set_alloc_range(ctx, 0, ctx->nr_user_files); return 0; fail: __io_sqe_files_unregister(ctx); return ret; } static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc) { io_buffer_unmap(ctx, &prsrc->buf); prsrc->buf = NULL; } void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx) { unsigned int i; for (i = 0; i < ctx->nr_user_bufs; i++) io_buffer_unmap(ctx, &ctx->user_bufs[i]); kfree(ctx->user_bufs); io_rsrc_data_free(ctx->buf_data); ctx->user_bufs = NULL; ctx->buf_data = NULL; ctx->nr_user_bufs = 0; } int io_sqe_buffers_unregister(struct io_ring_ctx *ctx) { unsigned nr = ctx->nr_user_bufs; int ret; if (!ctx->buf_data) return -ENXIO; /* * Quiesce may unlock ->uring_lock, and while it's not held * prevent new requests using the table. */ ctx->nr_user_bufs = 0; ret = io_rsrc_ref_quiesce(ctx->buf_data, ctx); ctx->nr_user_bufs = nr; if (!ret) __io_sqe_buffers_unregister(ctx); return ret; } /* * Not super efficient, but this is just a registration time. And we do cache * the last compound head, so generally we'll only do a full search if we don't * match that one. * * We check if the given compound head page has already been accounted, to * avoid double accounting it. This allows us to account the full size of the * page, not just the constituent pages of a huge page. */ static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages, int nr_pages, struct page *hpage) { int i, j; /* check current page array */ for (i = 0; i < nr_pages; i++) { if (!PageCompound(pages[i])) continue; if (compound_head(pages[i]) == hpage) return true; } /* check previously registered pages */ for (i = 0; i < ctx->nr_user_bufs; i++) { struct io_mapped_ubuf *imu = ctx->user_bufs[i]; for (j = 0; j < imu->nr_bvecs; j++) { if (!PageCompound(imu->bvec[j].bv_page)) continue; if (compound_head(imu->bvec[j].bv_page) == hpage) return true; } } return false; } static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages, int nr_pages, struct io_mapped_ubuf *imu, struct page **last_hpage) { int i, ret; imu->acct_pages = 0; for (i = 0; i < nr_pages; i++) { if (!PageCompound(pages[i])) { imu->acct_pages++; } else { struct page *hpage; hpage = compound_head(pages[i]); if (hpage == *last_hpage) continue; *last_hpage = hpage; if (headpage_already_acct(ctx, pages, i, hpage)) continue; imu->acct_pages += page_size(hpage) >> PAGE_SHIFT; } } if (!imu->acct_pages) return 0; ret = io_account_mem(ctx, imu->acct_pages); if (ret) imu->acct_pages = 0; return ret; } static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, struct io_mapped_ubuf **pimu, struct page **last_hpage) { struct io_mapped_ubuf *imu = NULL; struct page **pages = NULL; unsigned long off; size_t size; int ret, nr_pages, i; struct folio *folio = NULL; *pimu = (struct io_mapped_ubuf *)&dummy_ubuf; if (!iov->iov_base) return 0; ret = -ENOMEM; pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len, &nr_pages); if (IS_ERR(pages)) { ret = PTR_ERR(pages); pages = NULL; goto done; } /* If it's a huge page, try to coalesce them into a single bvec entry */ if (nr_pages > 1) { folio = page_folio(pages[0]); for (i = 1; i < nr_pages; i++) { /* * Pages must be consecutive and on the same folio for * this to work */ if (page_folio(pages[i]) != folio || pages[i] != pages[i - 1] + 1) { folio = NULL; break; } } if (folio) { /* * The pages are bound to the folio, it doesn't * actually unpin them but drops all but one reference, * which is usually put down by io_buffer_unmap(). * Note, needs a better helper. */ unpin_user_pages(&pages[1], nr_pages - 1); nr_pages = 1; } } imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL); if (!imu) goto done; ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage); if (ret) { unpin_user_pages(pages, nr_pages); goto done; } off = (unsigned long) iov->iov_base & ~PAGE_MASK; size = iov->iov_len; /* store original address for later verification */ imu->ubuf = (unsigned long) iov->iov_base; imu->ubuf_end = imu->ubuf + iov->iov_len; imu->nr_bvecs = nr_pages; *pimu = imu; ret = 0; if (folio) { bvec_set_page(&imu->bvec[0], pages[0], size, off); goto done; } for (i = 0; i < nr_pages; i++) { size_t vec_len; vec_len = min_t(size_t, size, PAGE_SIZE - off); bvec_set_page(&imu->bvec[i], pages[i], vec_len, off); off = 0; size -= vec_len; } done: if (ret) kvfree(imu); kvfree(pages); return ret; } static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args) { ctx->user_bufs = kcalloc(nr_args, sizeof(*ctx->user_bufs), GFP_KERNEL); return ctx->user_bufs ? 0 : -ENOMEM; } int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, unsigned int nr_args, u64 __user *tags) { struct page *last_hpage = NULL; struct io_rsrc_data *data; int i, ret; struct iovec iov; BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16)); if (ctx->user_bufs) return -EBUSY; if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS) return -EINVAL; ret = io_rsrc_data_alloc(ctx, IORING_RSRC_BUFFER, tags, nr_args, &data); if (ret) return ret; ret = io_buffers_map_alloc(ctx, nr_args); if (ret) { io_rsrc_data_free(data); return ret; } for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) { if (arg) { ret = io_copy_iov(ctx, &iov, arg, i); if (ret) break; ret = io_buffer_validate(&iov); if (ret) break; } else { memset(&iov, 0, sizeof(iov)); } if (!iov.iov_base && *io_get_tag_slot(data, i)) { ret = -EINVAL; break; } ret = io_sqe_buffer_register(ctx, &iov, &ctx->user_bufs[i], &last_hpage); if (ret) break; } WARN_ON_ONCE(ctx->buf_data); ctx->buf_data = data; if (ret) __io_sqe_buffers_unregister(ctx); return ret; } int io_import_fixed(int ddir, struct iov_iter *iter, struct io_mapped_ubuf *imu, u64 buf_addr, size_t len) { u64 buf_end; size_t offset; if (WARN_ON_ONCE(!imu)) return -EFAULT; if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end))) return -EFAULT; /* not inside the mapped region */ if (unlikely(buf_addr < imu->ubuf || buf_end > imu->ubuf_end)) return -EFAULT; /* * Might not be a start of buffer, set size appropriately * and advance us to the beginning. */ offset = buf_addr - imu->ubuf; iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, offset + len); if (offset) { /* * Don't use iov_iter_advance() here, as it's really slow for * using the latter parts of a big fixed buffer - it iterates * over each segment manually. We can cheat a bit here, because * we know that: * * 1) it's a BVEC iter, we set it up * 2) all bvecs are PAGE_SIZE in size, except potentially the * first and last bvec * * So just find our index, and adjust the iterator afterwards. * If the offset is within the first bvec (or the whole first * bvec, just use iov_iter_advance(). This makes it easier * since we can just skip the first segment, which may not * be PAGE_SIZE aligned. */ const struct bio_vec *bvec = imu->bvec; if (offset < bvec->bv_len) { /* * Note, huge pages buffers consists of one large * bvec entry and should always go this way. The other * branch doesn't expect non PAGE_SIZE'd chunks. */ iter->bvec = bvec; iter->count -= offset; iter->iov_offset = offset; } else { unsigned long seg_skip; /* skip first vec */ offset -= bvec->bv_len; seg_skip = 1 + (offset >> PAGE_SHIFT); iter->bvec = bvec + seg_skip; iter->nr_segs -= seg_skip; iter->count -= bvec->bv_len + offset; iter->iov_offset = offset & ~PAGE_MASK; } } return 0; } |
| 42 561 44 92 13 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Descending-priority-sorted double-linked list * * (C) 2002-2003 Intel Corp * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>. * * 2001-2005 (c) MontaVista Software, Inc. * Daniel Walker <dwalker@mvista.com> * * (C) 2005 Thomas Gleixner <tglx@linutronix.de> * * Simplifications of the original code by * Oleg Nesterov <oleg@tv-sign.ru> * * Based on simple lists (include/linux/list.h). * * This is a priority-sorted list of nodes; each node has a * priority from INT_MIN (highest) to INT_MAX (lowest). * * Addition is O(K), removal is O(1), change of priority of a node is * O(K) and K is the number of RT priority levels used in the system. * (1 <= K <= 99) * * This list is really a list of lists: * * - The tier 1 list is the prio_list, different priority nodes. * * - The tier 2 list is the node_list, serialized nodes. * * Simple ASCII art explanation: * * pl:prio_list (only for plist_node) * nl:node_list * HEAD| NODE(S) * | * ||------------------------------------| * ||->|pl|<->|pl|<--------------->|pl|<-| * | |10| |21| |21| |21| |40| (prio) * | | | | | | | | | | | * | | | | | | | | | | | * |->|nl|<->|nl|<->|nl|<->|nl|<->|nl|<->|nl|<-| * |-------------------------------------------| * * The nodes on the prio_list list are sorted by priority to simplify * the insertion of new nodes. There are no nodes with duplicate * priorites on the list. * * The nodes on the node_list are ordered by priority and can contain * entries which have the same priority. Those entries are ordered * FIFO * * Addition means: look for the prio_list node in the prio_list * for the priority of the node and insert it before the node_list * entry of the next prio_list node. If it is the first node of * that priority, add it to the prio_list in the right position and * insert it into the serialized node_list list * * Removal means remove it from the node_list and remove it from * the prio_list if the node_list list_head is non empty. In case * of removal from the prio_list it must be checked whether other * entries of the same priority are on the list or not. If there * is another entry of the same priority then this entry has to * replace the removed entry on the prio_list. If the entry which * is removed is the only entry of this priority then a simple * remove from both list is sufficient. * * INT_MIN is the highest priority, 0 is the medium highest, INT_MAX * is lowest priority. * * No locking is done, up to the caller. */ #ifndef _LINUX_PLIST_H_ #define _LINUX_PLIST_H_ #include <linux/container_of.h> #include <linux/list.h> #include <linux/plist_types.h> #include <asm/bug.h> /** * PLIST_HEAD_INIT - static struct plist_head initializer * @head: struct plist_head variable name */ #define PLIST_HEAD_INIT(head) \ { \ .node_list = LIST_HEAD_INIT((head).node_list) \ } /** * PLIST_HEAD - declare and init plist_head * @head: name for struct plist_head variable */ #define PLIST_HEAD(head) \ struct plist_head head = PLIST_HEAD_INIT(head) /** * PLIST_NODE_INIT - static struct plist_node initializer * @node: struct plist_node variable name * @__prio: initial node priority */ #define PLIST_NODE_INIT(node, __prio) \ { \ .prio = (__prio), \ .prio_list = LIST_HEAD_INIT((node).prio_list), \ .node_list = LIST_HEAD_INIT((node).node_list), \ } /** * plist_head_init - dynamic struct plist_head initializer * @head: &struct plist_head pointer */ static inline void plist_head_init(struct plist_head *head) { INIT_LIST_HEAD(&head->node_list); } /** * plist_node_init - Dynamic struct plist_node initializer * @node: &struct plist_node pointer * @prio: initial node priority */ static inline void plist_node_init(struct plist_node *node, int prio) { node->prio = prio; INIT_LIST_HEAD(&node->prio_list); INIT_LIST_HEAD(&node->node_list); } extern void plist_add(struct plist_node *node, struct plist_head *head); extern void plist_del(struct plist_node *node, struct plist_head *head); extern void plist_requeue(struct plist_node *node, struct plist_head *head); /** * plist_for_each - iterate over the plist * @pos: the type * to use as a loop counter * @head: the head for your list */ #define plist_for_each(pos, head) \ list_for_each_entry(pos, &(head)->node_list, node_list) /** * plist_for_each_continue - continue iteration over the plist * @pos: the type * to use as a loop cursor * @head: the head for your list * * Continue to iterate over plist, continuing after the current position. */ #define plist_for_each_continue(pos, head) \ list_for_each_entry_continue(pos, &(head)->node_list, node_list) /** * plist_for_each_safe - iterate safely over a plist of given type * @pos: the type * to use as a loop counter * @n: another type * to use as temporary storage * @head: the head for your list * * Iterate over a plist of given type, safe against removal of list entry. */ #define plist_for_each_safe(pos, n, head) \ list_for_each_entry_safe(pos, n, &(head)->node_list, node_list) /** * plist_for_each_entry - iterate over list of given type * @pos: the type * to use as a loop counter * @head: the head for your list * @mem: the name of the list_head within the struct */ #define plist_for_each_entry(pos, head, mem) \ list_for_each_entry(pos, &(head)->node_list, mem.node_list) /** * plist_for_each_entry_continue - continue iteration over list of given type * @pos: the type * to use as a loop cursor * @head: the head for your list * @m: the name of the list_head within the struct * * Continue to iterate over list of given type, continuing after * the current position. */ #define plist_for_each_entry_continue(pos, head, m) \ list_for_each_entry_continue(pos, &(head)->node_list, m.node_list) /** * plist_for_each_entry_safe - iterate safely over list of given type * @pos: the type * to use as a loop counter * @n: another type * to use as temporary storage * @head: the head for your list * @m: the name of the list_head within the struct * * Iterate over list of given type, safe against removal of list entry. */ #define plist_for_each_entry_safe(pos, n, head, m) \ list_for_each_entry_safe(pos, n, &(head)->node_list, m.node_list) /** * plist_head_empty - return !0 if a plist_head is empty * @head: &struct plist_head pointer */ static inline int plist_head_empty(const struct plist_head *head) { return list_empty(&head->node_list); } /** * plist_node_empty - return !0 if plist_node is not on a list * @node: &struct plist_node pointer */ static inline int plist_node_empty(const struct plist_node *node) { return list_empty(&node->node_list); } /* All functions below assume the plist_head is not empty. */ /** * plist_first_entry - get the struct for the first entry * @head: the &struct plist_head pointer * @type: the type of the struct this is embedded in * @member: the name of the list_head within the struct */ #ifdef CONFIG_DEBUG_PLIST # define plist_first_entry(head, type, member) \ ({ \ WARN_ON(plist_head_empty(head)); \ container_of(plist_first(head), type, member); \ }) #else # define plist_first_entry(head, type, member) \ container_of(plist_first(head), type, member) #endif /** * plist_last_entry - get the struct for the last entry * @head: the &struct plist_head pointer * @type: the type of the struct this is embedded in * @member: the name of the list_head within the struct */ #ifdef CONFIG_DEBUG_PLIST # define plist_last_entry(head, type, member) \ ({ \ WARN_ON(plist_head_empty(head)); \ container_of(plist_last(head), type, member); \ }) #else # define plist_last_entry(head, type, member) \ container_of(plist_last(head), type, member) #endif /** * plist_next - get the next entry in list * @pos: the type * to cursor */ #define plist_next(pos) \ list_next_entry(pos, node_list) /** * plist_prev - get the prev entry in list * @pos: the type * to cursor */ #define plist_prev(pos) \ list_prev_entry(pos, node_list) /** * plist_first - return the first node (and thus, highest priority) * @head: the &struct plist_head pointer * * Assumes the plist is _not_ empty. */ static inline struct plist_node *plist_first(const struct plist_head *head) { return list_entry(head->node_list.next, struct plist_node, node_list); } /** * plist_last - return the last node (and thus, lowest priority) * @head: the &struct plist_head pointer * * Assumes the plist is _not_ empty. */ static inline struct plist_node *plist_last(const struct plist_head *head) { return list_entry(head->node_list.prev, struct plist_node, node_list); } #endif |
| 4 5 5 5 5 5 5 5 5 5 5 5 5 5 5 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/block_validity.c * * Copyright (C) 2009 * Theodore Ts'o (tytso@mit.edu) * * Track which blocks in the filesystem are metadata blocks that * should never be used as data blocks by files or directories. */ #include <linux/time.h> #include <linux/fs.h> #include <linux/namei.h> #include <linux/quotaops.h> #include <linux/buffer_head.h> #include <linux/swap.h> #include <linux/pagemap.h> #include <linux/blkdev.h> #include <linux/slab.h> #include "ext4.h" struct ext4_system_zone { struct rb_node node; ext4_fsblk_t start_blk; unsigned int count; u32 ino; }; static struct kmem_cache *ext4_system_zone_cachep; int __init ext4_init_system_zone(void) { ext4_system_zone_cachep = KMEM_CACHE(ext4_system_zone, 0); if (ext4_system_zone_cachep == NULL) return -ENOMEM; return 0; } void ext4_exit_system_zone(void) { rcu_barrier(); kmem_cache_destroy(ext4_system_zone_cachep); } static inline int can_merge(struct ext4_system_zone *entry1, struct ext4_system_zone *entry2) { if ((entry1->start_blk + entry1->count) == entry2->start_blk && entry1->ino == entry2->ino) return 1; return 0; } static void release_system_zone(struct ext4_system_blocks *system_blks) { struct ext4_system_zone *entry, *n; rbtree_postorder_for_each_entry_safe(entry, n, &system_blks->root, node) kmem_cache_free(ext4_system_zone_cachep, entry); } /* * Mark a range of blocks as belonging to the "system zone" --- that * is, filesystem metadata blocks which should never be used by * inodes. */ static int add_system_zone(struct ext4_system_blocks *system_blks, ext4_fsblk_t start_blk, unsigned int count, u32 ino) { struct ext4_system_zone *new_entry, *entry; struct rb_node **n = &system_blks->root.rb_node, *node; struct rb_node *parent = NULL, *new_node = NULL; while (*n) { parent = *n; entry = rb_entry(parent, struct ext4_system_zone, node); if (start_blk < entry->start_blk) n = &(*n)->rb_left; else if (start_blk >= (entry->start_blk + entry->count)) n = &(*n)->rb_right; else /* Unexpected overlap of system zones. */ return -EFSCORRUPTED; } new_entry = kmem_cache_alloc(ext4_system_zone_cachep, GFP_KERNEL); if (!new_entry) return -ENOMEM; new_entry->start_blk = start_blk; new_entry->count = count; new_entry->ino = ino; new_node = &new_entry->node; rb_link_node(new_node, parent, n); rb_insert_color(new_node, &system_blks->root); /* Can we merge to the left? */ node = rb_prev(new_node); if (node) { entry = rb_entry(node, struct ext4_system_zone, node); if (can_merge(entry, new_entry)) { new_entry->start_blk = entry->start_blk; new_entry->count += entry->count; rb_erase(node, &system_blks->root); kmem_cache_free(ext4_system_zone_cachep, entry); } } /* Can we merge to the right? */ node = rb_next(new_node); if (node) { entry = rb_entry(node, struct ext4_system_zone, node); if (can_merge(new_entry, entry)) { new_entry->count += entry->count; rb_erase(node, &system_blks->root); kmem_cache_free(ext4_system_zone_cachep, entry); } } return 0; } static void debug_print_tree(struct ext4_sb_info *sbi) { struct rb_node *node; struct ext4_system_zone *entry; struct ext4_system_blocks *system_blks; int first = 1; printk(KERN_INFO "System zones: "); rcu_read_lock(); system_blks = rcu_dereference(sbi->s_system_blks); node = rb_first(&system_blks->root); while (node) { entry = rb_entry(node, struct ext4_system_zone, node); printk(KERN_CONT "%s%llu-%llu", first ? "" : ", ", entry->start_blk, entry->start_blk + entry->count - 1); first = 0; node = rb_next(node); } rcu_read_unlock(); printk(KERN_CONT "\n"); } static int ext4_protect_reserved_inode(struct super_block *sb, struct ext4_system_blocks *system_blks, u32 ino) { struct inode *inode; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_map_blocks map; u32 i = 0, num; int err = 0, n; if ((ino < EXT4_ROOT_INO) || (ino > le32_to_cpu(sbi->s_es->s_inodes_count))) return -EINVAL; inode = ext4_iget(sb, ino, EXT4_IGET_SPECIAL); if (IS_ERR(inode)) return PTR_ERR(inode); num = (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits; while (i < num) { cond_resched(); map.m_lblk = i; map.m_len = num - i; n = ext4_map_blocks(NULL, inode, &map, 0); if (n < 0) { err = n; break; } if (n == 0) { i++; } else { err = add_system_zone(system_blks, map.m_pblk, n, ino); if (err < 0) { if (err == -EFSCORRUPTED) { EXT4_ERROR_INODE_ERR(inode, -err, "blocks %llu-%llu from inode overlap system zone", map.m_pblk, map.m_pblk + map.m_len - 1); } break; } i += n; } } iput(inode); return err; } static void ext4_destroy_system_zone(struct rcu_head *rcu) { struct ext4_system_blocks *system_blks; system_blks = container_of(rcu, struct ext4_system_blocks, rcu); release_system_zone(system_blks); kfree(system_blks); } /* * Build system zone rbtree which is used for block validity checking. * * The update of system_blks pointer in this function is protected by * sb->s_umount semaphore. However we have to be careful as we can be * racing with ext4_inode_block_valid() calls reading system_blks rbtree * protected only by RCU. That's why we first build the rbtree and then * swap it in place. */ int ext4_setup_system_zone(struct super_block *sb) { ext4_group_t ngroups = ext4_get_groups_count(sb); struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_system_blocks *system_blks; struct ext4_group_desc *gdp; ext4_group_t i; int ret; system_blks = kzalloc(sizeof(*system_blks), GFP_KERNEL); if (!system_blks) return -ENOMEM; for (i=0; i < ngroups; i++) { unsigned int meta_blks = ext4_num_base_meta_blocks(sb, i); cond_resched(); if (meta_blks != 0) { ret = add_system_zone(system_blks, ext4_group_first_block_no(sb, i), meta_blks, 0); if (ret) goto err; } gdp = ext4_get_group_desc(sb, i, NULL); ret = add_system_zone(system_blks, ext4_block_bitmap(sb, gdp), 1, 0); if (ret) goto err; ret = add_system_zone(system_blks, ext4_inode_bitmap(sb, gdp), 1, 0); if (ret) goto err; ret = add_system_zone(system_blks, ext4_inode_table(sb, gdp), sbi->s_itb_per_group, 0); if (ret) goto err; } if (ext4_has_feature_journal(sb) && sbi->s_es->s_journal_inum) { ret = ext4_protect_reserved_inode(sb, system_blks, le32_to_cpu(sbi->s_es->s_journal_inum)); if (ret) goto err; } /* * System blks rbtree complete, announce it once to prevent racing * with ext4_inode_block_valid() accessing the rbtree at the same * time. */ rcu_assign_pointer(sbi->s_system_blks, system_blks); if (test_opt(sb, DEBUG)) debug_print_tree(sbi); return 0; err: release_system_zone(system_blks); kfree(system_blks); return ret; } /* * Called when the filesystem is unmounted or when remounting it with * noblock_validity specified. * * The update of system_blks pointer in this function is protected by * sb->s_umount semaphore. However we have to be careful as we can be * racing with ext4_inode_block_valid() calls reading system_blks rbtree * protected only by RCU. So we first clear the system_blks pointer and * then free the rbtree only after RCU grace period expires. */ void ext4_release_system_zone(struct super_block *sb) { struct ext4_system_blocks *system_blks; system_blks = rcu_dereference_protected(EXT4_SB(sb)->s_system_blks, lockdep_is_held(&sb->s_umount)); rcu_assign_pointer(EXT4_SB(sb)->s_system_blks, NULL); if (system_blks) call_rcu(&system_blks->rcu, ext4_destroy_system_zone); } int ext4_sb_block_valid(struct super_block *sb, struct inode *inode, ext4_fsblk_t start_blk, unsigned int count) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_system_blocks *system_blks; struct ext4_system_zone *entry; struct rb_node *n; int ret = 1; if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) || (start_blk + count < start_blk) || (start_blk + count > ext4_blocks_count(sbi->s_es))) return 0; /* * Lock the system zone to prevent it being released concurrently * when doing a remount which inverse current "[no]block_validity" * mount option. */ rcu_read_lock(); system_blks = rcu_dereference(sbi->s_system_blks); if (system_blks == NULL) goto out_rcu; n = system_blks->root.rb_node; while (n) { entry = rb_entry(n, struct ext4_system_zone, node); if (start_blk + count - 1 < entry->start_blk) n = n->rb_left; else if (start_blk >= (entry->start_blk + entry->count)) n = n->rb_right; else { ret = 0; if (inode) ret = (entry->ino == inode->i_ino); break; } } out_rcu: rcu_read_unlock(); return ret; } /* * Returns 1 if the passed-in block region (start_blk, * start_blk+count) is valid; 0 if some part of the block region * overlaps with some other filesystem metadata blocks. */ int ext4_inode_block_valid(struct inode *inode, ext4_fsblk_t start_blk, unsigned int count) { return ext4_sb_block_valid(inode->i_sb, inode, start_blk, count); } int ext4_check_blockref(const char *function, unsigned int line, struct inode *inode, __le32 *p, unsigned int max) { __le32 *bref = p; unsigned int blk; if (ext4_has_feature_journal(inode->i_sb) && (inode->i_ino == le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum))) return 0; while (bref < p+max) { blk = le32_to_cpu(*bref++); if (blk && unlikely(!ext4_inode_block_valid(inode, blk, 1))) { ext4_error_inode(inode, function, line, blk, "invalid block"); return -EFSCORRUPTED; } } return 0; } |
| 33 30 33 33 33 33 33 33 33 33 33 11 11 11 9 1 1 10 1 1 1 1 20 20 20 17 17 20 20 20 20 19 20 20 20 20 20 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 | // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/init.h> #include <linux/errno.h> #include <linux/mm.h> #include <linux/mman.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/io_uring.h> #include <linux/io_uring_types.h> #include <asm/shmparam.h> #include "memmap.h" #include "kbuf.h" static void *io_mem_alloc_compound(struct page **pages, int nr_pages, size_t size, gfp_t gfp) { struct page *page; int i, order; order = get_order(size); if (order > MAX_PAGE_ORDER) return ERR_PTR(-ENOMEM); else if (order) gfp |= __GFP_COMP; page = alloc_pages(gfp, order); if (!page) return ERR_PTR(-ENOMEM); for (i = 0; i < nr_pages; i++) pages[i] = page + i; return page_address(page); } static void *io_mem_alloc_single(struct page **pages, int nr_pages, size_t size, gfp_t gfp) { void *ret; int i; for (i = 0; i < nr_pages; i++) { pages[i] = alloc_page(gfp); if (!pages[i]) goto err; } ret = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL); if (ret) return ret; err: while (i--) put_page(pages[i]); return ERR_PTR(-ENOMEM); } void *io_pages_map(struct page ***out_pages, unsigned short *npages, size_t size) { gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN; struct page **pages; int nr_pages; void *ret; nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; pages = kvmalloc_array(nr_pages, sizeof(struct page *), gfp); if (!pages) return ERR_PTR(-ENOMEM); ret = io_mem_alloc_compound(pages, nr_pages, size, gfp); if (!IS_ERR(ret)) goto done; ret = io_mem_alloc_single(pages, nr_pages, size, gfp); if (!IS_ERR(ret)) { done: *out_pages = pages; *npages = nr_pages; return ret; } kvfree(pages); *out_pages = NULL; *npages = 0; return ret; } void io_pages_unmap(void *ptr, struct page ***pages, unsigned short *npages, bool put_pages) { bool do_vunmap = false; if (!ptr) return; if (put_pages && *npages) { struct page **to_free = *pages; int i; /* * Only did vmap for the non-compound multiple page case. * For the compound page, we just need to put the head. */ if (PageCompound(to_free[0])) *npages = 1; else if (*npages > 1) do_vunmap = true; for (i = 0; i < *npages; i++) put_page(to_free[i]); } if (do_vunmap) vunmap(ptr); kvfree(*pages); *pages = NULL; *npages = 0; } void io_pages_free(struct page ***pages, int npages) { struct page **page_array = *pages; if (!page_array) return; unpin_user_pages(page_array, npages); kvfree(page_array); *pages = NULL; } struct page **io_pin_pages(unsigned long uaddr, unsigned long len, int *npages) { unsigned long start, end, nr_pages; struct page **pages; int ret; end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; start = uaddr >> PAGE_SHIFT; nr_pages = end - start; if (WARN_ON_ONCE(!nr_pages)) return ERR_PTR(-EINVAL); pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL); if (!pages) return ERR_PTR(-ENOMEM); ret = pin_user_pages_fast(uaddr, nr_pages, FOLL_WRITE | FOLL_LONGTERM, pages); /* success, mapped all pages */ if (ret == nr_pages) { *npages = nr_pages; return pages; } /* partial map, or didn't map anything */ if (ret >= 0) { /* if we did partial map, release any pages we did get */ if (ret) unpin_user_pages(pages, ret); ret = -EFAULT; } kvfree(pages); return ERR_PTR(ret); } void *__io_uaddr_map(struct page ***pages, unsigned short *npages, unsigned long uaddr, size_t size) { struct page **page_array; unsigned int nr_pages; void *page_addr; *npages = 0; if (uaddr & (PAGE_SIZE - 1) || !size) return ERR_PTR(-EINVAL); nr_pages = 0; page_array = io_pin_pages(uaddr, size, &nr_pages); if (IS_ERR(page_array)) return page_array; page_addr = vmap(page_array, nr_pages, VM_MAP, PAGE_KERNEL); if (page_addr) { *pages = page_array; *npages = nr_pages; return page_addr; } io_pages_free(&page_array, nr_pages); return ERR_PTR(-ENOMEM); } static void *io_uring_validate_mmap_request(struct file *file, loff_t pgoff, size_t sz) { struct io_ring_ctx *ctx = file->private_data; loff_t offset = pgoff << PAGE_SHIFT; switch ((pgoff << PAGE_SHIFT) & IORING_OFF_MMAP_MASK) { case IORING_OFF_SQ_RING: case IORING_OFF_CQ_RING: /* Don't allow mmap if the ring was setup without it */ if (ctx->flags & IORING_SETUP_NO_MMAP) return ERR_PTR(-EINVAL); return ctx->rings; case IORING_OFF_SQES: /* Don't allow mmap if the ring was setup without it */ if (ctx->flags & IORING_SETUP_NO_MMAP) return ERR_PTR(-EINVAL); return ctx->sq_sqes; case IORING_OFF_PBUF_RING: { struct io_buffer_list *bl; unsigned int bgid; void *ptr; bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT; bl = io_pbuf_get_bl(ctx, bgid); if (IS_ERR(bl)) return bl; ptr = bl->buf_ring; io_put_bl(ctx, bl); return ptr; } } return ERR_PTR(-EINVAL); } int io_uring_mmap_pages(struct io_ring_ctx *ctx, struct vm_area_struct *vma, struct page **pages, int npages) { unsigned long nr_pages = npages; vm_flags_set(vma, VM_DONTEXPAND); return vm_insert_pages(vma, vma->vm_start, pages, &nr_pages); } #ifdef CONFIG_MMU __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma) { struct io_ring_ctx *ctx = file->private_data; size_t sz = vma->vm_end - vma->vm_start; long offset = vma->vm_pgoff << PAGE_SHIFT; unsigned int npages; void *ptr; ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz); if (IS_ERR(ptr)) return PTR_ERR(ptr); switch (offset & IORING_OFF_MMAP_MASK) { case IORING_OFF_SQ_RING: case IORING_OFF_CQ_RING: npages = min(ctx->n_ring_pages, (sz + PAGE_SIZE - 1) >> PAGE_SHIFT); return io_uring_mmap_pages(ctx, vma, ctx->ring_pages, npages); case IORING_OFF_SQES: return io_uring_mmap_pages(ctx, vma, ctx->sqe_pages, ctx->n_sqe_pages); case IORING_OFF_PBUF_RING: return io_pbuf_mmap(file, vma); } return -EINVAL; } unsigned long io_uring_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { void *ptr; /* * Do not allow to map to user-provided address to avoid breaking the * aliasing rules. Userspace is not able to guess the offset address of * kernel kmalloc()ed memory area. */ if (addr) return -EINVAL; ptr = io_uring_validate_mmap_request(filp, pgoff, len); if (IS_ERR(ptr)) return -ENOMEM; /* * Some architectures have strong cache aliasing requirements. * For such architectures we need a coherent mapping which aliases * kernel memory *and* userspace memory. To achieve that: * - use a NULL file pointer to reference physical memory, and * - use the kernel virtual address of the shared io_uring context * (instead of the userspace-provided address, which has to be 0UL * anyway). * - use the same pgoff which the get_unmapped_area() uses to * calculate the page colouring. * For architectures without such aliasing requirements, the * architecture will return any suitable mapping because addr is 0. */ filp = NULL; flags |= MAP_SHARED; pgoff = 0; /* has been translated to ptr above */ #ifdef SHM_COLOUR addr = (uintptr_t) ptr; pgoff = addr >> PAGE_SHIFT; #else addr = 0UL; #endif return mm_get_unmapped_area(current->mm, filp, addr, len, pgoff, flags); } #else /* !CONFIG_MMU */ int io_uring_mmap(struct file *file, struct vm_area_struct *vma) { return is_nommu_shared_mapping(vma->vm_flags) ? 0 : -EINVAL; } unsigned int io_uring_nommu_mmap_capabilities(struct file *file) { return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE; } unsigned long io_uring_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { void *ptr; ptr = io_uring_validate_mmap_request(file, pgoff, len); if (IS_ERR(ptr)) return PTR_ERR(ptr); return (unsigned long) ptr; } #endif /* !CONFIG_MMU */ |
| 1 1480 20 1448 1324 1324 1284 43 1324 1005 462 461 461 463 463 151 152 151 33 136 142 136 604 604 572 43 3 757 519 2 72 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 | /* SPDX-License-Identifier: GPL-2.0 */ /* * Percpu refcounts: * (C) 2012 Google, Inc. * Author: Kent Overstreet <koverstreet@google.com> * * This implements a refcount with similar semantics to atomic_t - atomic_inc(), * atomic_dec_and_test() - but percpu. * * There's one important difference between percpu refs and normal atomic_t * refcounts; you have to keep track of your initial refcount, and then when you * start shutting down you call percpu_ref_kill() _before_ dropping the initial * refcount. * * The refcount will have a range of 0 to ((1U << 31) - 1), i.e. one bit less * than an atomic_t - this is because of the way shutdown works, see * percpu_ref_kill()/PERCPU_COUNT_BIAS. * * Before you call percpu_ref_kill(), percpu_ref_put() does not check for the * refcount hitting 0 - it can't, if it was in percpu mode. percpu_ref_kill() * puts the ref back in single atomic_t mode, collecting the per cpu refs and * issuing the appropriate barriers, and then marks the ref as shutting down so * that percpu_ref_put() will check for the ref hitting 0. After it returns, * it's safe to drop the initial ref. * * USAGE: * * See fs/aio.c for some example usage; it's used there for struct kioctx, which * is created when userspaces calls io_setup(), and destroyed when userspace * calls io_destroy() or the process exits. * * In the aio code, kill_ioctx() is called when we wish to destroy a kioctx; it * removes the kioctx from the proccess's table of kioctxs and kills percpu_ref. * After that, there can't be any new users of the kioctx (from lookup_ioctx()) * and it's then safe to drop the initial ref with percpu_ref_put(). * * Note that the free path, free_ioctx(), needs to go through explicit call_rcu() * to synchronize with RCU protected lookup_ioctx(). percpu_ref operations don't * imply RCU grace periods of any kind and if a user wants to combine percpu_ref * with RCU protection, it must be done explicitly. * * Code that does a two stage shutdown like this often needs some kind of * explicit synchronization to ensure the initial refcount can only be dropped * once - percpu_ref_kill() does this for you, it returns true once and false if * someone else already called it. The aio code uses it this way, but it's not * necessary if the code has some other mechanism to synchronize teardown. * around. */ #ifndef _LINUX_PERCPU_REFCOUNT_H #define _LINUX_PERCPU_REFCOUNT_H #include <linux/atomic.h> #include <linux/percpu.h> #include <linux/rcupdate.h> #include <linux/types.h> #include <linux/gfp.h> struct percpu_ref; typedef void (percpu_ref_func_t)(struct percpu_ref *); /* flags set in the lower bits of percpu_ref->percpu_count_ptr */ enum { __PERCPU_REF_ATOMIC = 1LU << 0, /* operating in atomic mode */ __PERCPU_REF_DEAD = 1LU << 1, /* (being) killed */ __PERCPU_REF_ATOMIC_DEAD = __PERCPU_REF_ATOMIC | __PERCPU_REF_DEAD, __PERCPU_REF_FLAG_BITS = 2, }; /* @flags for percpu_ref_init() */ enum { /* * Start w/ ref == 1 in atomic mode. Can be switched to percpu * operation using percpu_ref_switch_to_percpu(). If initialized * with this flag, the ref will stay in atomic mode until * percpu_ref_switch_to_percpu() is invoked on it. * Implies ALLOW_REINIT. */ PERCPU_REF_INIT_ATOMIC = 1 << 0, /* * Start dead w/ ref == 0 in atomic mode. Must be revived with * percpu_ref_reinit() before used. Implies INIT_ATOMIC and * ALLOW_REINIT. */ PERCPU_REF_INIT_DEAD = 1 << 1, /* * Allow switching from atomic mode to percpu mode. */ PERCPU_REF_ALLOW_REINIT = 1 << 2, }; struct percpu_ref_data { atomic_long_t count; percpu_ref_func_t *release; percpu_ref_func_t *confirm_switch; bool force_atomic:1; bool allow_reinit:1; struct rcu_head rcu; struct percpu_ref *ref; }; struct percpu_ref { /* * The low bit of the pointer indicates whether the ref is in percpu * mode; if set, then get/put will manipulate the atomic_t. */ unsigned long percpu_count_ptr; /* * 'percpu_ref' is often embedded into user structure, and only * 'percpu_count_ptr' is required in fast path, move other fields * into 'percpu_ref_data', so we can reduce memory footprint in * fast path. */ struct percpu_ref_data *data; }; int __must_check percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release, unsigned int flags, gfp_t gfp); void percpu_ref_exit(struct percpu_ref *ref); void percpu_ref_switch_to_atomic(struct percpu_ref *ref, percpu_ref_func_t *confirm_switch); void percpu_ref_switch_to_atomic_sync(struct percpu_ref *ref); void percpu_ref_switch_to_percpu(struct percpu_ref *ref); void percpu_ref_kill_and_confirm(struct percpu_ref *ref, percpu_ref_func_t *confirm_kill); void percpu_ref_resurrect(struct percpu_ref *ref); void percpu_ref_reinit(struct percpu_ref *ref); bool percpu_ref_is_zero(struct percpu_ref *ref); /** * percpu_ref_kill - drop the initial ref * @ref: percpu_ref to kill * * Must be used to drop the initial ref on a percpu refcount; must be called * precisely once before shutdown. * * Switches @ref into atomic mode before gathering up the percpu counters * and dropping the initial ref. * * There are no implied RCU grace periods between kill and release. */ static inline void percpu_ref_kill(struct percpu_ref *ref) { percpu_ref_kill_and_confirm(ref, NULL); } /* * Internal helper. Don't use outside percpu-refcount proper. The * function doesn't return the pointer and let the caller test it for NULL * because doing so forces the compiler to generate two conditional * branches as it can't assume that @ref->percpu_count is not NULL. */ static inline bool __ref_is_percpu(struct percpu_ref *ref, unsigned long __percpu **percpu_countp) { unsigned long percpu_ptr; /* * The value of @ref->percpu_count_ptr is tested for * !__PERCPU_REF_ATOMIC, which may be set asynchronously, and then * used as a pointer. If the compiler generates a separate fetch * when using it as a pointer, __PERCPU_REF_ATOMIC may be set in * between contaminating the pointer value, meaning that * READ_ONCE() is required when fetching it. * * The dependency ordering from the READ_ONCE() pairs * with smp_store_release() in __percpu_ref_switch_to_percpu(). */ percpu_ptr = READ_ONCE(ref->percpu_count_ptr); /* * Theoretically, the following could test just ATOMIC; however, * then we'd have to mask off DEAD separately as DEAD may be * visible without ATOMIC if we race with percpu_ref_kill(). DEAD * implies ATOMIC anyway. Test them together. */ if (unlikely(percpu_ptr & __PERCPU_REF_ATOMIC_DEAD)) return false; *percpu_countp = (unsigned long __percpu *)percpu_ptr; return true; } /** * percpu_ref_get_many - increment a percpu refcount * @ref: percpu_ref to get * @nr: number of references to get * * Analogous to atomic_long_add(). * * This function is safe to call as long as @ref is between init and exit. */ static inline void percpu_ref_get_many(struct percpu_ref *ref, unsigned long nr) { unsigned long __percpu *percpu_count; rcu_read_lock(); if (__ref_is_percpu(ref, &percpu_count)) this_cpu_add(*percpu_count, nr); else atomic_long_add(nr, &ref->data->count); rcu_read_unlock(); } /** * percpu_ref_get - increment a percpu refcount * @ref: percpu_ref to get * * Analogous to atomic_long_inc(). * * This function is safe to call as long as @ref is between init and exit. */ static inline void percpu_ref_get(struct percpu_ref *ref) { percpu_ref_get_many(ref, 1); } /** * percpu_ref_tryget_many - try to increment a percpu refcount * @ref: percpu_ref to try-get * @nr: number of references to get * * Increment a percpu refcount by @nr unless its count already reached zero. * Returns %true on success; %false on failure. * * This function is safe to call as long as @ref is between init and exit. */ static inline bool percpu_ref_tryget_many(struct percpu_ref *ref, unsigned long nr) { unsigned long __percpu *percpu_count; bool ret; rcu_read_lock(); if (__ref_is_percpu(ref, &percpu_count)) { this_cpu_add(*percpu_count, nr); ret = true; } else { ret = atomic_long_add_unless(&ref->data->count, nr, 0); } rcu_read_unlock(); return ret; } /** * percpu_ref_tryget - try to increment a percpu refcount * @ref: percpu_ref to try-get * * Increment a percpu refcount unless its count already reached zero. * Returns %true on success; %false on failure. * * This function is safe to call as long as @ref is between init and exit. */ static inline bool percpu_ref_tryget(struct percpu_ref *ref) { return percpu_ref_tryget_many(ref, 1); } /** * percpu_ref_tryget_live_rcu - same as percpu_ref_tryget_live() but the * caller is responsible for taking RCU. * * This function is safe to call as long as @ref is between init and exit. */ static inline bool percpu_ref_tryget_live_rcu(struct percpu_ref *ref) { unsigned long __percpu *percpu_count; bool ret = false; WARN_ON_ONCE(!rcu_read_lock_held()); if (likely(__ref_is_percpu(ref, &percpu_count))) { this_cpu_inc(*percpu_count); ret = true; } else if (!(ref->percpu_count_ptr & __PERCPU_REF_DEAD)) { ret = atomic_long_inc_not_zero(&ref->data->count); } return ret; } /** * percpu_ref_tryget_live - try to increment a live percpu refcount * @ref: percpu_ref to try-get * * Increment a percpu refcount unless it has already been killed. Returns * %true on success; %false on failure. * * Completion of percpu_ref_kill() in itself doesn't guarantee that this * function will fail. For such guarantee, percpu_ref_kill_and_confirm() * should be used. After the confirm_kill callback is invoked, it's * guaranteed that no new reference will be given out by * percpu_ref_tryget_live(). * * This function is safe to call as long as @ref is between init and exit. */ static inline bool percpu_ref_tryget_live(struct percpu_ref *ref) { bool ret = false; rcu_read_lock(); ret = percpu_ref_tryget_live_rcu(ref); rcu_read_unlock(); return ret; } /** * percpu_ref_put_many - decrement a percpu refcount * @ref: percpu_ref to put * @nr: number of references to put * * Decrement the refcount, and if 0, call the release function (which was passed * to percpu_ref_init()) * * This function is safe to call as long as @ref is between init and exit. */ static inline void percpu_ref_put_many(struct percpu_ref *ref, unsigned long nr) { unsigned long __percpu *percpu_count; rcu_read_lock(); if (__ref_is_percpu(ref, &percpu_count)) this_cpu_sub(*percpu_count, nr); else if (unlikely(atomic_long_sub_and_test(nr, &ref->data->count))) ref->data->release(ref); rcu_read_unlock(); } /** * percpu_ref_put - decrement a percpu refcount * @ref: percpu_ref to put * * Decrement the refcount, and if 0, call the release function (which was passed * to percpu_ref_init()) * * This function is safe to call as long as @ref is between init and exit. */ static inline void percpu_ref_put(struct percpu_ref *ref) { percpu_ref_put_many(ref, 1); } /** * percpu_ref_is_dying - test whether a percpu refcount is dying or dead * @ref: percpu_ref to test * * Returns %true if @ref is dying or dead. * * This function is safe to call as long as @ref is between init and exit * and the caller is responsible for synchronizing against state changes. */ static inline bool percpu_ref_is_dying(struct percpu_ref *ref) { return ref->percpu_count_ptr & __PERCPU_REF_DEAD; } #endif |
| 98 84 17 17 17 17 1 331 18 18 17 17 17 337 1 332 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2008 IBM Corporation * * Authors: * Mimi Zohar <zohar@us.ibm.com> * * File: ima_iint.c * - implements the IMA hook: ima_inode_free * - cache integrity information in the inode security blob */ #include <linux/slab.h> #include "ima.h" static struct kmem_cache *ima_iint_cache __ro_after_init; /** * ima_iint_find - Return the iint associated with an inode * @inode: Pointer to the inode * * Return the IMA integrity information (iint) associated with an inode, if the * inode was processed by IMA. * * Return: Found iint or NULL. */ struct ima_iint_cache *ima_iint_find(struct inode *inode) { if (!IS_IMA(inode)) return NULL; return ima_inode_get_iint(inode); } #define IMA_MAX_NESTING (FILESYSTEM_MAX_STACK_DEPTH + 1) /* * It is not clear that IMA should be nested at all, but as long is it measures * files both on overlayfs and on underlying fs, we need to annotate the iint * mutex to avoid lockdep false positives related to IMA + overlayfs. * See ovl_lockdep_annotate_inode_mutex_key() for more details. */ static inline void ima_iint_lockdep_annotate(struct ima_iint_cache *iint, struct inode *inode) { #ifdef CONFIG_LOCKDEP static struct lock_class_key ima_iint_mutex_key[IMA_MAX_NESTING]; int depth = inode->i_sb->s_stack_depth; if (WARN_ON_ONCE(depth < 0 || depth >= IMA_MAX_NESTING)) depth = 0; lockdep_set_class(&iint->mutex, &ima_iint_mutex_key[depth]); #endif } static void ima_iint_init_always(struct ima_iint_cache *iint, struct inode *inode) { iint->ima_hash = NULL; iint->real_inode.version = 0; iint->flags = 0UL; iint->atomic_flags = 0UL; iint->ima_file_status = INTEGRITY_UNKNOWN; iint->ima_mmap_status = INTEGRITY_UNKNOWN; iint->ima_bprm_status = INTEGRITY_UNKNOWN; iint->ima_read_status = INTEGRITY_UNKNOWN; iint->ima_creds_status = INTEGRITY_UNKNOWN; iint->measured_pcrs = 0; mutex_init(&iint->mutex); ima_iint_lockdep_annotate(iint, inode); } static void ima_iint_free(struct ima_iint_cache *iint) { kfree(iint->ima_hash); mutex_destroy(&iint->mutex); kmem_cache_free(ima_iint_cache, iint); } /** * ima_inode_get - Find or allocate an iint associated with an inode * @inode: Pointer to the inode * * Find an iint associated with an inode, and allocate a new one if not found. * Caller must lock i_mutex. * * Return: An iint on success, NULL on error. */ struct ima_iint_cache *ima_inode_get(struct inode *inode) { struct ima_iint_cache *iint; iint = ima_iint_find(inode); if (iint) return iint; iint = kmem_cache_alloc(ima_iint_cache, GFP_NOFS); if (!iint) return NULL; ima_iint_init_always(iint, inode); inode->i_flags |= S_IMA; ima_inode_set_iint(inode, iint); return iint; } /** * ima_inode_free - Called on inode free * @inode: Pointer to the inode * * Free the iint associated with an inode. */ void ima_inode_free(struct inode *inode) { struct ima_iint_cache *iint; if (!IS_IMA(inode)) return; iint = ima_iint_find(inode); ima_inode_set_iint(inode, NULL); ima_iint_free(iint); } static void ima_iint_init_once(void *foo) { struct ima_iint_cache *iint = (struct ima_iint_cache *)foo; memset(iint, 0, sizeof(*iint)); } void __init ima_iintcache_init(void) { ima_iint_cache = kmem_cache_create("ima_iint_cache", sizeof(struct ima_iint_cache), 0, SLAB_PANIC, ima_iint_init_once); } |
| 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 | /* gf128mul.h - GF(2^128) multiplication functions * * Copyright (c) 2003, Dr Brian Gladman, Worcester, UK. * Copyright (c) 2006 Rik Snel <rsnel@cube.dyndns.org> * * Based on Dr Brian Gladman's (GPL'd) work published at * http://fp.gladman.plus.com/cryptography_technology/index.htm * See the original copyright notice below. * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free * Software Foundation; either version 2 of the License, or (at your option) * any later version. */ /* --------------------------------------------------------------------------- Copyright (c) 2003, Dr Brian Gladman, Worcester, UK. All rights reserved. LICENSE TERMS The free distribution and use of this software in both source and binary form is allowed (with or without changes) provided that: 1. distributions of this source code include the above copyright notice, this list of conditions and the following disclaimer; 2. distributions in binary form include the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other associated materials; 3. the copyright holder's name is not used to endorse products built using this software without specific written permission. ALTERNATIVELY, provided that this notice is retained in full, this product may be distributed under the terms of the GNU General Public License (GPL), in which case the provisions of the GPL apply INSTEAD OF those given above. DISCLAIMER This software is provided 'as is' with no explicit or implied warranties in respect of its properties, including, but not limited to, correctness and/or fitness for purpose. --------------------------------------------------------------------------- Issue Date: 31/01/2006 An implementation of field multiplication in Galois Field GF(2^128) */ #ifndef _CRYPTO_GF128MUL_H #define _CRYPTO_GF128MUL_H #include <asm/byteorder.h> #include <crypto/b128ops.h> #include <linux/slab.h> /* Comment by Rik: * * For some background on GF(2^128) see for example: * http://csrc.nist.gov/groups/ST/toolkit/BCM/documents/proposedmodes/gcm/gcm-revised-spec.pdf * * The elements of GF(2^128) := GF(2)[X]/(X^128-X^7-X^2-X^1-1) can * be mapped to computer memory in a variety of ways. Let's examine * three common cases. * * Take a look at the 16 binary octets below in memory order. The msb's * are left and the lsb's are right. char b[16] is an array and b[0] is * the first octet. * * 10000000 00000000 00000000 00000000 .... 00000000 00000000 00000000 * b[0] b[1] b[2] b[3] b[13] b[14] b[15] * * Every bit is a coefficient of some power of X. We can store the bits * in every byte in little-endian order and the bytes themselves also in * little endian order. I will call this lle (little-little-endian). * The above buffer represents the polynomial 1, and X^7+X^2+X^1+1 looks * like 11100001 00000000 .... 00000000 = { 0xE1, 0x00, }. * This format was originally implemented in gf128mul and is used * in GCM (Galois/Counter mode) and in ABL (Arbitrary Block Length). * * Another convention says: store the bits in bigendian order and the * bytes also. This is bbe (big-big-endian). Now the buffer above * represents X^127. X^7+X^2+X^1+1 looks like 00000000 .... 10000111, * b[15] = 0x87 and the rest is 0. LRW uses this convention and bbe * is partly implemented. * * Both of the above formats are easy to implement on big-endian * machines. * * XTS and EME (the latter of which is patent encumbered) use the ble * format (bits are stored in big endian order and the bytes in little * endian). The above buffer represents X^7 in this case and the * primitive polynomial is b[0] = 0x87. * * The common machine word-size is smaller than 128 bits, so to make * an efficient implementation we must split into machine word sizes. * This implementation uses 64-bit words for the moment. Machine * endianness comes into play. The lle format in relation to machine * endianness is discussed below by the original author of gf128mul Dr * Brian Gladman. * * Let's look at the bbe and ble format on a little endian machine. * * bbe on a little endian machine u32 x[4]: * * MS x[0] LS MS x[1] LS * ms ls ms ls ms ls ms ls ms ls ms ls ms ls ms ls * 103..96 111.104 119.112 127.120 71...64 79...72 87...80 95...88 * * MS x[2] LS MS x[3] LS * ms ls ms ls ms ls ms ls ms ls ms ls ms ls ms ls * 39...32 47...40 55...48 63...56 07...00 15...08 23...16 31...24 * * ble on a little endian machine * * MS x[0] LS MS x[1] LS * ms ls ms ls ms ls ms ls ms ls ms ls ms ls ms ls * 31...24 23...16 15...08 07...00 63...56 55...48 47...40 39...32 * * MS x[2] LS MS x[3] LS * ms ls ms ls ms ls ms ls ms ls ms ls ms ls ms ls * 95...88 87...80 79...72 71...64 127.120 199.112 111.104 103..96 * * Multiplications in GF(2^128) are mostly bit-shifts, so you see why * ble (and lbe also) are easier to implement on a little-endian * machine than on a big-endian machine. The converse holds for bbe * and lle. * * Note: to have good alignment, it seems to me that it is sufficient * to keep elements of GF(2^128) in type u64[2]. On 32-bit wordsize * machines this will automatically aligned to wordsize and on a 64-bit * machine also. */ /* Multiply a GF(2^128) field element by x. Field elements are held in arrays of bytes in which field bits 8n..8n + 7 are held in byte[n], with lower indexed bits placed in the more numerically significant bit positions within bytes. On little endian machines the bit indexes translate into the bit positions within four 32-bit words in the following way MS x[0] LS MS x[1] LS ms ls ms ls ms ls ms ls ms ls ms ls ms ls ms ls 24...31 16...23 08...15 00...07 56...63 48...55 40...47 32...39 MS x[2] LS MS x[3] LS ms ls ms ls ms ls ms ls ms ls ms ls ms ls ms ls 88...95 80...87 72...79 64...71 120.127 112.119 104.111 96..103 On big endian machines the bit indexes translate into the bit positions within four 32-bit words in the following way MS x[0] LS MS x[1] LS ms ls ms ls ms ls ms ls ms ls ms ls ms ls ms ls 00...07 08...15 16...23 24...31 32...39 40...47 48...55 56...63 MS x[2] LS MS x[3] LS ms ls ms ls ms ls ms ls ms ls ms ls ms ls ms ls 64...71 72...79 80...87 88...95 96..103 104.111 112.119 120.127 */ /* A slow generic version of gf_mul, implemented for lle and bbe * It multiplies a and b and puts the result in a */ void gf128mul_lle(be128 *a, const be128 *b); void gf128mul_bbe(be128 *a, const be128 *b); /* * The following functions multiply a field element by x in * the polynomial field representation. They use 64-bit word operations * to gain speed but compensate for machine endianness and hence work * correctly on both styles of machine. * * They are defined here for performance. */ static inline u64 gf128mul_mask_from_bit(u64 x, int which) { /* a constant-time version of 'x & ((u64)1 << which) ? (u64)-1 : 0' */ return ((s64)(x << (63 - which)) >> 63); } static inline void gf128mul_x_lle(be128 *r, const be128 *x) { u64 a = be64_to_cpu(x->a); u64 b = be64_to_cpu(x->b); /* equivalent to gf128mul_table_le[(b << 7) & 0xff] << 48 * (see crypto/gf128mul.c): */ u64 _tt = gf128mul_mask_from_bit(b, 0) & ((u64)0xe1 << 56); r->b = cpu_to_be64((b >> 1) | (a << 63)); r->a = cpu_to_be64((a >> 1) ^ _tt); } static inline void gf128mul_x_bbe(be128 *r, const be128 *x) { u64 a = be64_to_cpu(x->a); u64 b = be64_to_cpu(x->b); /* equivalent to gf128mul_table_be[a >> 63] (see crypto/gf128mul.c): */ u64 _tt = gf128mul_mask_from_bit(a, 63) & 0x87; r->a = cpu_to_be64((a << 1) | (b >> 63)); r->b = cpu_to_be64((b << 1) ^ _tt); } /* needed by XTS */ static inline void gf128mul_x_ble(le128 *r, const le128 *x) { u64 a = le64_to_cpu(x->a); u64 b = le64_to_cpu(x->b); /* equivalent to gf128mul_table_be[b >> 63] (see crypto/gf128mul.c): */ u64 _tt = gf128mul_mask_from_bit(a, 63) & 0x87; r->a = cpu_to_le64((a << 1) | (b >> 63)); r->b = cpu_to_le64((b << 1) ^ _tt); } /* 4k table optimization */ struct gf128mul_4k { be128 t[256]; }; struct gf128mul_4k *gf128mul_init_4k_lle(const be128 *g); struct gf128mul_4k *gf128mul_init_4k_bbe(const be128 *g); void gf128mul_4k_lle(be128 *a, const struct gf128mul_4k *t); void gf128mul_4k_bbe(be128 *a, const struct gf128mul_4k *t); void gf128mul_x8_ble(le128 *r, const le128 *x); static inline void gf128mul_free_4k(struct gf128mul_4k *t) { kfree_sensitive(t); } /* 64k table optimization, implemented for bbe */ struct gf128mul_64k { struct gf128mul_4k *t[16]; }; /* First initialize with the constant factor with which you * want to multiply and then call gf128mul_64k_bbe with the other * factor in the first argument, and the table in the second. * Afterwards, the result is stored in *a. */ struct gf128mul_64k *gf128mul_init_64k_bbe(const be128 *g); void gf128mul_free_64k(struct gf128mul_64k *t); void gf128mul_64k_bbe(be128 *a, const struct gf128mul_64k *t); #endif /* _CRYPTO_GF128MUL_H */ |
| 9 9 9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 | // SPDX-License-Identifier: GPL-2.0-or-later /* mpihelp-mul_3.c - MPI helper functions * Copyright (C) 1994, 1996, 1997, 1998, 2001 Free Software Foundation, Inc. * * This file is part of GnuPG. * * Note: This code is heavily based on the GNU MP Library. * Actually it's the same code with only minor changes in the * way the data is stored; this is to support the abstraction * of an optional secure memory allocation which may be used * to avoid revealing of sensitive data due to paging etc. * The GNU MP Library itself is published under the LGPL; * however I decided to publish this code under the plain GPL. */ #include "mpi-internal.h" #include "longlong.h" mpi_limb_t mpihelp_submul_1(mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr, mpi_size_t s1_size, mpi_limb_t s2_limb) { mpi_limb_t cy_limb; mpi_size_t j; mpi_limb_t prod_high, prod_low; mpi_limb_t x; /* The loop counter and index J goes from -SIZE to -1. This way * the loop becomes faster. */ j = -s1_size; res_ptr -= j; s1_ptr -= j; cy_limb = 0; do { umul_ppmm(prod_high, prod_low, s1_ptr[j], s2_limb); prod_low += cy_limb; cy_limb = (prod_low < cy_limb ? 1 : 0) + prod_high; x = res_ptr[j]; prod_low = x - prod_low; cy_limb += prod_low > x ? 1 : 0; res_ptr[j] = prod_low; } while (++j); return cy_limb; } |
| 6884 7834 7835 4564 7710 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __ASM_PREEMPT_H #define __ASM_PREEMPT_H #include <linux/thread_info.h> #define PREEMPT_ENABLED (0) static __always_inline int preempt_count(void) { return READ_ONCE(current_thread_info()->preempt_count); } static __always_inline volatile int *preempt_count_ptr(void) { return ¤t_thread_info()->preempt_count; } static __always_inline void preempt_count_set(int pc) { *preempt_count_ptr() = pc; } /* * must be macros to avoid header recursion hell */ #define init_task_preempt_count(p) do { \ task_thread_info(p)->preempt_count = FORK_PREEMPT_COUNT; \ } while (0) #define init_idle_preempt_count(p, cpu) do { \ task_thread_info(p)->preempt_count = PREEMPT_DISABLED; \ } while (0) static __always_inline void set_preempt_need_resched(void) { } static __always_inline void clear_preempt_need_resched(void) { } static __always_inline bool test_preempt_need_resched(void) { return false; } /* * The various preempt_count add/sub methods */ static __always_inline void __preempt_count_add(int val) { *preempt_count_ptr() += val; } static __always_inline void __preempt_count_sub(int val) { *preempt_count_ptr() -= val; } static __always_inline bool __preempt_count_dec_and_test(void) { /* * Because of load-store architectures cannot do per-cpu atomic * operations; we cannot use PREEMPT_NEED_RESCHED because it might get * lost. */ return !--*preempt_count_ptr() && tif_need_resched(); } /* * Returns true when we need to resched and can (barring IRQ state). */ static __always_inline bool should_resched(int preempt_offset) { return unlikely(preempt_count() == preempt_offset && tif_need_resched()); } #ifdef CONFIG_PREEMPTION extern asmlinkage void preempt_schedule(void); extern asmlinkage void preempt_schedule_notrace(void); #if defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) void dynamic_preempt_schedule(void); void dynamic_preempt_schedule_notrace(void); #define __preempt_schedule() dynamic_preempt_schedule() #define __preempt_schedule_notrace() dynamic_preempt_schedule_notrace() #else /* !CONFIG_PREEMPT_DYNAMIC || !CONFIG_HAVE_PREEMPT_DYNAMIC_KEY*/ #define __preempt_schedule() preempt_schedule() #define __preempt_schedule_notrace() preempt_schedule_notrace() #endif /* CONFIG_PREEMPT_DYNAMIC && CONFIG_HAVE_PREEMPT_DYNAMIC_KEY*/ #endif /* CONFIG_PREEMPTION */ #endif /* __ASM_PREEMPT_H */ |
| 8 46 48 2 46 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _NFNETLINK_H #define _NFNETLINK_H #include <linux/netlink.h> #include <linux/capability.h> #include <net/netlink.h> #include <uapi/linux/netfilter/nfnetlink.h> struct nfnl_info { struct net *net; struct sock *sk; const struct nlmsghdr *nlh; const struct nfgenmsg *nfmsg; struct netlink_ext_ack *extack; }; enum nfnl_callback_type { NFNL_CB_UNSPEC = 0, NFNL_CB_MUTEX, NFNL_CB_RCU, NFNL_CB_BATCH, }; struct nfnl_callback { int (*call)(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const cda[]); const struct nla_policy *policy; enum nfnl_callback_type type; __u16 attr_count; }; enum nfnl_abort_action { NFNL_ABORT_NONE = 0, NFNL_ABORT_AUTOLOAD, NFNL_ABORT_VALIDATE, }; struct nfnetlink_subsystem { const char *name; __u8 subsys_id; /* nfnetlink subsystem ID */ __u8 cb_count; /* number of callbacks */ const struct nfnl_callback *cb; /* callback for individual types */ struct module *owner; int (*commit)(struct net *net, struct sk_buff *skb); int (*abort)(struct net *net, struct sk_buff *skb, enum nfnl_abort_action action); bool (*valid_genid)(struct net *net, u32 genid); }; int nfnetlink_subsys_register(const struct nfnetlink_subsystem *n); int nfnetlink_subsys_unregister(const struct nfnetlink_subsystem *n); int nfnetlink_has_listeners(struct net *net, unsigned int group); int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 portid, unsigned int group, int echo, gfp_t flags); int nfnetlink_set_err(struct net *net, u32 portid, u32 group, int error); int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u32 portid); void nfnetlink_broadcast(struct net *net, struct sk_buff *skb, __u32 portid, __u32 group, gfp_t allocation); static inline u16 nfnl_msg_type(u8 subsys, u8 msg_type) { return subsys << 8 | msg_type; } static inline void nfnl_fill_hdr(struct nlmsghdr *nlh, u8 family, u8 version, __be16 res_id) { struct nfgenmsg *nfmsg; nfmsg = nlmsg_data(nlh); nfmsg->nfgen_family = family; nfmsg->version = version; nfmsg->res_id = res_id; } static inline struct nlmsghdr *nfnl_msg_put(struct sk_buff *skb, u32 portid, u32 seq, int type, int flags, u8 family, u8 version, __be16 res_id) { struct nlmsghdr *nlh; nlh = nlmsg_put(skb, portid, seq, type, sizeof(struct nfgenmsg), flags); if (!nlh) return NULL; nfnl_fill_hdr(nlh, family, version, res_id); return nlh; } void nfnl_lock(__u8 subsys_id); void nfnl_unlock(__u8 subsys_id); #ifdef CONFIG_PROVE_LOCKING bool lockdep_nfnl_is_held(__u8 subsys_id); #else static inline bool lockdep_nfnl_is_held(__u8 subsys_id) { return true; } #endif /* CONFIG_PROVE_LOCKING */ #define MODULE_ALIAS_NFNL_SUBSYS(subsys) \ MODULE_ALIAS("nfnetlink-subsys-" __stringify(subsys)) #endif /* _NFNETLINK_H */ |
| 11 11 11 11 11 11 11 11 1 1 11 11 4 3 1 10 10 10 2 10 8 1 2 9 9 8 2 1 3 3 6 9 1 1 8 9 1 9 9 2 9 1 9 9 9 2 2 11 3 1 3 6 6 2 6 1 6 2 2 6 6 1 5 6 6 6 9 9 2 1 1 1 1 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 | // SPDX-License-Identifier: GPL-2.0-or-later /* * net/sched/act_police.c Input police filter * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * J Hadi Salim (action changes) */ #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/skbuff.h> #include <linux/rtnetlink.h> #include <linux/init.h> #include <linux/slab.h> #include <net/act_api.h> #include <net/gso.h> #include <net/netlink.h> #include <net/pkt_cls.h> #include <net/tc_act/tc_police.h> #include <net/tc_wrapper.h> /* Each policer is serialized by its individual spinlock */ static struct tc_action_ops act_police_ops; static const struct nla_policy police_policy[TCA_POLICE_MAX + 1] = { [TCA_POLICE_RATE] = { .len = TC_RTAB_SIZE }, [TCA_POLICE_PEAKRATE] = { .len = TC_RTAB_SIZE }, [TCA_POLICE_AVRATE] = { .type = NLA_U32 }, [TCA_POLICE_RESULT] = { .type = NLA_U32 }, [TCA_POLICE_RATE64] = { .type = NLA_U64 }, [TCA_POLICE_PEAKRATE64] = { .type = NLA_U64 }, [TCA_POLICE_PKTRATE64] = { .type = NLA_U64, .min = 1 }, [TCA_POLICE_PKTBURST64] = { .type = NLA_U64, .min = 1 }, }; static int tcf_police_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { int ret = 0, tcfp_result = TC_ACT_OK, err, size; bool bind = flags & TCA_ACT_FLAGS_BIND; struct nlattr *tb[TCA_POLICE_MAX + 1]; struct tcf_chain *goto_ch = NULL; struct tc_police *parm; struct tcf_police *police; struct qdisc_rate_table *R_tab = NULL, *P_tab = NULL; struct tc_action_net *tn = net_generic(net, act_police_ops.net_id); struct tcf_police_params *new; bool exists = false; u32 index; u64 rate64, prate64; u64 pps, ppsburst; if (nla == NULL) return -EINVAL; err = nla_parse_nested_deprecated(tb, TCA_POLICE_MAX, nla, police_policy, NULL); if (err < 0) return err; if (tb[TCA_POLICE_TBF] == NULL) return -EINVAL; size = nla_len(tb[TCA_POLICE_TBF]); if (size != sizeof(*parm) && size != sizeof(struct tc_police_compat)) return -EINVAL; parm = nla_data(tb[TCA_POLICE_TBF]); index = parm->index; err = tcf_idr_check_alloc(tn, &index, a, bind); if (err < 0) return err; exists = err; if (exists && bind) return ACT_P_BOUND; if (!exists) { ret = tcf_idr_create(tn, index, NULL, a, &act_police_ops, bind, true, flags); if (ret) { tcf_idr_cleanup(tn, index); return ret; } ret = ACT_P_CREATED; spin_lock_init(&(to_police(*a)->tcfp_lock)); } else if (!(flags & TCA_ACT_FLAGS_REPLACE)) { tcf_idr_release(*a, bind); return -EEXIST; } err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); if (err < 0) goto release_idr; police = to_police(*a); if (parm->rate.rate) { err = -ENOMEM; R_tab = qdisc_get_rtab(&parm->rate, tb[TCA_POLICE_RATE], NULL); if (R_tab == NULL) goto failure; if (parm->peakrate.rate) { P_tab = qdisc_get_rtab(&parm->peakrate, tb[TCA_POLICE_PEAKRATE], NULL); if (P_tab == NULL) goto failure; } } if (est) { err = gen_replace_estimator(&police->tcf_bstats, police->common.cpu_bstats, &police->tcf_rate_est, &police->tcf_lock, false, est); if (err) goto failure; } else if (tb[TCA_POLICE_AVRATE] && (ret == ACT_P_CREATED || !gen_estimator_active(&police->tcf_rate_est))) { err = -EINVAL; goto failure; } if (tb[TCA_POLICE_RESULT]) { tcfp_result = nla_get_u32(tb[TCA_POLICE_RESULT]); if (TC_ACT_EXT_CMP(tcfp_result, TC_ACT_GOTO_CHAIN)) { NL_SET_ERR_MSG(extack, "goto chain not allowed on fallback"); err = -EINVAL; goto failure; } } if ((tb[TCA_POLICE_PKTRATE64] && !tb[TCA_POLICE_PKTBURST64]) || (!tb[TCA_POLICE_PKTRATE64] && tb[TCA_POLICE_PKTBURST64])) { NL_SET_ERR_MSG(extack, "Both or neither packet-per-second burst and rate must be provided"); err = -EINVAL; goto failure; } if (tb[TCA_POLICE_PKTRATE64] && R_tab) { NL_SET_ERR_MSG(extack, "packet-per-second and byte-per-second rate limits not allowed in same action"); err = -EINVAL; goto failure; } new = kzalloc(sizeof(*new), GFP_KERNEL); if (unlikely(!new)) { err = -ENOMEM; goto failure; } /* No failure allowed after this point */ new->tcfp_result = tcfp_result; new->tcfp_mtu = parm->mtu; if (!new->tcfp_mtu) { new->tcfp_mtu = ~0; if (R_tab) new->tcfp_mtu = 255 << R_tab->rate.cell_log; } if (R_tab) { new->rate_present = true; rate64 = tb[TCA_POLICE_RATE64] ? nla_get_u64(tb[TCA_POLICE_RATE64]) : 0; psched_ratecfg_precompute(&new->rate, &R_tab->rate, rate64); qdisc_put_rtab(R_tab); } else { new->rate_present = false; } if (P_tab) { new->peak_present = true; prate64 = tb[TCA_POLICE_PEAKRATE64] ? nla_get_u64(tb[TCA_POLICE_PEAKRATE64]) : 0; psched_ratecfg_precompute(&new->peak, &P_tab->rate, prate64); qdisc_put_rtab(P_tab); } else { new->peak_present = false; } new->tcfp_burst = PSCHED_TICKS2NS(parm->burst); if (new->peak_present) new->tcfp_mtu_ptoks = (s64)psched_l2t_ns(&new->peak, new->tcfp_mtu); if (tb[TCA_POLICE_AVRATE]) new->tcfp_ewma_rate = nla_get_u32(tb[TCA_POLICE_AVRATE]); if (tb[TCA_POLICE_PKTRATE64]) { pps = nla_get_u64(tb[TCA_POLICE_PKTRATE64]); ppsburst = nla_get_u64(tb[TCA_POLICE_PKTBURST64]); new->pps_present = true; new->tcfp_pkt_burst = PSCHED_TICKS2NS(ppsburst); psched_ppscfg_precompute(&new->ppsrate, pps); } spin_lock_bh(&police->tcf_lock); spin_lock_bh(&police->tcfp_lock); police->tcfp_t_c = ktime_get_ns(); police->tcfp_toks = new->tcfp_burst; if (new->peak_present) police->tcfp_ptoks = new->tcfp_mtu_ptoks; spin_unlock_bh(&police->tcfp_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); new = rcu_replace_pointer(police->params, new, lockdep_is_held(&police->tcf_lock)); spin_unlock_bh(&police->tcf_lock); if (goto_ch) tcf_chain_put_by_act(goto_ch); if (new) kfree_rcu(new, rcu); return ret; failure: qdisc_put_rtab(P_tab); qdisc_put_rtab(R_tab); if (goto_ch) tcf_chain_put_by_act(goto_ch); release_idr: tcf_idr_release(*a, bind); return err; } static bool tcf_police_mtu_check(struct sk_buff *skb, u32 limit) { u32 len; if (skb_is_gso(skb)) return skb_gso_validate_mac_len(skb, limit); len = qdisc_pkt_len(skb); if (skb_at_tc_ingress(skb)) len += skb->mac_len; return len <= limit; } TC_INDIRECT_SCOPE int tcf_police_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct tcf_police *police = to_police(a); s64 now, toks, ppstoks = 0, ptoks = 0; struct tcf_police_params *p; int ret; tcf_lastuse_update(&police->tcf_tm); bstats_update(this_cpu_ptr(police->common.cpu_bstats), skb); ret = READ_ONCE(police->tcf_action); p = rcu_dereference_bh(police->params); if (p->tcfp_ewma_rate) { struct gnet_stats_rate_est64 sample; if (!gen_estimator_read(&police->tcf_rate_est, &sample) || sample.bps >= p->tcfp_ewma_rate) goto inc_overlimits; } if (tcf_police_mtu_check(skb, p->tcfp_mtu)) { if (!p->rate_present && !p->pps_present) { ret = p->tcfp_result; goto end; } now = ktime_get_ns(); spin_lock_bh(&police->tcfp_lock); toks = min_t(s64, now - police->tcfp_t_c, p->tcfp_burst); if (p->peak_present) { ptoks = toks + police->tcfp_ptoks; if (ptoks > p->tcfp_mtu_ptoks) ptoks = p->tcfp_mtu_ptoks; ptoks -= (s64)psched_l2t_ns(&p->peak, qdisc_pkt_len(skb)); } if (p->rate_present) { toks += police->tcfp_toks; if (toks > p->tcfp_burst) toks = p->tcfp_burst; toks -= (s64)psched_l2t_ns(&p->rate, qdisc_pkt_len(skb)); } else if (p->pps_present) { ppstoks = min_t(s64, now - police->tcfp_t_c, p->tcfp_pkt_burst); ppstoks += police->tcfp_pkttoks; if (ppstoks > p->tcfp_pkt_burst) ppstoks = p->tcfp_pkt_burst; ppstoks -= (s64)psched_pkt2t_ns(&p->ppsrate, 1); } if ((toks | ptoks | ppstoks) >= 0) { police->tcfp_t_c = now; police->tcfp_toks = toks; police->tcfp_ptoks = ptoks; police->tcfp_pkttoks = ppstoks; spin_unlock_bh(&police->tcfp_lock); ret = p->tcfp_result; goto inc_drops; } spin_unlock_bh(&police->tcfp_lock); } inc_overlimits: qstats_overlimit_inc(this_cpu_ptr(police->common.cpu_qstats)); inc_drops: if (ret == TC_ACT_SHOT) qstats_drop_inc(this_cpu_ptr(police->common.cpu_qstats)); end: return ret; } static void tcf_police_cleanup(struct tc_action *a) { struct tcf_police *police = to_police(a); struct tcf_police_params *p; p = rcu_dereference_protected(police->params, 1); if (p) kfree_rcu(p, rcu); } static void tcf_police_stats_update(struct tc_action *a, u64 bytes, u64 packets, u64 drops, u64 lastuse, bool hw) { struct tcf_police *police = to_police(a); struct tcf_t *tm = &police->tcf_tm; tcf_action_update_stats(a, bytes, packets, drops, hw); tm->lastuse = max_t(u64, tm->lastuse, lastuse); } static int tcf_police_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); struct tcf_police *police = to_police(a); struct tcf_police_params *p; struct tc_police opt = { .index = police->tcf_index, .refcnt = refcount_read(&police->tcf_refcnt) - ref, .bindcnt = atomic_read(&police->tcf_bindcnt) - bind, }; struct tcf_t t; spin_lock_bh(&police->tcf_lock); opt.action = police->tcf_action; p = rcu_dereference_protected(police->params, lockdep_is_held(&police->tcf_lock)); opt.mtu = p->tcfp_mtu; opt.burst = PSCHED_NS2TICKS(p->tcfp_burst); if (p->rate_present) { psched_ratecfg_getrate(&opt.rate, &p->rate); if ((p->rate.rate_bytes_ps >= (1ULL << 32)) && nla_put_u64_64bit(skb, TCA_POLICE_RATE64, p->rate.rate_bytes_ps, TCA_POLICE_PAD)) goto nla_put_failure; } if (p->peak_present) { psched_ratecfg_getrate(&opt.peakrate, &p->peak); if ((p->peak.rate_bytes_ps >= (1ULL << 32)) && nla_put_u64_64bit(skb, TCA_POLICE_PEAKRATE64, p->peak.rate_bytes_ps, TCA_POLICE_PAD)) goto nla_put_failure; } if (p->pps_present) { if (nla_put_u64_64bit(skb, TCA_POLICE_PKTRATE64, p->ppsrate.rate_pkts_ps, TCA_POLICE_PAD)) goto nla_put_failure; if (nla_put_u64_64bit(skb, TCA_POLICE_PKTBURST64, PSCHED_NS2TICKS(p->tcfp_pkt_burst), TCA_POLICE_PAD)) goto nla_put_failure; } if (nla_put(skb, TCA_POLICE_TBF, sizeof(opt), &opt)) goto nla_put_failure; if (p->tcfp_result && nla_put_u32(skb, TCA_POLICE_RESULT, p->tcfp_result)) goto nla_put_failure; if (p->tcfp_ewma_rate && nla_put_u32(skb, TCA_POLICE_AVRATE, p->tcfp_ewma_rate)) goto nla_put_failure; tcf_tm_dump(&t, &police->tcf_tm); if (nla_put_64bit(skb, TCA_POLICE_TM, sizeof(t), &t, TCA_POLICE_PAD)) goto nla_put_failure; spin_unlock_bh(&police->tcf_lock); return skb->len; nla_put_failure: spin_unlock_bh(&police->tcf_lock); nlmsg_trim(skb, b); return -1; } static int tcf_police_act_to_flow_act(int tc_act, u32 *extval, struct netlink_ext_ack *extack) { int act_id = -EOPNOTSUPP; if (!TC_ACT_EXT_OPCODE(tc_act)) { if (tc_act == TC_ACT_OK) act_id = FLOW_ACTION_ACCEPT; else if (tc_act == TC_ACT_SHOT) act_id = FLOW_ACTION_DROP; else if (tc_act == TC_ACT_PIPE) act_id = FLOW_ACTION_PIPE; else if (tc_act == TC_ACT_RECLASSIFY) NL_SET_ERR_MSG_MOD(extack, "Offload not supported when conform/exceed action is \"reclassify\""); else NL_SET_ERR_MSG_MOD(extack, "Unsupported conform/exceed action offload"); } else if (TC_ACT_EXT_CMP(tc_act, TC_ACT_GOTO_CHAIN)) { act_id = FLOW_ACTION_GOTO; *extval = tc_act & TC_ACT_EXT_VAL_MASK; } else if (TC_ACT_EXT_CMP(tc_act, TC_ACT_JUMP)) { act_id = FLOW_ACTION_JUMP; *extval = tc_act & TC_ACT_EXT_VAL_MASK; } else if (tc_act == TC_ACT_UNSPEC) { act_id = FLOW_ACTION_CONTINUE; } else { NL_SET_ERR_MSG_MOD(extack, "Unsupported conform/exceed action offload"); } return act_id; } static int tcf_police_offload_act_setup(struct tc_action *act, void *entry_data, u32 *index_inc, bool bind, struct netlink_ext_ack *extack) { if (bind) { struct flow_action_entry *entry = entry_data; struct tcf_police *police = to_police(act); struct tcf_police_params *p; int act_id; p = rcu_dereference_protected(police->params, lockdep_is_held(&police->tcf_lock)); entry->id = FLOW_ACTION_POLICE; entry->police.burst = tcf_police_burst(act); entry->police.rate_bytes_ps = tcf_police_rate_bytes_ps(act); entry->police.peakrate_bytes_ps = tcf_police_peakrate_bytes_ps(act); entry->police.avrate = tcf_police_tcfp_ewma_rate(act); entry->police.overhead = tcf_police_rate_overhead(act); entry->police.burst_pkt = tcf_police_burst_pkt(act); entry->police.rate_pkt_ps = tcf_police_rate_pkt_ps(act); entry->police.mtu = tcf_police_tcfp_mtu(act); act_id = tcf_police_act_to_flow_act(police->tcf_action, &entry->police.exceed.extval, extack); if (act_id < 0) return act_id; entry->police.exceed.act_id = act_id; act_id = tcf_police_act_to_flow_act(p->tcfp_result, &entry->police.notexceed.extval, extack); if (act_id < 0) return act_id; entry->police.notexceed.act_id = act_id; *index_inc = 1; } else { struct flow_offload_action *fl_action = entry_data; fl_action->id = FLOW_ACTION_POLICE; } return 0; } MODULE_AUTHOR("Alexey Kuznetsov"); MODULE_DESCRIPTION("Policing actions"); MODULE_LICENSE("GPL"); static struct tc_action_ops act_police_ops = { .kind = "police", .id = TCA_ID_POLICE, .owner = THIS_MODULE, .stats_update = tcf_police_stats_update, .act = tcf_police_act, .dump = tcf_police_dump, .init = tcf_police_init, .cleanup = tcf_police_cleanup, .offload_act_setup = tcf_police_offload_act_setup, .size = sizeof(struct tcf_police), }; MODULE_ALIAS_NET_ACT("police"); static __net_init int police_init_net(struct net *net) { struct tc_action_net *tn = net_generic(net, act_police_ops.net_id); return tc_action_net_init(net, tn, &act_police_ops); } static void __net_exit police_exit_net(struct list_head *net_list) { tc_action_net_exit(net_list, act_police_ops.net_id); } static struct pernet_operations police_net_ops = { .init = police_init_net, .exit_batch = police_exit_net, .id = &act_police_ops.net_id, .size = sizeof(struct tc_action_net), }; static int __init police_init_module(void) { return tcf_register_action(&act_police_ops, &police_net_ops); } static void __exit police_cleanup_module(void) { tcf_unregister_action(&act_police_ops, &police_net_ops); } module_init(police_init_module); module_exit(police_cleanup_module); |
| 1 1 1 1 1 1 1 1 1 1 18 17 19 18 18 18 18 1 17 2 2 2 2 18 17 19 12 4 11 19 19 18 18 16 17 15 18 18 18 18 18 18 17 17 18 1 17 18 1 1 1 3 3 3 1 3 6 4 4 1 1 6 4 4 4 6 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 | // SPDX-License-Identifier: GPL-2.0 /* * Block device elevator/IO-scheduler. * * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE * * 30042000 Jens Axboe <axboe@kernel.dk> : * * Split the elevator a bit so that it is possible to choose a different * one or even write a new "plug in". There are three pieces: * - elevator_fn, inserts a new request in the queue list * - elevator_merge_fn, decides whether a new buffer can be merged with * an existing request * - elevator_dequeue_fn, called when a request is taken off the active list * * 20082000 Dave Jones <davej@suse.de> : * Removed tests for max-bomb-segments, which was breaking elvtune * when run without -bN * * Jens: * - Rework again to work with bio instead of buffer_heads * - loose bi_dev comparisons, partition handling is right now * - completely modularize elevator setup and teardown * */ #include <linux/kernel.h> #include <linux/fs.h> #include <linux/blkdev.h> #include <linux/bio.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/compiler.h> #include <linux/blktrace_api.h> #include <linux/hash.h> #include <linux/uaccess.h> #include <linux/pm_runtime.h> #include <trace/events/block.h> #include "elevator.h" #include "blk.h" #include "blk-mq-sched.h" #include "blk-pm.h" #include "blk-wbt.h" #include "blk-cgroup.h" static DEFINE_SPINLOCK(elv_list_lock); static LIST_HEAD(elv_list); /* * Merge hash stuff. */ #define rq_hash_key(rq) (blk_rq_pos(rq) + blk_rq_sectors(rq)) /* * Query io scheduler to see if the current process issuing bio may be * merged with rq. */ static bool elv_iosched_allow_bio_merge(struct request *rq, struct bio *bio) { struct request_queue *q = rq->q; struct elevator_queue *e = q->elevator; if (e->type->ops.allow_merge) return e->type->ops.allow_merge(q, rq, bio); return true; } /* * can we safely merge with this request? */ bool elv_bio_merge_ok(struct request *rq, struct bio *bio) { if (!blk_rq_merge_ok(rq, bio)) return false; if (!elv_iosched_allow_bio_merge(rq, bio)) return false; return true; } EXPORT_SYMBOL(elv_bio_merge_ok); /** * elevator_match - Check whether @e's name or alias matches @name * @e: Scheduler to test * @name: Elevator name to test * * Return true if the elevator @e's name or alias matches @name. */ static bool elevator_match(const struct elevator_type *e, const char *name) { return !strcmp(e->elevator_name, name) || (e->elevator_alias && !strcmp(e->elevator_alias, name)); } static struct elevator_type *__elevator_find(const char *name) { struct elevator_type *e; list_for_each_entry(e, &elv_list, list) if (elevator_match(e, name)) return e; return NULL; } static struct elevator_type *elevator_find_get(struct request_queue *q, const char *name) { struct elevator_type *e; spin_lock(&elv_list_lock); e = __elevator_find(name); if (e && (!elevator_tryget(e))) e = NULL; spin_unlock(&elv_list_lock); return e; } static const struct kobj_type elv_ktype; struct elevator_queue *elevator_alloc(struct request_queue *q, struct elevator_type *e) { struct elevator_queue *eq; eq = kzalloc_node(sizeof(*eq), GFP_KERNEL, q->node); if (unlikely(!eq)) return NULL; __elevator_get(e); eq->type = e; kobject_init(&eq->kobj, &elv_ktype); mutex_init(&eq->sysfs_lock); hash_init(eq->hash); return eq; } EXPORT_SYMBOL(elevator_alloc); static void elevator_release(struct kobject *kobj) { struct elevator_queue *e; e = container_of(kobj, struct elevator_queue, kobj); elevator_put(e->type); kfree(e); } void elevator_exit(struct request_queue *q) { struct elevator_queue *e = q->elevator; ioc_clear_queue(q); blk_mq_sched_free_rqs(q); mutex_lock(&e->sysfs_lock); blk_mq_exit_sched(q, e); mutex_unlock(&e->sysfs_lock); kobject_put(&e->kobj); } static inline void __elv_rqhash_del(struct request *rq) { hash_del(&rq->hash); rq->rq_flags &= ~RQF_HASHED; } void elv_rqhash_del(struct request_queue *q, struct request *rq) { if (ELV_ON_HASH(rq)) __elv_rqhash_del(rq); } EXPORT_SYMBOL_GPL(elv_rqhash_del); void elv_rqhash_add(struct request_queue *q, struct request *rq) { struct elevator_queue *e = q->elevator; BUG_ON(ELV_ON_HASH(rq)); hash_add(e->hash, &rq->hash, rq_hash_key(rq)); rq->rq_flags |= RQF_HASHED; } EXPORT_SYMBOL_GPL(elv_rqhash_add); void elv_rqhash_reposition(struct request_queue *q, struct request *rq) { __elv_rqhash_del(rq); elv_rqhash_add(q, rq); } struct request *elv_rqhash_find(struct request_queue *q, sector_t offset) { struct elevator_queue *e = q->elevator; struct hlist_node *next; struct request *rq; hash_for_each_possible_safe(e->hash, rq, next, hash, offset) { BUG_ON(!ELV_ON_HASH(rq)); if (unlikely(!rq_mergeable(rq))) { __elv_rqhash_del(rq); continue; } if (rq_hash_key(rq) == offset) return rq; } return NULL; } /* * RB-tree support functions for inserting/lookup/removal of requests * in a sorted RB tree. */ void elv_rb_add(struct rb_root *root, struct request *rq) { struct rb_node **p = &root->rb_node; struct rb_node *parent = NULL; struct request *__rq; while (*p) { parent = *p; __rq = rb_entry(parent, struct request, rb_node); if (blk_rq_pos(rq) < blk_rq_pos(__rq)) p = &(*p)->rb_left; else if (blk_rq_pos(rq) >= blk_rq_pos(__rq)) p = &(*p)->rb_right; } rb_link_node(&rq->rb_node, parent, p); rb_insert_color(&rq->rb_node, root); } EXPORT_SYMBOL(elv_rb_add); void elv_rb_del(struct rb_root *root, struct request *rq) { BUG_ON(RB_EMPTY_NODE(&rq->rb_node)); rb_erase(&rq->rb_node, root); RB_CLEAR_NODE(&rq->rb_node); } EXPORT_SYMBOL(elv_rb_del); struct request *elv_rb_find(struct rb_root *root, sector_t sector) { struct rb_node *n = root->rb_node; struct request *rq; while (n) { rq = rb_entry(n, struct request, rb_node); if (sector < blk_rq_pos(rq)) n = n->rb_left; else if (sector > blk_rq_pos(rq)) n = n->rb_right; else return rq; } return NULL; } EXPORT_SYMBOL(elv_rb_find); enum elv_merge elv_merge(struct request_queue *q, struct request **req, struct bio *bio) { struct elevator_queue *e = q->elevator; struct request *__rq; /* * Levels of merges: * nomerges: No merges at all attempted * noxmerges: Only simple one-hit cache try * merges: All merge tries attempted */ if (blk_queue_nomerges(q) || !bio_mergeable(bio)) return ELEVATOR_NO_MERGE; /* * First try one-hit cache. */ if (q->last_merge && elv_bio_merge_ok(q->last_merge, bio)) { enum elv_merge ret = blk_try_merge(q->last_merge, bio); if (ret != ELEVATOR_NO_MERGE) { *req = q->last_merge; return ret; } } if (blk_queue_noxmerges(q)) return ELEVATOR_NO_MERGE; /* * See if our hash lookup can find a potential backmerge. */ __rq = elv_rqhash_find(q, bio->bi_iter.bi_sector); if (__rq && elv_bio_merge_ok(__rq, bio)) { *req = __rq; if (blk_discard_mergable(__rq)) return ELEVATOR_DISCARD_MERGE; return ELEVATOR_BACK_MERGE; } if (e->type->ops.request_merge) return e->type->ops.request_merge(q, req, bio); return ELEVATOR_NO_MERGE; } /* * Attempt to do an insertion back merge. Only check for the case where * we can append 'rq' to an existing request, so we can throw 'rq' away * afterwards. * * Returns true if we merged, false otherwise. 'free' will contain all * requests that need to be freed. */ bool elv_attempt_insert_merge(struct request_queue *q, struct request *rq, struct list_head *free) { struct request *__rq; bool ret; if (blk_queue_nomerges(q)) return false; /* * First try one-hit cache. */ if (q->last_merge && blk_attempt_req_merge(q, q->last_merge, rq)) { list_add(&rq->queuelist, free); return true; } if (blk_queue_noxmerges(q)) return false; ret = false; /* * See if our hash lookup can find a potential backmerge. */ while (1) { __rq = elv_rqhash_find(q, blk_rq_pos(rq)); if (!__rq || !blk_attempt_req_merge(q, __rq, rq)) break; list_add(&rq->queuelist, free); /* The merged request could be merged with others, try again */ ret = true; rq = __rq; } return ret; } void elv_merged_request(struct request_queue *q, struct request *rq, enum elv_merge type) { struct elevator_queue *e = q->elevator; if (e->type->ops.request_merged) e->type->ops.request_merged(q, rq, type); if (type == ELEVATOR_BACK_MERGE) elv_rqhash_reposition(q, rq); q->last_merge = rq; } void elv_merge_requests(struct request_queue *q, struct request *rq, struct request *next) { struct elevator_queue *e = q->elevator; if (e->type->ops.requests_merged) e->type->ops.requests_merged(q, rq, next); elv_rqhash_reposition(q, rq); q->last_merge = rq; } struct request *elv_latter_request(struct request_queue *q, struct request *rq) { struct elevator_queue *e = q->elevator; if (e->type->ops.next_request) return e->type->ops.next_request(q, rq); return NULL; } struct request *elv_former_request(struct request_queue *q, struct request *rq) { struct elevator_queue *e = q->elevator; if (e->type->ops.former_request) return e->type->ops.former_request(q, rq); return NULL; } #define to_elv(atr) container_of((atr), struct elv_fs_entry, attr) static ssize_t elv_attr_show(struct kobject *kobj, struct attribute *attr, char *page) { struct elv_fs_entry *entry = to_elv(attr); struct elevator_queue *e; ssize_t error; if (!entry->show) return -EIO; e = container_of(kobj, struct elevator_queue, kobj); mutex_lock(&e->sysfs_lock); error = e->type ? entry->show(e, page) : -ENOENT; mutex_unlock(&e->sysfs_lock); return error; } static ssize_t elv_attr_store(struct kobject *kobj, struct attribute *attr, const char *page, size_t length) { struct elv_fs_entry *entry = to_elv(attr); struct elevator_queue *e; ssize_t error; if (!entry->store) return -EIO; e = container_of(kobj, struct elevator_queue, kobj); mutex_lock(&e->sysfs_lock); error = e->type ? entry->store(e, page, length) : -ENOENT; mutex_unlock(&e->sysfs_lock); return error; } static const struct sysfs_ops elv_sysfs_ops = { .show = elv_attr_show, .store = elv_attr_store, }; static const struct kobj_type elv_ktype = { .sysfs_ops = &elv_sysfs_ops, .release = elevator_release, }; int elv_register_queue(struct request_queue *q, bool uevent) { struct elevator_queue *e = q->elevator; int error; lockdep_assert_held(&q->sysfs_lock); error = kobject_add(&e->kobj, &q->disk->queue_kobj, "iosched"); if (!error) { struct elv_fs_entry *attr = e->type->elevator_attrs; if (attr) { while (attr->attr.name) { if (sysfs_create_file(&e->kobj, &attr->attr)) break; attr++; } } if (uevent) kobject_uevent(&e->kobj, KOBJ_ADD); set_bit(ELEVATOR_FLAG_REGISTERED, &e->flags); } return error; } void elv_unregister_queue(struct request_queue *q) { struct elevator_queue *e = q->elevator; lockdep_assert_held(&q->sysfs_lock); if (e && test_and_clear_bit(ELEVATOR_FLAG_REGISTERED, &e->flags)) { kobject_uevent(&e->kobj, KOBJ_REMOVE); kobject_del(&e->kobj); } } int elv_register(struct elevator_type *e) { /* finish request is mandatory */ if (WARN_ON_ONCE(!e->ops.finish_request)) return -EINVAL; /* insert_requests and dispatch_request are mandatory */ if (WARN_ON_ONCE(!e->ops.insert_requests || !e->ops.dispatch_request)) return -EINVAL; /* create icq_cache if requested */ if (e->icq_size) { if (WARN_ON(e->icq_size < sizeof(struct io_cq)) || WARN_ON(e->icq_align < __alignof__(struct io_cq))) return -EINVAL; snprintf(e->icq_cache_name, sizeof(e->icq_cache_name), "%s_io_cq", e->elevator_name); e->icq_cache = kmem_cache_create(e->icq_cache_name, e->icq_size, e->icq_align, 0, NULL); if (!e->icq_cache) return -ENOMEM; } /* register, don't allow duplicate names */ spin_lock(&elv_list_lock); if (__elevator_find(e->elevator_name)) { spin_unlock(&elv_list_lock); kmem_cache_destroy(e->icq_cache); return -EBUSY; } list_add_tail(&e->list, &elv_list); spin_unlock(&elv_list_lock); printk(KERN_INFO "io scheduler %s registered\n", e->elevator_name); return 0; } EXPORT_SYMBOL_GPL(elv_register); void elv_unregister(struct elevator_type *e) { /* unregister */ spin_lock(&elv_list_lock); list_del_init(&e->list); spin_unlock(&elv_list_lock); /* * Destroy icq_cache if it exists. icq's are RCU managed. Make * sure all RCU operations are complete before proceeding. */ if (e->icq_cache) { rcu_barrier(); kmem_cache_destroy(e->icq_cache); e->icq_cache = NULL; } } EXPORT_SYMBOL_GPL(elv_unregister); static inline bool elv_support_iosched(struct request_queue *q) { if (!queue_is_mq(q) || (q->tag_set && (q->tag_set->flags & BLK_MQ_F_NO_SCHED))) return false; return true; } /* * For single queue devices, default to using mq-deadline. If we have multiple * queues or mq-deadline is not available, default to "none". */ static struct elevator_type *elevator_get_default(struct request_queue *q) { if (q->tag_set && q->tag_set->flags & BLK_MQ_F_NO_SCHED_BY_DEFAULT) return NULL; if (q->nr_hw_queues != 1 && !blk_mq_is_shared_tags(q->tag_set->flags)) return NULL; return elevator_find_get(q, "mq-deadline"); } /* * Use the default elevator settings. If the chosen elevator initialization * fails, fall back to the "none" elevator (no elevator). */ void elevator_init_mq(struct request_queue *q) { struct elevator_type *e; int err; if (!elv_support_iosched(q)) return; WARN_ON_ONCE(blk_queue_registered(q)); if (unlikely(q->elevator)) return; e = elevator_get_default(q); if (!e) return; /* * We are called before adding disk, when there isn't any FS I/O, * so freezing queue plus canceling dispatch work is enough to * drain any dispatch activities originated from passthrough * requests, then no need to quiesce queue which may add long boot * latency, especially when lots of disks are involved. */ blk_mq_freeze_queue(q); blk_mq_cancel_work_sync(q); err = blk_mq_init_sched(q, e); blk_mq_unfreeze_queue(q); if (err) { pr_warn("\"%s\" elevator initialization failed, " "falling back to \"none\"\n", e->elevator_name); } elevator_put(e); } /* * Switch to new_e io scheduler. * * If switching fails, we are most likely running out of memory and not able * to restore the old io scheduler, so leaving the io scheduler being none. */ int elevator_switch(struct request_queue *q, struct elevator_type *new_e) { int ret; lockdep_assert_held(&q->sysfs_lock); blk_mq_freeze_queue(q); blk_mq_quiesce_queue(q); if (q->elevator) { elv_unregister_queue(q); elevator_exit(q); } ret = blk_mq_init_sched(q, new_e); if (ret) goto out_unfreeze; ret = elv_register_queue(q, true); if (ret) { elevator_exit(q); goto out_unfreeze; } blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name); out_unfreeze: blk_mq_unquiesce_queue(q); blk_mq_unfreeze_queue(q); if (ret) { pr_warn("elv: switch to \"%s\" failed, falling back to \"none\"\n", new_e->elevator_name); } return ret; } void elevator_disable(struct request_queue *q) { lockdep_assert_held(&q->sysfs_lock); blk_mq_freeze_queue(q); blk_mq_quiesce_queue(q); elv_unregister_queue(q); elevator_exit(q); blk_queue_flag_clear(QUEUE_FLAG_SQ_SCHED, q); q->elevator = NULL; q->nr_requests = q->tag_set->queue_depth; blk_add_trace_msg(q, "elv switch: none"); blk_mq_unquiesce_queue(q); blk_mq_unfreeze_queue(q); } /* * Switch this queue to the given IO scheduler. */ static int elevator_change(struct request_queue *q, const char *elevator_name) { struct elevator_type *e; int ret; /* Make sure queue is not in the middle of being removed */ if (!blk_queue_registered(q)) return -ENOENT; if (!strncmp(elevator_name, "none", 4)) { if (q->elevator) elevator_disable(q); return 0; } if (q->elevator && elevator_match(q->elevator->type, elevator_name)) return 0; e = elevator_find_get(q, elevator_name); if (!e) { request_module("%s-iosched", elevator_name); e = elevator_find_get(q, elevator_name); if (!e) return -EINVAL; } ret = elevator_switch(q, e); elevator_put(e); return ret; } ssize_t elv_iosched_store(struct request_queue *q, const char *buf, size_t count) { char elevator_name[ELV_NAME_MAX]; int ret; if (!elv_support_iosched(q)) return count; strscpy(elevator_name, buf, sizeof(elevator_name)); ret = elevator_change(q, strstrip(elevator_name)); if (!ret) return count; return ret; } ssize_t elv_iosched_show(struct request_queue *q, char *name) { struct elevator_queue *eq = q->elevator; struct elevator_type *cur = NULL, *e; int len = 0; if (!elv_support_iosched(q)) return sprintf(name, "none\n"); if (!q->elevator) { len += sprintf(name+len, "[none] "); } else { len += sprintf(name+len, "none "); cur = eq->type; } spin_lock(&elv_list_lock); list_for_each_entry(e, &elv_list, list) { if (e == cur) len += sprintf(name+len, "[%s] ", e->elevator_name); else len += sprintf(name+len, "%s ", e->elevator_name); } spin_unlock(&elv_list_lock); len += sprintf(name+len, "\n"); return len; } struct request *elv_rb_former_request(struct request_queue *q, struct request *rq) { struct rb_node *rbprev = rb_prev(&rq->rb_node); if (rbprev) return rb_entry_rq(rbprev); return NULL; } EXPORT_SYMBOL(elv_rb_former_request); struct request *elv_rb_latter_request(struct request_queue *q, struct request *rq) { struct rb_node *rbnext = rb_next(&rq->rb_node); if (rbnext) return rb_entry_rq(rbnext); return NULL; } EXPORT_SYMBOL(elv_rb_latter_request); static int __init elevator_setup(char *str) { pr_warn("Kernel parameter elevator= does not have any effect anymore.\n" "Please use sysfs to set IO scheduler for individual devices.\n"); return 1; } __setup("elevator=", elevator_setup); |
| 3 22 30 351 270 44 39 385 1017 1 1 1 1 28 22 41 5 4 11 2 17 2 18 34 34 257 44 1 44 33 33 141 323 229 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_CGROUP_H #define _LINUX_CGROUP_H /* * cgroup interface * * Copyright (C) 2003 BULL SA * Copyright (C) 2004-2006 Silicon Graphics, Inc. * */ #include <linux/sched.h> #include <linux/cpumask.h> #include <linux/nodemask.h> #include <linux/rculist.h> #include <linux/cgroupstats.h> #include <linux/fs.h> #include <linux/seq_file.h> #include <linux/kernfs.h> #include <linux/jump_label.h> #include <linux/types.h> #include <linux/ns_common.h> #include <linux/nsproxy.h> #include <linux/user_namespace.h> #include <linux/refcount.h> #include <linux/kernel_stat.h> #include <linux/cgroup-defs.h> struct kernel_clone_args; #ifdef CONFIG_CGROUPS /* * All weight knobs on the default hierarchy should use the following min, * default and max values. The default value is the logarithmic center of * MIN and MAX and allows 100x to be expressed in both directions. */ #define CGROUP_WEIGHT_MIN 1 #define CGROUP_WEIGHT_DFL 100 #define CGROUP_WEIGHT_MAX 10000 enum { CSS_TASK_ITER_PROCS = (1U << 0), /* walk only threadgroup leaders */ CSS_TASK_ITER_THREADED = (1U << 1), /* walk all threaded css_sets in the domain */ CSS_TASK_ITER_SKIPPED = (1U << 16), /* internal flags */ }; /* a css_task_iter should be treated as an opaque object */ struct css_task_iter { struct cgroup_subsys *ss; unsigned int flags; struct list_head *cset_pos; struct list_head *cset_head; struct list_head *tcset_pos; struct list_head *tcset_head; struct list_head *task_pos; struct list_head *cur_tasks_head; struct css_set *cur_cset; struct css_set *cur_dcset; struct task_struct *cur_task; struct list_head iters_node; /* css_set->task_iters */ }; extern struct file_system_type cgroup_fs_type; extern struct cgroup_root cgrp_dfl_root; extern struct css_set init_css_set; extern spinlock_t css_set_lock; #define SUBSYS(_x) extern struct cgroup_subsys _x ## _cgrp_subsys; #include <linux/cgroup_subsys.h> #undef SUBSYS #define SUBSYS(_x) \ extern struct static_key_true _x ## _cgrp_subsys_enabled_key; \ extern struct static_key_true _x ## _cgrp_subsys_on_dfl_key; #include <linux/cgroup_subsys.h> #undef SUBSYS /** * cgroup_subsys_enabled - fast test on whether a subsys is enabled * @ss: subsystem in question */ #define cgroup_subsys_enabled(ss) \ static_branch_likely(&ss ## _enabled_key) /** * cgroup_subsys_on_dfl - fast test on whether a subsys is on default hierarchy * @ss: subsystem in question */ #define cgroup_subsys_on_dfl(ss) \ static_branch_likely(&ss ## _on_dfl_key) bool css_has_online_children(struct cgroup_subsys_state *css); struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss); struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgroup, struct cgroup_subsys *ss); struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgroup, struct cgroup_subsys *ss); struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, struct cgroup_subsys *ss); struct cgroup *cgroup_get_from_path(const char *path); struct cgroup *cgroup_get_from_fd(int fd); struct cgroup *cgroup_v1v2_get_from_fd(int fd); int cgroup_attach_task_all(struct task_struct *from, struct task_struct *); int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from); int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); int cgroup_rm_cftypes(struct cftype *cfts); void cgroup_file_notify(struct cgroup_file *cfile); void cgroup_file_show(struct cgroup_file *cfile, bool show); int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry); int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *tsk); void cgroup_fork(struct task_struct *p); extern int cgroup_can_fork(struct task_struct *p, struct kernel_clone_args *kargs); extern void cgroup_cancel_fork(struct task_struct *p, struct kernel_clone_args *kargs); extern void cgroup_post_fork(struct task_struct *p, struct kernel_clone_args *kargs); void cgroup_exit(struct task_struct *p); void cgroup_release(struct task_struct *p); void cgroup_free(struct task_struct *p); int cgroup_init_early(void); int cgroup_init(void); int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v); /* * Iteration helpers and macros. */ struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos, struct cgroup_subsys_state *parent); struct cgroup_subsys_state *css_next_descendant_pre(struct cgroup_subsys_state *pos, struct cgroup_subsys_state *css); struct cgroup_subsys_state *css_rightmost_descendant(struct cgroup_subsys_state *pos); struct cgroup_subsys_state *css_next_descendant_post(struct cgroup_subsys_state *pos, struct cgroup_subsys_state *css); struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset, struct cgroup_subsys_state **dst_cssp); struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset, struct cgroup_subsys_state **dst_cssp); void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags, struct css_task_iter *it); struct task_struct *css_task_iter_next(struct css_task_iter *it); void css_task_iter_end(struct css_task_iter *it); /** * css_for_each_child - iterate through children of a css * @pos: the css * to use as the loop cursor * @parent: css whose children to walk * * Walk @parent's children. Must be called under rcu_read_lock(). * * If a subsystem synchronizes ->css_online() and the start of iteration, a * css which finished ->css_online() is guaranteed to be visible in the * future iterations and will stay visible until the last reference is put. * A css which hasn't finished ->css_online() or already finished * ->css_offline() may show up during traversal. It's each subsystem's * responsibility to synchronize against on/offlining. * * It is allowed to temporarily drop RCU read lock during iteration. The * caller is responsible for ensuring that @pos remains accessible until * the start of the next iteration by, for example, bumping the css refcnt. */ #define css_for_each_child(pos, parent) \ for ((pos) = css_next_child(NULL, (parent)); (pos); \ (pos) = css_next_child((pos), (parent))) /** * css_for_each_descendant_pre - pre-order walk of a css's descendants * @pos: the css * to use as the loop cursor * @root: css whose descendants to walk * * Walk @root's descendants. @root is included in the iteration and the * first node to be visited. Must be called under rcu_read_lock(). * * If a subsystem synchronizes ->css_online() and the start of iteration, a * css which finished ->css_online() is guaranteed to be visible in the * future iterations and will stay visible until the last reference is put. * A css which hasn't finished ->css_online() or already finished * ->css_offline() may show up during traversal. It's each subsystem's * responsibility to synchronize against on/offlining. * * For example, the following guarantees that a descendant can't escape * state updates of its ancestors. * * my_online(@css) * { * Lock @css's parent and @css; * Inherit state from the parent; * Unlock both. * } * * my_update_state(@css) * { * css_for_each_descendant_pre(@pos, @css) { * Lock @pos; * if (@pos == @css) * Update @css's state; * else * Verify @pos is alive and inherit state from its parent; * Unlock @pos; * } * } * * As long as the inheriting step, including checking the parent state, is * enclosed inside @pos locking, double-locking the parent isn't necessary * while inheriting. The state update to the parent is guaranteed to be * visible by walking order and, as long as inheriting operations to the * same @pos are atomic to each other, multiple updates racing each other * still result in the correct state. It's guaranateed that at least one * inheritance happens for any css after the latest update to its parent. * * If checking parent's state requires locking the parent, each inheriting * iteration should lock and unlock both @pos->parent and @pos. * * Alternatively, a subsystem may choose to use a single global lock to * synchronize ->css_online() and ->css_offline() against tree-walking * operations. * * It is allowed to temporarily drop RCU read lock during iteration. The * caller is responsible for ensuring that @pos remains accessible until * the start of the next iteration by, for example, bumping the css refcnt. */ #define css_for_each_descendant_pre(pos, css) \ for ((pos) = css_next_descendant_pre(NULL, (css)); (pos); \ (pos) = css_next_descendant_pre((pos), (css))) /** * css_for_each_descendant_post - post-order walk of a css's descendants * @pos: the css * to use as the loop cursor * @css: css whose descendants to walk * * Similar to css_for_each_descendant_pre() but performs post-order * traversal instead. @root is included in the iteration and the last * node to be visited. * * If a subsystem synchronizes ->css_online() and the start of iteration, a * css which finished ->css_online() is guaranteed to be visible in the * future iterations and will stay visible until the last reference is put. * A css which hasn't finished ->css_online() or already finished * ->css_offline() may show up during traversal. It's each subsystem's * responsibility to synchronize against on/offlining. * * Note that the walk visibility guarantee example described in pre-order * walk doesn't apply the same to post-order walks. */ #define css_for_each_descendant_post(pos, css) \ for ((pos) = css_next_descendant_post(NULL, (css)); (pos); \ (pos) = css_next_descendant_post((pos), (css))) /** * cgroup_taskset_for_each - iterate cgroup_taskset * @task: the loop cursor * @dst_css: the destination css * @tset: taskset to iterate * * @tset may contain multiple tasks and they may belong to multiple * processes. * * On the v2 hierarchy, there may be tasks from multiple processes and they * may not share the source or destination csses. * * On traditional hierarchies, when there are multiple tasks in @tset, if a * task of a process is in @tset, all tasks of the process are in @tset. * Also, all are guaranteed to share the same source and destination csses. * * Iteration is not in any specific order. */ #define cgroup_taskset_for_each(task, dst_css, tset) \ for ((task) = cgroup_taskset_first((tset), &(dst_css)); \ (task); \ (task) = cgroup_taskset_next((tset), &(dst_css))) /** * cgroup_taskset_for_each_leader - iterate group leaders in a cgroup_taskset * @leader: the loop cursor * @dst_css: the destination css * @tset: taskset to iterate * * Iterate threadgroup leaders of @tset. For single-task migrations, @tset * may not contain any. */ #define cgroup_taskset_for_each_leader(leader, dst_css, tset) \ for ((leader) = cgroup_taskset_first((tset), &(dst_css)); \ (leader); \ (leader) = cgroup_taskset_next((tset), &(dst_css))) \ if ((leader) != (leader)->group_leader) \ ; \ else /* * Inline functions. */ #ifdef CONFIG_DEBUG_CGROUP_REF void css_get(struct cgroup_subsys_state *css); void css_get_many(struct cgroup_subsys_state *css, unsigned int n); bool css_tryget(struct cgroup_subsys_state *css); bool css_tryget_online(struct cgroup_subsys_state *css); void css_put(struct cgroup_subsys_state *css); void css_put_many(struct cgroup_subsys_state *css, unsigned int n); #else #define CGROUP_REF_FN_ATTRS static inline #define CGROUP_REF_EXPORT(fn) #include <linux/cgroup_refcnt.h> #endif static inline u64 cgroup_id(const struct cgroup *cgrp) { return cgrp->kn->id; } /** * css_is_dying - test whether the specified css is dying * @css: target css * * Test whether @css is in the process of offlining or already offline. In * most cases, ->css_online() and ->css_offline() callbacks should be * enough; however, the actual offline operations are RCU delayed and this * test returns %true also when @css is scheduled to be offlined. * * This is useful, for example, when the use case requires synchronous * behavior with respect to cgroup removal. cgroup removal schedules css * offlining but the css can seem alive while the operation is being * delayed. If the delay affects user visible semantics, this test can be * used to resolve the situation. */ static inline bool css_is_dying(struct cgroup_subsys_state *css) { return !(css->flags & CSS_NO_REF) && percpu_ref_is_dying(&css->refcnt); } static inline void cgroup_get(struct cgroup *cgrp) { css_get(&cgrp->self); } static inline bool cgroup_tryget(struct cgroup *cgrp) { return css_tryget(&cgrp->self); } static inline void cgroup_put(struct cgroup *cgrp) { css_put(&cgrp->self); } extern struct mutex cgroup_mutex; static inline void cgroup_lock(void) { mutex_lock(&cgroup_mutex); } static inline void cgroup_unlock(void) { mutex_unlock(&cgroup_mutex); } /** * task_css_set_check - obtain a task's css_set with extra access conditions * @task: the task to obtain css_set for * @__c: extra condition expression to be passed to rcu_dereference_check() * * A task's css_set is RCU protected, initialized and exited while holding * task_lock(), and can only be modified while holding both cgroup_mutex * and task_lock() while the task is alive. This macro verifies that the * caller is inside proper critical section and returns @task's css_set. * * The caller can also specify additional allowed conditions via @__c, such * as locks used during the cgroup_subsys::attach() methods. */ #ifdef CONFIG_PROVE_RCU #define task_css_set_check(task, __c) \ rcu_dereference_check((task)->cgroups, \ rcu_read_lock_sched_held() || \ lockdep_is_held(&cgroup_mutex) || \ lockdep_is_held(&css_set_lock) || \ ((task)->flags & PF_EXITING) || (__c)) #else #define task_css_set_check(task, __c) \ rcu_dereference((task)->cgroups) #endif /** * task_css_check - obtain css for (task, subsys) w/ extra access conds * @task: the target task * @subsys_id: the target subsystem ID * @__c: extra condition expression to be passed to rcu_dereference_check() * * Return the cgroup_subsys_state for the (@task, @subsys_id) pair. The * synchronization rules are the same as task_css_set_check(). */ #define task_css_check(task, subsys_id, __c) \ task_css_set_check((task), (__c))->subsys[(subsys_id)] /** * task_css_set - obtain a task's css_set * @task: the task to obtain css_set for * * See task_css_set_check(). */ static inline struct css_set *task_css_set(struct task_struct *task) { return task_css_set_check(task, false); } /** * task_css - obtain css for (task, subsys) * @task: the target task * @subsys_id: the target subsystem ID * * See task_css_check(). */ static inline struct cgroup_subsys_state *task_css(struct task_struct *task, int subsys_id) { return task_css_check(task, subsys_id, false); } /** * task_get_css - find and get the css for (task, subsys) * @task: the target task * @subsys_id: the target subsystem ID * * Find the css for the (@task, @subsys_id) combination, increment a * reference on and return it. This function is guaranteed to return a * valid css. The returned css may already have been offlined. */ static inline struct cgroup_subsys_state * task_get_css(struct task_struct *task, int subsys_id) { struct cgroup_subsys_state *css; rcu_read_lock(); while (true) { css = task_css(task, subsys_id); /* * Can't use css_tryget_online() here. A task which has * PF_EXITING set may stay associated with an offline css. * If such task calls this function, css_tryget_online() * will keep failing. */ if (likely(css_tryget(css))) break; cpu_relax(); } rcu_read_unlock(); return css; } /** * task_css_is_root - test whether a task belongs to the root css * @task: the target task * @subsys_id: the target subsystem ID * * Test whether @task belongs to the root css on the specified subsystem. * May be invoked in any context. */ static inline bool task_css_is_root(struct task_struct *task, int subsys_id) { return task_css_check(task, subsys_id, true) == init_css_set.subsys[subsys_id]; } static inline struct cgroup *task_cgroup(struct task_struct *task, int subsys_id) { return task_css(task, subsys_id)->cgroup; } static inline struct cgroup *task_dfl_cgroup(struct task_struct *task) { return task_css_set(task)->dfl_cgrp; } static inline struct cgroup *cgroup_parent(struct cgroup *cgrp) { struct cgroup_subsys_state *parent_css = cgrp->self.parent; if (parent_css) return container_of(parent_css, struct cgroup, self); return NULL; } /** * cgroup_is_descendant - test ancestry * @cgrp: the cgroup to be tested * @ancestor: possible ancestor of @cgrp * * Test whether @cgrp is a descendant of @ancestor. It also returns %true * if @cgrp == @ancestor. This function is safe to call as long as @cgrp * and @ancestor are accessible. */ static inline bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor) { if (cgrp->root != ancestor->root || cgrp->level < ancestor->level) return false; return cgrp->ancestors[ancestor->level] == ancestor; } /** * cgroup_ancestor - find ancestor of cgroup * @cgrp: cgroup to find ancestor of * @ancestor_level: level of ancestor to find starting from root * * Find ancestor of cgroup at specified level starting from root if it exists * and return pointer to it. Return NULL if @cgrp doesn't have ancestor at * @ancestor_level. * * This function is safe to call as long as @cgrp is accessible. */ static inline struct cgroup *cgroup_ancestor(struct cgroup *cgrp, int ancestor_level) { if (ancestor_level < 0 || ancestor_level > cgrp->level) return NULL; return cgrp->ancestors[ancestor_level]; } /** * task_under_cgroup_hierarchy - test task's membership of cgroup ancestry * @task: the task to be tested * @ancestor: possible ancestor of @task's cgroup * * Tests whether @task's default cgroup hierarchy is a descendant of @ancestor. * It follows all the same rules as cgroup_is_descendant, and only applies * to the default hierarchy. */ static inline bool task_under_cgroup_hierarchy(struct task_struct *task, struct cgroup *ancestor) { struct css_set *cset = task_css_set(task); return cgroup_is_descendant(cset->dfl_cgrp, ancestor); } /* no synchronization, the result can only be used as a hint */ static inline bool cgroup_is_populated(struct cgroup *cgrp) { return cgrp->nr_populated_csets + cgrp->nr_populated_domain_children + cgrp->nr_populated_threaded_children; } /* returns ino associated with a cgroup */ static inline ino_t cgroup_ino(struct cgroup *cgrp) { return kernfs_ino(cgrp->kn); } /* cft/css accessors for cftype->write() operation */ static inline struct cftype *of_cft(struct kernfs_open_file *of) { return of->kn->priv; } struct cgroup_subsys_state *of_css(struct kernfs_open_file *of); /* cft/css accessors for cftype->seq_*() operations */ static inline struct cftype *seq_cft(struct seq_file *seq) { return of_cft(seq->private); } static inline struct cgroup_subsys_state *seq_css(struct seq_file *seq) { return of_css(seq->private); } /* * Name / path handling functions. All are thin wrappers around the kernfs * counterparts and can be called under any context. */ static inline int cgroup_name(struct cgroup *cgrp, char *buf, size_t buflen) { return kernfs_name(cgrp->kn, buf, buflen); } static inline int cgroup_path(struct cgroup *cgrp, char *buf, size_t buflen) { return kernfs_path(cgrp->kn, buf, buflen); } static inline void pr_cont_cgroup_name(struct cgroup *cgrp) { pr_cont_kernfs_name(cgrp->kn); } static inline void pr_cont_cgroup_path(struct cgroup *cgrp) { pr_cont_kernfs_path(cgrp->kn); } bool cgroup_psi_enabled(void); static inline void cgroup_init_kthreadd(void) { /* * kthreadd is inherited by all kthreads, keep it in the root so * that the new kthreads are guaranteed to stay in the root until * initialization is finished. */ current->no_cgroup_migration = 1; } static inline void cgroup_kthread_ready(void) { /* * This kthread finished initialization. The creator should have * set PF_NO_SETAFFINITY if this kthread should stay in the root. */ current->no_cgroup_migration = 0; } void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen); struct cgroup *cgroup_get_from_id(u64 id); #else /* !CONFIG_CGROUPS */ struct cgroup_subsys_state; struct cgroup; static inline u64 cgroup_id(const struct cgroup *cgrp) { return 1; } static inline void css_get(struct cgroup_subsys_state *css) {} static inline void css_put(struct cgroup_subsys_state *css) {} static inline void cgroup_lock(void) {} static inline void cgroup_unlock(void) {} static inline int cgroup_attach_task_all(struct task_struct *from, struct task_struct *t) { return 0; } static inline int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) { return -EINVAL; } static inline void cgroup_fork(struct task_struct *p) {} static inline int cgroup_can_fork(struct task_struct *p, struct kernel_clone_args *kargs) { return 0; } static inline void cgroup_cancel_fork(struct task_struct *p, struct kernel_clone_args *kargs) {} static inline void cgroup_post_fork(struct task_struct *p, struct kernel_clone_args *kargs) {} static inline void cgroup_exit(struct task_struct *p) {} static inline void cgroup_release(struct task_struct *p) {} static inline void cgroup_free(struct task_struct *p) {} static inline int cgroup_init_early(void) { return 0; } static inline int cgroup_init(void) { return 0; } static inline void cgroup_init_kthreadd(void) {} static inline void cgroup_kthread_ready(void) {} static inline struct cgroup *cgroup_parent(struct cgroup *cgrp) { return NULL; } static inline bool cgroup_psi_enabled(void) { return false; } static inline bool task_under_cgroup_hierarchy(struct task_struct *task, struct cgroup *ancestor) { return true; } static inline void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen) {} #endif /* !CONFIG_CGROUPS */ #ifdef CONFIG_CGROUPS /* * cgroup scalable recursive statistics. */ void cgroup_rstat_updated(struct cgroup *cgrp, int cpu); void cgroup_rstat_flush(struct cgroup *cgrp); void cgroup_rstat_flush_hold(struct cgroup *cgrp); void cgroup_rstat_flush_release(struct cgroup *cgrp); /* * Basic resource stats. */ #ifdef CONFIG_CGROUP_CPUACCT void cpuacct_charge(struct task_struct *tsk, u64 cputime); void cpuacct_account_field(struct task_struct *tsk, int index, u64 val); #else static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} static inline void cpuacct_account_field(struct task_struct *tsk, int index, u64 val) {} #endif void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec); void __cgroup_account_cputime_field(struct cgroup *cgrp, enum cpu_usage_stat index, u64 delta_exec); static inline void cgroup_account_cputime(struct task_struct *task, u64 delta_exec) { struct cgroup *cgrp; cpuacct_charge(task, delta_exec); cgrp = task_dfl_cgroup(task); if (cgroup_parent(cgrp)) __cgroup_account_cputime(cgrp, delta_exec); } static inline void cgroup_account_cputime_field(struct task_struct *task, enum cpu_usage_stat index, u64 delta_exec) { struct cgroup *cgrp; cpuacct_account_field(task, index, delta_exec); cgrp = task_dfl_cgroup(task); if (cgroup_parent(cgrp)) __cgroup_account_cputime_field(cgrp, index, delta_exec); } #else /* CONFIG_CGROUPS */ static inline void cgroup_account_cputime(struct task_struct *task, u64 delta_exec) {} static inline void cgroup_account_cputime_field(struct task_struct *task, enum cpu_usage_stat index, u64 delta_exec) {} #endif /* CONFIG_CGROUPS */ /* * sock->sk_cgrp_data handling. For more info, see sock_cgroup_data * definition in cgroup-defs.h. */ #ifdef CONFIG_SOCK_CGROUP_DATA void cgroup_sk_alloc(struct sock_cgroup_data *skcd); void cgroup_sk_clone(struct sock_cgroup_data *skcd); void cgroup_sk_free(struct sock_cgroup_data *skcd); static inline struct cgroup *sock_cgroup_ptr(struct sock_cgroup_data *skcd) { return skcd->cgroup; } #else /* CONFIG_CGROUP_DATA */ static inline void cgroup_sk_alloc(struct sock_cgroup_data *skcd) {} static inline void cgroup_sk_clone(struct sock_cgroup_data *skcd) {} static inline void cgroup_sk_free(struct sock_cgroup_data *skcd) {} #endif /* CONFIG_CGROUP_DATA */ struct cgroup_namespace { struct ns_common ns; struct user_namespace *user_ns; struct ucounts *ucounts; struct css_set *root_cset; }; extern struct cgroup_namespace init_cgroup_ns; #ifdef CONFIG_CGROUPS void free_cgroup_ns(struct cgroup_namespace *ns); struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, struct user_namespace *user_ns, struct cgroup_namespace *old_ns); int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, struct cgroup_namespace *ns); #else /* !CONFIG_CGROUPS */ static inline void free_cgroup_ns(struct cgroup_namespace *ns) { } static inline struct cgroup_namespace * copy_cgroup_ns(unsigned long flags, struct user_namespace *user_ns, struct cgroup_namespace *old_ns) { return old_ns; } #endif /* !CONFIG_CGROUPS */ static inline void get_cgroup_ns(struct cgroup_namespace *ns) { if (ns) refcount_inc(&ns->ns.count); } static inline void put_cgroup_ns(struct cgroup_namespace *ns) { if (ns && refcount_dec_and_test(&ns->ns.count)) free_cgroup_ns(ns); } #ifdef CONFIG_CGROUPS void cgroup_enter_frozen(void); void cgroup_leave_frozen(bool always_leave); void cgroup_update_frozen(struct cgroup *cgrp); void cgroup_freeze(struct cgroup *cgrp, bool freeze); void cgroup_freezer_migrate_task(struct task_struct *task, struct cgroup *src, struct cgroup *dst); static inline bool cgroup_task_frozen(struct task_struct *task) { return task->frozen; } #else /* !CONFIG_CGROUPS */ static inline void cgroup_enter_frozen(void) { } static inline void cgroup_leave_frozen(bool always_leave) { } static inline bool cgroup_task_frozen(struct task_struct *task) { return false; } #endif /* !CONFIG_CGROUPS */ #ifdef CONFIG_CGROUP_BPF static inline void cgroup_bpf_get(struct cgroup *cgrp) { percpu_ref_get(&cgrp->bpf.refcnt); } static inline void cgroup_bpf_put(struct cgroup *cgrp) { percpu_ref_put(&cgrp->bpf.refcnt); } #else /* CONFIG_CGROUP_BPF */ static inline void cgroup_bpf_get(struct cgroup *cgrp) {} static inline void cgroup_bpf_put(struct cgroup *cgrp) {} #endif /* CONFIG_CGROUP_BPF */ struct cgroup *task_get_cgroup1(struct task_struct *tsk, int hierarchy_id); #endif /* _LINUX_CGROUP_H */ |
| 123 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 | /* * Copyright (C) 2016 Red Hat * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. * * Authors: * Rob Clark <robdclark@gmail.com> */ #ifndef DRM_PRINT_H_ #define DRM_PRINT_H_ #include <linux/compiler.h> #include <linux/printk.h> #include <linux/device.h> #include <linux/dynamic_debug.h> #include <drm/drm.h> struct debugfs_regset32; struct drm_device; struct seq_file; /* Do *not* use outside of drm_print.[ch]! */ extern unsigned long __drm_debug; /** * DOC: print * * A simple wrapper for dev_printk(), seq_printf(), etc. Allows same * debug code to be used for both debugfs and printk logging. * * For example:: * * void log_some_info(struct drm_printer *p) * { * drm_printf(p, "foo=%d\n", foo); * drm_printf(p, "bar=%d\n", bar); * } * * #ifdef CONFIG_DEBUG_FS * void debugfs_show(struct seq_file *f) * { * struct drm_printer p = drm_seq_file_printer(f); * log_some_info(&p); * } * #endif * * void some_other_function(...) * { * struct drm_printer p = drm_info_printer(drm->dev); * log_some_info(&p); * } */ /** * enum drm_debug_category - The DRM debug categories * * Each of the DRM debug logging macros use a specific category, and the logging * is filtered by the drm.debug module parameter. This enum specifies the values * for the interface. * * Each DRM_DEBUG_<CATEGORY> macro logs to DRM_UT_<CATEGORY> category, except * DRM_DEBUG() logs to DRM_UT_CORE. * * Enabling verbose debug messages is done through the drm.debug parameter, each * category being enabled by a bit: * * - drm.debug=0x1 will enable CORE messages * - drm.debug=0x2 will enable DRIVER messages * - drm.debug=0x3 will enable CORE and DRIVER messages * - ... * - drm.debug=0x1ff will enable all messages * * An interesting feature is that it's possible to enable verbose logging at * run-time by echoing the debug value in its sysfs node:: * * # echo 0xf > /sys/module/drm/parameters/debug * */ enum drm_debug_category { /* These names must match those in DYNAMIC_DEBUG_CLASSBITS */ /** * @DRM_UT_CORE: Used in the generic drm code: drm_ioctl.c, drm_mm.c, * drm_memory.c, ... */ DRM_UT_CORE, /** * @DRM_UT_DRIVER: Used in the vendor specific part of the driver: i915, * radeon, ... macro. */ DRM_UT_DRIVER, /** * @DRM_UT_KMS: Used in the modesetting code. */ DRM_UT_KMS, /** * @DRM_UT_PRIME: Used in the prime code. */ DRM_UT_PRIME, /** * @DRM_UT_ATOMIC: Used in the atomic code. */ DRM_UT_ATOMIC, /** * @DRM_UT_VBL: Used for verbose debug message in the vblank code. */ DRM_UT_VBL, /** * @DRM_UT_STATE: Used for verbose atomic state debugging. */ DRM_UT_STATE, /** * @DRM_UT_LEASE: Used in the lease code. */ DRM_UT_LEASE, /** * @DRM_UT_DP: Used in the DP code. */ DRM_UT_DP, /** * @DRM_UT_DRMRES: Used in the drm managed resources code. */ DRM_UT_DRMRES }; static inline bool drm_debug_enabled_raw(enum drm_debug_category category) { return unlikely(__drm_debug & BIT(category)); } #define drm_debug_enabled_instrumented(category) \ ({ \ pr_debug("todo: is this frequent enough to optimize ?\n"); \ drm_debug_enabled_raw(category); \ }) #if defined(CONFIG_DRM_USE_DYNAMIC_DEBUG) /* * the drm.debug API uses dyndbg, so each drm_*dbg macro/callsite gets * a descriptor, and only enabled callsites are reachable. They use * the private macro to avoid re-testing the enable-bit. */ #define __drm_debug_enabled(category) true #define drm_debug_enabled(category) drm_debug_enabled_instrumented(category) #else #define __drm_debug_enabled(category) drm_debug_enabled_raw(category) #define drm_debug_enabled(category) drm_debug_enabled_raw(category) #endif /** * struct drm_printer - drm output "stream" * * Do not use struct members directly. Use drm_printer_seq_file(), * drm_printer_info(), etc to initialize. And drm_printf() for output. */ struct drm_printer { /* private: */ void (*printfn)(struct drm_printer *p, struct va_format *vaf); void (*puts)(struct drm_printer *p, const char *str); void *arg; const char *prefix; enum drm_debug_category category; }; void __drm_printfn_coredump(struct drm_printer *p, struct va_format *vaf); void __drm_puts_coredump(struct drm_printer *p, const char *str); void __drm_printfn_seq_file(struct drm_printer *p, struct va_format *vaf); void __drm_puts_seq_file(struct drm_printer *p, const char *str); void __drm_printfn_info(struct drm_printer *p, struct va_format *vaf); void __drm_printfn_dbg(struct drm_printer *p, struct va_format *vaf); void __drm_printfn_err(struct drm_printer *p, struct va_format *vaf); __printf(2, 3) void drm_printf(struct drm_printer *p, const char *f, ...); void drm_puts(struct drm_printer *p, const char *str); void drm_print_regset32(struct drm_printer *p, struct debugfs_regset32 *regset); void drm_print_bits(struct drm_printer *p, unsigned long value, const char * const bits[], unsigned int nbits); __printf(2, 0) /** * drm_vprintf - print to a &drm_printer stream * @p: the &drm_printer * @fmt: format string * @va: the va_list */ static inline void drm_vprintf(struct drm_printer *p, const char *fmt, va_list *va) { struct va_format vaf = { .fmt = fmt, .va = va }; p->printfn(p, &vaf); } /** * drm_printf_indent - Print to a &drm_printer stream with indentation * @printer: DRM printer * @indent: Tab indentation level (max 5) * @fmt: Format string */ #define drm_printf_indent(printer, indent, fmt, ...) \ drm_printf((printer), "%.*s" fmt, (indent), "\t\t\t\t\tX", ##__VA_ARGS__) /** * struct drm_print_iterator - local struct used with drm_printer_coredump * @data: Pointer to the devcoredump output buffer * @start: The offset within the buffer to start writing * @remain: The number of bytes to write for this iteration */ struct drm_print_iterator { void *data; ssize_t start; ssize_t remain; /* private: */ ssize_t offset; }; /** * drm_coredump_printer - construct a &drm_printer that can output to a buffer * from the read function for devcoredump * @iter: A pointer to a struct drm_print_iterator for the read instance * * This wrapper extends drm_printf() to work with a dev_coredumpm() callback * function. The passed in drm_print_iterator struct contains the buffer * pointer, size and offset as passed in from devcoredump. * * For example:: * * void coredump_read(char *buffer, loff_t offset, size_t count, * void *data, size_t datalen) * { * struct drm_print_iterator iter; * struct drm_printer p; * * iter.data = buffer; * iter.start = offset; * iter.remain = count; * * p = drm_coredump_printer(&iter); * * drm_printf(p, "foo=%d\n", foo); * } * * void makecoredump(...) * { * ... * dev_coredumpm(dev, THIS_MODULE, data, 0, GFP_KERNEL, * coredump_read, ...) * } * * RETURNS: * The &drm_printer object */ static inline struct drm_printer drm_coredump_printer(struct drm_print_iterator *iter) { struct drm_printer p = { .printfn = __drm_printfn_coredump, .puts = __drm_puts_coredump, .arg = iter, }; /* Set the internal offset of the iterator to zero */ iter->offset = 0; return p; } /** * drm_seq_file_printer - construct a &drm_printer that outputs to &seq_file * @f: the &struct seq_file to output to * * RETURNS: * The &drm_printer object */ static inline struct drm_printer drm_seq_file_printer(struct seq_file *f) { struct drm_printer p = { .printfn = __drm_printfn_seq_file, .puts = __drm_puts_seq_file, .arg = f, }; return p; } /** * drm_info_printer - construct a &drm_printer that outputs to dev_printk() * @dev: the &struct device pointer * * RETURNS: * The &drm_printer object */ static inline struct drm_printer drm_info_printer(struct device *dev) { struct drm_printer p = { .printfn = __drm_printfn_info, .arg = dev, }; return p; } /** * drm_dbg_printer - construct a &drm_printer for drm device specific output * @drm: the &struct drm_device pointer, or NULL * @category: the debug category to use * @prefix: debug output prefix, or NULL for no prefix * * RETURNS: * The &drm_printer object */ static inline struct drm_printer drm_dbg_printer(struct drm_device *drm, enum drm_debug_category category, const char *prefix) { struct drm_printer p = { .printfn = __drm_printfn_dbg, .arg = drm, .prefix = prefix, .category = category, }; return p; } /** * drm_err_printer - construct a &drm_printer that outputs to drm_err() * @drm: the &struct drm_device pointer * @prefix: debug output prefix, or NULL for no prefix * * RETURNS: * The &drm_printer object */ static inline struct drm_printer drm_err_printer(struct drm_device *drm, const char *prefix) { struct drm_printer p = { .printfn = __drm_printfn_err, .arg = drm, .prefix = prefix }; return p; } /* * struct device based logging * * Prefer drm_device based logging over device or printk based logging. */ __printf(3, 4) void drm_dev_printk(const struct device *dev, const char *level, const char *format, ...); struct _ddebug; __printf(4, 5) void __drm_dev_dbg(struct _ddebug *desc, const struct device *dev, enum drm_debug_category category, const char *format, ...); /** * DRM_DEV_ERROR() - Error output. * * NOTE: this is deprecated in favor of drm_err() or dev_err(). * * @dev: device pointer * @fmt: printf() like format string. */ #define DRM_DEV_ERROR(dev, fmt, ...) \ drm_dev_printk(dev, KERN_ERR, "*ERROR* " fmt, ##__VA_ARGS__) /** * DRM_DEV_ERROR_RATELIMITED() - Rate limited error output. * * NOTE: this is deprecated in favor of drm_err_ratelimited() or * dev_err_ratelimited(). * * @dev: device pointer * @fmt: printf() like format string. * * Like DRM_ERROR() but won't flood the log. */ #define DRM_DEV_ERROR_RATELIMITED(dev, fmt, ...) \ ({ \ static DEFINE_RATELIMIT_STATE(_rs, \ DEFAULT_RATELIMIT_INTERVAL, \ DEFAULT_RATELIMIT_BURST); \ \ if (__ratelimit(&_rs)) \ DRM_DEV_ERROR(dev, fmt, ##__VA_ARGS__); \ }) /* NOTE: this is deprecated in favor of drm_info() or dev_info(). */ #define DRM_DEV_INFO(dev, fmt, ...) \ drm_dev_printk(dev, KERN_INFO, fmt, ##__VA_ARGS__) /* NOTE: this is deprecated in favor of drm_info_once() or dev_info_once(). */ #define DRM_DEV_INFO_ONCE(dev, fmt, ...) \ ({ \ static bool __print_once __read_mostly; \ if (!__print_once) { \ __print_once = true; \ DRM_DEV_INFO(dev, fmt, ##__VA_ARGS__); \ } \ }) #if !defined(CONFIG_DRM_USE_DYNAMIC_DEBUG) #define drm_dev_dbg(dev, cat, fmt, ...) \ __drm_dev_dbg(NULL, dev, cat, fmt, ##__VA_ARGS__) #else #define drm_dev_dbg(dev, cat, fmt, ...) \ _dynamic_func_call_cls(cat, fmt, __drm_dev_dbg, \ dev, cat, fmt, ##__VA_ARGS__) #endif /** * DRM_DEV_DEBUG() - Debug output for generic drm code * * NOTE: this is deprecated in favor of drm_dbg_core(). * * @dev: device pointer * @fmt: printf() like format string. */ #define DRM_DEV_DEBUG(dev, fmt, ...) \ drm_dev_dbg(dev, DRM_UT_CORE, fmt, ##__VA_ARGS__) /** * DRM_DEV_DEBUG_DRIVER() - Debug output for vendor specific part of the driver * * NOTE: this is deprecated in favor of drm_dbg() or dev_dbg(). * * @dev: device pointer * @fmt: printf() like format string. */ #define DRM_DEV_DEBUG_DRIVER(dev, fmt, ...) \ drm_dev_dbg(dev, DRM_UT_DRIVER, fmt, ##__VA_ARGS__) /** * DRM_DEV_DEBUG_KMS() - Debug output for modesetting code * * NOTE: this is deprecated in favor of drm_dbg_kms(). * * @dev: device pointer * @fmt: printf() like format string. */ #define DRM_DEV_DEBUG_KMS(dev, fmt, ...) \ drm_dev_dbg(dev, DRM_UT_KMS, fmt, ##__VA_ARGS__) /* * struct drm_device based logging * * Prefer drm_device based logging over device or prink based logging. */ /* Helper for struct drm_device based logging. */ #define __drm_printk(drm, level, type, fmt, ...) \ dev_##level##type((drm) ? (drm)->dev : NULL, "[drm] " fmt, ##__VA_ARGS__) #define drm_info(drm, fmt, ...) \ __drm_printk((drm), info,, fmt, ##__VA_ARGS__) #define drm_notice(drm, fmt, ...) \ __drm_printk((drm), notice,, fmt, ##__VA_ARGS__) #define drm_warn(drm, fmt, ...) \ __drm_printk((drm), warn,, fmt, ##__VA_ARGS__) #define drm_err(drm, fmt, ...) \ __drm_printk((drm), err,, "*ERROR* " fmt, ##__VA_ARGS__) #define drm_info_once(drm, fmt, ...) \ __drm_printk((drm), info, _once, fmt, ##__VA_ARGS__) #define drm_notice_once(drm, fmt, ...) \ __drm_printk((drm), notice, _once, fmt, ##__VA_ARGS__) #define drm_warn_once(drm, fmt, ...) \ __drm_printk((drm), warn, _once, fmt, ##__VA_ARGS__) #define drm_err_once(drm, fmt, ...) \ __drm_printk((drm), err, _once, "*ERROR* " fmt, ##__VA_ARGS__) #define drm_err_ratelimited(drm, fmt, ...) \ __drm_printk((drm), err, _ratelimited, "*ERROR* " fmt, ##__VA_ARGS__) #define drm_dbg_core(drm, fmt, ...) \ drm_dev_dbg((drm) ? (drm)->dev : NULL, DRM_UT_CORE, fmt, ##__VA_ARGS__) #define drm_dbg_driver(drm, fmt, ...) \ drm_dev_dbg((drm) ? (drm)->dev : NULL, DRM_UT_DRIVER, fmt, ##__VA_ARGS__) #define drm_dbg_kms(drm, fmt, ...) \ drm_dev_dbg((drm) ? (drm)->dev : NULL, DRM_UT_KMS, fmt, ##__VA_ARGS__) #define drm_dbg_prime(drm, fmt, ...) \ drm_dev_dbg((drm) ? (drm)->dev : NULL, DRM_UT_PRIME, fmt, ##__VA_ARGS__) #define drm_dbg_atomic(drm, fmt, ...) \ drm_dev_dbg((drm) ? (drm)->dev : NULL, DRM_UT_ATOMIC, fmt, ##__VA_ARGS__) #define drm_dbg_vbl(drm, fmt, ...) \ drm_dev_dbg((drm) ? (drm)->dev : NULL, DRM_UT_VBL, fmt, ##__VA_ARGS__) #define drm_dbg_state(drm, fmt, ...) \ drm_dev_dbg((drm) ? (drm)->dev : NULL, DRM_UT_STATE, fmt, ##__VA_ARGS__) #define drm_dbg_lease(drm, fmt, ...) \ drm_dev_dbg((drm) ? (drm)->dev : NULL, DRM_UT_LEASE, fmt, ##__VA_ARGS__) #define drm_dbg_dp(drm, fmt, ...) \ drm_dev_dbg((drm) ? (drm)->dev : NULL, DRM_UT_DP, fmt, ##__VA_ARGS__) #define drm_dbg_drmres(drm, fmt, ...) \ drm_dev_dbg((drm) ? (drm)->dev : NULL, DRM_UT_DRMRES, fmt, ##__VA_ARGS__) #define drm_dbg(drm, fmt, ...) drm_dbg_driver(drm, fmt, ##__VA_ARGS__) /* * printk based logging * * Prefer drm_device based logging over device or prink based logging. */ __printf(3, 4) void ___drm_dbg(struct _ddebug *desc, enum drm_debug_category category, const char *format, ...); __printf(1, 2) void __drm_err(const char *format, ...); #if !defined(CONFIG_DRM_USE_DYNAMIC_DEBUG) #define __drm_dbg(cat, fmt, ...) ___drm_dbg(NULL, cat, fmt, ##__VA_ARGS__) #else #define __drm_dbg(cat, fmt, ...) \ _dynamic_func_call_cls(cat, fmt, ___drm_dbg, \ cat, fmt, ##__VA_ARGS__) #endif /* Macros to make printk easier */ #define _DRM_PRINTK(once, level, fmt, ...) \ printk##once(KERN_##level "[" DRM_NAME "] " fmt, ##__VA_ARGS__) /* NOTE: this is deprecated in favor of pr_info(). */ #define DRM_INFO(fmt, ...) \ _DRM_PRINTK(, INFO, fmt, ##__VA_ARGS__) /* NOTE: this is deprecated in favor of pr_notice(). */ #define DRM_NOTE(fmt, ...) \ _DRM_PRINTK(, NOTICE, fmt, ##__VA_ARGS__) /* NOTE: this is deprecated in favor of pr_warn(). */ #define DRM_WARN(fmt, ...) \ _DRM_PRINTK(, WARNING, fmt, ##__VA_ARGS__) /* NOTE: this is deprecated in favor of pr_info_once(). */ #define DRM_INFO_ONCE(fmt, ...) \ _DRM_PRINTK(_once, INFO, fmt, ##__VA_ARGS__) /* NOTE: this is deprecated in favor of pr_notice_once(). */ #define DRM_NOTE_ONCE(fmt, ...) \ _DRM_PRINTK(_once, NOTICE, fmt, ##__VA_ARGS__) /* NOTE: this is deprecated in favor of pr_warn_once(). */ #define DRM_WARN_ONCE(fmt, ...) \ _DRM_PRINTK(_once, WARNING, fmt, ##__VA_ARGS__) /* NOTE: this is deprecated in favor of pr_err(). */ #define DRM_ERROR(fmt, ...) \ __drm_err(fmt, ##__VA_ARGS__) /* NOTE: this is deprecated in favor of pr_err_ratelimited(). */ #define DRM_ERROR_RATELIMITED(fmt, ...) \ DRM_DEV_ERROR_RATELIMITED(NULL, fmt, ##__VA_ARGS__) /* NOTE: this is deprecated in favor of drm_dbg_core(NULL, ...). */ #define DRM_DEBUG(fmt, ...) \ __drm_dbg(DRM_UT_CORE, fmt, ##__VA_ARGS__) /* NOTE: this is deprecated in favor of drm_dbg(NULL, ...). */ #define DRM_DEBUG_DRIVER(fmt, ...) \ __drm_dbg(DRM_UT_DRIVER, fmt, ##__VA_ARGS__) /* NOTE: this is deprecated in favor of drm_dbg_kms(NULL, ...). */ #define DRM_DEBUG_KMS(fmt, ...) \ __drm_dbg(DRM_UT_KMS, fmt, ##__VA_ARGS__) /* NOTE: this is deprecated in favor of drm_dbg_prime(NULL, ...). */ #define DRM_DEBUG_PRIME(fmt, ...) \ __drm_dbg(DRM_UT_PRIME, fmt, ##__VA_ARGS__) /* NOTE: this is deprecated in favor of drm_dbg_atomic(NULL, ...). */ #define DRM_DEBUG_ATOMIC(fmt, ...) \ __drm_dbg(DRM_UT_ATOMIC, fmt, ##__VA_ARGS__) /* NOTE: this is deprecated in favor of drm_dbg_vbl(NULL, ...). */ #define DRM_DEBUG_VBL(fmt, ...) \ __drm_dbg(DRM_UT_VBL, fmt, ##__VA_ARGS__) /* NOTE: this is deprecated in favor of drm_dbg_lease(NULL, ...). */ #define DRM_DEBUG_LEASE(fmt, ...) \ __drm_dbg(DRM_UT_LEASE, fmt, ##__VA_ARGS__) /* NOTE: this is deprecated in favor of drm_dbg_dp(NULL, ...). */ #define DRM_DEBUG_DP(fmt, ...) \ __drm_dbg(DRM_UT_DP, fmt, ## __VA_ARGS__) #define __DRM_DEFINE_DBG_RATELIMITED(category, drm, fmt, ...) \ ({ \ static DEFINE_RATELIMIT_STATE(rs_, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST);\ const struct drm_device *drm_ = (drm); \ \ if (drm_debug_enabled(DRM_UT_ ## category) && __ratelimit(&rs_)) \ drm_dev_printk(drm_ ? drm_->dev : NULL, KERN_DEBUG, fmt, ## __VA_ARGS__); \ }) #define drm_dbg_ratelimited(drm, fmt, ...) \ __DRM_DEFINE_DBG_RATELIMITED(DRIVER, drm, fmt, ## __VA_ARGS__) #define drm_dbg_kms_ratelimited(drm, fmt, ...) \ __DRM_DEFINE_DBG_RATELIMITED(KMS, drm, fmt, ## __VA_ARGS__) /* * struct drm_device based WARNs * * drm_WARN*() acts like WARN*(), but with the key difference of * using device specific information so that we know from which device * warning is originating from. * * Prefer drm_device based drm_WARN* over regular WARN* */ /* Helper for struct drm_device based WARNs */ #define drm_WARN(drm, condition, format, arg...) \ WARN(condition, "%s %s: " format, \ dev_driver_string((drm)->dev), \ dev_name((drm)->dev), ## arg) #define drm_WARN_ONCE(drm, condition, format, arg...) \ WARN_ONCE(condition, "%s %s: " format, \ dev_driver_string((drm)->dev), \ dev_name((drm)->dev), ## arg) #define drm_WARN_ON(drm, x) \ drm_WARN((drm), (x), "%s", \ "drm_WARN_ON(" __stringify(x) ")") #define drm_WARN_ON_ONCE(drm, x) \ drm_WARN_ONCE((drm), (x), "%s", \ "drm_WARN_ON_ONCE(" __stringify(x) ")") #endif /* DRM_PRINT_H_ */ |
| 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 | /* * Copyright © 2006 Keith Packard * Copyright © 2007-2008 Dave Airlie * Copyright © 2007-2008 Intel Corporation * Jesse Barnes <jesse.barnes@intel.com> * Copyright © 2014 Intel Corporation * Daniel Vetter <daniel.vetter@ffwll.ch> * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. */ #ifndef __DRM_MODES_H__ #define __DRM_MODES_H__ #include <linux/hdmi.h> #include <drm/drm_mode_object.h> #include <drm/drm_connector.h> struct videomode; /* * Note on terminology: here, for brevity and convenience, we refer to connector * control chips as 'CRTCs'. They can control any type of connector, VGA, LVDS, * DVI, etc. And 'screen' refers to the whole of the visible display, which * may span multiple monitors (and therefore multiple CRTC and connector * structures). */ /** * enum drm_mode_status - hardware support status of a mode * @MODE_OK: Mode OK * @MODE_HSYNC: hsync out of range * @MODE_VSYNC: vsync out of range * @MODE_H_ILLEGAL: mode has illegal horizontal timings * @MODE_V_ILLEGAL: mode has illegal vertical timings * @MODE_BAD_WIDTH: requires an unsupported linepitch * @MODE_NOMODE: no mode with a matching name * @MODE_NO_INTERLACE: interlaced mode not supported * @MODE_NO_DBLESCAN: doublescan mode not supported * @MODE_NO_VSCAN: multiscan mode not supported * @MODE_MEM: insufficient video memory * @MODE_VIRTUAL_X: mode width too large for specified virtual size * @MODE_VIRTUAL_Y: mode height too large for specified virtual size * @MODE_MEM_VIRT: insufficient video memory given virtual size * @MODE_NOCLOCK: no fixed clock available * @MODE_CLOCK_HIGH: clock required is too high * @MODE_CLOCK_LOW: clock required is too low * @MODE_CLOCK_RANGE: clock/mode isn't in a ClockRange * @MODE_BAD_HVALUE: horizontal timing was out of range * @MODE_BAD_VVALUE: vertical timing was out of range * @MODE_BAD_VSCAN: VScan value out of range * @MODE_HSYNC_NARROW: horizontal sync too narrow * @MODE_HSYNC_WIDE: horizontal sync too wide * @MODE_HBLANK_NARROW: horizontal blanking too narrow * @MODE_HBLANK_WIDE: horizontal blanking too wide * @MODE_VSYNC_NARROW: vertical sync too narrow * @MODE_VSYNC_WIDE: vertical sync too wide * @MODE_VBLANK_NARROW: vertical blanking too narrow * @MODE_VBLANK_WIDE: vertical blanking too wide * @MODE_PANEL: exceeds panel dimensions * @MODE_INTERLACE_WIDTH: width too large for interlaced mode * @MODE_ONE_WIDTH: only one width is supported * @MODE_ONE_HEIGHT: only one height is supported * @MODE_ONE_SIZE: only one resolution is supported * @MODE_NO_REDUCED: monitor doesn't accept reduced blanking * @MODE_NO_STEREO: stereo modes not supported * @MODE_NO_420: ycbcr 420 modes not supported * @MODE_STALE: mode has become stale * @MODE_BAD: unspecified reason * @MODE_ERROR: error condition * * This enum is used to filter out modes not supported by the driver/hardware * combination. */ enum drm_mode_status { MODE_OK = 0, MODE_HSYNC, MODE_VSYNC, MODE_H_ILLEGAL, MODE_V_ILLEGAL, MODE_BAD_WIDTH, MODE_NOMODE, MODE_NO_INTERLACE, MODE_NO_DBLESCAN, MODE_NO_VSCAN, MODE_MEM, MODE_VIRTUAL_X, MODE_VIRTUAL_Y, MODE_MEM_VIRT, MODE_NOCLOCK, MODE_CLOCK_HIGH, MODE_CLOCK_LOW, MODE_CLOCK_RANGE, MODE_BAD_HVALUE, MODE_BAD_VVALUE, MODE_BAD_VSCAN, MODE_HSYNC_NARROW, MODE_HSYNC_WIDE, MODE_HBLANK_NARROW, MODE_HBLANK_WIDE, MODE_VSYNC_NARROW, MODE_VSYNC_WIDE, MODE_VBLANK_NARROW, MODE_VBLANK_WIDE, MODE_PANEL, MODE_INTERLACE_WIDTH, MODE_ONE_WIDTH, MODE_ONE_HEIGHT, MODE_ONE_SIZE, MODE_NO_REDUCED, MODE_NO_STEREO, MODE_NO_420, MODE_STALE = -3, MODE_BAD = -2, MODE_ERROR = -1 }; #define DRM_MODE(nm, t, c, hd, hss, hse, ht, hsk, vd, vss, vse, vt, vs, f) \ .name = nm, .status = 0, .type = (t), .clock = (c), \ .hdisplay = (hd), .hsync_start = (hss), .hsync_end = (hse), \ .htotal = (ht), .hskew = (hsk), .vdisplay = (vd), \ .vsync_start = (vss), .vsync_end = (vse), .vtotal = (vt), \ .vscan = (vs), .flags = (f) /** * DRM_MODE_RES_MM - Calculates the display size from resolution and DPI * @res: The resolution in pixel * @dpi: The number of dots per inch */ #define DRM_MODE_RES_MM(res, dpi) \ (((res) * 254ul) / ((dpi) * 10ul)) #define __DRM_MODE_INIT(pix, hd, vd, hd_mm, vd_mm) \ .type = DRM_MODE_TYPE_DRIVER, .clock = (pix), \ .hdisplay = (hd), .hsync_start = (hd), .hsync_end = (hd), \ .htotal = (hd), .vdisplay = (vd), .vsync_start = (vd), \ .vsync_end = (vd), .vtotal = (vd), .width_mm = (hd_mm), \ .height_mm = (vd_mm) /** * DRM_MODE_INIT - Initialize display mode * @hz: Vertical refresh rate in Hertz * @hd: Horizontal resolution, width * @vd: Vertical resolution, height * @hd_mm: Display width in millimeters * @vd_mm: Display height in millimeters * * This macro initializes a &drm_display_mode that contains information about * refresh rate, resolution and physical size. */ #define DRM_MODE_INIT(hz, hd, vd, hd_mm, vd_mm) \ __DRM_MODE_INIT((hd) * (vd) * (hz) / 1000 /* kHz */, hd, vd, hd_mm, vd_mm) /** * DRM_SIMPLE_MODE - Simple display mode * @hd: Horizontal resolution, width * @vd: Vertical resolution, height * @hd_mm: Display width in millimeters * @vd_mm: Display height in millimeters * * This macro initializes a &drm_display_mode that only contains info about * resolution and physical size. */ #define DRM_SIMPLE_MODE(hd, vd, hd_mm, vd_mm) \ __DRM_MODE_INIT(1 /* pass validation */, hd, vd, hd_mm, vd_mm) #define CRTC_INTERLACE_HALVE_V (1 << 0) /* halve V values for interlacing */ #define CRTC_STEREO_DOUBLE (1 << 1) /* adjust timings for stereo modes */ #define CRTC_NO_DBLSCAN (1 << 2) /* don't adjust doublescan */ #define CRTC_NO_VSCAN (1 << 3) /* don't adjust doublescan */ #define CRTC_STEREO_DOUBLE_ONLY (CRTC_STEREO_DOUBLE | CRTC_NO_DBLSCAN | CRTC_NO_VSCAN) #define DRM_MODE_FLAG_3D_MAX DRM_MODE_FLAG_3D_SIDE_BY_SIDE_HALF #define DRM_MODE_MATCH_TIMINGS (1 << 0) #define DRM_MODE_MATCH_CLOCK (1 << 1) #define DRM_MODE_MATCH_FLAGS (1 << 2) #define DRM_MODE_MATCH_3D_FLAGS (1 << 3) #define DRM_MODE_MATCH_ASPECT_RATIO (1 << 4) /** * struct drm_display_mode - DRM kernel-internal display mode structure * @hdisplay: horizontal display size * @hsync_start: horizontal sync start * @hsync_end: horizontal sync end * @htotal: horizontal total size * @hskew: horizontal skew?! * @vdisplay: vertical display size * @vsync_start: vertical sync start * @vsync_end: vertical sync end * @vtotal: vertical total size * @vscan: vertical scan?! * @crtc_hdisplay: hardware mode horizontal display size * @crtc_hblank_start: hardware mode horizontal blank start * @crtc_hblank_end: hardware mode horizontal blank end * @crtc_hsync_start: hardware mode horizontal sync start * @crtc_hsync_end: hardware mode horizontal sync end * @crtc_htotal: hardware mode horizontal total size * @crtc_hskew: hardware mode horizontal skew?! * @crtc_vdisplay: hardware mode vertical display size * @crtc_vblank_start: hardware mode vertical blank start * @crtc_vblank_end: hardware mode vertical blank end * @crtc_vsync_start: hardware mode vertical sync start * @crtc_vsync_end: hardware mode vertical sync end * @crtc_vtotal: hardware mode vertical total size * * This is the kernel API display mode information structure. For the * user-space version see struct drm_mode_modeinfo. * * The horizontal and vertical timings are defined per the following diagram. * * :: * * * Active Front Sync Back * Region Porch Porch * <-----------------------><----------------><-------------><--------------> * //////////////////////| * ////////////////////// | * ////////////////////// |.................. ................ * _______________ * <----- [hv]display -----> * <------------- [hv]sync_start ------------> * <--------------------- [hv]sync_end ---------------------> * <-------------------------------- [hv]total ----------------------------->* * * This structure contains two copies of timings. First are the plain timings, * which specify the logical mode, as it would be for a progressive 1:1 scanout * at the refresh rate userspace can observe through vblank timestamps. Then * there's the hardware timings, which are corrected for interlacing, * double-clocking and similar things. They are provided as a convenience, and * can be appropriately computed using drm_mode_set_crtcinfo(). * * For printing you can use %DRM_MODE_FMT and DRM_MODE_ARG(). */ struct drm_display_mode { /** * @clock: * * Pixel clock in kHz. */ int clock; /* in kHz */ u16 hdisplay; u16 hsync_start; u16 hsync_end; u16 htotal; u16 hskew; u16 vdisplay; u16 vsync_start; u16 vsync_end; u16 vtotal; u16 vscan; /** * @flags: * * Sync and timing flags: * * - DRM_MODE_FLAG_PHSYNC: horizontal sync is active high. * - DRM_MODE_FLAG_NHSYNC: horizontal sync is active low. * - DRM_MODE_FLAG_PVSYNC: vertical sync is active high. * - DRM_MODE_FLAG_NVSYNC: vertical sync is active low. * - DRM_MODE_FLAG_INTERLACE: mode is interlaced. * - DRM_MODE_FLAG_DBLSCAN: mode uses doublescan. * - DRM_MODE_FLAG_CSYNC: mode uses composite sync. * - DRM_MODE_FLAG_PCSYNC: composite sync is active high. * - DRM_MODE_FLAG_NCSYNC: composite sync is active low. * - DRM_MODE_FLAG_HSKEW: hskew provided (not used?). * - DRM_MODE_FLAG_BCAST: <deprecated> * - DRM_MODE_FLAG_PIXMUX: <deprecated> * - DRM_MODE_FLAG_DBLCLK: double-clocked mode. * - DRM_MODE_FLAG_CLKDIV2: half-clocked mode. * * Additionally there's flags to specify how 3D modes are packed: * * - DRM_MODE_FLAG_3D_NONE: normal, non-3D mode. * - DRM_MODE_FLAG_3D_FRAME_PACKING: 2 full frames for left and right. * - DRM_MODE_FLAG_3D_FIELD_ALTERNATIVE: interleaved like fields. * - DRM_MODE_FLAG_3D_LINE_ALTERNATIVE: interleaved lines. * - DRM_MODE_FLAG_3D_SIDE_BY_SIDE_FULL: side-by-side full frames. * - DRM_MODE_FLAG_3D_L_DEPTH: ? * - DRM_MODE_FLAG_3D_L_DEPTH_GFX_GFX_DEPTH: ? * - DRM_MODE_FLAG_3D_TOP_AND_BOTTOM: frame split into top and bottom * parts. * - DRM_MODE_FLAG_3D_SIDE_BY_SIDE_HALF: frame split into left and * right parts. */ u32 flags; /** * @crtc_clock: * * Actual pixel or dot clock in the hardware. This differs from the * logical @clock when e.g. using interlacing, double-clocking, stereo * modes or other fancy stuff that changes the timings and signals * actually sent over the wire. * * This is again in kHz. * * Note that with digital outputs like HDMI or DP there's usually a * massive confusion between the dot clock and the signal clock at the * bit encoding level. Especially when a 8b/10b encoding is used and the * difference is exactly a factor of 10. */ int crtc_clock; u16 crtc_hdisplay; u16 crtc_hblank_start; u16 crtc_hblank_end; u16 crtc_hsync_start; u16 crtc_hsync_end; u16 crtc_htotal; u16 crtc_hskew; u16 crtc_vdisplay; u16 crtc_vblank_start; u16 crtc_vblank_end; u16 crtc_vsync_start; u16 crtc_vsync_end; u16 crtc_vtotal; /** * @width_mm: * * Addressable size of the output in mm, projectors should set this to * 0. */ u16 width_mm; /** * @height_mm: * * Addressable size of the output in mm, projectors should set this to * 0. */ u16 height_mm; /** * @type: * * A bitmask of flags, mostly about the source of a mode. Possible flags * are: * * - DRM_MODE_TYPE_PREFERRED: Preferred mode, usually the native * resolution of an LCD panel. There should only be one preferred * mode per connector at any given time. * - DRM_MODE_TYPE_DRIVER: Mode created by the driver, which is all of * them really. Drivers must set this bit for all modes they create * and expose to userspace. * - DRM_MODE_TYPE_USERDEF: Mode defined or selected via the kernel * command line. * * Plus a big list of flags which shouldn't be used at all, but are * still around since these flags are also used in the userspace ABI. * We no longer accept modes with these types though: * * - DRM_MODE_TYPE_BUILTIN: Meant for hard-coded modes, unused. * Use DRM_MODE_TYPE_DRIVER instead. * - DRM_MODE_TYPE_DEFAULT: Again a leftover, use * DRM_MODE_TYPE_PREFERRED instead. * - DRM_MODE_TYPE_CLOCK_C and DRM_MODE_TYPE_CRTC_C: Define leftovers * which are stuck around for hysterical raisins only. No one has an * idea what they were meant for. Don't use. */ u8 type; /** * @expose_to_userspace: * * Indicates whether the mode is to be exposed to the userspace. * This is to maintain a set of exposed modes while preparing * user-mode's list in drm_mode_getconnector ioctl. The purpose of * this only lies in the ioctl function, and is not to be used * outside the function. */ bool expose_to_userspace; /** * @head: * * struct list_head for mode lists. */ struct list_head head; /** * @name: * * Human-readable name of the mode, filled out with drm_mode_set_name(). */ char name[DRM_DISPLAY_MODE_LEN]; /** * @status: * * Status of the mode, used to filter out modes not supported by the * hardware. See enum &drm_mode_status. */ enum drm_mode_status status; /** * @picture_aspect_ratio: * * Field for setting the HDMI picture aspect ratio of a mode. */ enum hdmi_picture_aspect picture_aspect_ratio; }; /** * DRM_MODE_FMT - printf string for &struct drm_display_mode */ #define DRM_MODE_FMT "\"%s\": %d %d %d %d %d %d %d %d %d %d 0x%x 0x%x" /** * DRM_MODE_ARG - printf arguments for &struct drm_display_mode * @m: display mode */ #define DRM_MODE_ARG(m) \ (m)->name, drm_mode_vrefresh(m), (m)->clock, \ (m)->hdisplay, (m)->hsync_start, (m)->hsync_end, (m)->htotal, \ (m)->vdisplay, (m)->vsync_start, (m)->vsync_end, (m)->vtotal, \ (m)->type, (m)->flags #define obj_to_mode(x) container_of(x, struct drm_display_mode, base) /** * drm_mode_is_stereo - check for stereo mode flags * @mode: drm_display_mode to check * * Returns: * True if the mode is one of the stereo modes (like side-by-side), false if * not. */ static inline bool drm_mode_is_stereo(const struct drm_display_mode *mode) { return mode->flags & DRM_MODE_FLAG_3D_MASK; } struct drm_connector; struct drm_cmdline_mode; struct drm_display_mode *drm_mode_create(struct drm_device *dev); void drm_mode_destroy(struct drm_device *dev, struct drm_display_mode *mode); void drm_mode_convert_to_umode(struct drm_mode_modeinfo *out, const struct drm_display_mode *in); int drm_mode_convert_umode(struct drm_device *dev, struct drm_display_mode *out, const struct drm_mode_modeinfo *in); void drm_mode_probed_add(struct drm_connector *connector, struct drm_display_mode *mode); void drm_mode_debug_printmodeline(const struct drm_display_mode *mode); bool drm_mode_is_420_only(const struct drm_display_info *display, const struct drm_display_mode *mode); bool drm_mode_is_420_also(const struct drm_display_info *display, const struct drm_display_mode *mode); bool drm_mode_is_420(const struct drm_display_info *display, const struct drm_display_mode *mode); void drm_set_preferred_mode(struct drm_connector *connector, int hpref, int vpref); struct drm_display_mode *drm_analog_tv_mode(struct drm_device *dev, enum drm_connector_tv_mode mode, unsigned long pixel_clock_hz, unsigned int hdisplay, unsigned int vdisplay, bool interlace); static inline struct drm_display_mode *drm_mode_analog_ntsc_480i(struct drm_device *dev) { return drm_analog_tv_mode(dev, DRM_MODE_TV_MODE_NTSC, 13500000, 720, 480, true); } static inline struct drm_display_mode *drm_mode_analog_pal_576i(struct drm_device *dev) { return drm_analog_tv_mode(dev, DRM_MODE_TV_MODE_PAL, 13500000, 720, 576, true); } struct drm_display_mode *drm_cvt_mode(struct drm_device *dev, int hdisplay, int vdisplay, int vrefresh, bool reduced, bool interlaced, bool margins); struct drm_display_mode *drm_gtf_mode(struct drm_device *dev, int hdisplay, int vdisplay, int vrefresh, bool interlaced, int margins); struct drm_display_mode *drm_gtf_mode_complex(struct drm_device *dev, int hdisplay, int vdisplay, int vrefresh, bool interlaced, int margins, int GTF_M, int GTF_2C, int GTF_K, int GTF_2J); void drm_display_mode_from_videomode(const struct videomode *vm, struct drm_display_mode *dmode); void drm_display_mode_to_videomode(const struct drm_display_mode *dmode, struct videomode *vm); void drm_bus_flags_from_videomode(const struct videomode *vm, u32 *bus_flags); #if defined(CONFIG_OF) int of_get_drm_display_mode(struct device_node *np, struct drm_display_mode *dmode, u32 *bus_flags, int index); int of_get_drm_panel_display_mode(struct device_node *np, struct drm_display_mode *dmode, u32 *bus_flags); #else static inline int of_get_drm_display_mode(struct device_node *np, struct drm_display_mode *dmode, u32 *bus_flags, int index) { return -EINVAL; } static inline int of_get_drm_panel_display_mode(struct device_node *np, struct drm_display_mode *dmode, u32 *bus_flags) { return -EINVAL; } #endif void drm_mode_set_name(struct drm_display_mode *mode); int drm_mode_vrefresh(const struct drm_display_mode *mode); void drm_mode_get_hv_timing(const struct drm_display_mode *mode, int *hdisplay, int *vdisplay); void drm_mode_set_crtcinfo(struct drm_display_mode *p, int adjust_flags); void drm_mode_copy(struct drm_display_mode *dst, const struct drm_display_mode *src); void drm_mode_init(struct drm_display_mode *dst, const struct drm_display_mode *src); struct drm_display_mode *drm_mode_duplicate(struct drm_device *dev, const struct drm_display_mode *mode); bool drm_mode_match(const struct drm_display_mode *mode1, const struct drm_display_mode *mode2, unsigned int match_flags); bool drm_mode_equal(const struct drm_display_mode *mode1, const struct drm_display_mode *mode2); bool drm_mode_equal_no_clocks(const struct drm_display_mode *mode1, const struct drm_display_mode *mode2); bool drm_mode_equal_no_clocks_no_stereo(const struct drm_display_mode *mode1, const struct drm_display_mode *mode2); /* for use by the crtc helper probe functions */ enum drm_mode_status drm_mode_validate_driver(struct drm_device *dev, const struct drm_display_mode *mode); enum drm_mode_status drm_mode_validate_size(const struct drm_display_mode *mode, int maxX, int maxY); enum drm_mode_status drm_mode_validate_ycbcr420(const struct drm_display_mode *mode, struct drm_connector *connector); void drm_mode_prune_invalid(struct drm_device *dev, struct list_head *mode_list, bool verbose); void drm_mode_sort(struct list_head *mode_list); void drm_connector_list_update(struct drm_connector *connector); /* parsing cmdline modes */ bool drm_mode_parse_command_line_for_connector(const char *mode_option, const struct drm_connector *connector, struct drm_cmdline_mode *mode); struct drm_display_mode * drm_mode_create_from_cmdline_mode(struct drm_device *dev, struct drm_cmdline_mode *cmd); #endif /* __DRM_MODES_H__ */ |
| 4 4 4 4 4 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 1 1 1 1 3 1 1 1 1 36 36 36 11 11 2 2 2 2 2 1 1 1 1 31 31 34 31 33 34 34 34 34 34 34 34 36 35 1 1 36 36 36 35 36 34 2 2 2 2 1 2 5 5 5 5 5 2 2 5 5 5 5 5 5 5 2 2 2 2 2 2 2 2 2 2 5 5 5 5 5 5 5 5 5 36 3 7 7 15 8 48 48 4 4 43 2 2 2 2 2 2 2 2 1 2 2 2 2 2 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 | // SPDX-License-Identifier: GPL-2.0 /* * drivers/usb/core/driver.c - most of the driver model stuff for usb * * (C) Copyright 2005 Greg Kroah-Hartman <gregkh@suse.de> * * based on drivers/usb/usb.c which had the following copyrights: * (C) Copyright Linus Torvalds 1999 * (C) Copyright Johannes Erdfelt 1999-2001 * (C) Copyright Andreas Gal 1999 * (C) Copyright Gregory P. Smith 1999 * (C) Copyright Deti Fliegl 1999 (new USB architecture) * (C) Copyright Randy Dunlap 2000 * (C) Copyright David Brownell 2000-2004 * (C) Copyright Yggdrasil Computing, Inc. 2000 * (usb_device_id matching changes by Adam J. Richter) * (C) Copyright Greg Kroah-Hartman 2002-2003 * * Released under the GPLv2 only. * * NOTE! This is not actually a driver at all, rather this is * just a collection of helper routines that implement the * matching, probing, releasing, suspending and resuming for * real drivers. * */ #include <linux/device.h> #include <linux/slab.h> #include <linux/export.h> #include <linux/usb.h> #include <linux/usb/quirks.h> #include <linux/usb/hcd.h> #include "usb.h" /* * Adds a new dynamic USBdevice ID to this driver, * and cause the driver to probe for all devices again. */ ssize_t usb_store_new_id(struct usb_dynids *dynids, const struct usb_device_id *id_table, struct device_driver *driver, const char *buf, size_t count) { struct usb_dynid *dynid; u32 idVendor = 0; u32 idProduct = 0; unsigned int bInterfaceClass = 0; u32 refVendor, refProduct; int fields = 0; int retval = 0; fields = sscanf(buf, "%x %x %x %x %x", &idVendor, &idProduct, &bInterfaceClass, &refVendor, &refProduct); if (fields < 2) return -EINVAL; dynid = kzalloc(sizeof(*dynid), GFP_KERNEL); if (!dynid) return -ENOMEM; INIT_LIST_HEAD(&dynid->node); dynid->id.idVendor = idVendor; dynid->id.idProduct = idProduct; dynid->id.match_flags = USB_DEVICE_ID_MATCH_DEVICE; if (fields > 2 && bInterfaceClass) { if (bInterfaceClass > 255) { retval = -EINVAL; goto fail; } dynid->id.bInterfaceClass = (u8)bInterfaceClass; dynid->id.match_flags |= USB_DEVICE_ID_MATCH_INT_CLASS; } if (fields > 4) { const struct usb_device_id *id = id_table; if (!id) { retval = -ENODEV; goto fail; } for (; id->match_flags; id++) if (id->idVendor == refVendor && id->idProduct == refProduct) break; if (id->match_flags) { dynid->id.driver_info = id->driver_info; } else { retval = -ENODEV; goto fail; } } spin_lock(&dynids->lock); list_add_tail(&dynid->node, &dynids->list); spin_unlock(&dynids->lock); retval = driver_attach(driver); if (retval) return retval; return count; fail: kfree(dynid); return retval; } EXPORT_SYMBOL_GPL(usb_store_new_id); ssize_t usb_show_dynids(struct usb_dynids *dynids, char *buf) { struct usb_dynid *dynid; size_t count = 0; list_for_each_entry(dynid, &dynids->list, node) if (dynid->id.bInterfaceClass != 0) count += scnprintf(&buf[count], PAGE_SIZE - count, "%04x %04x %02x\n", dynid->id.idVendor, dynid->id.idProduct, dynid->id.bInterfaceClass); else count += scnprintf(&buf[count], PAGE_SIZE - count, "%04x %04x\n", dynid->id.idVendor, dynid->id.idProduct); return count; } EXPORT_SYMBOL_GPL(usb_show_dynids); static ssize_t new_id_show(struct device_driver *driver, char *buf) { struct usb_driver *usb_drv = to_usb_driver(driver); return usb_show_dynids(&usb_drv->dynids, buf); } static ssize_t new_id_store(struct device_driver *driver, const char *buf, size_t count) { struct usb_driver *usb_drv = to_usb_driver(driver); return usb_store_new_id(&usb_drv->dynids, usb_drv->id_table, driver, buf, count); } static DRIVER_ATTR_RW(new_id); /* * Remove a USB device ID from this driver */ static ssize_t remove_id_store(struct device_driver *driver, const char *buf, size_t count) { struct usb_dynid *dynid, *n; struct usb_driver *usb_driver = to_usb_driver(driver); u32 idVendor; u32 idProduct; int fields; fields = sscanf(buf, "%x %x", &idVendor, &idProduct); if (fields < 2) return -EINVAL; spin_lock(&usb_driver->dynids.lock); list_for_each_entry_safe(dynid, n, &usb_driver->dynids.list, node) { struct usb_device_id *id = &dynid->id; if ((id->idVendor == idVendor) && (id->idProduct == idProduct)) { list_del(&dynid->node); kfree(dynid); break; } } spin_unlock(&usb_driver->dynids.lock); return count; } static ssize_t remove_id_show(struct device_driver *driver, char *buf) { return new_id_show(driver, buf); } static DRIVER_ATTR_RW(remove_id); static int usb_create_newid_files(struct usb_driver *usb_drv) { int error = 0; if (usb_drv->no_dynamic_id) goto exit; if (usb_drv->probe != NULL) { error = driver_create_file(&usb_drv->driver, &driver_attr_new_id); if (error == 0) { error = driver_create_file(&usb_drv->driver, &driver_attr_remove_id); if (error) driver_remove_file(&usb_drv->driver, &driver_attr_new_id); } } exit: return error; } static void usb_remove_newid_files(struct usb_driver *usb_drv) { if (usb_drv->no_dynamic_id) return; if (usb_drv->probe != NULL) { driver_remove_file(&usb_drv->driver, &driver_attr_remove_id); driver_remove_file(&usb_drv->driver, &driver_attr_new_id); } } static void usb_free_dynids(struct usb_driver *usb_drv) { struct usb_dynid *dynid, *n; spin_lock(&usb_drv->dynids.lock); list_for_each_entry_safe(dynid, n, &usb_drv->dynids.list, node) { list_del(&dynid->node); kfree(dynid); } spin_unlock(&usb_drv->dynids.lock); } static const struct usb_device_id *usb_match_dynamic_id(struct usb_interface *intf, struct usb_driver *drv) { struct usb_dynid *dynid; spin_lock(&drv->dynids.lock); list_for_each_entry(dynid, &drv->dynids.list, node) { if (usb_match_one_id(intf, &dynid->id)) { spin_unlock(&drv->dynids.lock); return &dynid->id; } } spin_unlock(&drv->dynids.lock); return NULL; } /* called from driver core with dev locked */ static int usb_probe_device(struct device *dev) { struct usb_device_driver *udriver = to_usb_device_driver(dev->driver); struct usb_device *udev = to_usb_device(dev); int error = 0; dev_dbg(dev, "%s\n", __func__); /* TODO: Add real matching code */ /* The device should always appear to be in use * unless the driver supports autosuspend. */ if (!udriver->supports_autosuspend) error = usb_autoresume_device(udev); if (error) return error; if (udriver->generic_subclass) error = usb_generic_driver_probe(udev); if (error) return error; /* Probe the USB device with the driver in hand, but only * defer to a generic driver in case the current USB * device driver has an id_table or a match function; i.e., * when the device driver was explicitly matched against * a device. * * If the device driver does not have either of these, * then we assume that it can bind to any device and is * not truly a more specialized/non-generic driver, so a * return value of -ENODEV should not force the device * to be handled by the generic USB driver, as there * can still be another, more specialized, device driver. * * This accommodates the usbip driver. * * TODO: What if, in the future, there are multiple * specialized USB device drivers for a particular device? * In such cases, there is a need to try all matching * specialised device drivers prior to setting the * use_generic_driver bit. */ if (udriver->probe) error = udriver->probe(udev); else if (!udriver->generic_subclass) error = -EINVAL; if (error == -ENODEV && udriver != &usb_generic_driver && (udriver->id_table || udriver->match)) { udev->use_generic_driver = 1; return -EPROBE_DEFER; } return error; } /* called from driver core with dev locked */ static int usb_unbind_device(struct device *dev) { struct usb_device *udev = to_usb_device(dev); struct usb_device_driver *udriver = to_usb_device_driver(dev->driver); if (udriver->disconnect) udriver->disconnect(udev); if (udriver->generic_subclass) usb_generic_driver_disconnect(udev); if (!udriver->supports_autosuspend) usb_autosuspend_device(udev); return 0; } /* called from driver core with dev locked */ static int usb_probe_interface(struct device *dev) { struct usb_driver *driver = to_usb_driver(dev->driver); struct usb_interface *intf = to_usb_interface(dev); struct usb_device *udev = interface_to_usbdev(intf); const struct usb_device_id *id; int error = -ENODEV; int lpm_disable_error = -ENODEV; dev_dbg(dev, "%s\n", __func__); intf->needs_binding = 0; if (usb_device_is_owned(udev)) return error; if (udev->authorized == 0) { dev_err(&intf->dev, "Device is not authorized for usage\n"); return error; } else if (intf->authorized == 0) { dev_err(&intf->dev, "Interface %d is not authorized for usage\n", intf->altsetting->desc.bInterfaceNumber); return error; } id = usb_match_dynamic_id(intf, driver); if (!id) id = usb_match_id(intf, driver->id_table); if (!id) return error; dev_dbg(dev, "%s - got id\n", __func__); error = usb_autoresume_device(udev); if (error) return error; intf->condition = USB_INTERFACE_BINDING; /* Probed interfaces are initially active. They are * runtime-PM-enabled only if the driver has autosuspend support. * They are sensitive to their children's power states. */ pm_runtime_set_active(dev); pm_suspend_ignore_children(dev, false); if (driver->supports_autosuspend) pm_runtime_enable(dev); /* If the new driver doesn't allow hub-initiated LPM, and we can't * disable hub-initiated LPM, then fail the probe. * * Otherwise, leaving LPM enabled should be harmless, because the * endpoint intervals should remain the same, and the U1/U2 timeouts * should remain the same. * * If we need to install alt setting 0 before probe, or another alt * setting during probe, that should also be fine. usb_set_interface() * will attempt to disable LPM, and fail if it can't disable it. */ if (driver->disable_hub_initiated_lpm) { lpm_disable_error = usb_unlocked_disable_lpm(udev); if (lpm_disable_error) { dev_err(&intf->dev, "%s Failed to disable LPM for driver %s\n", __func__, driver->name); error = lpm_disable_error; goto err; } } /* Carry out a deferred switch to altsetting 0 */ if (intf->needs_altsetting0) { error = usb_set_interface(udev, intf->altsetting[0]. desc.bInterfaceNumber, 0); if (error < 0) goto err; intf->needs_altsetting0 = 0; } error = driver->probe(intf, id); if (error) goto err; intf->condition = USB_INTERFACE_BOUND; /* If the LPM disable succeeded, balance the ref counts. */ if (!lpm_disable_error) usb_unlocked_enable_lpm(udev); usb_autosuspend_device(udev); return error; err: usb_set_intfdata(intf, NULL); intf->needs_remote_wakeup = 0; intf->condition = USB_INTERFACE_UNBOUND; /* If the LPM disable succeeded, balance the ref counts. */ if (!lpm_disable_error) usb_unlocked_enable_lpm(udev); /* Unbound interfaces are always runtime-PM-disabled and -suspended */ if (driver->supports_autosuspend) pm_runtime_disable(dev); pm_runtime_set_suspended(dev); usb_autosuspend_device(udev); return error; } /* called from driver core with dev locked */ static int usb_unbind_interface(struct device *dev) { struct usb_driver *driver = to_usb_driver(dev->driver); struct usb_interface *intf = to_usb_interface(dev); struct usb_host_endpoint *ep, **eps = NULL; struct usb_device *udev; int i, j, error, r; int lpm_disable_error = -ENODEV; intf->condition = USB_INTERFACE_UNBINDING; /* Autoresume for set_interface call below */ udev = interface_to_usbdev(intf); error = usb_autoresume_device(udev); /* If hub-initiated LPM policy may change, attempt to disable LPM until * the driver is unbound. If LPM isn't disabled, that's fine because it * wouldn't be enabled unless all the bound interfaces supported * hub-initiated LPM. */ if (driver->disable_hub_initiated_lpm) lpm_disable_error = usb_unlocked_disable_lpm(udev); /* * Terminate all URBs for this interface unless the driver * supports "soft" unbinding and the device is still present. */ if (!driver->soft_unbind || udev->state == USB_STATE_NOTATTACHED) usb_disable_interface(udev, intf, false); driver->disconnect(intf); /* Free streams */ for (i = 0, j = 0; i < intf->cur_altsetting->desc.bNumEndpoints; i++) { ep = &intf->cur_altsetting->endpoint[i]; if (ep->streams == 0) continue; if (j == 0) { eps = kmalloc_array(USB_MAXENDPOINTS, sizeof(void *), GFP_KERNEL); if (!eps) break; } eps[j++] = ep; } if (j) { usb_free_streams(intf, eps, j, GFP_KERNEL); kfree(eps); } /* Reset other interface state. * We cannot do a Set-Interface if the device is suspended or * if it is prepared for a system sleep (since installing a new * altsetting means creating new endpoint device entries). * When either of these happens, defer the Set-Interface. */ if (intf->cur_altsetting->desc.bAlternateSetting == 0) { /* Already in altsetting 0 so skip Set-Interface. * Just re-enable it without affecting the endpoint toggles. */ usb_enable_interface(udev, intf, false); } else if (!error && !intf->dev.power.is_prepared) { r = usb_set_interface(udev, intf->altsetting[0]. desc.bInterfaceNumber, 0); if (r < 0) intf->needs_altsetting0 = 1; } else { intf->needs_altsetting0 = 1; } usb_set_intfdata(intf, NULL); intf->condition = USB_INTERFACE_UNBOUND; intf->needs_remote_wakeup = 0; /* Attempt to re-enable USB3 LPM, if the disable succeeded. */ if (!lpm_disable_error) usb_unlocked_enable_lpm(udev); /* Unbound interfaces are always runtime-PM-disabled and -suspended */ if (driver->supports_autosuspend) pm_runtime_disable(dev); pm_runtime_set_suspended(dev); if (!error) usb_autosuspend_device(udev); return 0; } /** * usb_driver_claim_interface - bind a driver to an interface * @driver: the driver to be bound * @iface: the interface to which it will be bound; must be in the * usb device's active configuration * @data: driver data associated with that interface * * This is used by usb device drivers that need to claim more than one * interface on a device when probing (audio and acm are current examples). * No device driver should directly modify internal usb_interface or * usb_device structure members. * * Callers must own the device lock, so driver probe() entries don't need * extra locking, but other call contexts may need to explicitly claim that * lock. * * Return: 0 on success. */ int usb_driver_claim_interface(struct usb_driver *driver, struct usb_interface *iface, void *data) { struct device *dev; int retval = 0; if (!iface) return -ENODEV; dev = &iface->dev; if (dev->driver) return -EBUSY; /* reject claim if interface is not authorized */ if (!iface->authorized) return -ENODEV; dev->driver = &driver->driver; usb_set_intfdata(iface, data); iface->needs_binding = 0; iface->condition = USB_INTERFACE_BOUND; /* Claimed interfaces are initially inactive (suspended) and * runtime-PM-enabled, but only if the driver has autosuspend * support. Otherwise they are marked active, to prevent the * device from being autosuspended, but left disabled. In either * case they are sensitive to their children's power states. */ pm_suspend_ignore_children(dev, false); if (driver->supports_autosuspend) pm_runtime_enable(dev); else pm_runtime_set_active(dev); /* if interface was already added, bind now; else let * the future device_add() bind it, bypassing probe() */ if (device_is_registered(dev)) retval = device_bind_driver(dev); if (retval) { dev->driver = NULL; usb_set_intfdata(iface, NULL); iface->needs_remote_wakeup = 0; iface->condition = USB_INTERFACE_UNBOUND; /* * Unbound interfaces are always runtime-PM-disabled * and runtime-PM-suspended */ if (driver->supports_autosuspend) pm_runtime_disable(dev); pm_runtime_set_suspended(dev); } return retval; } EXPORT_SYMBOL_GPL(usb_driver_claim_interface); /** * usb_driver_release_interface - unbind a driver from an interface * @driver: the driver to be unbound * @iface: the interface from which it will be unbound * * This can be used by drivers to release an interface without waiting * for their disconnect() methods to be called. In typical cases this * also causes the driver disconnect() method to be called. * * This call is synchronous, and may not be used in an interrupt context. * Callers must own the device lock, so driver disconnect() entries don't * need extra locking, but other call contexts may need to explicitly claim * that lock. */ void usb_driver_release_interface(struct usb_driver *driver, struct usb_interface *iface) { struct device *dev = &iface->dev; /* this should never happen, don't release something that's not ours */ if (!dev->driver || dev->driver != &driver->driver) return; /* don't release from within disconnect() */ if (iface->condition != USB_INTERFACE_BOUND) return; iface->condition = USB_INTERFACE_UNBINDING; /* Release via the driver core only if the interface * has already been registered */ if (device_is_registered(dev)) { device_release_driver(dev); } else { device_lock(dev); usb_unbind_interface(dev); dev->driver = NULL; device_unlock(dev); } } EXPORT_SYMBOL_GPL(usb_driver_release_interface); /* returns 0 if no match, 1 if match */ int usb_match_device(struct usb_device *dev, const struct usb_device_id *id) { if ((id->match_flags & USB_DEVICE_ID_MATCH_VENDOR) && id->idVendor != le16_to_cpu(dev->descriptor.idVendor)) return 0; if ((id->match_flags & USB_DEVICE_ID_MATCH_PRODUCT) && id->idProduct != le16_to_cpu(dev->descriptor.idProduct)) return 0; /* No need to test id->bcdDevice_lo != 0, since 0 is never greater than any unsigned number. */ if ((id->match_flags & USB_DEVICE_ID_MATCH_DEV_LO) && (id->bcdDevice_lo > le16_to_cpu(dev->descriptor.bcdDevice))) return 0; if ((id->match_flags & USB_DEVICE_ID_MATCH_DEV_HI) && (id->bcdDevice_hi < le16_to_cpu(dev->descriptor.bcdDevice))) return 0; if ((id->match_flags & USB_DEVICE_ID_MATCH_DEV_CLASS) && (id->bDeviceClass != dev->descriptor.bDeviceClass)) return 0; if ((id->match_flags & USB_DEVICE_ID_MATCH_DEV_SUBCLASS) && (id->bDeviceSubClass != dev->descriptor.bDeviceSubClass)) return 0; if ((id->match_flags & USB_DEVICE_ID_MATCH_DEV_PROTOCOL) && (id->bDeviceProtocol != dev->descriptor.bDeviceProtocol)) return 0; return 1; } /* returns 0 if no match, 1 if match */ int usb_match_one_id_intf(struct usb_device *dev, struct usb_host_interface *intf, const struct usb_device_id *id) { /* The interface class, subclass, protocol and number should never be * checked for a match if the device class is Vendor Specific, * unless the match record specifies the Vendor ID. */ if (dev->descriptor.bDeviceClass == USB_CLASS_VENDOR_SPEC && !(id->match_flags & USB_DEVICE_ID_MATCH_VENDOR) && (id->match_flags & (USB_DEVICE_ID_MATCH_INT_CLASS | USB_DEVICE_ID_MATCH_INT_SUBCLASS | USB_DEVICE_ID_MATCH_INT_PROTOCOL | USB_DEVICE_ID_MATCH_INT_NUMBER))) return 0; if ((id->match_flags & USB_DEVICE_ID_MATCH_INT_CLASS) && (id->bInterfaceClass != intf->desc.bInterfaceClass)) return 0; if ((id->match_flags & USB_DEVICE_ID_MATCH_INT_SUBCLASS) && (id->bInterfaceSubClass != intf->desc.bInterfaceSubClass)) return 0; if ((id->match_flags & USB_DEVICE_ID_MATCH_INT_PROTOCOL) && (id->bInterfaceProtocol != intf->desc.bInterfaceProtocol)) return 0; if ((id->match_flags & USB_DEVICE_ID_MATCH_INT_NUMBER) && (id->bInterfaceNumber != intf->desc.bInterfaceNumber)) return 0; return 1; } /* returns 0 if no match, 1 if match */ int usb_match_one_id(struct usb_interface *interface, const struct usb_device_id *id) { struct usb_host_interface *intf; struct usb_device *dev; /* proc_connectinfo in devio.c may call us with id == NULL. */ if (id == NULL) return 0; intf = interface->cur_altsetting; dev = interface_to_usbdev(interface); if (!usb_match_device(dev, id)) return 0; return usb_match_one_id_intf(dev, intf, id); } EXPORT_SYMBOL_GPL(usb_match_one_id); /** * usb_match_id - find first usb_device_id matching device or interface * @interface: the interface of interest * @id: array of usb_device_id structures, terminated by zero entry * * usb_match_id searches an array of usb_device_id's and returns * the first one matching the device or interface, or null. * This is used when binding (or rebinding) a driver to an interface. * Most USB device drivers will use this indirectly, through the usb core, * but some layered driver frameworks use it directly. * These device tables are exported with MODULE_DEVICE_TABLE, through * modutils, to support the driver loading functionality of USB hotplugging. * * Return: The first matching usb_device_id, or %NULL. * * What Matches: * * The "match_flags" element in a usb_device_id controls which * members are used. If the corresponding bit is set, the * value in the device_id must match its corresponding member * in the device or interface descriptor, or else the device_id * does not match. * * "driver_info" is normally used only by device drivers, * but you can create a wildcard "matches anything" usb_device_id * as a driver's "modules.usbmap" entry if you provide an id with * only a nonzero "driver_info" field. If you do this, the USB device * driver's probe() routine should use additional intelligence to * decide whether to bind to the specified interface. * * What Makes Good usb_device_id Tables: * * The match algorithm is very simple, so that intelligence in * driver selection must come from smart driver id records. * Unless you have good reasons to use another selection policy, * provide match elements only in related groups, and order match * specifiers from specific to general. Use the macros provided * for that purpose if you can. * * The most specific match specifiers use device descriptor * data. These are commonly used with product-specific matches; * the USB_DEVICE macro lets you provide vendor and product IDs, * and you can also match against ranges of product revisions. * These are widely used for devices with application or vendor * specific bDeviceClass values. * * Matches based on device class/subclass/protocol specifications * are slightly more general; use the USB_DEVICE_INFO macro, or * its siblings. These are used with single-function devices * where bDeviceClass doesn't specify that each interface has * its own class. * * Matches based on interface class/subclass/protocol are the * most general; they let drivers bind to any interface on a * multiple-function device. Use the USB_INTERFACE_INFO * macro, or its siblings, to match class-per-interface style * devices (as recorded in bInterfaceClass). * * Note that an entry created by USB_INTERFACE_INFO won't match * any interface if the device class is set to Vendor-Specific. * This is deliberate; according to the USB spec the meanings of * the interface class/subclass/protocol for these devices are also * vendor-specific, and hence matching against a standard product * class wouldn't work anyway. If you really want to use an * interface-based match for such a device, create a match record * that also specifies the vendor ID. (Unforunately there isn't a * standard macro for creating records like this.) * * Within those groups, remember that not all combinations are * meaningful. For example, don't give a product version range * without vendor and product IDs; or specify a protocol without * its associated class and subclass. */ const struct usb_device_id *usb_match_id(struct usb_interface *interface, const struct usb_device_id *id) { /* proc_connectinfo in devio.c may call us with id == NULL. */ if (id == NULL) return NULL; /* It is important to check that id->driver_info is nonzero, since an entry that is all zeroes except for a nonzero id->driver_info is the way to create an entry that indicates that the driver want to examine every device and interface. */ for (; id->idVendor || id->idProduct || id->bDeviceClass || id->bInterfaceClass || id->driver_info; id++) { if (usb_match_one_id(interface, id)) return id; } return NULL; } EXPORT_SYMBOL_GPL(usb_match_id); const struct usb_device_id *usb_device_match_id(struct usb_device *udev, const struct usb_device_id *id) { if (!id) return NULL; for (; id->idVendor || id->idProduct ; id++) { if (usb_match_device(udev, id)) return id; } return NULL; } EXPORT_SYMBOL_GPL(usb_device_match_id); bool usb_driver_applicable(struct usb_device *udev, struct usb_device_driver *udrv) { if (udrv->id_table && udrv->match) return usb_device_match_id(udev, udrv->id_table) != NULL && udrv->match(udev); if (udrv->id_table) return usb_device_match_id(udev, udrv->id_table) != NULL; if (udrv->match) return udrv->match(udev); return false; } static int usb_device_match(struct device *dev, struct device_driver *drv) { /* devices and interfaces are handled separately */ if (is_usb_device(dev)) { struct usb_device *udev; struct usb_device_driver *udrv; /* interface drivers never match devices */ if (!is_usb_device_driver(drv)) return 0; udev = to_usb_device(dev); udrv = to_usb_device_driver(drv); /* If the device driver under consideration does not have a * id_table or a match function, then let the driver's probe * function decide. */ if (!udrv->id_table && !udrv->match) return 1; return usb_driver_applicable(udev, udrv); } else if (is_usb_interface(dev)) { struct usb_interface *intf; struct usb_driver *usb_drv; const struct usb_device_id *id; /* device drivers never match interfaces */ if (is_usb_device_driver(drv)) return 0; intf = to_usb_interface(dev); usb_drv = to_usb_driver(drv); id = usb_match_id(intf, usb_drv->id_table); if (id) return 1; id = usb_match_dynamic_id(intf, usb_drv); if (id) return 1; } return 0; } static int usb_uevent(const struct device *dev, struct kobj_uevent_env *env) { const struct usb_device *usb_dev; if (is_usb_device(dev)) { usb_dev = to_usb_device(dev); } else if (is_usb_interface(dev)) { const struct usb_interface *intf = to_usb_interface(dev); usb_dev = interface_to_usbdev(intf); } else { return 0; } if (usb_dev->devnum < 0) { /* driver is often null here; dev_dbg() would oops */ pr_debug("usb %s: already deleted?\n", dev_name(dev)); return -ENODEV; } if (!usb_dev->bus) { pr_debug("usb %s: bus removed?\n", dev_name(dev)); return -ENODEV; } /* per-device configurations are common */ if (add_uevent_var(env, "PRODUCT=%x/%x/%x", le16_to_cpu(usb_dev->descriptor.idVendor), le16_to_cpu(usb_dev->descriptor.idProduct), le16_to_cpu(usb_dev->descriptor.bcdDevice))) return -ENOMEM; /* class-based driver binding models */ if (add_uevent_var(env, "TYPE=%d/%d/%d", usb_dev->descriptor.bDeviceClass, usb_dev->descriptor.bDeviceSubClass, usb_dev->descriptor.bDeviceProtocol)) return -ENOMEM; return 0; } static int __usb_bus_reprobe_drivers(struct device *dev, void *data) { struct usb_device_driver *new_udriver = data; struct usb_device *udev; int ret; /* Don't reprobe if current driver isn't usb_generic_driver */ if (dev->driver != &usb_generic_driver.driver) return 0; udev = to_usb_device(dev); if (!usb_driver_applicable(udev, new_udriver)) return 0; ret = device_reprobe(dev); if (ret && ret != -EPROBE_DEFER) dev_err(dev, "Failed to reprobe device (error %d)\n", ret); return 0; } bool is_usb_device_driver(const struct device_driver *drv) { return drv->probe == usb_probe_device; } /** * usb_register_device_driver - register a USB device (not interface) driver * @new_udriver: USB operations for the device driver * @owner: module owner of this driver. * * Registers a USB device driver with the USB core. The list of * unattached devices will be rescanned whenever a new driver is * added, allowing the new driver to attach to any recognized devices. * * Return: A negative error code on failure and 0 on success. */ int usb_register_device_driver(struct usb_device_driver *new_udriver, struct module *owner) { int retval = 0; if (usb_disabled()) return -ENODEV; new_udriver->driver.name = new_udriver->name; new_udriver->driver.bus = &usb_bus_type; new_udriver->driver.probe = usb_probe_device; new_udriver->driver.remove = usb_unbind_device; new_udriver->driver.owner = owner; new_udriver->driver.dev_groups = new_udriver->dev_groups; retval = driver_register(&new_udriver->driver); if (!retval) { pr_info("%s: registered new device driver %s\n", usbcore_name, new_udriver->name); /* * Check whether any device could be better served with * this new driver */ bus_for_each_dev(&usb_bus_type, NULL, new_udriver, __usb_bus_reprobe_drivers); } else { pr_err("%s: error %d registering device driver %s\n", usbcore_name, retval, new_udriver->name); } return retval; } EXPORT_SYMBOL_GPL(usb_register_device_driver); /** * usb_deregister_device_driver - unregister a USB device (not interface) driver * @udriver: USB operations of the device driver to unregister * Context: must be able to sleep * * Unlinks the specified driver from the internal USB driver list. */ void usb_deregister_device_driver(struct usb_device_driver *udriver) { pr_info("%s: deregistering device driver %s\n", usbcore_name, udriver->name); driver_unregister(&udriver->driver); } EXPORT_SYMBOL_GPL(usb_deregister_device_driver); /** * usb_register_driver - register a USB interface driver * @new_driver: USB operations for the interface driver * @owner: module owner of this driver. * @mod_name: module name string * * Registers a USB interface driver with the USB core. The list of * unattached interfaces will be rescanned whenever a new driver is * added, allowing the new driver to attach to any recognized interfaces. * * Return: A negative error code on failure and 0 on success. * * NOTE: if you want your driver to use the USB major number, you must call * usb_register_dev() to enable that functionality. This function no longer * takes care of that. */ int usb_register_driver(struct usb_driver *new_driver, struct module *owner, const char *mod_name) { int retval = 0; if (usb_disabled()) return -ENODEV; new_driver->driver.name = new_driver->name; new_driver->driver.bus = &usb_bus_type; new_driver->driver.probe = usb_probe_interface; new_driver->driver.remove = usb_unbind_interface; new_driver->driver.owner = owner; new_driver->driver.mod_name = mod_name; new_driver->driver.dev_groups = new_driver->dev_groups; spin_lock_init(&new_driver->dynids.lock); INIT_LIST_HEAD(&new_driver->dynids.list); retval = driver_register(&new_driver->driver); if (retval) goto out; retval = usb_create_newid_files(new_driver); if (retval) goto out_newid; pr_info("%s: registered new interface driver %s\n", usbcore_name, new_driver->name); out: return retval; out_newid: driver_unregister(&new_driver->driver); pr_err("%s: error %d registering interface driver %s\n", usbcore_name, retval, new_driver->name); goto out; } EXPORT_SYMBOL_GPL(usb_register_driver); /** * usb_deregister - unregister a USB interface driver * @driver: USB operations of the interface driver to unregister * Context: must be able to sleep * * Unlinks the specified driver from the internal USB driver list. * * NOTE: If you called usb_register_dev(), you still need to call * usb_deregister_dev() to clean up your driver's allocated minor numbers, * this * call will no longer do it for you. */ void usb_deregister(struct usb_driver *driver) { pr_info("%s: deregistering interface driver %s\n", usbcore_name, driver->name); usb_remove_newid_files(driver); driver_unregister(&driver->driver); usb_free_dynids(driver); } EXPORT_SYMBOL_GPL(usb_deregister); /* Forced unbinding of a USB interface driver, either because * it doesn't support pre_reset/post_reset/reset_resume or * because it doesn't support suspend/resume. * * The caller must hold @intf's device's lock, but not @intf's lock. */ void usb_forced_unbind_intf(struct usb_interface *intf) { struct usb_driver *driver = to_usb_driver(intf->dev.driver); dev_dbg(&intf->dev, "forced unbind\n"); usb_driver_release_interface(driver, intf); /* Mark the interface for later rebinding */ intf->needs_binding = 1; } /* * Unbind drivers for @udev's marked interfaces. These interfaces have * the needs_binding flag set, for example by usb_resume_interface(). * * The caller must hold @udev's device lock. */ static void unbind_marked_interfaces(struct usb_device *udev) { struct usb_host_config *config; int i; struct usb_interface *intf; config = udev->actconfig; if (config) { for (i = 0; i < config->desc.bNumInterfaces; ++i) { intf = config->interface[i]; if (intf->dev.driver && intf->needs_binding) usb_forced_unbind_intf(intf); } } } /* Delayed forced unbinding of a USB interface driver and scan * for rebinding. * * The caller must hold @intf's device's lock, but not @intf's lock. * * Note: Rebinds will be skipped if a system sleep transition is in * progress and the PM "complete" callback hasn't occurred yet. */ static void usb_rebind_intf(struct usb_interface *intf) { int rc; /* Delayed unbind of an existing driver */ if (intf->dev.driver) usb_forced_unbind_intf(intf); /* Try to rebind the interface */ if (!intf->dev.power.is_prepared) { intf->needs_binding = 0; rc = device_attach(&intf->dev); if (rc < 0 && rc != -EPROBE_DEFER) dev_warn(&intf->dev, "rebind failed: %d\n", rc); } } /* * Rebind drivers to @udev's marked interfaces. These interfaces have * the needs_binding flag set. * * The caller must hold @udev's device lock. */ static void rebind_marked_interfaces(struct usb_device *udev) { struct usb_host_config *config; int i; struct usb_interface *intf; config = udev->actconfig; if (config) { for (i = 0; i < config->desc.bNumInterfaces; ++i) { intf = config->interface[i]; if (intf->needs_binding) usb_rebind_intf(intf); } } } /* * Unbind all of @udev's marked interfaces and then rebind all of them. * This ordering is necessary because some drivers claim several interfaces * when they are first probed. * * The caller must hold @udev's device lock. */ void usb_unbind_and_rebind_marked_interfaces(struct usb_device *udev) { unbind_marked_interfaces(udev); rebind_marked_interfaces(udev); } #ifdef CONFIG_PM /* Unbind drivers for @udev's interfaces that don't support suspend/resume * There is no check for reset_resume here because it can be determined * only during resume whether reset_resume is needed. * * The caller must hold @udev's device lock. */ static void unbind_no_pm_drivers_interfaces(struct usb_device *udev) { struct usb_host_config *config; int i; struct usb_interface *intf; struct usb_driver *drv; config = udev->actconfig; if (config) { for (i = 0; i < config->desc.bNumInterfaces; ++i) { intf = config->interface[i]; if (intf->dev.driver) { drv = to_usb_driver(intf->dev.driver); if (!drv->suspend || !drv->resume) usb_forced_unbind_intf(intf); } } } } static int usb_suspend_device(struct usb_device *udev, pm_message_t msg) { struct usb_device_driver *udriver; int status = 0; if (udev->state == USB_STATE_NOTATTACHED || udev->state == USB_STATE_SUSPENDED) goto done; /* For devices that don't have a driver, we do a generic suspend. */ if (udev->dev.driver) udriver = to_usb_device_driver(udev->dev.driver); else { udev->do_remote_wakeup = 0; udriver = &usb_generic_driver; } if (udriver->suspend) status = udriver->suspend(udev, msg); if (status == 0 && udriver->generic_subclass) status = usb_generic_driver_suspend(udev, msg); done: dev_vdbg(&udev->dev, "%s: status %d\n", __func__, status); return status; } static int usb_resume_device(struct usb_device *udev, pm_message_t msg) { struct usb_device_driver *udriver; int status = 0; if (udev->state == USB_STATE_NOTATTACHED) goto done; /* Can't resume it if it doesn't have a driver. */ if (udev->dev.driver == NULL) { status = -ENOTCONN; goto done; } /* Non-root devices on a full/low-speed bus must wait for their * companion high-speed root hub, in case a handoff is needed. */ if (!PMSG_IS_AUTO(msg) && udev->parent && udev->bus->hs_companion) device_pm_wait_for_dev(&udev->dev, &udev->bus->hs_companion->root_hub->dev); if (udev->quirks & USB_QUIRK_RESET_RESUME) udev->reset_resume = 1; udriver = to_usb_device_driver(udev->dev.driver); if (udriver->generic_subclass) status = usb_generic_driver_resume(udev, msg); if (status == 0 && udriver->resume) status = udriver->resume(udev, msg); done: dev_vdbg(&udev->dev, "%s: status %d\n", __func__, status); return status; } static int usb_suspend_interface(struct usb_device *udev, struct usb_interface *intf, pm_message_t msg) { struct usb_driver *driver; int status = 0; if (udev->state == USB_STATE_NOTATTACHED || intf->condition == USB_INTERFACE_UNBOUND) goto done; driver = to_usb_driver(intf->dev.driver); /* at this time we know the driver supports suspend */ status = driver->suspend(intf, msg); if (status && !PMSG_IS_AUTO(msg)) dev_err(&intf->dev, "suspend error %d\n", status); done: dev_vdbg(&intf->dev, "%s: status %d\n", __func__, status); return status; } static int usb_resume_interface(struct usb_device *udev, struct usb_interface *intf, pm_message_t msg, int reset_resume) { struct usb_driver *driver; int status = 0; if (udev->state == USB_STATE_NOTATTACHED) goto done; /* Don't let autoresume interfere with unbinding */ if (intf->condition == USB_INTERFACE_UNBINDING) goto done; /* Can't resume it if it doesn't have a driver. */ if (intf->condition == USB_INTERFACE_UNBOUND) { /* Carry out a deferred switch to altsetting 0 */ if (intf->needs_altsetting0 && !intf->dev.power.is_prepared) { usb_set_interface(udev, intf->altsetting[0]. desc.bInterfaceNumber, 0); intf->needs_altsetting0 = 0; } goto done; } /* Don't resume if the interface is marked for rebinding */ if (intf->needs_binding) goto done; driver = to_usb_driver(intf->dev.driver); if (reset_resume) { if (driver->reset_resume) { status = driver->reset_resume(intf); if (status) dev_err(&intf->dev, "%s error %d\n", "reset_resume", status); } else { intf->needs_binding = 1; dev_dbg(&intf->dev, "no reset_resume for driver %s?\n", driver->name); } } else { status = driver->resume(intf); if (status) dev_err(&intf->dev, "resume error %d\n", status); } done: dev_vdbg(&intf->dev, "%s: status %d\n", __func__, status); /* Later we will unbind the driver and/or reprobe, if necessary */ return status; } /** * usb_suspend_both - suspend a USB device and its interfaces * @udev: the usb_device to suspend * @msg: Power Management message describing this state transition * * This is the central routine for suspending USB devices. It calls the * suspend methods for all the interface drivers in @udev and then calls * the suspend method for @udev itself. When the routine is called in * autosuspend, if an error occurs at any stage, all the interfaces * which were suspended are resumed so that they remain in the same * state as the device, but when called from system sleep, all error * from suspend methods of interfaces and the non-root-hub device itself * are simply ignored, so all suspended interfaces are only resumed * to the device's state when @udev is root-hub and its suspend method * returns failure. * * Autosuspend requests originating from a child device or an interface * driver may be made without the protection of @udev's device lock, but * all other suspend calls will hold the lock. Usbcore will insure that * method calls do not arrive during bind, unbind, or reset operations. * However drivers must be prepared to handle suspend calls arriving at * unpredictable times. * * This routine can run only in process context. * * Return: 0 if the suspend succeeded. */ static int usb_suspend_both(struct usb_device *udev, pm_message_t msg) { int status = 0; int i = 0, n = 0; struct usb_interface *intf; if (udev->state == USB_STATE_NOTATTACHED || udev->state == USB_STATE_SUSPENDED) goto done; /* Suspend all the interfaces and then udev itself */ if (udev->actconfig) { n = udev->actconfig->desc.bNumInterfaces; for (i = n - 1; i >= 0; --i) { intf = udev->actconfig->interface[i]; status = usb_suspend_interface(udev, intf, msg); /* Ignore errors during system sleep transitions */ if (!PMSG_IS_AUTO(msg)) status = 0; if (status != 0) break; } } if (status == 0) { status = usb_suspend_device(udev, msg); /* * Ignore errors from non-root-hub devices during * system sleep transitions. For the most part, * these devices should go to low power anyway when * the entire bus is suspended. */ if (udev->parent && !PMSG_IS_AUTO(msg)) status = 0; /* * If the device is inaccessible, don't try to resume * suspended interfaces and just return the error. */ if (status && status != -EBUSY) { int err; u16 devstat; err = usb_get_std_status(udev, USB_RECIP_DEVICE, 0, &devstat); if (err) { dev_err(&udev->dev, "Failed to suspend device, error %d\n", status); goto done; } } } /* If the suspend failed, resume interfaces that did get suspended */ if (status != 0) { if (udev->actconfig) { msg.event ^= (PM_EVENT_SUSPEND | PM_EVENT_RESUME); while (++i < n) { intf = udev->actconfig->interface[i]; usb_resume_interface(udev, intf, msg, 0); } } /* If the suspend succeeded then prevent any more URB submissions * and flush any outstanding URBs. */ } else { udev->can_submit = 0; for (i = 0; i < 16; ++i) { usb_hcd_flush_endpoint(udev, udev->ep_out[i]); usb_hcd_flush_endpoint(udev, udev->ep_in[i]); } } done: dev_vdbg(&udev->dev, "%s: status %d\n", __func__, status); return status; } /** * usb_resume_both - resume a USB device and its interfaces * @udev: the usb_device to resume * @msg: Power Management message describing this state transition * * This is the central routine for resuming USB devices. It calls the * resume method for @udev and then calls the resume methods for all * the interface drivers in @udev. * * Autoresume requests originating from a child device or an interface * driver may be made without the protection of @udev's device lock, but * all other resume calls will hold the lock. Usbcore will insure that * method calls do not arrive during bind, unbind, or reset operations. * However drivers must be prepared to handle resume calls arriving at * unpredictable times. * * This routine can run only in process context. * * Return: 0 on success. */ static int usb_resume_both(struct usb_device *udev, pm_message_t msg) { int status = 0; int i; struct usb_interface *intf; if (udev->state == USB_STATE_NOTATTACHED) { status = -ENODEV; goto done; } udev->can_submit = 1; /* Resume the device */ if (udev->state == USB_STATE_SUSPENDED || udev->reset_resume) status = usb_resume_device(udev, msg); /* Resume the interfaces */ if (status == 0 && udev->actconfig) { for (i = 0; i < udev->actconfig->desc.bNumInterfaces; i++) { intf = udev->actconfig->interface[i]; usb_resume_interface(udev, intf, msg, udev->reset_resume); } } usb_mark_last_busy(udev); done: dev_vdbg(&udev->dev, "%s: status %d\n", __func__, status); if (!status) udev->reset_resume = 0; return status; } static void choose_wakeup(struct usb_device *udev, pm_message_t msg) { int w; /* * For FREEZE/QUIESCE, disable remote wakeups so no interrupts get * generated. */ if (msg.event == PM_EVENT_FREEZE || msg.event == PM_EVENT_QUIESCE) { w = 0; } else { /* * Enable remote wakeup if it is allowed, even if no interface * drivers actually want it. */ w = device_may_wakeup(&udev->dev); } /* * If the device is autosuspended with the wrong wakeup setting, * autoresume now so the setting can be changed. */ if (udev->state == USB_STATE_SUSPENDED && w != udev->do_remote_wakeup) pm_runtime_resume(&udev->dev); udev->do_remote_wakeup = w; } /* The device lock is held by the PM core */ int usb_suspend(struct device *dev, pm_message_t msg) { struct usb_device *udev = to_usb_device(dev); int r; unbind_no_pm_drivers_interfaces(udev); /* From now on we are sure all drivers support suspend/resume * but not necessarily reset_resume() * so we may still need to unbind and rebind upon resume */ choose_wakeup(udev, msg); r = usb_suspend_both(udev, msg); if (r) return r; if (udev->quirks & USB_QUIRK_DISCONNECT_SUSPEND) usb_port_disable(udev); return 0; } /* The device lock is held by the PM core */ int usb_resume_complete(struct device *dev) { struct usb_device *udev = to_usb_device(dev); /* For PM complete calls, all we do is rebind interfaces * whose needs_binding flag is set */ if (udev->state != USB_STATE_NOTATTACHED) rebind_marked_interfaces(udev); return 0; } /* The device lock is held by the PM core */ int usb_resume(struct device *dev, pm_message_t msg) { struct usb_device *udev = to_usb_device(dev); int status; /* For all calls, take the device back to full power and * tell the PM core in case it was autosuspended previously. * Unbind the interfaces that will need rebinding later, * because they fail to support reset_resume. * (This can't be done in usb_resume_interface() * above because it doesn't own the right set of locks.) */ status = usb_resume_both(udev, msg); if (status == 0) { pm_runtime_disable(dev); pm_runtime_set_active(dev); pm_runtime_enable(dev); unbind_marked_interfaces(udev); } /* Avoid PM error messages for devices disconnected while suspended * as we'll display regular disconnect messages just a bit later. */ if (status == -ENODEV || status == -ESHUTDOWN) status = 0; return status; } /** * usb_enable_autosuspend - allow a USB device to be autosuspended * @udev: the USB device which may be autosuspended * * This routine allows @udev to be autosuspended. An autosuspend won't * take place until the autosuspend_delay has elapsed and all the other * necessary conditions are satisfied. * * The caller must hold @udev's device lock. */ void usb_enable_autosuspend(struct usb_device *udev) { pm_runtime_allow(&udev->dev); } EXPORT_SYMBOL_GPL(usb_enable_autosuspend); /** * usb_disable_autosuspend - prevent a USB device from being autosuspended * @udev: the USB device which may not be autosuspended * * This routine prevents @udev from being autosuspended and wakes it up * if it is already autosuspended. * * The caller must hold @udev's device lock. */ void usb_disable_autosuspend(struct usb_device *udev) { pm_runtime_forbid(&udev->dev); } EXPORT_SYMBOL_GPL(usb_disable_autosuspend); /** * usb_autosuspend_device - delayed autosuspend of a USB device and its interfaces * @udev: the usb_device to autosuspend * * This routine should be called when a core subsystem is finished using * @udev and wants to allow it to autosuspend. Examples would be when * @udev's device file in usbfs is closed or after a configuration change. * * @udev's usage counter is decremented; if it drops to 0 and all the * interfaces are inactive then a delayed autosuspend will be attempted. * The attempt may fail (see autosuspend_check()). * * The caller must hold @udev's device lock. * * This routine can run only in process context. */ void usb_autosuspend_device(struct usb_device *udev) { int status; usb_mark_last_busy(udev); status = pm_runtime_put_sync_autosuspend(&udev->dev); dev_vdbg(&udev->dev, "%s: cnt %d -> %d\n", __func__, atomic_read(&udev->dev.power.usage_count), status); } /** * usb_autoresume_device - immediately autoresume a USB device and its interfaces * @udev: the usb_device to autoresume * * This routine should be called when a core subsystem wants to use @udev * and needs to guarantee that it is not suspended. No autosuspend will * occur until usb_autosuspend_device() is called. (Note that this will * not prevent suspend events originating in the PM core.) Examples would * be when @udev's device file in usbfs is opened or when a remote-wakeup * request is received. * * @udev's usage counter is incremented to prevent subsequent autosuspends. * However if the autoresume fails then the usage counter is re-decremented. * * The caller must hold @udev's device lock. * * This routine can run only in process context. * * Return: 0 on success. A negative error code otherwise. */ int usb_autoresume_device(struct usb_device *udev) { int status; status = pm_runtime_resume_and_get(&udev->dev); dev_vdbg(&udev->dev, "%s: cnt %d -> %d\n", __func__, atomic_read(&udev->dev.power.usage_count), status); if (status > 0) status = 0; return status; } /** * usb_autopm_put_interface - decrement a USB interface's PM-usage counter * @intf: the usb_interface whose counter should be decremented * * This routine should be called by an interface driver when it is * finished using @intf and wants to allow it to autosuspend. A typical * example would be a character-device driver when its device file is * closed. * * The routine decrements @intf's usage counter. When the counter reaches * 0, a delayed autosuspend request for @intf's device is attempted. The * attempt may fail (see autosuspend_check()). * * This routine can run only in process context. */ void usb_autopm_put_interface(struct usb_interface *intf) { struct usb_device *udev = interface_to_usbdev(intf); int status; usb_mark_last_busy(udev); status = pm_runtime_put_sync(&intf->dev); dev_vdbg(&intf->dev, "%s: cnt %d -> %d\n", __func__, atomic_read(&intf->dev.power.usage_count), status); } EXPORT_SYMBOL_GPL(usb_autopm_put_interface); /** * usb_autopm_put_interface_async - decrement a USB interface's PM-usage counter * @intf: the usb_interface whose counter should be decremented * * This routine does much the same thing as usb_autopm_put_interface(): * It decrements @intf's usage counter and schedules a delayed * autosuspend request if the counter is <= 0. The difference is that it * does not perform any synchronization; callers should hold a private * lock and handle all synchronization issues themselves. * * Typically a driver would call this routine during an URB's completion * handler, if no more URBs were pending. * * This routine can run in atomic context. */ void usb_autopm_put_interface_async(struct usb_interface *intf) { struct usb_device *udev = interface_to_usbdev(intf); int status; usb_mark_last_busy(udev); status = pm_runtime_put(&intf->dev); dev_vdbg(&intf->dev, "%s: cnt %d -> %d\n", __func__, atomic_read(&intf->dev.power.usage_count), status); } EXPORT_SYMBOL_GPL(usb_autopm_put_interface_async); /** * usb_autopm_put_interface_no_suspend - decrement a USB interface's PM-usage counter * @intf: the usb_interface whose counter should be decremented * * This routine decrements @intf's usage counter but does not carry out an * autosuspend. * * This routine can run in atomic context. */ void usb_autopm_put_interface_no_suspend(struct usb_interface *intf) { struct usb_device *udev = interface_to_usbdev(intf); usb_mark_last_busy(udev); pm_runtime_put_noidle(&intf->dev); } EXPORT_SYMBOL_GPL(usb_autopm_put_interface_no_suspend); /** * usb_autopm_get_interface - increment a USB interface's PM-usage counter * @intf: the usb_interface whose counter should be incremented * * This routine should be called by an interface driver when it wants to * use @intf and needs to guarantee that it is not suspended. In addition, * the routine prevents @intf from being autosuspended subsequently. (Note * that this will not prevent suspend events originating in the PM core.) * This prevention will persist until usb_autopm_put_interface() is called * or @intf is unbound. A typical example would be a character-device * driver when its device file is opened. * * @intf's usage counter is incremented to prevent subsequent autosuspends. * However if the autoresume fails then the counter is re-decremented. * * This routine can run only in process context. * * Return: 0 on success. */ int usb_autopm_get_interface(struct usb_interface *intf) { int status; status = pm_runtime_resume_and_get(&intf->dev); dev_vdbg(&intf->dev, "%s: cnt %d -> %d\n", __func__, atomic_read(&intf->dev.power.usage_count), status); if (status > 0) status = 0; return status; } EXPORT_SYMBOL_GPL(usb_autopm_get_interface); /** * usb_autopm_get_interface_async - increment a USB interface's PM-usage counter * @intf: the usb_interface whose counter should be incremented * * This routine does much the same thing as * usb_autopm_get_interface(): It increments @intf's usage counter and * queues an autoresume request if the device is suspended. The * differences are that it does not perform any synchronization (callers * should hold a private lock and handle all synchronization issues * themselves), and it does not autoresume the device directly (it only * queues a request). After a successful call, the device may not yet be * resumed. * * This routine can run in atomic context. * * Return: 0 on success. A negative error code otherwise. */ int usb_autopm_get_interface_async(struct usb_interface *intf) { int status; status = pm_runtime_get(&intf->dev); if (status < 0 && status != -EINPROGRESS) pm_runtime_put_noidle(&intf->dev); dev_vdbg(&intf->dev, "%s: cnt %d -> %d\n", __func__, atomic_read(&intf->dev.power.usage_count), status); if (status > 0 || status == -EINPROGRESS) status = 0; return status; } EXPORT_SYMBOL_GPL(usb_autopm_get_interface_async); /** * usb_autopm_get_interface_no_resume - increment a USB interface's PM-usage counter * @intf: the usb_interface whose counter should be incremented * * This routine increments @intf's usage counter but does not carry out an * autoresume. * * This routine can run in atomic context. */ void usb_autopm_get_interface_no_resume(struct usb_interface *intf) { struct usb_device *udev = interface_to_usbdev(intf); usb_mark_last_busy(udev); pm_runtime_get_noresume(&intf->dev); } EXPORT_SYMBOL_GPL(usb_autopm_get_interface_no_resume); /* Internal routine to check whether we may autosuspend a device. */ static int autosuspend_check(struct usb_device *udev) { int w, i; struct usb_interface *intf; if (udev->state == USB_STATE_NOTATTACHED) return -ENODEV; /* Fail if autosuspend is disabled, or any interfaces are in use, or * any interface drivers require remote wakeup but it isn't available. */ w = 0; if (udev->actconfig) { for (i = 0; i < udev->actconfig->desc.bNumInterfaces; i++) { intf = udev->actconfig->interface[i]; /* We don't need to check interfaces that are * disabled for runtime PM. Either they are unbound * or else their drivers don't support autosuspend * and so they are permanently active. */ if (intf->dev.power.disable_depth) continue; if (atomic_read(&intf->dev.power.usage_count) > 0) return -EBUSY; w |= intf->needs_remote_wakeup; /* Don't allow autosuspend if the device will need * a reset-resume and any of its interface drivers * doesn't include support or needs remote wakeup. */ if (udev->quirks & USB_QUIRK_RESET_RESUME) { struct usb_driver *driver; driver = to_usb_driver(intf->dev.driver); if (!driver->reset_resume || intf->needs_remote_wakeup) return -EOPNOTSUPP; } } } if (w && !device_can_wakeup(&udev->dev)) { dev_dbg(&udev->dev, "remote wakeup needed for autosuspend\n"); return -EOPNOTSUPP; } /* * If the device is a direct child of the root hub and the HCD * doesn't handle wakeup requests, don't allow autosuspend when * wakeup is needed. */ if (w && udev->parent == udev->bus->root_hub && bus_to_hcd(udev->bus)->cant_recv_wakeups) { dev_dbg(&udev->dev, "HCD doesn't handle wakeup requests\n"); return -EOPNOTSUPP; } udev->do_remote_wakeup = w; return 0; } int usb_runtime_suspend(struct device *dev) { struct usb_device *udev = to_usb_device(dev); int status; /* A USB device can be suspended if it passes the various autosuspend * checks. Runtime suspend for a USB device means suspending all the * interfaces and then the device itself. */ if (autosuspend_check(udev) != 0) return -EAGAIN; status = usb_suspend_both(udev, PMSG_AUTO_SUSPEND); /* Allow a retry if autosuspend failed temporarily */ if (status == -EAGAIN || status == -EBUSY) usb_mark_last_busy(udev); /* * The PM core reacts badly unless the return code is 0, * -EAGAIN, or -EBUSY, so always return -EBUSY on an error * (except for root hubs, because they don't suspend through * an upstream port like other USB devices). */ if (status != 0 && udev->parent) return -EBUSY; return status; } int usb_runtime_resume(struct device *dev) { struct usb_device *udev = to_usb_device(dev); int status; /* Runtime resume for a USB device means resuming both the device * and all its interfaces. */ status = usb_resume_both(udev, PMSG_AUTO_RESUME); return status; } int usb_runtime_idle(struct device *dev) { struct usb_device *udev = to_usb_device(dev); /* An idle USB device can be suspended if it passes the various * autosuspend checks. */ if (autosuspend_check(udev) == 0) pm_runtime_autosuspend(dev); /* Tell the core not to suspend it, though. */ return -EBUSY; } static int usb_set_usb2_hardware_lpm(struct usb_device *udev, int enable) { struct usb_hcd *hcd = bus_to_hcd(udev->bus); int ret = -EPERM; if (hcd->driver->set_usb2_hw_lpm) { ret = hcd->driver->set_usb2_hw_lpm(hcd, udev, enable); if (!ret) udev->usb2_hw_lpm_enabled = enable; } return ret; } int usb_enable_usb2_hardware_lpm(struct usb_device *udev) { if (!udev->usb2_hw_lpm_capable || !udev->usb2_hw_lpm_allowed || udev->usb2_hw_lpm_enabled) return 0; return usb_set_usb2_hardware_lpm(udev, 1); } int usb_disable_usb2_hardware_lpm(struct usb_device *udev) { if (!udev->usb2_hw_lpm_enabled) return 0; return usb_set_usb2_hardware_lpm(udev, 0); } #endif /* CONFIG_PM */ const struct bus_type usb_bus_type = { .name = "usb", .match = usb_device_match, .uevent = usb_uevent, .need_parent_lock = true, }; |
| 2 2 2 2 2 2 2 2 1 1 1 1 1 1 41 41 32 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 | /* SPDX-License-Identifier: GPL-2.0 * * page_pool/helpers.h * Author: Jesper Dangaard Brouer <netoptimizer@brouer.com> * Copyright (C) 2016 Red Hat, Inc. */ /** * DOC: page_pool allocator * * The page_pool allocator is optimized for recycling page or page fragment used * by skb packet and xdp frame. * * Basic use involves replacing any alloc_pages() calls with page_pool_alloc(), * which allocate memory with or without page splitting depending on the * requested memory size. * * If the driver knows that it always requires full pages or its allocations are * always smaller than half a page, it can use one of the more specific API * calls: * * 1. page_pool_alloc_pages(): allocate memory without page splitting when * driver knows that the memory it need is always bigger than half of the page * allocated from page pool. There is no cache line dirtying for 'struct page' * when a page is recycled back to the page pool. * * 2. page_pool_alloc_frag(): allocate memory with page splitting when driver * knows that the memory it need is always smaller than or equal to half of the * page allocated from page pool. Page splitting enables memory saving and thus * avoids TLB/cache miss for data access, but there also is some cost to * implement page splitting, mainly some cache line dirtying/bouncing for * 'struct page' and atomic operation for page->pp_ref_count. * * The API keeps track of in-flight pages, in order to let API users know when * it is safe to free a page_pool object, the API users must call * page_pool_put_page() or page_pool_free_va() to free the page_pool object, or * attach the page_pool object to a page_pool-aware object like skbs marked with * skb_mark_for_recycle(). * * page_pool_put_page() may be called multiple times on the same page if a page * is split into multiple fragments. For the last fragment, it will either * recycle the page, or in case of page->_refcount > 1, it will release the DMA * mapping and in-flight state accounting. * * dma_sync_single_range_for_device() is only called for the last fragment when * page_pool is created with PP_FLAG_DMA_SYNC_DEV flag, so it depends on the * last freed fragment to do the sync_for_device operation for all fragments in * the same page when a page is split. The API user must setup pool->p.max_len * and pool->p.offset correctly and ensure that page_pool_put_page() is called * with dma_sync_size being -1 for fragment API. */ #ifndef _NET_PAGE_POOL_HELPERS_H #define _NET_PAGE_POOL_HELPERS_H #include <linux/dma-mapping.h> #include <net/page_pool/types.h> #ifdef CONFIG_PAGE_POOL_STATS /* Deprecated driver-facing API, use netlink instead */ int page_pool_ethtool_stats_get_count(void); u8 *page_pool_ethtool_stats_get_strings(u8 *data); u64 *page_pool_ethtool_stats_get(u64 *data, const void *stats); bool page_pool_get_stats(const struct page_pool *pool, struct page_pool_stats *stats); #else static inline int page_pool_ethtool_stats_get_count(void) { return 0; } static inline u8 *page_pool_ethtool_stats_get_strings(u8 *data) { return data; } static inline u64 *page_pool_ethtool_stats_get(u64 *data, const void *stats) { return data; } #endif /** * page_pool_dev_alloc_pages() - allocate a page. * @pool: pool from which to allocate * * Get a page from the page allocator or page_pool caches. */ static inline struct page *page_pool_dev_alloc_pages(struct page_pool *pool) { gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN); return page_pool_alloc_pages(pool, gfp); } /** * page_pool_dev_alloc_frag() - allocate a page fragment. * @pool: pool from which to allocate * @offset: offset to the allocated page * @size: requested size * * Get a page fragment from the page allocator or page_pool caches. * * Return: * Return allocated page fragment, otherwise return NULL. */ static inline struct page *page_pool_dev_alloc_frag(struct page_pool *pool, unsigned int *offset, unsigned int size) { gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN); return page_pool_alloc_frag(pool, offset, size, gfp); } static inline struct page *page_pool_alloc(struct page_pool *pool, unsigned int *offset, unsigned int *size, gfp_t gfp) { unsigned int max_size = PAGE_SIZE << pool->p.order; struct page *page; if ((*size << 1) > max_size) { *size = max_size; *offset = 0; return page_pool_alloc_pages(pool, gfp); } page = page_pool_alloc_frag(pool, offset, *size, gfp); if (unlikely(!page)) return NULL; /* There is very likely not enough space for another fragment, so append * the remaining size to the current fragment to avoid truesize * underestimate problem. */ if (pool->frag_offset + *size > max_size) { *size = max_size - *offset; pool->frag_offset = max_size; } return page; } /** * page_pool_dev_alloc() - allocate a page or a page fragment. * @pool: pool from which to allocate * @offset: offset to the allocated page * @size: in as the requested size, out as the allocated size * * Get a page or a page fragment from the page allocator or page_pool caches * depending on the requested size in order to allocate memory with least memory * utilization and performance penalty. * * Return: * Return allocated page or page fragment, otherwise return NULL. */ static inline struct page *page_pool_dev_alloc(struct page_pool *pool, unsigned int *offset, unsigned int *size) { gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN); return page_pool_alloc(pool, offset, size, gfp); } static inline void *page_pool_alloc_va(struct page_pool *pool, unsigned int *size, gfp_t gfp) { unsigned int offset; struct page *page; /* Mask off __GFP_HIGHMEM to ensure we can use page_address() */ page = page_pool_alloc(pool, &offset, size, gfp & ~__GFP_HIGHMEM); if (unlikely(!page)) return NULL; return page_address(page) + offset; } /** * page_pool_dev_alloc_va() - allocate a page or a page fragment and return its * va. * @pool: pool from which to allocate * @size: in as the requested size, out as the allocated size * * This is just a thin wrapper around the page_pool_alloc() API, and * it returns va of the allocated page or page fragment. * * Return: * Return the va for the allocated page or page fragment, otherwise return NULL. */ static inline void *page_pool_dev_alloc_va(struct page_pool *pool, unsigned int *size) { gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN); return page_pool_alloc_va(pool, size, gfp); } /** * page_pool_get_dma_dir() - Retrieve the stored DMA direction. * @pool: pool from which page was allocated * * Get the stored dma direction. A driver might decide to store this locally * and avoid the extra cache line from page_pool to determine the direction. */ static inline enum dma_data_direction page_pool_get_dma_dir(const struct page_pool *pool) { return pool->p.dma_dir; } /** * page_pool_fragment_page() - split a fresh page into fragments * @page: page to split * @nr: references to set * * pp_ref_count represents the number of outstanding references to the page, * which will be freed using page_pool APIs (rather than page allocator APIs * like put_page()). Such references are usually held by page_pool-aware * objects like skbs marked for page pool recycling. * * This helper allows the caller to take (set) multiple references to a * freshly allocated page. The page must be freshly allocated (have a * pp_ref_count of 1). This is commonly done by drivers and * "fragment allocators" to save atomic operations - either when they know * upfront how many references they will need; or to take MAX references and * return the unused ones with a single atomic dec(), instead of performing * multiple atomic inc() operations. */ static inline void page_pool_fragment_page(struct page *page, long nr) { atomic_long_set(&page->pp_ref_count, nr); } static inline long page_pool_unref_page(struct page *page, long nr) { long ret; /* If nr == pp_ref_count then we have cleared all remaining * references to the page: * 1. 'n == 1': no need to actually overwrite it. * 2. 'n != 1': overwrite it with one, which is the rare case * for pp_ref_count draining. * * The main advantage to doing this is that not only we avoid a atomic * update, as an atomic_read is generally a much cheaper operation than * an atomic update, especially when dealing with a page that may be * referenced by only 2 or 3 users; but also unify the pp_ref_count * handling by ensuring all pages have partitioned into only 1 piece * initially, and only overwrite it when the page is partitioned into * more than one piece. */ if (atomic_long_read(&page->pp_ref_count) == nr) { /* As we have ensured nr is always one for constant case using * the BUILD_BUG_ON(), only need to handle the non-constant case * here for pp_ref_count draining, which is a rare case. */ BUILD_BUG_ON(__builtin_constant_p(nr) && nr != 1); if (!__builtin_constant_p(nr)) atomic_long_set(&page->pp_ref_count, 1); return 0; } ret = atomic_long_sub_return(nr, &page->pp_ref_count); WARN_ON(ret < 0); /* We are the last user here too, reset pp_ref_count back to 1 to * ensure all pages have been partitioned into 1 piece initially, * this should be the rare case when the last two fragment users call * page_pool_unref_page() currently. */ if (unlikely(!ret)) atomic_long_set(&page->pp_ref_count, 1); return ret; } static inline void page_pool_ref_page(struct page *page) { atomic_long_inc(&page->pp_ref_count); } static inline bool page_pool_is_last_ref(struct page *page) { /* If page_pool_unref_page() returns 0, we were the last user */ return page_pool_unref_page(page, 1) == 0; } /** * page_pool_put_page() - release a reference to a page pool page * @pool: pool from which page was allocated * @page: page to release a reference on * @dma_sync_size: how much of the page may have been touched by the device * @allow_direct: released by the consumer, allow lockless caching * * The outcome of this depends on the page refcnt. If the driver bumps * the refcnt > 1 this will unmap the page. If the page refcnt is 1 * the allocator owns the page and will try to recycle it in one of the pool * caches. If PP_FLAG_DMA_SYNC_DEV is set, the page will be synced for_device * using dma_sync_single_range_for_device(). */ static inline void page_pool_put_page(struct page_pool *pool, struct page *page, unsigned int dma_sync_size, bool allow_direct) { /* When page_pool isn't compiled-in, net/core/xdp.c doesn't * allow registering MEM_TYPE_PAGE_POOL, but shield linker. */ #ifdef CONFIG_PAGE_POOL if (!page_pool_is_last_ref(page)) return; page_pool_put_unrefed_page(pool, page, dma_sync_size, allow_direct); #endif } /** * page_pool_put_full_page() - release a reference on a page pool page * @pool: pool from which page was allocated * @page: page to release a reference on * @allow_direct: released by the consumer, allow lockless caching * * Similar to page_pool_put_page(), but will DMA sync the entire memory area * as configured in &page_pool_params.max_len. */ static inline void page_pool_put_full_page(struct page_pool *pool, struct page *page, bool allow_direct) { page_pool_put_page(pool, page, -1, allow_direct); } /** * page_pool_recycle_direct() - release a reference on a page pool page * @pool: pool from which page was allocated * @page: page to release a reference on * * Similar to page_pool_put_full_page() but caller must guarantee safe context * (e.g NAPI), since it will recycle the page directly into the pool fast cache. */ static inline void page_pool_recycle_direct(struct page_pool *pool, struct page *page) { page_pool_put_full_page(pool, page, true); } #define PAGE_POOL_32BIT_ARCH_WITH_64BIT_DMA \ (sizeof(dma_addr_t) > sizeof(unsigned long)) /** * page_pool_free_va() - free a va into the page_pool * @pool: pool from which va was allocated * @va: va to be freed * @allow_direct: freed by the consumer, allow lockless caching * * Free a va allocated from page_pool_allo_va(). */ static inline void page_pool_free_va(struct page_pool *pool, void *va, bool allow_direct) { page_pool_put_page(pool, virt_to_head_page(va), -1, allow_direct); } /** * page_pool_get_dma_addr() - Retrieve the stored DMA address. * @page: page allocated from a page pool * * Fetch the DMA address of the page. The page pool to which the page belongs * must had been created with PP_FLAG_DMA_MAP. */ static inline dma_addr_t page_pool_get_dma_addr(const struct page *page) { dma_addr_t ret = page->dma_addr; if (PAGE_POOL_32BIT_ARCH_WITH_64BIT_DMA) ret <<= PAGE_SHIFT; return ret; } static inline bool page_pool_set_dma_addr(struct page *page, dma_addr_t addr) { if (PAGE_POOL_32BIT_ARCH_WITH_64BIT_DMA) { page->dma_addr = addr >> PAGE_SHIFT; /* We assume page alignment to shave off bottom bits, * if this "compression" doesn't work we need to drop. */ return addr != (dma_addr_t)page->dma_addr << PAGE_SHIFT; } page->dma_addr = addr; return false; } /** * page_pool_dma_sync_for_cpu - sync Rx page for CPU after it's written by HW * @pool: &page_pool the @page belongs to * @page: page to sync * @offset: offset from page start to "hard" start if using PP frags * @dma_sync_size: size of the data written to the page * * Can be used as a shorthand to sync Rx pages before accessing them in the * driver. Caller must ensure the pool was created with ``PP_FLAG_DMA_MAP``. * Note that this version performs DMA sync unconditionally, even if the * associated PP doesn't perform sync-for-device. */ static inline void page_pool_dma_sync_for_cpu(const struct page_pool *pool, const struct page *page, u32 offset, u32 dma_sync_size) { dma_sync_single_range_for_cpu(pool->p.dev, page_pool_get_dma_addr(page), offset + pool->p.offset, dma_sync_size, page_pool_get_dma_dir(pool)); } static inline bool page_pool_put(struct page_pool *pool) { return refcount_dec_and_test(&pool->user_cnt); } static inline void page_pool_nid_changed(struct page_pool *pool, int new_nid) { if (unlikely(pool->p.nid != new_nid)) page_pool_update_nid(pool, new_nid); } #endif /* _NET_PAGE_POOL_HELPERS_H */ |
| 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 | /* SPDX-License-Identifier: GPL-2.0-or-later */ #ifndef _TCP_AO_H #define _TCP_AO_H #define TCP_AO_KEY_ALIGN 1 #define __tcp_ao_key_align __aligned(TCP_AO_KEY_ALIGN) union tcp_ao_addr { struct in_addr a4; #if IS_ENABLED(CONFIG_IPV6) struct in6_addr a6; #endif }; struct tcp_ao_hdr { u8 kind; u8 length; u8 keyid; u8 rnext_keyid; }; struct tcp_ao_counters { atomic64_t pkt_good; atomic64_t pkt_bad; atomic64_t key_not_found; atomic64_t ao_required; atomic64_t dropped_icmp; }; struct tcp_ao_key { struct hlist_node node; union tcp_ao_addr addr; u8 key[TCP_AO_MAXKEYLEN] __tcp_ao_key_align; unsigned int tcp_sigpool_id; unsigned int digest_size; int l3index; u8 prefixlen; u8 family; u8 keylen; u8 keyflags; u8 sndid; u8 rcvid; u8 maclen; struct rcu_head rcu; atomic64_t pkt_good; atomic64_t pkt_bad; u8 traffic_keys[]; }; static inline u8 *rcv_other_key(struct tcp_ao_key *key) { return key->traffic_keys; } static inline u8 *snd_other_key(struct tcp_ao_key *key) { return key->traffic_keys + key->digest_size; } static inline int tcp_ao_maclen(const struct tcp_ao_key *key) { return key->maclen; } /* Use tcp_ao_len_aligned() for TCP header calculations */ static inline int tcp_ao_len(const struct tcp_ao_key *key) { return tcp_ao_maclen(key) + sizeof(struct tcp_ao_hdr); } static inline int tcp_ao_len_aligned(const struct tcp_ao_key *key) { return round_up(tcp_ao_len(key), 4); } static inline unsigned int tcp_ao_digest_size(struct tcp_ao_key *key) { return key->digest_size; } static inline int tcp_ao_sizeof_key(const struct tcp_ao_key *key) { return sizeof(struct tcp_ao_key) + (key->digest_size << 1); } struct tcp_ao_info { /* List of tcp_ao_key's */ struct hlist_head head; /* current_key and rnext_key are maintained on sockets * in TCP_AO_ESTABLISHED states. * Their purpose is to cache keys on established connections, * saving needless lookups. Never dereference any of them from * listen sockets. * ::current_key may change in RX to the key that was requested by * the peer, please use READ_ONCE()/WRITE_ONCE() in order to avoid * load/store tearing. * Do the same for ::rnext_key, if you don't hold socket lock * (it's changed only by userspace request in setsockopt()). */ struct tcp_ao_key *current_key; struct tcp_ao_key *rnext_key; struct tcp_ao_counters counters; u32 ao_required :1, accept_icmps :1, __unused :30; __be32 lisn; __be32 risn; /* Sequence Number Extension (SNE) are upper 4 bytes for SEQ, * that protect TCP-AO connection from replayed old TCP segments. * See RFC5925 (6.2). * In order to get correct SNE, there's a helper tcp_ao_compute_sne(). * It needs SEQ basis to understand whereabouts are lower SEQ numbers. * According to that basis vector, it can provide incremented SNE * when SEQ rolls over or provide decremented SNE when there's * a retransmitted segment from before-rolling over. * - for request sockets such basis is rcv_isn/snt_isn, which seems * good enough as it's unexpected to receive 4 Gbytes on reqsk. * - for full sockets the basis is rcv_nxt/snd_una. snd_una is * taken instead of snd_nxt as currently it's easier to track * in tcp_snd_una_update(), rather than updating SNE in all * WRITE_ONCE(tp->snd_nxt, ...) * - for time-wait sockets the basis is tw_rcv_nxt/tw_snd_nxt. * tw_snd_nxt is not expected to change, while tw_rcv_nxt may. */ u32 snd_sne; u32 rcv_sne; refcount_t refcnt; /* Protects twsk destruction */ struct rcu_head rcu; }; #ifdef CONFIG_TCP_MD5SIG #include <linux/jump_label.h> extern struct static_key_false_deferred tcp_md5_needed; #define static_branch_tcp_md5() static_branch_unlikely(&tcp_md5_needed.key) #else #define static_branch_tcp_md5() false #endif #ifdef CONFIG_TCP_AO /* TCP-AO structures and functions */ #include <linux/jump_label.h> extern struct static_key_false_deferred tcp_ao_needed; #define static_branch_tcp_ao() static_branch_unlikely(&tcp_ao_needed.key) #else #define static_branch_tcp_ao() false #endif static inline bool tcp_hash_should_produce_warnings(void) { return static_branch_tcp_md5() || static_branch_tcp_ao(); } #define tcp_hash_fail(msg, family, skb, fmt, ...) \ do { \ const struct tcphdr *th = tcp_hdr(skb); \ char hdr_flags[6]; \ char *f = hdr_flags; \ \ if (!tcp_hash_should_produce_warnings()) \ break; \ if (th->fin) \ *f++ = 'F'; \ if (th->syn) \ *f++ = 'S'; \ if (th->rst) \ *f++ = 'R'; \ if (th->psh) \ *f++ = 'P'; \ if (th->ack) \ *f++ = '.'; \ *f = 0; \ if ((family) == AF_INET) { \ net_info_ratelimited("%s for %pI4.%d->%pI4.%d [%s] " fmt "\n", \ msg, &ip_hdr(skb)->saddr, ntohs(th->source), \ &ip_hdr(skb)->daddr, ntohs(th->dest), \ hdr_flags, ##__VA_ARGS__); \ } else { \ net_info_ratelimited("%s for [%pI6c].%d->[%pI6c].%d [%s]" fmt "\n", \ msg, &ipv6_hdr(skb)->saddr, ntohs(th->source), \ &ipv6_hdr(skb)->daddr, ntohs(th->dest), \ hdr_flags, ##__VA_ARGS__); \ } \ } while (0) #ifdef CONFIG_TCP_AO /* TCP-AO structures and functions */ struct tcp4_ao_context { __be32 saddr; __be32 daddr; __be16 sport; __be16 dport; __be32 sisn; __be32 disn; }; struct tcp6_ao_context { struct in6_addr saddr; struct in6_addr daddr; __be16 sport; __be16 dport; __be32 sisn; __be32 disn; }; struct tcp_sigpool; /* Established states are fast-path and there always is current_key/rnext_key */ #define TCP_AO_ESTABLISHED (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2 | \ TCPF_CLOSE_WAIT | TCPF_LAST_ACK | TCPF_CLOSING) int tcp_ao_transmit_skb(struct sock *sk, struct sk_buff *skb, struct tcp_ao_key *key, struct tcphdr *th, __u8 *hash_location); int tcp_ao_hash_skb(unsigned short int family, char *ao_hash, struct tcp_ao_key *key, const struct sock *sk, const struct sk_buff *skb, const u8 *tkey, int hash_offset, u32 sne); int tcp_parse_ao(struct sock *sk, int cmd, unsigned short int family, sockptr_t optval, int optlen); struct tcp_ao_key *tcp_ao_established_key(struct tcp_ao_info *ao, int sndid, int rcvid); int tcp_ao_copy_all_matching(const struct sock *sk, struct sock *newsk, struct request_sock *req, struct sk_buff *skb, int family); int tcp_ao_calc_traffic_key(struct tcp_ao_key *mkt, u8 *key, void *ctx, unsigned int len, struct tcp_sigpool *hp); void tcp_ao_destroy_sock(struct sock *sk, bool twsk); void tcp_ao_time_wait(struct tcp_timewait_sock *tcptw, struct tcp_sock *tp); bool tcp_ao_ignore_icmp(const struct sock *sk, int family, int type, int code); int tcp_ao_get_mkts(struct sock *sk, sockptr_t optval, sockptr_t optlen); int tcp_ao_get_sock_info(struct sock *sk, sockptr_t optval, sockptr_t optlen); int tcp_ao_get_repair(struct sock *sk, sockptr_t optval, sockptr_t optlen); int tcp_ao_set_repair(struct sock *sk, sockptr_t optval, unsigned int optlen); enum skb_drop_reason tcp_inbound_ao_hash(struct sock *sk, const struct sk_buff *skb, unsigned short int family, const struct request_sock *req, int l3index, const struct tcp_ao_hdr *aoh); u32 tcp_ao_compute_sne(u32 next_sne, u32 next_seq, u32 seq); struct tcp_ao_key *tcp_ao_do_lookup(const struct sock *sk, int l3index, const union tcp_ao_addr *addr, int family, int sndid, int rcvid); int tcp_ao_hash_hdr(unsigned short family, char *ao_hash, struct tcp_ao_key *key, const u8 *tkey, const union tcp_ao_addr *daddr, const union tcp_ao_addr *saddr, const struct tcphdr *th, u32 sne); int tcp_ao_prepare_reset(const struct sock *sk, struct sk_buff *skb, const struct tcp_ao_hdr *aoh, int l3index, u32 seq, struct tcp_ao_key **key, char **traffic_key, bool *allocated_traffic_key, u8 *keyid, u32 *sne); /* ipv4 specific functions */ int tcp_v4_parse_ao(struct sock *sk, int cmd, sockptr_t optval, int optlen); struct tcp_ao_key *tcp_v4_ao_lookup(const struct sock *sk, struct sock *addr_sk, int sndid, int rcvid); int tcp_v4_ao_synack_hash(char *ao_hash, struct tcp_ao_key *mkt, struct request_sock *req, const struct sk_buff *skb, int hash_offset, u32 sne); int tcp_v4_ao_calc_key_sk(struct tcp_ao_key *mkt, u8 *key, const struct sock *sk, __be32 sisn, __be32 disn, bool send); int tcp_v4_ao_calc_key_rsk(struct tcp_ao_key *mkt, u8 *key, struct request_sock *req); struct tcp_ao_key *tcp_v4_ao_lookup_rsk(const struct sock *sk, struct request_sock *req, int sndid, int rcvid); int tcp_v4_ao_hash_skb(char *ao_hash, struct tcp_ao_key *key, const struct sock *sk, const struct sk_buff *skb, const u8 *tkey, int hash_offset, u32 sne); /* ipv6 specific functions */ int tcp_v6_ao_hash_pseudoheader(struct tcp_sigpool *hp, const struct in6_addr *daddr, const struct in6_addr *saddr, int nbytes); int tcp_v6_ao_calc_key_skb(struct tcp_ao_key *mkt, u8 *key, const struct sk_buff *skb, __be32 sisn, __be32 disn); int tcp_v6_ao_calc_key_sk(struct tcp_ao_key *mkt, u8 *key, const struct sock *sk, __be32 sisn, __be32 disn, bool send); int tcp_v6_ao_calc_key_rsk(struct tcp_ao_key *mkt, u8 *key, struct request_sock *req); struct tcp_ao_key *tcp_v6_ao_lookup(const struct sock *sk, struct sock *addr_sk, int sndid, int rcvid); struct tcp_ao_key *tcp_v6_ao_lookup_rsk(const struct sock *sk, struct request_sock *req, int sndid, int rcvid); int tcp_v6_ao_hash_skb(char *ao_hash, struct tcp_ao_key *key, const struct sock *sk, const struct sk_buff *skb, const u8 *tkey, int hash_offset, u32 sne); int tcp_v6_parse_ao(struct sock *sk, int cmd, sockptr_t optval, int optlen); int tcp_v6_ao_synack_hash(char *ao_hash, struct tcp_ao_key *ao_key, struct request_sock *req, const struct sk_buff *skb, int hash_offset, u32 sne); void tcp_ao_established(struct sock *sk); void tcp_ao_finish_connect(struct sock *sk, struct sk_buff *skb); void tcp_ao_connect_init(struct sock *sk); void tcp_ao_syncookie(struct sock *sk, const struct sk_buff *skb, struct request_sock *req, unsigned short int family); #else /* CONFIG_TCP_AO */ static inline int tcp_ao_transmit_skb(struct sock *sk, struct sk_buff *skb, struct tcp_ao_key *key, struct tcphdr *th, __u8 *hash_location) { return 0; } static inline void tcp_ao_syncookie(struct sock *sk, const struct sk_buff *skb, struct request_sock *req, unsigned short int family) { } static inline bool tcp_ao_ignore_icmp(const struct sock *sk, int family, int type, int code) { return false; } static inline enum skb_drop_reason tcp_inbound_ao_hash(struct sock *sk, const struct sk_buff *skb, unsigned short int family, const struct request_sock *req, int l3index, const struct tcp_ao_hdr *aoh) { return SKB_NOT_DROPPED_YET; } static inline struct tcp_ao_key *tcp_ao_do_lookup(const struct sock *sk, int l3index, const union tcp_ao_addr *addr, int family, int sndid, int rcvid) { return NULL; } static inline void tcp_ao_destroy_sock(struct sock *sk, bool twsk) { } static inline void tcp_ao_established(struct sock *sk) { } static inline void tcp_ao_finish_connect(struct sock *sk, struct sk_buff *skb) { } static inline void tcp_ao_time_wait(struct tcp_timewait_sock *tcptw, struct tcp_sock *tp) { } static inline void tcp_ao_connect_init(struct sock *sk) { } static inline int tcp_ao_get_mkts(struct sock *sk, sockptr_t optval, sockptr_t optlen) { return -ENOPROTOOPT; } static inline int tcp_ao_get_sock_info(struct sock *sk, sockptr_t optval, sockptr_t optlen) { return -ENOPROTOOPT; } static inline int tcp_ao_get_repair(struct sock *sk, sockptr_t optval, sockptr_t optlen) { return -ENOPROTOOPT; } static inline int tcp_ao_set_repair(struct sock *sk, sockptr_t optval, unsigned int optlen) { return -ENOPROTOOPT; } #endif #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) int tcp_do_parse_auth_options(const struct tcphdr *th, const u8 **md5_hash, const u8 **ao_hash); #else static inline int tcp_do_parse_auth_options(const struct tcphdr *th, const u8 **md5_hash, const u8 **ao_hash) { *md5_hash = NULL; *ao_hash = NULL; return 0; } #endif #endif /* _TCP_AO_H */ |
| 1 1 789 787 792 770 763 43 44 35 37 15 15 44 197 197 195 3 286 2 1 2 2 1 1 1 115 66 116 116 116 3 60 61 263 4019 45 2 45 44 3 3911 3867 3869 3891 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 194 197 195 213 195 169 9 176 214 215 54 198 54 24 17 6 6 17 9 17 13 17 2616 2538 2630 2636 35 34 6 5 4 10 4 4 4 3 1 2 1 1 10 3 10 10 4 4 3 1 2 3 1 6 1 5 6 6 5 6 250 248 251 5 2 5 6 2 6 44 42 13 44 44 38 145 145 8 145 14 144 145 1 1 1 1 204 106 108 103 48 38 202 279 54 52 53 24 25 223 1 1 1 11 11 11 10 2 1 2 2 4 206 273 1 1 1 1 95 93 95 96 2 1 96 5 166 4 4 1 4 4 34 13 9 6 9 1 34 147 147 145 1 148 147 147 148 1 148 148 148 147 147 126 124 126 20 21 1 16 56 1 82 17 80 81 83 17 64 83 17 16 17 17 17 16 12 12 12 7 5 5 5 5 12 17 16 213 211 212 211 212 211 213 32 31 32 32 32 32 33 49 48 49 48 48 48 48 24 2 1 26 25 45 45 45 23 22 45 47 45 44 47 23 24 180 177 178 171 181 178 179 173 172 177 15 15 15 15 15 15 15 11 11 11 4 11 1 11 266 265 264 200 196 263 11 263 242 4 263 62 63 61 60 5 61 39 3 1 59 3 62 103 943 945 944 943 944 946 103 71 882 878 933 950 943 5 914 947 388 386 396 393 46 340 337 383 347 382 387 384 2 390 386 391 31 32 31 33 32 31 33 33 2356 2356 301 300 159 2257 243 2150 2350 2360 2352 2137 2343 2352 2347 2345 2234 2245 136 2238 72 134 132 2236 29 2233 128 19 6 2240 202 89 2163 66 2186 2230 6 2236 2230 2237 2235 2176 3 2005 2004 2008 2003 1949 1974 2012 252 251 251 251 250 250 202 250 203 202 200 197 226 224 229 252 152 6 158 156 125 6 158 49 157 151 128 21 131 129 127 152 154 152 153 151 143 3 53 53 52 52 47 48 53 108 11 106 1 106 94 104 103 93 102 93 90 91 11 92 8 2 6 90 86 76 35 23 17 82 107 94 11 3 110 109 1 2 179 177 178 175 154 65 66 15 2 16 19 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 | // SPDX-License-Identifier: GPL-2.0-or-later /* * NET An implementation of the SOCKET network access protocol. * * Version: @(#)socket.c 1.1.93 18/02/95 * * Authors: Orest Zborowski, <obz@Kodak.COM> * Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * * Fixes: * Anonymous : NOTSOCK/BADF cleanup. Error fix in * shutdown() * Alan Cox : verify_area() fixes * Alan Cox : Removed DDI * Jonathan Kamens : SOCK_DGRAM reconnect bug * Alan Cox : Moved a load of checks to the very * top level. * Alan Cox : Move address structures to/from user * mode above the protocol layers. * Rob Janssen : Allow 0 length sends. * Alan Cox : Asynchronous I/O support (cribbed from the * tty drivers). * Niibe Yutaka : Asynchronous I/O for writes (4.4BSD style) * Jeff Uphoff : Made max number of sockets command-line * configurable. * Matti Aarnio : Made the number of sockets dynamic, * to be allocated when needed, and mr. * Uphoff's max is used as max to be * allowed to allocate. * Linus : Argh. removed all the socket allocation * altogether: it's in the inode now. * Alan Cox : Made sock_alloc()/sock_release() public * for NetROM and future kernel nfsd type * stuff. * Alan Cox : sendmsg/recvmsg basics. * Tom Dyas : Export net symbols. * Marcin Dalecki : Fixed problems with CONFIG_NET="n". * Alan Cox : Added thread locking to sys_* calls * for sockets. May have errors at the * moment. * Kevin Buhr : Fixed the dumb errors in the above. * Andi Kleen : Some small cleanups, optimizations, * and fixed a copy_from_user() bug. * Tigran Aivazian : sys_send(args) calls sys_sendto(args, NULL, 0) * Tigran Aivazian : Made listen(2) backlog sanity checks * protocol-independent * * This module is effectively the top level interface to the BSD socket * paradigm. * * Based upon Swansea University Computer Society NET3.039 */ #include <linux/bpf-cgroup.h> #include <linux/ethtool.h> #include <linux/mm.h> #include <linux/socket.h> #include <linux/file.h> #include <linux/splice.h> #include <linux/net.h> #include <linux/interrupt.h> #include <linux/thread_info.h> #include <linux/rcupdate.h> #include <linux/netdevice.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/mutex.h> #include <linux/if_bridge.h> #include <linux/if_vlan.h> #include <linux/ptp_classify.h> #include <linux/init.h> #include <linux/poll.h> #include <linux/cache.h> #include <linux/module.h> #include <linux/highmem.h> #include <linux/mount.h> #include <linux/pseudo_fs.h> #include <linux/security.h> #include <linux/syscalls.h> #include <linux/compat.h> #include <linux/kmod.h> #include <linux/audit.h> #include <linux/wireless.h> #include <linux/nsproxy.h> #include <linux/magic.h> #include <linux/slab.h> #include <linux/xattr.h> #include <linux/nospec.h> #include <linux/indirect_call_wrapper.h> #include <linux/io_uring/net.h> #include <linux/uaccess.h> #include <asm/unistd.h> #include <net/compat.h> #include <net/wext.h> #include <net/cls_cgroup.h> #include <net/sock.h> #include <linux/netfilter.h> #include <linux/if_tun.h> #include <linux/ipv6_route.h> #include <linux/route.h> #include <linux/termios.h> #include <linux/sockios.h> #include <net/busy_poll.h> #include <linux/errqueue.h> #include <linux/ptp_clock_kernel.h> #include <trace/events/sock.h> #ifdef CONFIG_NET_RX_BUSY_POLL unsigned int sysctl_net_busy_read __read_mostly; unsigned int sysctl_net_busy_poll __read_mostly; #endif static ssize_t sock_read_iter(struct kiocb *iocb, struct iov_iter *to); static ssize_t sock_write_iter(struct kiocb *iocb, struct iov_iter *from); static int sock_mmap(struct file *file, struct vm_area_struct *vma); static int sock_close(struct inode *inode, struct file *file); static __poll_t sock_poll(struct file *file, struct poll_table_struct *wait); static long sock_ioctl(struct file *file, unsigned int cmd, unsigned long arg); #ifdef CONFIG_COMPAT static long compat_sock_ioctl(struct file *file, unsigned int cmd, unsigned long arg); #endif static int sock_fasync(int fd, struct file *filp, int on); static ssize_t sock_splice_read(struct file *file, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags); static void sock_splice_eof(struct file *file); #ifdef CONFIG_PROC_FS static void sock_show_fdinfo(struct seq_file *m, struct file *f) { struct socket *sock = f->private_data; const struct proto_ops *ops = READ_ONCE(sock->ops); if (ops->show_fdinfo) ops->show_fdinfo(m, sock); } #else #define sock_show_fdinfo NULL #endif /* * Socket files have a set of 'special' operations as well as the generic file ones. These don't appear * in the operation structures but are done directly via the socketcall() multiplexor. */ static const struct file_operations socket_file_ops = { .owner = THIS_MODULE, .llseek = no_llseek, .read_iter = sock_read_iter, .write_iter = sock_write_iter, .poll = sock_poll, .unlocked_ioctl = sock_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = compat_sock_ioctl, #endif .uring_cmd = io_uring_cmd_sock, .mmap = sock_mmap, .release = sock_close, .fasync = sock_fasync, .splice_write = splice_to_socket, .splice_read = sock_splice_read, .splice_eof = sock_splice_eof, .show_fdinfo = sock_show_fdinfo, }; static const char * const pf_family_names[] = { [PF_UNSPEC] = "PF_UNSPEC", [PF_UNIX] = "PF_UNIX/PF_LOCAL", [PF_INET] = "PF_INET", [PF_AX25] = "PF_AX25", [PF_IPX] = "PF_IPX", [PF_APPLETALK] = "PF_APPLETALK", [PF_NETROM] = "PF_NETROM", [PF_BRIDGE] = "PF_BRIDGE", [PF_ATMPVC] = "PF_ATMPVC", [PF_X25] = "PF_X25", [PF_INET6] = "PF_INET6", [PF_ROSE] = "PF_ROSE", [PF_DECnet] = "PF_DECnet", [PF_NETBEUI] = "PF_NETBEUI", [PF_SECURITY] = "PF_SECURITY", [PF_KEY] = "PF_KEY", [PF_NETLINK] = "PF_NETLINK/PF_ROUTE", [PF_PACKET] = "PF_PACKET", [PF_ASH] = "PF_ASH", [PF_ECONET] = "PF_ECONET", [PF_ATMSVC] = "PF_ATMSVC", [PF_RDS] = "PF_RDS", [PF_SNA] = "PF_SNA", [PF_IRDA] = "PF_IRDA", [PF_PPPOX] = "PF_PPPOX", [PF_WANPIPE] = "PF_WANPIPE", [PF_LLC] = "PF_LLC", [PF_IB] = "PF_IB", [PF_MPLS] = "PF_MPLS", [PF_CAN] = "PF_CAN", [PF_TIPC] = "PF_TIPC", [PF_BLUETOOTH] = "PF_BLUETOOTH", [PF_IUCV] = "PF_IUCV", [PF_RXRPC] = "PF_RXRPC", [PF_ISDN] = "PF_ISDN", [PF_PHONET] = "PF_PHONET", [PF_IEEE802154] = "PF_IEEE802154", [PF_CAIF] = "PF_CAIF", [PF_ALG] = "PF_ALG", [PF_NFC] = "PF_NFC", [PF_VSOCK] = "PF_VSOCK", [PF_KCM] = "PF_KCM", [PF_QIPCRTR] = "PF_QIPCRTR", [PF_SMC] = "PF_SMC", [PF_XDP] = "PF_XDP", [PF_MCTP] = "PF_MCTP", }; /* * The protocol list. Each protocol is registered in here. */ static DEFINE_SPINLOCK(net_family_lock); static const struct net_proto_family __rcu *net_families[NPROTO] __read_mostly; /* * Support routines. * Move socket addresses back and forth across the kernel/user * divide and look after the messy bits. */ /** * move_addr_to_kernel - copy a socket address into kernel space * @uaddr: Address in user space * @kaddr: Address in kernel space * @ulen: Length in user space * * The address is copied into kernel space. If the provided address is * too long an error code of -EINVAL is returned. If the copy gives * invalid addresses -EFAULT is returned. On a success 0 is returned. */ int move_addr_to_kernel(void __user *uaddr, int ulen, struct sockaddr_storage *kaddr) { if (ulen < 0 || ulen > sizeof(struct sockaddr_storage)) return -EINVAL; if (ulen == 0) return 0; if (copy_from_user(kaddr, uaddr, ulen)) return -EFAULT; return audit_sockaddr(ulen, kaddr); } /** * move_addr_to_user - copy an address to user space * @kaddr: kernel space address * @klen: length of address in kernel * @uaddr: user space address * @ulen: pointer to user length field * * The value pointed to by ulen on entry is the buffer length available. * This is overwritten with the buffer space used. -EINVAL is returned * if an overlong buffer is specified or a negative buffer size. -EFAULT * is returned if either the buffer or the length field are not * accessible. * After copying the data up to the limit the user specifies, the true * length of the data is written over the length limit the user * specified. Zero is returned for a success. */ static int move_addr_to_user(struct sockaddr_storage *kaddr, int klen, void __user *uaddr, int __user *ulen) { int err; int len; BUG_ON(klen > sizeof(struct sockaddr_storage)); err = get_user(len, ulen); if (err) return err; if (len > klen) len = klen; if (len < 0) return -EINVAL; if (len) { if (audit_sockaddr(klen, kaddr)) return -ENOMEM; if (copy_to_user(uaddr, kaddr, len)) return -EFAULT; } /* * "fromlen shall refer to the value before truncation.." * 1003.1g */ return __put_user(klen, ulen); } static struct kmem_cache *sock_inode_cachep __ro_after_init; static struct inode *sock_alloc_inode(struct super_block *sb) { struct socket_alloc *ei; ei = alloc_inode_sb(sb, sock_inode_cachep, GFP_KERNEL); if (!ei) return NULL; init_waitqueue_head(&ei->socket.wq.wait); ei->socket.wq.fasync_list = NULL; ei->socket.wq.flags = 0; ei->socket.state = SS_UNCONNECTED; ei->socket.flags = 0; ei->socket.ops = NULL; ei->socket.sk = NULL; ei->socket.file = NULL; return &ei->vfs_inode; } static void sock_free_inode(struct inode *inode) { struct socket_alloc *ei; ei = container_of(inode, struct socket_alloc, vfs_inode); kmem_cache_free(sock_inode_cachep, ei); } static void init_once(void *foo) { struct socket_alloc *ei = (struct socket_alloc *)foo; inode_init_once(&ei->vfs_inode); } static void init_inodecache(void) { sock_inode_cachep = kmem_cache_create("sock_inode_cache", sizeof(struct socket_alloc), 0, (SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT), init_once); BUG_ON(sock_inode_cachep == NULL); } static const struct super_operations sockfs_ops = { .alloc_inode = sock_alloc_inode, .free_inode = sock_free_inode, .statfs = simple_statfs, }; /* * sockfs_dname() is called from d_path(). */ static char *sockfs_dname(struct dentry *dentry, char *buffer, int buflen) { return dynamic_dname(buffer, buflen, "socket:[%lu]", d_inode(dentry)->i_ino); } static const struct dentry_operations sockfs_dentry_operations = { .d_dname = sockfs_dname, }; static int sockfs_xattr_get(const struct xattr_handler *handler, struct dentry *dentry, struct inode *inode, const char *suffix, void *value, size_t size) { if (value) { if (dentry->d_name.len + 1 > size) return -ERANGE; memcpy(value, dentry->d_name.name, dentry->d_name.len + 1); } return dentry->d_name.len + 1; } #define XATTR_SOCKPROTONAME_SUFFIX "sockprotoname" #define XATTR_NAME_SOCKPROTONAME (XATTR_SYSTEM_PREFIX XATTR_SOCKPROTONAME_SUFFIX) #define XATTR_NAME_SOCKPROTONAME_LEN (sizeof(XATTR_NAME_SOCKPROTONAME)-1) static const struct xattr_handler sockfs_xattr_handler = { .name = XATTR_NAME_SOCKPROTONAME, .get = sockfs_xattr_get, }; static int sockfs_security_xattr_set(const struct xattr_handler *handler, struct mnt_idmap *idmap, struct dentry *dentry, struct inode *inode, const char *suffix, const void *value, size_t size, int flags) { /* Handled by LSM. */ return -EAGAIN; } static const struct xattr_handler sockfs_security_xattr_handler = { .prefix = XATTR_SECURITY_PREFIX, .set = sockfs_security_xattr_set, }; static const struct xattr_handler * const sockfs_xattr_handlers[] = { &sockfs_xattr_handler, &sockfs_security_xattr_handler, NULL }; static int sockfs_init_fs_context(struct fs_context *fc) { struct pseudo_fs_context *ctx = init_pseudo(fc, SOCKFS_MAGIC); if (!ctx) return -ENOMEM; ctx->ops = &sockfs_ops; ctx->dops = &sockfs_dentry_operations; ctx->xattr = sockfs_xattr_handlers; return 0; } static struct vfsmount *sock_mnt __read_mostly; static struct file_system_type sock_fs_type = { .name = "sockfs", .init_fs_context = sockfs_init_fs_context, .kill_sb = kill_anon_super, }; /* * Obtains the first available file descriptor and sets it up for use. * * These functions create file structures and maps them to fd space * of the current process. On success it returns file descriptor * and file struct implicitly stored in sock->file. * Note that another thread may close file descriptor before we return * from this function. We use the fact that now we do not refer * to socket after mapping. If one day we will need it, this * function will increment ref. count on file by 1. * * In any case returned fd MAY BE not valid! * This race condition is unavoidable * with shared fd spaces, we cannot solve it inside kernel, * but we take care of internal coherence yet. */ /** * sock_alloc_file - Bind a &socket to a &file * @sock: socket * @flags: file status flags * @dname: protocol name * * Returns the &file bound with @sock, implicitly storing it * in sock->file. If dname is %NULL, sets to "". * * On failure @sock is released, and an ERR pointer is returned. * * This function uses GFP_KERNEL internally. */ struct file *sock_alloc_file(struct socket *sock, int flags, const char *dname) { struct file *file; if (!dname) dname = sock->sk ? sock->sk->sk_prot_creator->name : ""; file = alloc_file_pseudo(SOCK_INODE(sock), sock_mnt, dname, O_RDWR | (flags & O_NONBLOCK), &socket_file_ops); if (IS_ERR(file)) { sock_release(sock); return file; } file->f_mode |= FMODE_NOWAIT; sock->file = file; file->private_data = sock; stream_open(SOCK_INODE(sock), file); return file; } EXPORT_SYMBOL(sock_alloc_file); static int sock_map_fd(struct socket *sock, int flags) { struct file *newfile; int fd = get_unused_fd_flags(flags); if (unlikely(fd < 0)) { sock_release(sock); return fd; } newfile = sock_alloc_file(sock, flags, NULL); if (!IS_ERR(newfile)) { fd_install(fd, newfile); return fd; } put_unused_fd(fd); return PTR_ERR(newfile); } /** * sock_from_file - Return the &socket bounded to @file. * @file: file * * On failure returns %NULL. */ struct socket *sock_from_file(struct file *file) { if (file->f_op == &socket_file_ops) return file->private_data; /* set in sock_alloc_file */ return NULL; } EXPORT_SYMBOL(sock_from_file); /** * sockfd_lookup - Go from a file number to its socket slot * @fd: file handle * @err: pointer to an error code return * * The file handle passed in is locked and the socket it is bound * to is returned. If an error occurs the err pointer is overwritten * with a negative errno code and NULL is returned. The function checks * for both invalid handles and passing a handle which is not a socket. * * On a success the socket object pointer is returned. */ struct socket *sockfd_lookup(int fd, int *err) { struct file *file; struct socket *sock; file = fget(fd); if (!file) { *err = -EBADF; return NULL; } sock = sock_from_file(file); if (!sock) { *err = -ENOTSOCK; fput(file); } return sock; } EXPORT_SYMBOL(sockfd_lookup); static struct socket *sockfd_lookup_light(int fd, int *err, int *fput_needed) { struct fd f = fdget(fd); struct socket *sock; *err = -EBADF; if (f.file) { sock = sock_from_file(f.file); if (likely(sock)) { *fput_needed = f.flags & FDPUT_FPUT; return sock; } *err = -ENOTSOCK; fdput(f); } return NULL; } static ssize_t sockfs_listxattr(struct dentry *dentry, char *buffer, size_t size) { ssize_t len; ssize_t used = 0; len = security_inode_listsecurity(d_inode(dentry), buffer, size); if (len < 0) return len; used += len; if (buffer) { if (size < used) return -ERANGE; buffer += len; } len = (XATTR_NAME_SOCKPROTONAME_LEN + 1); used += len; if (buffer) { if (size < used) return -ERANGE; memcpy(buffer, XATTR_NAME_SOCKPROTONAME, len); buffer += len; } return used; } static int sockfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr) { int err = simple_setattr(&nop_mnt_idmap, dentry, iattr); if (!err && (iattr->ia_valid & ATTR_UID)) { struct socket *sock = SOCKET_I(d_inode(dentry)); if (sock->sk) sock->sk->sk_uid = iattr->ia_uid; else err = -ENOENT; } return err; } static const struct inode_operations sockfs_inode_ops = { .listxattr = sockfs_listxattr, .setattr = sockfs_setattr, }; /** * sock_alloc - allocate a socket * * Allocate a new inode and socket object. The two are bound together * and initialised. The socket is then returned. If we are out of inodes * NULL is returned. This functions uses GFP_KERNEL internally. */ struct socket *sock_alloc(void) { struct inode *inode; struct socket *sock; inode = new_inode_pseudo(sock_mnt->mnt_sb); if (!inode) return NULL; sock = SOCKET_I(inode); inode->i_ino = get_next_ino(); inode->i_mode = S_IFSOCK | S_IRWXUGO; inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); inode->i_op = &sockfs_inode_ops; return sock; } EXPORT_SYMBOL(sock_alloc); static void __sock_release(struct socket *sock, struct inode *inode) { const struct proto_ops *ops = READ_ONCE(sock->ops); if (ops) { struct module *owner = ops->owner; if (inode) inode_lock(inode); ops->release(sock); sock->sk = NULL; if (inode) inode_unlock(inode); sock->ops = NULL; module_put(owner); } if (sock->wq.fasync_list) pr_err("%s: fasync list not empty!\n", __func__); if (!sock->file) { iput(SOCK_INODE(sock)); return; } sock->file = NULL; } /** * sock_release - close a socket * @sock: socket to close * * The socket is released from the protocol stack if it has a release * callback, and the inode is then released if the socket is bound to * an inode not a file. */ void sock_release(struct socket *sock) { __sock_release(sock, NULL); } EXPORT_SYMBOL(sock_release); void __sock_tx_timestamp(__u16 tsflags, __u8 *tx_flags) { u8 flags = *tx_flags; if (tsflags & SOF_TIMESTAMPING_TX_HARDWARE) { flags |= SKBTX_HW_TSTAMP; /* PTP hardware clocks can provide a free running cycle counter * as a time base for virtual clocks. Tell driver to use the * free running cycle counter for timestamp if socket is bound * to virtual clock. */ if (tsflags & SOF_TIMESTAMPING_BIND_PHC) flags |= SKBTX_HW_TSTAMP_USE_CYCLES; } if (tsflags & SOF_TIMESTAMPING_TX_SOFTWARE) flags |= SKBTX_SW_TSTAMP; if (tsflags & SOF_TIMESTAMPING_TX_SCHED) flags |= SKBTX_SCHED_TSTAMP; *tx_flags = flags; } EXPORT_SYMBOL(__sock_tx_timestamp); INDIRECT_CALLABLE_DECLARE(int inet_sendmsg(struct socket *, struct msghdr *, size_t)); INDIRECT_CALLABLE_DECLARE(int inet6_sendmsg(struct socket *, struct msghdr *, size_t)); static noinline void call_trace_sock_send_length(struct sock *sk, int ret, int flags) { trace_sock_send_length(sk, ret, 0); } static inline int sock_sendmsg_nosec(struct socket *sock, struct msghdr *msg) { int ret = INDIRECT_CALL_INET(READ_ONCE(sock->ops)->sendmsg, inet6_sendmsg, inet_sendmsg, sock, msg, msg_data_left(msg)); BUG_ON(ret == -EIOCBQUEUED); if (trace_sock_send_length_enabled()) call_trace_sock_send_length(sock->sk, ret, 0); return ret; } static int __sock_sendmsg(struct socket *sock, struct msghdr *msg) { int err = security_socket_sendmsg(sock, msg, msg_data_left(msg)); return err ?: sock_sendmsg_nosec(sock, msg); } /** * sock_sendmsg - send a message through @sock * @sock: socket * @msg: message to send * * Sends @msg through @sock, passing through LSM. * Returns the number of bytes sent, or an error code. */ int sock_sendmsg(struct socket *sock, struct msghdr *msg) { struct sockaddr_storage *save_addr = (struct sockaddr_storage *)msg->msg_name; struct sockaddr_storage address; int save_len = msg->msg_namelen; int ret; if (msg->msg_name) { memcpy(&address, msg->msg_name, msg->msg_namelen); msg->msg_name = &address; } ret = __sock_sendmsg(sock, msg); msg->msg_name = save_addr; msg->msg_namelen = save_len; return ret; } EXPORT_SYMBOL(sock_sendmsg); /** * kernel_sendmsg - send a message through @sock (kernel-space) * @sock: socket * @msg: message header * @vec: kernel vec * @num: vec array length * @size: total message data size * * Builds the message data with @vec and sends it through @sock. * Returns the number of bytes sent, or an error code. */ int kernel_sendmsg(struct socket *sock, struct msghdr *msg, struct kvec *vec, size_t num, size_t size) { iov_iter_kvec(&msg->msg_iter, ITER_SOURCE, vec, num, size); return sock_sendmsg(sock, msg); } EXPORT_SYMBOL(kernel_sendmsg); /** * kernel_sendmsg_locked - send a message through @sock (kernel-space) * @sk: sock * @msg: message header * @vec: output s/g array * @num: output s/g array length * @size: total message data size * * Builds the message data with @vec and sends it through @sock. * Returns the number of bytes sent, or an error code. * Caller must hold @sk. */ int kernel_sendmsg_locked(struct sock *sk, struct msghdr *msg, struct kvec *vec, size_t num, size_t size) { struct socket *sock = sk->sk_socket; const struct proto_ops *ops = READ_ONCE(sock->ops); if (!ops->sendmsg_locked) return sock_no_sendmsg_locked(sk, msg, size); iov_iter_kvec(&msg->msg_iter, ITER_SOURCE, vec, num, size); return ops->sendmsg_locked(sk, msg, msg_data_left(msg)); } EXPORT_SYMBOL(kernel_sendmsg_locked); static bool skb_is_err_queue(const struct sk_buff *skb) { /* pkt_type of skbs enqueued on the error queue are set to * PACKET_OUTGOING in skb_set_err_queue(). This is only safe to do * in recvmsg, since skbs received on a local socket will never * have a pkt_type of PACKET_OUTGOING. */ return skb->pkt_type == PACKET_OUTGOING; } /* On transmit, software and hardware timestamps are returned independently. * As the two skb clones share the hardware timestamp, which may be updated * before the software timestamp is received, a hardware TX timestamp may be * returned only if there is no software TX timestamp. Ignore false software * timestamps, which may be made in the __sock_recv_timestamp() call when the * option SO_TIMESTAMP_OLD(NS) is enabled on the socket, even when the skb has a * hardware timestamp. */ static bool skb_is_swtx_tstamp(const struct sk_buff *skb, int false_tstamp) { return skb->tstamp && !false_tstamp && skb_is_err_queue(skb); } static ktime_t get_timestamp(struct sock *sk, struct sk_buff *skb, int *if_index) { bool cycles = READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_BIND_PHC; struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb); struct net_device *orig_dev; ktime_t hwtstamp; rcu_read_lock(); orig_dev = dev_get_by_napi_id(skb_napi_id(skb)); if (orig_dev) { *if_index = orig_dev->ifindex; hwtstamp = netdev_get_tstamp(orig_dev, shhwtstamps, cycles); } else { hwtstamp = shhwtstamps->hwtstamp; } rcu_read_unlock(); return hwtstamp; } static void put_ts_pktinfo(struct msghdr *msg, struct sk_buff *skb, int if_index) { struct scm_ts_pktinfo ts_pktinfo; struct net_device *orig_dev; if (!skb_mac_header_was_set(skb)) return; memset(&ts_pktinfo, 0, sizeof(ts_pktinfo)); if (!if_index) { rcu_read_lock(); orig_dev = dev_get_by_napi_id(skb_napi_id(skb)); if (orig_dev) if_index = orig_dev->ifindex; rcu_read_unlock(); } ts_pktinfo.if_index = if_index; ts_pktinfo.pkt_length = skb->len - skb_mac_offset(skb); put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPING_PKTINFO, sizeof(ts_pktinfo), &ts_pktinfo); } /* * called from sock_recv_timestamp() if sock_flag(sk, SOCK_RCVTSTAMP) */ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) { int need_software_tstamp = sock_flag(sk, SOCK_RCVTSTAMP); int new_tstamp = sock_flag(sk, SOCK_TSTAMP_NEW); struct scm_timestamping_internal tss; int empty = 1, false_tstamp = 0; struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb); int if_index; ktime_t hwtstamp; u32 tsflags; /* Race occurred between timestamp enabling and packet receiving. Fill in the current time for now. */ if (need_software_tstamp && skb->tstamp == 0) { __net_timestamp(skb); false_tstamp = 1; } if (need_software_tstamp) { if (!sock_flag(sk, SOCK_RCVTSTAMPNS)) { if (new_tstamp) { struct __kernel_sock_timeval tv; skb_get_new_timestamp(skb, &tv); put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_NEW, sizeof(tv), &tv); } else { struct __kernel_old_timeval tv; skb_get_timestamp(skb, &tv); put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_OLD, sizeof(tv), &tv); } } else { if (new_tstamp) { struct __kernel_timespec ts; skb_get_new_timestampns(skb, &ts); put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_NEW, sizeof(ts), &ts); } else { struct __kernel_old_timespec ts; skb_get_timestampns(skb, &ts); put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_OLD, sizeof(ts), &ts); } } } memset(&tss, 0, sizeof(tss)); tsflags = READ_ONCE(sk->sk_tsflags); if ((tsflags & SOF_TIMESTAMPING_SOFTWARE) && ktime_to_timespec64_cond(skb->tstamp, tss.ts + 0)) empty = 0; if (shhwtstamps && (tsflags & SOF_TIMESTAMPING_RAW_HARDWARE) && !skb_is_swtx_tstamp(skb, false_tstamp)) { if_index = 0; if (skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP_NETDEV) hwtstamp = get_timestamp(sk, skb, &if_index); else hwtstamp = shhwtstamps->hwtstamp; if (tsflags & SOF_TIMESTAMPING_BIND_PHC) hwtstamp = ptp_convert_timestamp(&hwtstamp, READ_ONCE(sk->sk_bind_phc)); if (ktime_to_timespec64_cond(hwtstamp, tss.ts + 2)) { empty = 0; if ((tsflags & SOF_TIMESTAMPING_OPT_PKTINFO) && !skb_is_err_queue(skb)) put_ts_pktinfo(msg, skb, if_index); } } if (!empty) { if (sock_flag(sk, SOCK_TSTAMP_NEW)) put_cmsg_scm_timestamping64(msg, &tss); else put_cmsg_scm_timestamping(msg, &tss); if (skb_is_err_queue(skb) && skb->len && SKB_EXT_ERR(skb)->opt_stats) put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPING_OPT_STATS, skb->len, skb->data); } } EXPORT_SYMBOL_GPL(__sock_recv_timestamp); #ifdef CONFIG_WIRELESS void __sock_recv_wifi_status(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) { int ack; if (!sock_flag(sk, SOCK_WIFI_STATUS)) return; if (!skb->wifi_acked_valid) return; ack = skb->wifi_acked; put_cmsg(msg, SOL_SOCKET, SCM_WIFI_STATUS, sizeof(ack), &ack); } EXPORT_SYMBOL_GPL(__sock_recv_wifi_status); #endif static inline void sock_recv_drops(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) { if (sock_flag(sk, SOCK_RXQ_OVFL) && skb && SOCK_SKB_CB(skb)->dropcount) put_cmsg(msg, SOL_SOCKET, SO_RXQ_OVFL, sizeof(__u32), &SOCK_SKB_CB(skb)->dropcount); } static void sock_recv_mark(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) { if (sock_flag(sk, SOCK_RCVMARK) && skb) { /* We must use a bounce buffer for CONFIG_HARDENED_USERCOPY=y */ __u32 mark = skb->mark; put_cmsg(msg, SOL_SOCKET, SO_MARK, sizeof(__u32), &mark); } } void __sock_recv_cmsgs(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) { sock_recv_timestamp(msg, sk, skb); sock_recv_drops(msg, sk, skb); sock_recv_mark(msg, sk, skb); } EXPORT_SYMBOL_GPL(__sock_recv_cmsgs); INDIRECT_CALLABLE_DECLARE(int inet_recvmsg(struct socket *, struct msghdr *, size_t, int)); INDIRECT_CALLABLE_DECLARE(int inet6_recvmsg(struct socket *, struct msghdr *, size_t, int)); static noinline void call_trace_sock_recv_length(struct sock *sk, int ret, int flags) { trace_sock_recv_length(sk, ret, flags); } static inline int sock_recvmsg_nosec(struct socket *sock, struct msghdr *msg, int flags) { int ret = INDIRECT_CALL_INET(READ_ONCE(sock->ops)->recvmsg, inet6_recvmsg, inet_recvmsg, sock, msg, msg_data_left(msg), flags); if (trace_sock_recv_length_enabled()) call_trace_sock_recv_length(sock->sk, ret, flags); return ret; } /** * sock_recvmsg - receive a message from @sock * @sock: socket * @msg: message to receive * @flags: message flags * * Receives @msg from @sock, passing through LSM. Returns the total number * of bytes received, or an error. */ int sock_recvmsg(struct socket *sock, struct msghdr *msg, int flags) { int err = security_socket_recvmsg(sock, msg, msg_data_left(msg), flags); return err ?: sock_recvmsg_nosec(sock, msg, flags); } EXPORT_SYMBOL(sock_recvmsg); /** * kernel_recvmsg - Receive a message from a socket (kernel space) * @sock: The socket to receive the message from * @msg: Received message * @vec: Input s/g array for message data * @num: Size of input s/g array * @size: Number of bytes to read * @flags: Message flags (MSG_DONTWAIT, etc...) * * On return the msg structure contains the scatter/gather array passed in the * vec argument. The array is modified so that it consists of the unfilled * portion of the original array. * * The returned value is the total number of bytes received, or an error. */ int kernel_recvmsg(struct socket *sock, struct msghdr *msg, struct kvec *vec, size_t num, size_t size, int flags) { msg->msg_control_is_user = false; iov_iter_kvec(&msg->msg_iter, ITER_DEST, vec, num, size); return sock_recvmsg(sock, msg, flags); } EXPORT_SYMBOL(kernel_recvmsg); static ssize_t sock_splice_read(struct file *file, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { struct socket *sock = file->private_data; const struct proto_ops *ops; ops = READ_ONCE(sock->ops); if (unlikely(!ops->splice_read)) return copy_splice_read(file, ppos, pipe, len, flags); return ops->splice_read(sock, ppos, pipe, len, flags); } static void sock_splice_eof(struct file *file) { struct socket *sock = file->private_data; const struct proto_ops *ops; ops = READ_ONCE(sock->ops); if (ops->splice_eof) ops->splice_eof(sock); } static ssize_t sock_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; struct socket *sock = file->private_data; struct msghdr msg = {.msg_iter = *to, .msg_iocb = iocb}; ssize_t res; if (file->f_flags & O_NONBLOCK || (iocb->ki_flags & IOCB_NOWAIT)) msg.msg_flags = MSG_DONTWAIT; if (iocb->ki_pos != 0) return -ESPIPE; if (!iov_iter_count(to)) /* Match SYS5 behaviour */ return 0; res = sock_recvmsg(sock, &msg, msg.msg_flags); *to = msg.msg_iter; return res; } static ssize_t sock_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct socket *sock = file->private_data; struct msghdr msg = {.msg_iter = *from, .msg_iocb = iocb}; ssize_t res; if (iocb->ki_pos != 0) return -ESPIPE; if (file->f_flags & O_NONBLOCK || (iocb->ki_flags & IOCB_NOWAIT)) msg.msg_flags = MSG_DONTWAIT; if (sock->type == SOCK_SEQPACKET) msg.msg_flags |= MSG_EOR; res = __sock_sendmsg(sock, &msg); *from = msg.msg_iter; return res; } /* * Atomic setting of ioctl hooks to avoid race * with module unload. */ static DEFINE_MUTEX(br_ioctl_mutex); static int (*br_ioctl_hook)(struct net *net, struct net_bridge *br, unsigned int cmd, struct ifreq *ifr, void __user *uarg); void brioctl_set(int (*hook)(struct net *net, struct net_bridge *br, unsigned int cmd, struct ifreq *ifr, void __user *uarg)) { mutex_lock(&br_ioctl_mutex); br_ioctl_hook = hook; mutex_unlock(&br_ioctl_mutex); } EXPORT_SYMBOL(brioctl_set); int br_ioctl_call(struct net *net, struct net_bridge *br, unsigned int cmd, struct ifreq *ifr, void __user *uarg) { int err = -ENOPKG; if (!br_ioctl_hook) request_module("bridge"); mutex_lock(&br_ioctl_mutex); if (br_ioctl_hook) err = br_ioctl_hook(net, br, cmd, ifr, uarg); mutex_unlock(&br_ioctl_mutex); return err; } static DEFINE_MUTEX(vlan_ioctl_mutex); static int (*vlan_ioctl_hook) (struct net *, void __user *arg); void vlan_ioctl_set(int (*hook) (struct net *, void __user *)) { mutex_lock(&vlan_ioctl_mutex); vlan_ioctl_hook = hook; mutex_unlock(&vlan_ioctl_mutex); } EXPORT_SYMBOL(vlan_ioctl_set); static long sock_do_ioctl(struct net *net, struct socket *sock, unsigned int cmd, unsigned long arg) { const struct proto_ops *ops = READ_ONCE(sock->ops); struct ifreq ifr; bool need_copyout; int err; void __user *argp = (void __user *)arg; void __user *data; err = ops->ioctl(sock, cmd, arg); /* * If this ioctl is unknown try to hand it down * to the NIC driver. */ if (err != -ENOIOCTLCMD) return err; if (!is_socket_ioctl_cmd(cmd)) return -ENOTTY; if (get_user_ifreq(&ifr, &data, argp)) return -EFAULT; err = dev_ioctl(net, cmd, &ifr, data, &need_copyout); if (!err && need_copyout) if (put_user_ifreq(&ifr, argp)) return -EFAULT; return err; } /* * With an ioctl, arg may well be a user mode pointer, but we don't know * what to do with it - that's up to the protocol still. */ static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg) { const struct proto_ops *ops; struct socket *sock; struct sock *sk; void __user *argp = (void __user *)arg; int pid, err; struct net *net; sock = file->private_data; ops = READ_ONCE(sock->ops); sk = sock->sk; net = sock_net(sk); if (unlikely(cmd >= SIOCDEVPRIVATE && cmd <= (SIOCDEVPRIVATE + 15))) { struct ifreq ifr; void __user *data; bool need_copyout; if (get_user_ifreq(&ifr, &data, argp)) return -EFAULT; err = dev_ioctl(net, cmd, &ifr, data, &need_copyout); if (!err && need_copyout) if (put_user_ifreq(&ifr, argp)) return -EFAULT; } else #ifdef CONFIG_WEXT_CORE if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) { err = wext_handle_ioctl(net, cmd, argp); } else #endif switch (cmd) { case FIOSETOWN: case SIOCSPGRP: err = -EFAULT; if (get_user(pid, (int __user *)argp)) break; err = f_setown(sock->file, pid, 1); break; case FIOGETOWN: case SIOCGPGRP: err = put_user(f_getown(sock->file), (int __user *)argp); break; case SIOCGIFBR: case SIOCSIFBR: case SIOCBRADDBR: case SIOCBRDELBR: err = br_ioctl_call(net, NULL, cmd, NULL, argp); break; case SIOCGIFVLAN: case SIOCSIFVLAN: err = -ENOPKG; if (!vlan_ioctl_hook) request_module("8021q"); mutex_lock(&vlan_ioctl_mutex); if (vlan_ioctl_hook) err = vlan_ioctl_hook(net, argp); mutex_unlock(&vlan_ioctl_mutex); break; case SIOCGSKNS: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; err = open_related_ns(&net->ns, get_net_ns); break; case SIOCGSTAMP_OLD: case SIOCGSTAMPNS_OLD: if (!ops->gettstamp) { err = -ENOIOCTLCMD; break; } err = ops->gettstamp(sock, argp, cmd == SIOCGSTAMP_OLD, !IS_ENABLED(CONFIG_64BIT)); break; case SIOCGSTAMP_NEW: case SIOCGSTAMPNS_NEW: if (!ops->gettstamp) { err = -ENOIOCTLCMD; break; } err = ops->gettstamp(sock, argp, cmd == SIOCGSTAMP_NEW, false); break; case SIOCGIFCONF: err = dev_ifconf(net, argp); break; default: err = sock_do_ioctl(net, sock, cmd, arg); break; } return err; } /** * sock_create_lite - creates a socket * @family: protocol family (AF_INET, ...) * @type: communication type (SOCK_STREAM, ...) * @protocol: protocol (0, ...) * @res: new socket * * Creates a new socket and assigns it to @res, passing through LSM. * The new socket initialization is not complete, see kernel_accept(). * Returns 0 or an error. On failure @res is set to %NULL. * This function internally uses GFP_KERNEL. */ int sock_create_lite(int family, int type, int protocol, struct socket **res) { int err; struct socket *sock = NULL; err = security_socket_create(family, type, protocol, 1); if (err) goto out; sock = sock_alloc(); if (!sock) { err = -ENOMEM; goto out; } sock->type = type; err = security_socket_post_create(sock, family, type, protocol, 1); if (err) goto out_release; out: *res = sock; return err; out_release: sock_release(sock); sock = NULL; goto out; } EXPORT_SYMBOL(sock_create_lite); /* No kernel lock held - perfect */ static __poll_t sock_poll(struct file *file, poll_table *wait) { struct socket *sock = file->private_data; const struct proto_ops *ops = READ_ONCE(sock->ops); __poll_t events = poll_requested_events(wait), flag = 0; if (!ops->poll) return 0; if (sk_can_busy_loop(sock->sk)) { /* poll once if requested by the syscall */ if (events & POLL_BUSY_LOOP) sk_busy_loop(sock->sk, 1); /* if this socket can poll_ll, tell the system call */ flag = POLL_BUSY_LOOP; } return ops->poll(file, sock, wait) | flag; } static int sock_mmap(struct file *file, struct vm_area_struct *vma) { struct socket *sock = file->private_data; return READ_ONCE(sock->ops)->mmap(file, sock, vma); } static int sock_close(struct inode *inode, struct file *filp) { __sock_release(SOCKET_I(inode), inode); return 0; } /* * Update the socket async list * * Fasync_list locking strategy. * * 1. fasync_list is modified only under process context socket lock * i.e. under semaphore. * 2. fasync_list is used under read_lock(&sk->sk_callback_lock) * or under socket lock */ static int sock_fasync(int fd, struct file *filp, int on) { struct socket *sock = filp->private_data; struct sock *sk = sock->sk; struct socket_wq *wq = &sock->wq; if (sk == NULL) return -EINVAL; lock_sock(sk); fasync_helper(fd, filp, on, &wq->fasync_list); if (!wq->fasync_list) sock_reset_flag(sk, SOCK_FASYNC); else sock_set_flag(sk, SOCK_FASYNC); release_sock(sk); return 0; } /* This function may be called only under rcu_lock */ int sock_wake_async(struct socket_wq *wq, int how, int band) { if (!wq || !wq->fasync_list) return -1; switch (how) { case SOCK_WAKE_WAITD: if (test_bit(SOCKWQ_ASYNC_WAITDATA, &wq->flags)) break; goto call_kill; case SOCK_WAKE_SPACE: if (!test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &wq->flags)) break; fallthrough; case SOCK_WAKE_IO: call_kill: kill_fasync(&wq->fasync_list, SIGIO, band); break; case SOCK_WAKE_URG: kill_fasync(&wq->fasync_list, SIGURG, band); } return 0; } EXPORT_SYMBOL(sock_wake_async); /** * __sock_create - creates a socket * @net: net namespace * @family: protocol family (AF_INET, ...) * @type: communication type (SOCK_STREAM, ...) * @protocol: protocol (0, ...) * @res: new socket * @kern: boolean for kernel space sockets * * Creates a new socket and assigns it to @res, passing through LSM. * Returns 0 or an error. On failure @res is set to %NULL. @kern must * be set to true if the socket resides in kernel space. * This function internally uses GFP_KERNEL. */ int __sock_create(struct net *net, int family, int type, int protocol, struct socket **res, int kern) { int err; struct socket *sock; const struct net_proto_family *pf; /* * Check protocol is in range */ if (family < 0 || family >= NPROTO) return -EAFNOSUPPORT; if (type < 0 || type >= SOCK_MAX) return -EINVAL; /* Compatibility. This uglymoron is moved from INET layer to here to avoid deadlock in module load. */ if (family == PF_INET && type == SOCK_PACKET) { pr_info_once("%s uses obsolete (PF_INET,SOCK_PACKET)\n", current->comm); family = PF_PACKET; } err = security_socket_create(family, type, protocol, kern); if (err) return err; /* * Allocate the socket and allow the family to set things up. if * the protocol is 0, the family is instructed to select an appropriate * default. */ sock = sock_alloc(); if (!sock) { net_warn_ratelimited("socket: no more sockets\n"); return -ENFILE; /* Not exactly a match, but its the closest posix thing */ } sock->type = type; #ifdef CONFIG_MODULES /* Attempt to load a protocol module if the find failed. * * 12/09/1996 Marcin: But! this makes REALLY only sense, if the user * requested real, full-featured networking support upon configuration. * Otherwise module support will break! */ if (rcu_access_pointer(net_families[family]) == NULL) request_module("net-pf-%d", family); #endif rcu_read_lock(); pf = rcu_dereference(net_families[family]); err = -EAFNOSUPPORT; if (!pf) goto out_release; /* * We will call the ->create function, that possibly is in a loadable * module, so we have to bump that loadable module refcnt first. */ if (!try_module_get(pf->owner)) goto out_release; /* Now protected by module ref count */ rcu_read_unlock(); err = pf->create(net, sock, protocol, kern); if (err < 0) goto out_module_put; /* * Now to bump the refcnt of the [loadable] module that owns this * socket at sock_release time we decrement its refcnt. */ if (!try_module_get(sock->ops->owner)) goto out_module_busy; /* * Now that we're done with the ->create function, the [loadable] * module can have its refcnt decremented */ module_put(pf->owner); err = security_socket_post_create(sock, family, type, protocol, kern); if (err) goto out_sock_release; *res = sock; return 0; out_module_busy: err = -EAFNOSUPPORT; out_module_put: sock->ops = NULL; module_put(pf->owner); out_sock_release: sock_release(sock); return err; out_release: rcu_read_unlock(); goto out_sock_release; } EXPORT_SYMBOL(__sock_create); /** * sock_create - creates a socket * @family: protocol family (AF_INET, ...) * @type: communication type (SOCK_STREAM, ...) * @protocol: protocol (0, ...) * @res: new socket * * A wrapper around __sock_create(). * Returns 0 or an error. This function internally uses GFP_KERNEL. */ int sock_create(int family, int type, int protocol, struct socket **res) { return __sock_create(current->nsproxy->net_ns, family, type, protocol, res, 0); } EXPORT_SYMBOL(sock_create); /** * sock_create_kern - creates a socket (kernel space) * @net: net namespace * @family: protocol family (AF_INET, ...) * @type: communication type (SOCK_STREAM, ...) * @protocol: protocol (0, ...) * @res: new socket * * A wrapper around __sock_create(). * Returns 0 or an error. This function internally uses GFP_KERNEL. */ int sock_create_kern(struct net *net, int family, int type, int protocol, struct socket **res) { return __sock_create(net, family, type, protocol, res, 1); } EXPORT_SYMBOL(sock_create_kern); static struct socket *__sys_socket_create(int family, int type, int protocol) { struct socket *sock; int retval; /* Check the SOCK_* constants for consistency. */ BUILD_BUG_ON(SOCK_CLOEXEC != O_CLOEXEC); BUILD_BUG_ON((SOCK_MAX | SOCK_TYPE_MASK) != SOCK_TYPE_MASK); BUILD_BUG_ON(SOCK_CLOEXEC & SOCK_TYPE_MASK); BUILD_BUG_ON(SOCK_NONBLOCK & SOCK_TYPE_MASK); if ((type & ~SOCK_TYPE_MASK) & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) return ERR_PTR(-EINVAL); type &= SOCK_TYPE_MASK; retval = sock_create(family, type, protocol, &sock); if (retval < 0) return ERR_PTR(retval); return sock; } struct file *__sys_socket_file(int family, int type, int protocol) { struct socket *sock; int flags; sock = __sys_socket_create(family, type, protocol); if (IS_ERR(sock)) return ERR_CAST(sock); flags = type & ~SOCK_TYPE_MASK; if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK)) flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK; return sock_alloc_file(sock, flags, NULL); } /* A hook for bpf progs to attach to and update socket protocol. * * A static noinline declaration here could cause the compiler to * optimize away the function. A global noinline declaration will * keep the definition, but may optimize away the callsite. * Therefore, __weak is needed to ensure that the call is still * emitted, by telling the compiler that we don't know what the * function might eventually be. */ __bpf_hook_start(); __weak noinline int update_socket_protocol(int family, int type, int protocol) { return protocol; } __bpf_hook_end(); int __sys_socket(int family, int type, int protocol) { struct socket *sock; int flags; sock = __sys_socket_create(family, type, update_socket_protocol(family, type, protocol)); if (IS_ERR(sock)) return PTR_ERR(sock); flags = type & ~SOCK_TYPE_MASK; if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK)) flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK; return sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK)); } SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol) { return __sys_socket(family, type, protocol); } /* * Create a pair of connected sockets. */ int __sys_socketpair(int family, int type, int protocol, int __user *usockvec) { struct socket *sock1, *sock2; int fd1, fd2, err; struct file *newfile1, *newfile2; int flags; flags = type & ~SOCK_TYPE_MASK; if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) return -EINVAL; type &= SOCK_TYPE_MASK; if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK)) flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK; /* * reserve descriptors and make sure we won't fail * to return them to userland. */ fd1 = get_unused_fd_flags(flags); if (unlikely(fd1 < 0)) return fd1; fd2 = get_unused_fd_flags(flags); if (unlikely(fd2 < 0)) { put_unused_fd(fd1); return fd2; } err = put_user(fd1, &usockvec[0]); if (err) goto out; err = put_user(fd2, &usockvec[1]); if (err) goto out; /* * Obtain the first socket and check if the underlying protocol * supports the socketpair call. */ err = sock_create(family, type, protocol, &sock1); if (unlikely(err < 0)) goto out; err = sock_create(family, type, protocol, &sock2); if (unlikely(err < 0)) { sock_release(sock1); goto out; } err = security_socket_socketpair(sock1, sock2); if (unlikely(err)) { sock_release(sock2); sock_release(sock1); goto out; } err = READ_ONCE(sock1->ops)->socketpair(sock1, sock2); if (unlikely(err < 0)) { sock_release(sock2); sock_release(sock1); goto out; } newfile1 = sock_alloc_file(sock1, flags, NULL); if (IS_ERR(newfile1)) { err = PTR_ERR(newfile1); sock_release(sock2); goto out; } newfile2 = sock_alloc_file(sock2, flags, NULL); if (IS_ERR(newfile2)) { err = PTR_ERR(newfile2); fput(newfile1); goto out; } audit_fd_pair(fd1, fd2); fd_install(fd1, newfile1); fd_install(fd2, newfile2); return 0; out: put_unused_fd(fd2); put_unused_fd(fd1); return err; } SYSCALL_DEFINE4(socketpair, int, family, int, type, int, protocol, int __user *, usockvec) { return __sys_socketpair(family, type, protocol, usockvec); } /* * Bind a name to a socket. Nothing much to do here since it's * the protocol's responsibility to handle the local address. * * We move the socket address to kernel space before we call * the protocol layer (having also checked the address is ok). */ int __sys_bind(int fd, struct sockaddr __user *umyaddr, int addrlen) { struct socket *sock; struct sockaddr_storage address; int err, fput_needed; sock = sockfd_lookup_light(fd, &err, &fput_needed); if (sock) { err = move_addr_to_kernel(umyaddr, addrlen, &address); if (!err) { err = security_socket_bind(sock, (struct sockaddr *)&address, addrlen); if (!err) err = READ_ONCE(sock->ops)->bind(sock, (struct sockaddr *) &address, addrlen); } fput_light(sock->file, fput_needed); } return err; } SYSCALL_DEFINE3(bind, int, fd, struct sockaddr __user *, umyaddr, int, addrlen) { return __sys_bind(fd, umyaddr, addrlen); } /* * Perform a listen. Basically, we allow the protocol to do anything * necessary for a listen, and if that works, we mark the socket as * ready for listening. */ int __sys_listen(int fd, int backlog) { struct socket *sock; int err, fput_needed; int somaxconn; sock = sockfd_lookup_light(fd, &err, &fput_needed); if (sock) { somaxconn = READ_ONCE(sock_net(sock->sk)->core.sysctl_somaxconn); if ((unsigned int)backlog > somaxconn) backlog = somaxconn; err = security_socket_listen(sock, backlog); if (!err) err = READ_ONCE(sock->ops)->listen(sock, backlog); fput_light(sock->file, fput_needed); } return err; } SYSCALL_DEFINE2(listen, int, fd, int, backlog) { return __sys_listen(fd, backlog); } struct file *do_accept(struct file *file, struct proto_accept_arg *arg, struct sockaddr __user *upeer_sockaddr, int __user *upeer_addrlen, int flags) { struct socket *sock, *newsock; struct file *newfile; int err, len; struct sockaddr_storage address; const struct proto_ops *ops; sock = sock_from_file(file); if (!sock) return ERR_PTR(-ENOTSOCK); newsock = sock_alloc(); if (!newsock) return ERR_PTR(-ENFILE); ops = READ_ONCE(sock->ops); newsock->type = sock->type; newsock->ops = ops; /* * We don't need try_module_get here, as the listening socket (sock) * has the protocol module (sock->ops->owner) held. */ __module_get(ops->owner); newfile = sock_alloc_file(newsock, flags, sock->sk->sk_prot_creator->name); if (IS_ERR(newfile)) return newfile; err = security_socket_accept(sock, newsock); if (err) goto out_fd; arg->flags |= sock->file->f_flags; err = ops->accept(sock, newsock, arg); if (err < 0) goto out_fd; if (upeer_sockaddr) { len = ops->getname(newsock, (struct sockaddr *)&address, 2); if (len < 0) { err = -ECONNABORTED; goto out_fd; } err = move_addr_to_user(&address, len, upeer_sockaddr, upeer_addrlen); if (err < 0) goto out_fd; } /* File flags are not inherited via accept() unlike another OSes. */ return newfile; out_fd: fput(newfile); return ERR_PTR(err); } static int __sys_accept4_file(struct file *file, struct sockaddr __user *upeer_sockaddr, int __user *upeer_addrlen, int flags) { struct proto_accept_arg arg = { }; struct file *newfile; int newfd; if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) return -EINVAL; if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK)) flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK; newfd = get_unused_fd_flags(flags); if (unlikely(newfd < 0)) return newfd; newfile = do_accept(file, &arg, upeer_sockaddr, upeer_addrlen, flags); if (IS_ERR(newfile)) { put_unused_fd(newfd); return PTR_ERR(newfile); } fd_install(newfd, newfile); return newfd; } /* * For accept, we attempt to create a new socket, set up the link * with the client, wake up the client, then return the new * connected fd. We collect the address of the connector in kernel * space and move it to user at the very end. This is unclean because * we open the socket then return an error. * * 1003.1g adds the ability to recvmsg() to query connection pending * status to recvmsg. We need to add that support in a way thats * clean when we restructure accept also. */ int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr, int __user *upeer_addrlen, int flags) { int ret = -EBADF; struct fd f; f = fdget(fd); if (f.file) { ret = __sys_accept4_file(f.file, upeer_sockaddr, upeer_addrlen, flags); fdput(f); } return ret; } SYSCALL_DEFINE4(accept4, int, fd, struct sockaddr __user *, upeer_sockaddr, int __user *, upeer_addrlen, int, flags) { return __sys_accept4(fd, upeer_sockaddr, upeer_addrlen, flags); } SYSCALL_DEFINE3(accept, int, fd, struct sockaddr __user *, upeer_sockaddr, int __user *, upeer_addrlen) { return __sys_accept4(fd, upeer_sockaddr, upeer_addrlen, 0); } /* * Attempt to connect to a socket with the server address. The address * is in user space so we verify it is OK and move it to kernel space. * * For 1003.1g we need to add clean support for a bind to AF_UNSPEC to * break bindings * * NOTE: 1003.1g draft 6.3 is broken with respect to AX.25/NetROM and * other SEQPACKET protocols that take time to connect() as it doesn't * include the -EINPROGRESS status for such sockets. */ int __sys_connect_file(struct file *file, struct sockaddr_storage *address, int addrlen, int file_flags) { struct socket *sock; int err; sock = sock_from_file(file); if (!sock) { err = -ENOTSOCK; goto out; } err = security_socket_connect(sock, (struct sockaddr *)address, addrlen); if (err) goto out; err = READ_ONCE(sock->ops)->connect(sock, (struct sockaddr *)address, addrlen, sock->file->f_flags | file_flags); out: return err; } int __sys_connect(int fd, struct sockaddr __user *uservaddr, int addrlen) { int ret = -EBADF; struct fd f; f = fdget(fd); if (f.file) { struct sockaddr_storage address; ret = move_addr_to_kernel(uservaddr, addrlen, &address); if (!ret) ret = __sys_connect_file(f.file, &address, addrlen, 0); fdput(f); } return ret; } SYSCALL_DEFINE3(connect, int, fd, struct sockaddr __user *, uservaddr, int, addrlen) { return __sys_connect(fd, uservaddr, addrlen); } /* * Get the local address ('name') of a socket object. Move the obtained * name to user space. */ int __sys_getsockname(int fd, struct sockaddr __user *usockaddr, int __user *usockaddr_len) { struct socket *sock; struct sockaddr_storage address; int err, fput_needed; sock = sockfd_lookup_light(fd, &err, &fput_needed); if (!sock) goto out; err = security_socket_getsockname(sock); if (err) goto out_put; err = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 0); if (err < 0) goto out_put; /* "err" is actually length in this case */ err = move_addr_to_user(&address, err, usockaddr, usockaddr_len); out_put: fput_light(sock->file, fput_needed); out: return err; } SYSCALL_DEFINE3(getsockname, int, fd, struct sockaddr __user *, usockaddr, int __user *, usockaddr_len) { return __sys_getsockname(fd, usockaddr, usockaddr_len); } /* * Get the remote address ('name') of a socket object. Move the obtained * name to user space. */ int __sys_getpeername(int fd, struct sockaddr __user *usockaddr, int __user *usockaddr_len) { struct socket *sock; struct sockaddr_storage address; int err, fput_needed; sock = sockfd_lookup_light(fd, &err, &fput_needed); if (sock != NULL) { const struct proto_ops *ops = READ_ONCE(sock->ops); err = security_socket_getpeername(sock); if (err) { fput_light(sock->file, fput_needed); return err; } err = ops->getname(sock, (struct sockaddr *)&address, 1); if (err >= 0) /* "err" is actually length in this case */ err = move_addr_to_user(&address, err, usockaddr, usockaddr_len); fput_light(sock->file, fput_needed); } return err; } SYSCALL_DEFINE3(getpeername, int, fd, struct sockaddr __user *, usockaddr, int __user *, usockaddr_len) { return __sys_getpeername(fd, usockaddr, usockaddr_len); } /* * Send a datagram to a given address. We move the address into kernel * space and check the user space data area is readable before invoking * the protocol. */ int __sys_sendto(int fd, void __user *buff, size_t len, unsigned int flags, struct sockaddr __user *addr, int addr_len) { struct socket *sock; struct sockaddr_storage address; int err; struct msghdr msg; int fput_needed; err = import_ubuf(ITER_SOURCE, buff, len, &msg.msg_iter); if (unlikely(err)) return err; sock = sockfd_lookup_light(fd, &err, &fput_needed); if (!sock) goto out; msg.msg_name = NULL; msg.msg_control = NULL; msg.msg_controllen = 0; msg.msg_namelen = 0; msg.msg_ubuf = NULL; if (addr) { err = move_addr_to_kernel(addr, addr_len, &address); if (err < 0) goto out_put; msg.msg_name = (struct sockaddr *)&address; msg.msg_namelen = addr_len; } flags &= ~MSG_INTERNAL_SENDMSG_FLAGS; if (sock->file->f_flags & O_NONBLOCK) flags |= MSG_DONTWAIT; msg.msg_flags = flags; err = __sock_sendmsg(sock, &msg); out_put: fput_light(sock->file, fput_needed); out: return err; } SYSCALL_DEFINE6(sendto, int, fd, void __user *, buff, size_t, len, unsigned int, flags, struct sockaddr __user *, addr, int, addr_len) { return __sys_sendto(fd, buff, len, flags, addr, addr_len); } /* * Send a datagram down a socket. */ SYSCALL_DEFINE4(send, int, fd, void __user *, buff, size_t, len, unsigned int, flags) { return __sys_sendto(fd, buff, len, flags, NULL, 0); } /* * Receive a frame from the socket and optionally record the address of the * sender. We verify the buffers are writable and if needed move the * sender address from kernel to user space. */ int __sys_recvfrom(int fd, void __user *ubuf, size_t size, unsigned int flags, struct sockaddr __user *addr, int __user *addr_len) { struct sockaddr_storage address; struct msghdr msg = { /* Save some cycles and don't copy the address if not needed */ .msg_name = addr ? (struct sockaddr *)&address : NULL, }; struct socket *sock; int err, err2; int fput_needed; err = import_ubuf(ITER_DEST, ubuf, size, &msg.msg_iter); if (unlikely(err)) return err; sock = sockfd_lookup_light(fd, &err, &fput_needed); if (!sock) goto out; if (sock->file->f_flags & O_NONBLOCK) flags |= MSG_DONTWAIT; err = sock_recvmsg(sock, &msg, flags); if (err >= 0 && addr != NULL) { err2 = move_addr_to_user(&address, msg.msg_namelen, addr, addr_len); if (err2 < 0) err = err2; } fput_light(sock->file, fput_needed); out: return err; } SYSCALL_DEFINE6(recvfrom, int, fd, void __user *, ubuf, size_t, size, unsigned int, flags, struct sockaddr __user *, addr, int __user *, addr_len) { return __sys_recvfrom(fd, ubuf, size, flags, addr, addr_len); } /* * Receive a datagram from a socket. */ SYSCALL_DEFINE4(recv, int, fd, void __user *, ubuf, size_t, size, unsigned int, flags) { return __sys_recvfrom(fd, ubuf, size, flags, NULL, NULL); } static bool sock_use_custom_sol_socket(const struct socket *sock) { return test_bit(SOCK_CUSTOM_SOCKOPT, &sock->flags); } int do_sock_setsockopt(struct socket *sock, bool compat, int level, int optname, sockptr_t optval, int optlen) { const struct proto_ops *ops; char *kernel_optval = NULL; int err; if (optlen < 0) return -EINVAL; err = security_socket_setsockopt(sock, level, optname); if (err) goto out_put; if (!compat) err = BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock->sk, &level, &optname, optval, &optlen, &kernel_optval); if (err < 0) goto out_put; if (err > 0) { err = 0; goto out_put; } if (kernel_optval) optval = KERNEL_SOCKPTR(kernel_optval); ops = READ_ONCE(sock->ops); if (level == SOL_SOCKET && !sock_use_custom_sol_socket(sock)) err = sock_setsockopt(sock, level, optname, optval, optlen); else if (unlikely(!ops->setsockopt)) err = -EOPNOTSUPP; else err = ops->setsockopt(sock, level, optname, optval, optlen); kfree(kernel_optval); out_put: return err; } EXPORT_SYMBOL(do_sock_setsockopt); /* Set a socket option. Because we don't know the option lengths we have * to pass the user mode parameter for the protocols to sort out. */ int __sys_setsockopt(int fd, int level, int optname, char __user *user_optval, int optlen) { sockptr_t optval = USER_SOCKPTR(user_optval); bool compat = in_compat_syscall(); int err, fput_needed; struct socket *sock; sock = sockfd_lookup_light(fd, &err, &fput_needed); if (!sock) return err; err = do_sock_setsockopt(sock, compat, level, optname, optval, optlen); fput_light(sock->file, fput_needed); return err; } SYSCALL_DEFINE5(setsockopt, int, fd, int, level, int, optname, char __user *, optval, int, optlen) { return __sys_setsockopt(fd, level, optname, optval, optlen); } INDIRECT_CALLABLE_DECLARE(bool tcp_bpf_bypass_getsockopt(int level, int optname)); int do_sock_getsockopt(struct socket *sock, bool compat, int level, int optname, sockptr_t optval, sockptr_t optlen) { int max_optlen __maybe_unused; const struct proto_ops *ops; int err; err = security_socket_getsockopt(sock, level, optname); if (err) return err; if (!compat) max_optlen = BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen); ops = READ_ONCE(sock->ops); if (level == SOL_SOCKET) { err = sk_getsockopt(sock->sk, level, optname, optval, optlen); } else if (unlikely(!ops->getsockopt)) { err = -EOPNOTSUPP; } else { if (WARN_ONCE(optval.is_kernel || optlen.is_kernel, "Invalid argument type")) return -EOPNOTSUPP; err = ops->getsockopt(sock, level, optname, optval.user, optlen.user); } if (!compat) err = BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock->sk, level, optname, optval, optlen, max_optlen, err); return err; } EXPORT_SYMBOL(do_sock_getsockopt); /* * Get a socket option. Because we don't know the option lengths we have * to pass a user mode parameter for the protocols to sort out. */ int __sys_getsockopt(int fd, int level, int optname, char __user *optval, int __user *optlen) { int err, fput_needed; struct socket *sock; bool compat; sock = sockfd_lookup_light(fd, &err, &fput_needed); if (!sock) return err; compat = in_compat_syscall(); err = do_sock_getsockopt(sock, compat, level, optname, USER_SOCKPTR(optval), USER_SOCKPTR(optlen)); fput_light(sock->file, fput_needed); return err; } SYSCALL_DEFINE5(getsockopt, int, fd, int, level, int, optname, char __user *, optval, int __user *, optlen) { return __sys_getsockopt(fd, level, optname, optval, optlen); } /* * Shutdown a socket. */ int __sys_shutdown_sock(struct socket *sock, int how) { int err; err = security_socket_shutdown(sock, how); if (!err) err = READ_ONCE(sock->ops)->shutdown(sock, how); return err; } int __sys_shutdown(int fd, int how) { int err, fput_needed; struct socket *sock; sock = sockfd_lookup_light(fd, &err, &fput_needed); if (sock != NULL) { err = __sys_shutdown_sock(sock, how); fput_light(sock->file, fput_needed); } return err; } SYSCALL_DEFINE2(shutdown, int, fd, int, how) { return __sys_shutdown(fd, how); } /* A couple of helpful macros for getting the address of the 32/64 bit * fields which are the same type (int / unsigned) on our platforms. */ #define COMPAT_MSG(msg, member) ((MSG_CMSG_COMPAT & flags) ? &msg##_compat->member : &msg->member) #define COMPAT_NAMELEN(msg) COMPAT_MSG(msg, msg_namelen) #define COMPAT_FLAGS(msg) COMPAT_MSG(msg, msg_flags) struct used_address { struct sockaddr_storage name; unsigned int name_len; }; int __copy_msghdr(struct msghdr *kmsg, struct user_msghdr *msg, struct sockaddr __user **save_addr) { ssize_t err; kmsg->msg_control_is_user = true; kmsg->msg_get_inq = 0; kmsg->msg_control_user = msg->msg_control; kmsg->msg_controllen = msg->msg_controllen; kmsg->msg_flags = msg->msg_flags; kmsg->msg_namelen = msg->msg_namelen; if (!msg->msg_name) kmsg->msg_namelen = 0; if (kmsg->msg_namelen < 0) return -EINVAL; if (kmsg->msg_namelen > sizeof(struct sockaddr_storage)) kmsg->msg_namelen = sizeof(struct sockaddr_storage); if (save_addr) *save_addr = msg->msg_name; if (msg->msg_name && kmsg->msg_namelen) { if (!save_addr) { err = move_addr_to_kernel(msg->msg_name, kmsg->msg_namelen, kmsg->msg_name); if (err < 0) return err; } } else { kmsg->msg_name = NULL; kmsg->msg_namelen = 0; } if (msg->msg_iovlen > UIO_MAXIOV) return -EMSGSIZE; kmsg->msg_iocb = NULL; kmsg->msg_ubuf = NULL; return 0; } static int copy_msghdr_from_user(struct msghdr *kmsg, struct user_msghdr __user *umsg, struct sockaddr __user **save_addr, struct iovec **iov) { struct user_msghdr msg; ssize_t err; if (copy_from_user(&msg, umsg, sizeof(*umsg))) return -EFAULT; err = __copy_msghdr(kmsg, &msg, save_addr); if (err) return err; err = import_iovec(save_addr ? ITER_DEST : ITER_SOURCE, msg.msg_iov, msg.msg_iovlen, UIO_FASTIOV, iov, &kmsg->msg_iter); return err < 0 ? err : 0; } static int ____sys_sendmsg(struct socket *sock, struct msghdr *msg_sys, unsigned int flags, struct used_address *used_address, unsigned int allowed_msghdr_flags) { unsigned char ctl[sizeof(struct cmsghdr) + 20] __aligned(sizeof(__kernel_size_t)); /* 20 is size of ipv6_pktinfo */ unsigned char *ctl_buf = ctl; int ctl_len; ssize_t err; err = -ENOBUFS; if (msg_sys->msg_controllen > INT_MAX) goto out; flags |= (msg_sys->msg_flags & allowed_msghdr_flags); ctl_len = msg_sys->msg_controllen; if ((MSG_CMSG_COMPAT & flags) && ctl_len) { err = cmsghdr_from_user_compat_to_kern(msg_sys, sock->sk, ctl, sizeof(ctl)); if (err) goto out; ctl_buf = msg_sys->msg_control; ctl_len = msg_sys->msg_controllen; } else if (ctl_len) { BUILD_BUG_ON(sizeof(struct cmsghdr) != CMSG_ALIGN(sizeof(struct cmsghdr))); if (ctl_len > sizeof(ctl)) { ctl_buf = sock_kmalloc(sock->sk, ctl_len, GFP_KERNEL); if (ctl_buf == NULL) goto out; } err = -EFAULT; if (copy_from_user(ctl_buf, msg_sys->msg_control_user, ctl_len)) goto out_freectl; msg_sys->msg_control = ctl_buf; msg_sys->msg_control_is_user = false; } flags &= ~MSG_INTERNAL_SENDMSG_FLAGS; msg_sys->msg_flags = flags; if (sock->file->f_flags & O_NONBLOCK) msg_sys->msg_flags |= MSG_DONTWAIT; /* * If this is sendmmsg() and current destination address is same as * previously succeeded address, omit asking LSM's decision. * used_address->name_len is initialized to UINT_MAX so that the first * destination address never matches. */ if (used_address && msg_sys->msg_name && used_address->name_len == msg_sys->msg_namelen && !memcmp(&used_address->name, msg_sys->msg_name, used_address->name_len)) { err = sock_sendmsg_nosec(sock, msg_sys); goto out_freectl; } err = __sock_sendmsg(sock, msg_sys); /* * If this is sendmmsg() and sending to current destination address was * successful, remember it. */ if (used_address && err >= 0) { used_address->name_len = msg_sys->msg_namelen; if (msg_sys->msg_name) memcpy(&used_address->name, msg_sys->msg_name, used_address->name_len); } out_freectl: if (ctl_buf != ctl) sock_kfree_s(sock->sk, ctl_buf, ctl_len); out: return err; } static int sendmsg_copy_msghdr(struct msghdr *msg, struct user_msghdr __user *umsg, unsigned flags, struct iovec **iov) { int err; if (flags & MSG_CMSG_COMPAT) { struct compat_msghdr __user *msg_compat; msg_compat = (struct compat_msghdr __user *) umsg; err = get_compat_msghdr(msg, msg_compat, NULL, iov); } else { err = copy_msghdr_from_user(msg, umsg, NULL, iov); } if (err < 0) return err; return 0; } static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg, struct msghdr *msg_sys, unsigned int flags, struct used_address *used_address, unsigned int allowed_msghdr_flags) { struct sockaddr_storage address; struct iovec iovstack[UIO_FASTIOV], *iov = iovstack; ssize_t err; msg_sys->msg_name = &address; err = sendmsg_copy_msghdr(msg_sys, msg, flags, &iov); if (err < 0) return err; err = ____sys_sendmsg(sock, msg_sys, flags, used_address, allowed_msghdr_flags); kfree(iov); return err; } /* * BSD sendmsg interface */ long __sys_sendmsg_sock(struct socket *sock, struct msghdr *msg, unsigned int flags) { return ____sys_sendmsg(sock, msg, flags, NULL, 0); } long __sys_sendmsg(int fd, struct user_msghdr __user *msg, unsigned int flags, bool forbid_cmsg_compat) { int fput_needed, err; struct msghdr msg_sys; struct socket *sock; if (forbid_cmsg_compat && (flags & MSG_CMSG_COMPAT)) return -EINVAL; sock = sockfd_lookup_light(fd, &err, &fput_needed); if (!sock) goto out; err = ___sys_sendmsg(sock, msg, &msg_sys, flags, NULL, 0); fput_light(sock->file, fput_needed); out: return err; } SYSCALL_DEFINE3(sendmsg, int, fd, struct user_msghdr __user *, msg, unsigned int, flags) { return __sys_sendmsg(fd, msg, flags, true); } /* * Linux sendmmsg interface */ int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen, unsigned int flags, bool forbid_cmsg_compat) { int fput_needed, err, datagrams; struct socket *sock; struct mmsghdr __user *entry; struct compat_mmsghdr __user *compat_entry; struct msghdr msg_sys; struct used_address used_address; unsigned int oflags = flags; if (forbid_cmsg_compat && (flags & MSG_CMSG_COMPAT)) return -EINVAL; if (vlen > UIO_MAXIOV) vlen = UIO_MAXIOV; datagrams = 0; sock = sockfd_lookup_light(fd, &err, &fput_needed); if (!sock) return err; used_address.name_len = UINT_MAX; entry = mmsg; compat_entry = (struct compat_mmsghdr __user *)mmsg; err = 0; flags |= MSG_BATCH; while (datagrams < vlen) { if (datagrams == vlen - 1) flags = oflags; if (MSG_CMSG_COMPAT & flags) { err = ___sys_sendmsg(sock, (struct user_msghdr __user *)compat_entry, &msg_sys, flags, &used_address, MSG_EOR); if (err < 0) break; err = __put_user(err, &compat_entry->msg_len); ++compat_entry; } else { err = ___sys_sendmsg(sock, (struct user_msghdr __user *)entry, &msg_sys, flags, &used_address, MSG_EOR); if (err < 0) break; err = put_user(err, &entry->msg_len); ++entry; } if (err) break; ++datagrams; if (msg_data_left(&msg_sys)) break; cond_resched(); } fput_light(sock->file, fput_needed); /* We only return an error if no datagrams were able to be sent */ if (datagrams != 0) return datagrams; return err; } SYSCALL_DEFINE4(sendmmsg, int, fd, struct mmsghdr __user *, mmsg, unsigned int, vlen, unsigned int, flags) { return __sys_sendmmsg(fd, mmsg, vlen, flags, true); } static int recvmsg_copy_msghdr(struct msghdr *msg, struct user_msghdr __user *umsg, unsigned flags, struct sockaddr __user **uaddr, struct iovec **iov) { ssize_t err; if (MSG_CMSG_COMPAT & flags) { struct compat_msghdr __user *msg_compat; msg_compat = (struct compat_msghdr __user *) umsg; err = get_compat_msghdr(msg, msg_compat, uaddr, iov); } else { err = copy_msghdr_from_user(msg, umsg, uaddr, iov); } if (err < 0) return err; return 0; } static int ____sys_recvmsg(struct socket *sock, struct msghdr *msg_sys, struct user_msghdr __user *msg, struct sockaddr __user *uaddr, unsigned int flags, int nosec) { struct compat_msghdr __user *msg_compat = (struct compat_msghdr __user *) msg; int __user *uaddr_len = COMPAT_NAMELEN(msg); struct sockaddr_storage addr; unsigned long cmsg_ptr; int len; ssize_t err; msg_sys->msg_name = &addr; cmsg_ptr = (unsigned long)msg_sys->msg_control; msg_sys->msg_flags = flags & (MSG_CMSG_CLOEXEC|MSG_CMSG_COMPAT); /* We assume all kernel code knows the size of sockaddr_storage */ msg_sys->msg_namelen = 0; if (sock->file->f_flags & O_NONBLOCK) flags |= MSG_DONTWAIT; if (unlikely(nosec)) err = sock_recvmsg_nosec(sock, msg_sys, flags); else err = sock_recvmsg(sock, msg_sys, flags); if (err < 0) goto out; len = err; if (uaddr != NULL) { err = move_addr_to_user(&addr, msg_sys->msg_namelen, uaddr, uaddr_len); if (err < 0) goto out; } err = __put_user((msg_sys->msg_flags & ~MSG_CMSG_COMPAT), COMPAT_FLAGS(msg)); if (err) goto out; if (MSG_CMSG_COMPAT & flags) err = __put_user((unsigned long)msg_sys->msg_control - cmsg_ptr, &msg_compat->msg_controllen); else err = __put_user((unsigned long)msg_sys->msg_control - cmsg_ptr, &msg->msg_controllen); if (err) goto out; err = len; out: return err; } static int ___sys_recvmsg(struct socket *sock, struct user_msghdr __user *msg, struct msghdr *msg_sys, unsigned int flags, int nosec) { struct iovec iovstack[UIO_FASTIOV], *iov = iovstack; /* user mode address pointers */ struct sockaddr __user *uaddr; ssize_t err; err = recvmsg_copy_msghdr(msg_sys, msg, flags, &uaddr, &iov); if (err < 0) return err; err = ____sys_recvmsg(sock, msg_sys, msg, uaddr, flags, nosec); kfree(iov); return err; } /* * BSD recvmsg interface */ long __sys_recvmsg_sock(struct socket *sock, struct msghdr *msg, struct user_msghdr __user *umsg, struct sockaddr __user *uaddr, unsigned int flags) { return ____sys_recvmsg(sock, msg, umsg, uaddr, flags, 0); } long __sys_recvmsg(int fd, struct user_msghdr __user *msg, unsigned int flags, bool forbid_cmsg_compat) { int fput_needed, err; struct msghdr msg_sys; struct socket *sock; if (forbid_cmsg_compat && (flags & MSG_CMSG_COMPAT)) return -EINVAL; sock = sockfd_lookup_light(fd, &err, &fput_needed); if (!sock) goto out; err = ___sys_recvmsg(sock, msg, &msg_sys, flags, 0); fput_light(sock->file, fput_needed); out: return err; } SYSCALL_DEFINE3(recvmsg, int, fd, struct user_msghdr __user *, msg, unsigned int, flags) { return __sys_recvmsg(fd, msg, flags, true); } /* * Linux recvmmsg interface */ static int do_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen, unsigned int flags, struct timespec64 *timeout) { int fput_needed, err, datagrams; struct socket *sock; struct mmsghdr __user *entry; struct compat_mmsghdr __user *compat_entry; struct msghdr msg_sys; struct timespec64 end_time; struct timespec64 timeout64; if (timeout && poll_select_set_timeout(&end_time, timeout->tv_sec, timeout->tv_nsec)) return -EINVAL; datagrams = 0; sock = sockfd_lookup_light(fd, &err, &fput_needed); if (!sock) return err; if (likely(!(flags & MSG_ERRQUEUE))) { err = sock_error(sock->sk); if (err) { datagrams = err; goto out_put; } } entry = mmsg; compat_entry = (struct compat_mmsghdr __user *)mmsg; while (datagrams < vlen) { /* * No need to ask LSM for more than the first datagram. */ if (MSG_CMSG_COMPAT & flags) { err = ___sys_recvmsg(sock, (struct user_msghdr __user *)compat_entry, &msg_sys, flags & ~MSG_WAITFORONE, datagrams); if (err < 0) break; err = __put_user(err, &compat_entry->msg_len); ++compat_entry; } else { err = ___sys_recvmsg(sock, (struct user_msghdr __user *)entry, &msg_sys, flags & ~MSG_WAITFORONE, datagrams); if (err < 0) break; err = put_user(err, &entry->msg_len); ++entry; } if (err) break; ++datagrams; /* MSG_WAITFORONE turns on MSG_DONTWAIT after one packet */ if (flags & MSG_WAITFORONE) flags |= MSG_DONTWAIT; if (timeout) { ktime_get_ts64(&timeout64); *timeout = timespec64_sub(end_time, timeout64); if (timeout->tv_sec < 0) { timeout->tv_sec = timeout->tv_nsec = 0; break; } /* Timeout, return less than vlen datagrams */ if (timeout->tv_nsec == 0 && timeout->tv_sec == 0) break; } /* Out of band data, return right away */ if (msg_sys.msg_flags & MSG_OOB) break; cond_resched(); } if (err == 0) goto out_put; if (datagrams == 0) { datagrams = err; goto out_put; } /* * We may return less entries than requested (vlen) if the * sock is non block and there aren't enough datagrams... */ if (err != -EAGAIN) { /* * ... or if recvmsg returns an error after we * received some datagrams, where we record the * error to return on the next call or if the * app asks about it using getsockopt(SO_ERROR). */ WRITE_ONCE(sock->sk->sk_err, -err); } out_put: fput_light(sock->file, fput_needed); return datagrams; } int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen, unsigned int flags, struct __kernel_timespec __user *timeout, struct old_timespec32 __user *timeout32) { int datagrams; struct timespec64 timeout_sys; if (timeout && get_timespec64(&timeout_sys, timeout)) return -EFAULT; if (timeout32 && get_old_timespec32(&timeout_sys, timeout32)) return -EFAULT; if (!timeout && !timeout32) return do_recvmmsg(fd, mmsg, vlen, flags, NULL); datagrams = do_recvmmsg(fd, mmsg, vlen, flags, &timeout_sys); if (datagrams <= 0) return datagrams; if (timeout && put_timespec64(&timeout_sys, timeout)) datagrams = -EFAULT; if (timeout32 && put_old_timespec32(&timeout_sys, timeout32)) datagrams = -EFAULT; return datagrams; } SYSCALL_DEFINE5(recvmmsg, int, fd, struct mmsghdr __user *, mmsg, unsigned int, vlen, unsigned int, flags, struct __kernel_timespec __user *, timeout) { if (flags & MSG_CMSG_COMPAT) return -EINVAL; return __sys_recvmmsg(fd, mmsg, vlen, flags, timeout, NULL); } #ifdef CONFIG_COMPAT_32BIT_TIME SYSCALL_DEFINE5(recvmmsg_time32, int, fd, struct mmsghdr __user *, mmsg, unsigned int, vlen, unsigned int, flags, struct old_timespec32 __user *, timeout) { if (flags & MSG_CMSG_COMPAT) return -EINVAL; return __sys_recvmmsg(fd, mmsg, vlen, flags, NULL, timeout); } #endif #ifdef __ARCH_WANT_SYS_SOCKETCALL /* Argument list sizes for sys_socketcall */ #define AL(x) ((x) * sizeof(unsigned long)) static const unsigned char nargs[21] = { AL(0), AL(3), AL(3), AL(3), AL(2), AL(3), AL(3), AL(3), AL(4), AL(4), AL(4), AL(6), AL(6), AL(2), AL(5), AL(5), AL(3), AL(3), AL(4), AL(5), AL(4) }; #undef AL /* * System call vectors. * * Argument checking cleaned up. Saved 20% in size. * This function doesn't need to set the kernel lock because * it is set by the callees. */ SYSCALL_DEFINE2(socketcall, int, call, unsigned long __user *, args) { unsigned long a[AUDITSC_ARGS]; unsigned long a0, a1; int err; unsigned int len; if (call < 1 || call > SYS_SENDMMSG) return -EINVAL; call = array_index_nospec(call, SYS_SENDMMSG + 1); len = nargs[call]; if (len > sizeof(a)) return -EINVAL; /* copy_from_user should be SMP safe. */ if (copy_from_user(a, args, len)) return -EFAULT; err = audit_socketcall(nargs[call] / sizeof(unsigned long), a); if (err) return err; a0 = a[0]; a1 = a[1]; switch (call) { case SYS_SOCKET: err = __sys_socket(a0, a1, a[2]); break; case SYS_BIND: err = __sys_bind(a0, (struct sockaddr __user *)a1, a[2]); break; case SYS_CONNECT: err = __sys_connect(a0, (struct sockaddr __user *)a1, a[2]); break; case SYS_LISTEN: err = __sys_listen(a0, a1); break; case SYS_ACCEPT: err = __sys_accept4(a0, (struct sockaddr __user *)a1, (int __user *)a[2], 0); break; case SYS_GETSOCKNAME: err = __sys_getsockname(a0, (struct sockaddr __user *)a1, (int __user *)a[2]); break; case SYS_GETPEERNAME: err = __sys_getpeername(a0, (struct sockaddr __user *)a1, (int __user *)a[2]); break; case SYS_SOCKETPAIR: err = __sys_socketpair(a0, a1, a[2], (int __user *)a[3]); break; case SYS_SEND: err = __sys_sendto(a0, (void __user *)a1, a[2], a[3], NULL, 0); break; case SYS_SENDTO: err = __sys_sendto(a0, (void __user *)a1, a[2], a[3], (struct sockaddr __user *)a[4], a[5]); break; case SYS_RECV: err = __sys_recvfrom(a0, (void __user *)a1, a[2], a[3], NULL, NULL); break; case SYS_RECVFROM: err = __sys_recvfrom(a0, (void __user *)a1, a[2], a[3], (struct sockaddr __user *)a[4], (int __user *)a[5]); break; case SYS_SHUTDOWN: err = __sys_shutdown(a0, a1); break; case SYS_SETSOCKOPT: err = __sys_setsockopt(a0, a1, a[2], (char __user *)a[3], a[4]); break; case SYS_GETSOCKOPT: err = __sys_getsockopt(a0, a1, a[2], (char __user *)a[3], (int __user *)a[4]); break; case SYS_SENDMSG: err = __sys_sendmsg(a0, (struct user_msghdr __user *)a1, a[2], true); break; case SYS_SENDMMSG: err = __sys_sendmmsg(a0, (struct mmsghdr __user *)a1, a[2], a[3], true); break; case SYS_RECVMSG: err = __sys_recvmsg(a0, (struct user_msghdr __user *)a1, a[2], true); break; case SYS_RECVMMSG: if (IS_ENABLED(CONFIG_64BIT)) err = __sys_recvmmsg(a0, (struct mmsghdr __user *)a1, a[2], a[3], (struct __kernel_timespec __user *)a[4], NULL); else err = __sys_recvmmsg(a0, (struct mmsghdr __user *)a1, a[2], a[3], NULL, (struct old_timespec32 __user *)a[4]); break; case SYS_ACCEPT4: err = __sys_accept4(a0, (struct sockaddr __user *)a1, (int __user *)a[2], a[3]); break; default: err = -EINVAL; break; } return err; } #endif /* __ARCH_WANT_SYS_SOCKETCALL */ /** * sock_register - add a socket protocol handler * @ops: description of protocol * * This function is called by a protocol handler that wants to * advertise its address family, and have it linked into the * socket interface. The value ops->family corresponds to the * socket system call protocol family. */ int sock_register(const struct net_proto_family *ops) { int err; if (ops->family >= NPROTO) { pr_crit("protocol %d >= NPROTO(%d)\n", ops->family, NPROTO); return -ENOBUFS; } spin_lock(&net_family_lock); if (rcu_dereference_protected(net_families[ops->family], lockdep_is_held(&net_family_lock))) err = -EEXIST; else { rcu_assign_pointer(net_families[ops->family], ops); err = 0; } spin_unlock(&net_family_lock); pr_info("NET: Registered %s protocol family\n", pf_family_names[ops->family]); return err; } EXPORT_SYMBOL(sock_register); /** * sock_unregister - remove a protocol handler * @family: protocol family to remove * * This function is called by a protocol handler that wants to * remove its address family, and have it unlinked from the * new socket creation. * * If protocol handler is a module, then it can use module reference * counts to protect against new references. If protocol handler is not * a module then it needs to provide its own protection in * the ops->create routine. */ void sock_unregister(int family) { BUG_ON(family < 0 || family >= NPROTO); spin_lock(&net_family_lock); RCU_INIT_POINTER(net_families[family], NULL); spin_unlock(&net_family_lock); synchronize_rcu(); pr_info("NET: Unregistered %s protocol family\n", pf_family_names[family]); } EXPORT_SYMBOL(sock_unregister); bool sock_is_registered(int family) { return family < NPROTO && rcu_access_pointer(net_families[family]); } static int __init sock_init(void) { int err; /* * Initialize the network sysctl infrastructure. */ err = net_sysctl_init(); if (err) goto out; /* * Initialize skbuff SLAB cache */ skb_init(); /* * Initialize the protocols module. */ init_inodecache(); err = register_filesystem(&sock_fs_type); if (err) goto out; sock_mnt = kern_mount(&sock_fs_type); if (IS_ERR(sock_mnt)) { err = PTR_ERR(sock_mnt); goto out_mount; } /* The real protocol initialization is performed in later initcalls. */ #ifdef CONFIG_NETFILTER err = netfilter_init(); if (err) goto out; #endif ptp_classifier_init(); out: return err; out_mount: unregister_filesystem(&sock_fs_type); goto out; } core_initcall(sock_init); /* early initcall */ #ifdef CONFIG_PROC_FS void socket_seq_show(struct seq_file *seq) { seq_printf(seq, "sockets: used %d\n", sock_inuse_get(seq->private)); } #endif /* CONFIG_PROC_FS */ /* Handle the fact that while struct ifreq has the same *layout* on * 32/64 for everything but ifreq::ifru_ifmap and ifreq::ifru_data, * which are handled elsewhere, it still has different *size* due to * ifreq::ifru_ifmap (which is 16 bytes on 32 bit, 24 bytes on 64-bit, * resulting in struct ifreq being 32 and 40 bytes respectively). * As a result, if the struct happens to be at the end of a page and * the next page isn't readable/writable, we get a fault. To prevent * that, copy back and forth to the full size. */ int get_user_ifreq(struct ifreq *ifr, void __user **ifrdata, void __user *arg) { if (in_compat_syscall()) { struct compat_ifreq *ifr32 = (struct compat_ifreq *)ifr; memset(ifr, 0, sizeof(*ifr)); if (copy_from_user(ifr32, arg, sizeof(*ifr32))) return -EFAULT; if (ifrdata) *ifrdata = compat_ptr(ifr32->ifr_data); return 0; } if (copy_from_user(ifr, arg, sizeof(*ifr))) return -EFAULT; if (ifrdata) *ifrdata = ifr->ifr_data; return 0; } EXPORT_SYMBOL(get_user_ifreq); int put_user_ifreq(struct ifreq *ifr, void __user *arg) { size_t size = sizeof(*ifr); if (in_compat_syscall()) size = sizeof(struct compat_ifreq); if (copy_to_user(arg, ifr, size)) return -EFAULT; return 0; } EXPORT_SYMBOL(put_user_ifreq); #ifdef CONFIG_COMPAT static int compat_siocwandev(struct net *net, struct compat_ifreq __user *uifr32) { compat_uptr_t uptr32; struct ifreq ifr; void __user *saved; int err; if (get_user_ifreq(&ifr, NULL, uifr32)) return -EFAULT; if (get_user(uptr32, &uifr32->ifr_settings.ifs_ifsu)) return -EFAULT; saved = ifr.ifr_settings.ifs_ifsu.raw_hdlc; ifr.ifr_settings.ifs_ifsu.raw_hdlc = compat_ptr(uptr32); err = dev_ioctl(net, SIOCWANDEV, &ifr, NULL, NULL); if (!err) { ifr.ifr_settings.ifs_ifsu.raw_hdlc = saved; if (put_user_ifreq(&ifr, uifr32)) err = -EFAULT; } return err; } /* Handle ioctls that use ifreq::ifr_data and just need struct ifreq converted */ static int compat_ifr_data_ioctl(struct net *net, unsigned int cmd, struct compat_ifreq __user *u_ifreq32) { struct ifreq ifreq; void __user *data; if (!is_socket_ioctl_cmd(cmd)) return -ENOTTY; if (get_user_ifreq(&ifreq, &data, u_ifreq32)) return -EFAULT; ifreq.ifr_data = data; return dev_ioctl(net, cmd, &ifreq, data, NULL); } static int compat_sock_ioctl_trans(struct file *file, struct socket *sock, unsigned int cmd, unsigned long arg) { void __user *argp = compat_ptr(arg); struct sock *sk = sock->sk; struct net *net = sock_net(sk); const struct proto_ops *ops; if (cmd >= SIOCDEVPRIVATE && cmd <= (SIOCDEVPRIVATE + 15)) return sock_ioctl(file, cmd, (unsigned long)argp); switch (cmd) { case SIOCWANDEV: return compat_siocwandev(net, argp); case SIOCGSTAMP_OLD: case SIOCGSTAMPNS_OLD: ops = READ_ONCE(sock->ops); if (!ops->gettstamp) return -ENOIOCTLCMD; return ops->gettstamp(sock, argp, cmd == SIOCGSTAMP_OLD, !COMPAT_USE_64BIT_TIME); case SIOCETHTOOL: case SIOCBONDSLAVEINFOQUERY: case SIOCBONDINFOQUERY: case SIOCSHWTSTAMP: case SIOCGHWTSTAMP: return compat_ifr_data_ioctl(net, cmd, argp); case FIOSETOWN: case SIOCSPGRP: case FIOGETOWN: case SIOCGPGRP: case SIOCBRADDBR: case SIOCBRDELBR: case SIOCGIFVLAN: case SIOCSIFVLAN: case SIOCGSKNS: case SIOCGSTAMP_NEW: case SIOCGSTAMPNS_NEW: case SIOCGIFCONF: case SIOCSIFBR: case SIOCGIFBR: return sock_ioctl(file, cmd, arg); case SIOCGIFFLAGS: case SIOCSIFFLAGS: case SIOCGIFMAP: case SIOCSIFMAP: case SIOCGIFMETRIC: case SIOCSIFMETRIC: case SIOCGIFMTU: case SIOCSIFMTU: case SIOCGIFMEM: case SIOCSIFMEM: case SIOCGIFHWADDR: case SIOCSIFHWADDR: case SIOCADDMULTI: case SIOCDELMULTI: case SIOCGIFINDEX: case SIOCGIFADDR: case SIOCSIFADDR: case SIOCSIFHWBROADCAST: case SIOCDIFADDR: case SIOCGIFBRDADDR: case SIOCSIFBRDADDR: case SIOCGIFDSTADDR: case SIOCSIFDSTADDR: case SIOCGIFNETMASK: case SIOCSIFNETMASK: case SIOCSIFPFLAGS: case SIOCGIFPFLAGS: case SIOCGIFTXQLEN: case SIOCSIFTXQLEN: case SIOCBRADDIF: case SIOCBRDELIF: case SIOCGIFNAME: case SIOCSIFNAME: case SIOCGMIIPHY: case SIOCGMIIREG: case SIOCSMIIREG: case SIOCBONDENSLAVE: case SIOCBONDRELEASE: case SIOCBONDSETHWADDR: case SIOCBONDCHANGEACTIVE: case SIOCSARP: case SIOCGARP: case SIOCDARP: case SIOCOUTQ: case SIOCOUTQNSD: case SIOCATMARK: return sock_do_ioctl(net, sock, cmd, arg); } return -ENOIOCTLCMD; } static long compat_sock_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct socket *sock = file->private_data; const struct proto_ops *ops = READ_ONCE(sock->ops); int ret = -ENOIOCTLCMD; struct sock *sk; struct net *net; sk = sock->sk; net = sock_net(sk); if (ops->compat_ioctl) ret = ops->compat_ioctl(sock, cmd, arg); if (ret == -ENOIOCTLCMD && (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)) ret = compat_wext_handle_ioctl(net, cmd, arg); if (ret == -ENOIOCTLCMD) ret = compat_sock_ioctl_trans(file, sock, cmd, arg); return ret; } #endif /** * kernel_bind - bind an address to a socket (kernel space) * @sock: socket * @addr: address * @addrlen: length of address * * Returns 0 or an error. */ int kernel_bind(struct socket *sock, struct sockaddr *addr, int addrlen) { struct sockaddr_storage address; memcpy(&address, addr, addrlen); return READ_ONCE(sock->ops)->bind(sock, (struct sockaddr *)&address, addrlen); } EXPORT_SYMBOL(kernel_bind); /** * kernel_listen - move socket to listening state (kernel space) * @sock: socket * @backlog: pending connections queue size * * Returns 0 or an error. */ int kernel_listen(struct socket *sock, int backlog) { return READ_ONCE(sock->ops)->listen(sock, backlog); } EXPORT_SYMBOL(kernel_listen); /** * kernel_accept - accept a connection (kernel space) * @sock: listening socket * @newsock: new connected socket * @flags: flags * * @flags must be SOCK_CLOEXEC, SOCK_NONBLOCK or 0. * If it fails, @newsock is guaranteed to be %NULL. * Returns 0 or an error. */ int kernel_accept(struct socket *sock, struct socket **newsock, int flags) { struct sock *sk = sock->sk; const struct proto_ops *ops = READ_ONCE(sock->ops); struct proto_accept_arg arg = { .flags = flags, .kern = true, }; int err; err = sock_create_lite(sk->sk_family, sk->sk_type, sk->sk_protocol, newsock); if (err < 0) goto done; err = ops->accept(sock, *newsock, &arg); if (err < 0) { sock_release(*newsock); *newsock = NULL; goto done; } (*newsock)->ops = ops; __module_get(ops->owner); done: return err; } EXPORT_SYMBOL(kernel_accept); /** * kernel_connect - connect a socket (kernel space) * @sock: socket * @addr: address * @addrlen: address length * @flags: flags (O_NONBLOCK, ...) * * For datagram sockets, @addr is the address to which datagrams are sent * by default, and the only address from which datagrams are received. * For stream sockets, attempts to connect to @addr. * Returns 0 or an error code. */ int kernel_connect(struct socket *sock, struct sockaddr *addr, int addrlen, int flags) { struct sockaddr_storage address; memcpy(&address, addr, addrlen); return READ_ONCE(sock->ops)->connect(sock, (struct sockaddr *)&address, addrlen, flags); } EXPORT_SYMBOL(kernel_connect); /** * kernel_getsockname - get the address which the socket is bound (kernel space) * @sock: socket * @addr: address holder * * Fills the @addr pointer with the address which the socket is bound. * Returns the length of the address in bytes or an error code. */ int kernel_getsockname(struct socket *sock, struct sockaddr *addr) { return READ_ONCE(sock->ops)->getname(sock, addr, 0); } EXPORT_SYMBOL(kernel_getsockname); /** * kernel_getpeername - get the address which the socket is connected (kernel space) * @sock: socket * @addr: address holder * * Fills the @addr pointer with the address which the socket is connected. * Returns the length of the address in bytes or an error code. */ int kernel_getpeername(struct socket *sock, struct sockaddr *addr) { return READ_ONCE(sock->ops)->getname(sock, addr, 1); } EXPORT_SYMBOL(kernel_getpeername); /** * kernel_sock_shutdown - shut down part of a full-duplex connection (kernel space) * @sock: socket * @how: connection part * * Returns 0 or an error. */ int kernel_sock_shutdown(struct socket *sock, enum sock_shutdown_cmd how) { return READ_ONCE(sock->ops)->shutdown(sock, how); } EXPORT_SYMBOL(kernel_sock_shutdown); /** * kernel_sock_ip_overhead - returns the IP overhead imposed by a socket * @sk: socket * * This routine returns the IP overhead imposed by a socket i.e. * the length of the underlying IP header, depending on whether * this is an IPv4 or IPv6 socket and the length from IP options turned * on at the socket. Assumes that the caller has a lock on the socket. */ u32 kernel_sock_ip_overhead(struct sock *sk) { struct inet_sock *inet; struct ip_options_rcu *opt; u32 overhead = 0; #if IS_ENABLED(CONFIG_IPV6) struct ipv6_pinfo *np; struct ipv6_txoptions *optv6 = NULL; #endif /* IS_ENABLED(CONFIG_IPV6) */ if (!sk) return overhead; switch (sk->sk_family) { case AF_INET: inet = inet_sk(sk); overhead += sizeof(struct iphdr); opt = rcu_dereference_protected(inet->inet_opt, sock_owned_by_user(sk)); if (opt) overhead += opt->opt.optlen; return overhead; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: np = inet6_sk(sk); overhead += sizeof(struct ipv6hdr); if (np) optv6 = rcu_dereference_protected(np->opt, sock_owned_by_user(sk)); if (optv6) overhead += (optv6->opt_flen + optv6->opt_nflen); return overhead; #endif /* IS_ENABLED(CONFIG_IPV6) */ default: /* Returns 0 overhead if the socket is not ipv4 or ipv6 */ return overhead; } } EXPORT_SYMBOL(kernel_sock_ip_overhead); |
| 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2012-2016 Pablo Neira Ayuso <pablo@netfilter.org> */ #include <linux/init.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> #define nft_objref_priv(expr) *((struct nft_object **)nft_expr_priv(expr)) void nft_objref_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { struct nft_object *obj = nft_objref_priv(expr); obj->ops->eval(obj, regs, pkt); } static int nft_objref_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_object *obj = nft_objref_priv(expr); u8 genmask = nft_genmask_next(ctx->net); u32 objtype; if (!tb[NFTA_OBJREF_IMM_NAME] || !tb[NFTA_OBJREF_IMM_TYPE]) return -EINVAL; objtype = ntohl(nla_get_be32(tb[NFTA_OBJREF_IMM_TYPE])); obj = nft_obj_lookup(ctx->net, ctx->table, tb[NFTA_OBJREF_IMM_NAME], objtype, genmask); if (IS_ERR(obj)) return -ENOENT; if (!nft_use_inc(&obj->use)) return -EMFILE; nft_objref_priv(expr) = obj; return 0; } static int nft_objref_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { const struct nft_object *obj = nft_objref_priv(expr); if (nla_put_string(skb, NFTA_OBJREF_IMM_NAME, obj->key.name) || nla_put_be32(skb, NFTA_OBJREF_IMM_TYPE, htonl(obj->ops->type->type))) goto nla_put_failure; return 0; nla_put_failure: return -1; } static void nft_objref_deactivate(const struct nft_ctx *ctx, const struct nft_expr *expr, enum nft_trans_phase phase) { struct nft_object *obj = nft_objref_priv(expr); if (phase == NFT_TRANS_COMMIT) return; nft_use_dec(&obj->use); } static void nft_objref_activate(const struct nft_ctx *ctx, const struct nft_expr *expr) { struct nft_object *obj = nft_objref_priv(expr); nft_use_inc_restore(&obj->use); } static const struct nft_expr_ops nft_objref_ops = { .type = &nft_objref_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_object *)), .eval = nft_objref_eval, .init = nft_objref_init, .activate = nft_objref_activate, .deactivate = nft_objref_deactivate, .dump = nft_objref_dump, .reduce = NFT_REDUCE_READONLY, }; struct nft_objref_map { struct nft_set *set; u8 sreg; struct nft_set_binding binding; }; void nft_objref_map_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { struct nft_objref_map *priv = nft_expr_priv(expr); const struct nft_set *set = priv->set; struct net *net = nft_net(pkt); const struct nft_set_ext *ext; struct nft_object *obj; bool found; found = nft_set_do_lookup(net, set, ®s->data[priv->sreg], &ext); if (!found) { ext = nft_set_catchall_lookup(net, set); if (!ext) { regs->verdict.code = NFT_BREAK; return; } } obj = *nft_set_ext_obj(ext); obj->ops->eval(obj, regs, pkt); } static int nft_objref_map_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_objref_map *priv = nft_expr_priv(expr); u8 genmask = nft_genmask_next(ctx->net); struct nft_set *set; int err; set = nft_set_lookup_global(ctx->net, ctx->table, tb[NFTA_OBJREF_SET_NAME], tb[NFTA_OBJREF_SET_ID], genmask); if (IS_ERR(set)) return PTR_ERR(set); if (!(set->flags & NFT_SET_OBJECT)) return -EINVAL; err = nft_parse_register_load(tb[NFTA_OBJREF_SET_SREG], &priv->sreg, set->klen); if (err < 0) return err; priv->binding.flags = set->flags & NFT_SET_OBJECT; err = nf_tables_bind_set(ctx, set, &priv->binding); if (err < 0) return err; priv->set = set; return 0; } static int nft_objref_map_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { const struct nft_objref_map *priv = nft_expr_priv(expr); if (nft_dump_register(skb, NFTA_OBJREF_SET_SREG, priv->sreg) || nla_put_string(skb, NFTA_OBJREF_SET_NAME, priv->set->name)) goto nla_put_failure; return 0; nla_put_failure: return -1; } static void nft_objref_map_deactivate(const struct nft_ctx *ctx, const struct nft_expr *expr, enum nft_trans_phase phase) { struct nft_objref_map *priv = nft_expr_priv(expr); nf_tables_deactivate_set(ctx, priv->set, &priv->binding, phase); } static void nft_objref_map_activate(const struct nft_ctx *ctx, const struct nft_expr *expr) { struct nft_objref_map *priv = nft_expr_priv(expr); nf_tables_activate_set(ctx, priv->set); } static void nft_objref_map_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { struct nft_objref_map *priv = nft_expr_priv(expr); nf_tables_destroy_set(ctx, priv->set); } static const struct nft_expr_ops nft_objref_map_ops = { .type = &nft_objref_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_objref_map)), .eval = nft_objref_map_eval, .init = nft_objref_map_init, .activate = nft_objref_map_activate, .deactivate = nft_objref_map_deactivate, .destroy = nft_objref_map_destroy, .dump = nft_objref_map_dump, .reduce = NFT_REDUCE_READONLY, }; static const struct nft_expr_ops * nft_objref_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { if (tb[NFTA_OBJREF_SET_SREG] && (tb[NFTA_OBJREF_SET_NAME] || tb[NFTA_OBJREF_SET_ID])) return &nft_objref_map_ops; else if (tb[NFTA_OBJREF_IMM_NAME] && tb[NFTA_OBJREF_IMM_TYPE]) return &nft_objref_ops; return ERR_PTR(-EOPNOTSUPP); } static const struct nla_policy nft_objref_policy[NFTA_OBJREF_MAX + 1] = { [NFTA_OBJREF_IMM_NAME] = { .type = NLA_STRING, .len = NFT_OBJ_MAXNAMELEN - 1 }, [NFTA_OBJREF_IMM_TYPE] = { .type = NLA_U32 }, [NFTA_OBJREF_SET_SREG] = { .type = NLA_U32 }, [NFTA_OBJREF_SET_NAME] = { .type = NLA_STRING, .len = NFT_SET_MAXNAMELEN - 1 }, [NFTA_OBJREF_SET_ID] = { .type = NLA_U32 }, }; struct nft_expr_type nft_objref_type __read_mostly = { .name = "objref", .select_ops = nft_objref_select_ops, .policy = nft_objref_policy, .maxattr = NFTA_OBJREF_MAX, .owner = THIS_MODULE, }; |
| 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 | // SPDX-License-Identifier: GPL-2.0-or-later /* * A generic kernel FIFO implementation * * Copyright (C) 2009/2010 Stefani Seibold <stefani@seibold.net> */ #include <linux/dma-mapping.h> #include <linux/err.h> #include <linux/export.h> #include <linux/kfifo.h> #include <linux/log2.h> #include <linux/scatterlist.h> #include <linux/slab.h> #include <linux/uaccess.h> /* * internal helper to calculate the unused elements in a fifo */ static inline unsigned int kfifo_unused(struct __kfifo *fifo) { return (fifo->mask + 1) - (fifo->in - fifo->out); } int __kfifo_alloc(struct __kfifo *fifo, unsigned int size, size_t esize, gfp_t gfp_mask) { /* * round up to the next power of 2, since our 'let the indices * wrap' technique works only in this case. */ size = roundup_pow_of_two(size); fifo->in = 0; fifo->out = 0; fifo->esize = esize; if (size < 2) { fifo->data = NULL; fifo->mask = 0; return -EINVAL; } fifo->data = kmalloc_array(esize, size, gfp_mask); if (!fifo->data) { fifo->mask = 0; return -ENOMEM; } fifo->mask = size - 1; return 0; } EXPORT_SYMBOL(__kfifo_alloc); void __kfifo_free(struct __kfifo *fifo) { kfree(fifo->data); fifo->in = 0; fifo->out = 0; fifo->esize = 0; fifo->data = NULL; fifo->mask = 0; } EXPORT_SYMBOL(__kfifo_free); int __kfifo_init(struct __kfifo *fifo, void *buffer, unsigned int size, size_t esize) { size /= esize; if (!is_power_of_2(size)) size = rounddown_pow_of_two(size); fifo->in = 0; fifo->out = 0; fifo->esize = esize; fifo->data = buffer; if (size < 2) { fifo->mask = 0; return -EINVAL; } fifo->mask = size - 1; return 0; } EXPORT_SYMBOL(__kfifo_init); static void kfifo_copy_in(struct __kfifo *fifo, const void *src, unsigned int len, unsigned int off) { unsigned int size = fifo->mask + 1; unsigned int esize = fifo->esize; unsigned int l; off &= fifo->mask; if (esize != 1) { off *= esize; size *= esize; len *= esize; } l = min(len, size - off); memcpy(fifo->data + off, src, l); memcpy(fifo->data, src + l, len - l); /* * make sure that the data in the fifo is up to date before * incrementing the fifo->in index counter */ smp_wmb(); } unsigned int __kfifo_in(struct __kfifo *fifo, const void *buf, unsigned int len) { unsigned int l; l = kfifo_unused(fifo); if (len > l) len = l; kfifo_copy_in(fifo, buf, len, fifo->in); fifo->in += len; return len; } EXPORT_SYMBOL(__kfifo_in); static void kfifo_copy_out(struct __kfifo *fifo, void *dst, unsigned int len, unsigned int off) { unsigned int size = fifo->mask + 1; unsigned int esize = fifo->esize; unsigned int l; off &= fifo->mask; if (esize != 1) { off *= esize; size *= esize; len *= esize; } l = min(len, size - off); memcpy(dst, fifo->data + off, l); memcpy(dst + l, fifo->data, len - l); /* * make sure that the data is copied before * incrementing the fifo->out index counter */ smp_wmb(); } unsigned int __kfifo_out_peek(struct __kfifo *fifo, void *buf, unsigned int len) { unsigned int l; l = fifo->in - fifo->out; if (len > l) len = l; kfifo_copy_out(fifo, buf, len, fifo->out); return len; } EXPORT_SYMBOL(__kfifo_out_peek); unsigned int __kfifo_out_linear(struct __kfifo *fifo, unsigned int *tail, unsigned int n) { unsigned int size = fifo->mask + 1; unsigned int off = fifo->out & fifo->mask; if (tail) *tail = off; return min3(n, fifo->in - fifo->out, size - off); } EXPORT_SYMBOL(__kfifo_out_linear); unsigned int __kfifo_out(struct __kfifo *fifo, void *buf, unsigned int len) { len = __kfifo_out_peek(fifo, buf, len); fifo->out += len; return len; } EXPORT_SYMBOL(__kfifo_out); static unsigned long kfifo_copy_from_user(struct __kfifo *fifo, const void __user *from, unsigned int len, unsigned int off, unsigned int *copied) { unsigned int size = fifo->mask + 1; unsigned int esize = fifo->esize; unsigned int l; unsigned long ret; off &= fifo->mask; if (esize != 1) { off *= esize; size *= esize; len *= esize; } l = min(len, size - off); ret = copy_from_user(fifo->data + off, from, l); if (unlikely(ret)) ret = DIV_ROUND_UP(ret + len - l, esize); else { ret = copy_from_user(fifo->data, from + l, len - l); if (unlikely(ret)) ret = DIV_ROUND_UP(ret, esize); } /* * make sure that the data in the fifo is up to date before * incrementing the fifo->in index counter */ smp_wmb(); *copied = len - ret * esize; /* return the number of elements which are not copied */ return ret; } int __kfifo_from_user(struct __kfifo *fifo, const void __user *from, unsigned long len, unsigned int *copied) { unsigned int l; unsigned long ret; unsigned int esize = fifo->esize; int err; if (esize != 1) len /= esize; l = kfifo_unused(fifo); if (len > l) len = l; ret = kfifo_copy_from_user(fifo, from, len, fifo->in, copied); if (unlikely(ret)) { len -= ret; err = -EFAULT; } else err = 0; fifo->in += len; return err; } EXPORT_SYMBOL(__kfifo_from_user); static unsigned long kfifo_copy_to_user(struct __kfifo *fifo, void __user *to, unsigned int len, unsigned int off, unsigned int *copied) { unsigned int l; unsigned long ret; unsigned int size = fifo->mask + 1; unsigned int esize = fifo->esize; off &= fifo->mask; if (esize != 1) { off *= esize; size *= esize; len *= esize; } l = min(len, size - off); ret = copy_to_user(to, fifo->data + off, l); if (unlikely(ret)) ret = DIV_ROUND_UP(ret + len - l, esize); else { ret = copy_to_user(to + l, fifo->data, len - l); if (unlikely(ret)) ret = DIV_ROUND_UP(ret, esize); } /* * make sure that the data is copied before * incrementing the fifo->out index counter */ smp_wmb(); *copied = len - ret * esize; /* return the number of elements which are not copied */ return ret; } int __kfifo_to_user(struct __kfifo *fifo, void __user *to, unsigned long len, unsigned int *copied) { unsigned int l; unsigned long ret; unsigned int esize = fifo->esize; int err; if (esize != 1) len /= esize; l = fifo->in - fifo->out; if (len > l) len = l; ret = kfifo_copy_to_user(fifo, to, len, fifo->out, copied); if (unlikely(ret)) { len -= ret; err = -EFAULT; } else err = 0; fifo->out += len; return err; } EXPORT_SYMBOL(__kfifo_to_user); static unsigned int setup_sgl_buf(struct __kfifo *fifo, struct scatterlist *sgl, unsigned int data_offset, int nents, unsigned int len, dma_addr_t dma) { const void *buf = fifo->data + data_offset; if (!nents || !len) return 0; sg_set_buf(sgl, buf, len); if (dma != DMA_MAPPING_ERROR) { sg_dma_address(sgl) = dma + data_offset; sg_dma_len(sgl) = len; } return 1; } static unsigned int setup_sgl(struct __kfifo *fifo, struct scatterlist *sgl, int nents, unsigned int len, unsigned int off, dma_addr_t dma) { unsigned int size = fifo->mask + 1; unsigned int esize = fifo->esize; unsigned int len_to_end; unsigned int n; off &= fifo->mask; if (esize != 1) { off *= esize; size *= esize; len *= esize; } len_to_end = min(len, size - off); n = setup_sgl_buf(fifo, sgl, off, nents, len_to_end, dma); n += setup_sgl_buf(fifo, sgl + n, 0, nents - n, len - len_to_end, dma); return n; } unsigned int __kfifo_dma_in_prepare(struct __kfifo *fifo, struct scatterlist *sgl, int nents, unsigned int len, dma_addr_t dma) { unsigned int l; l = kfifo_unused(fifo); if (len > l) len = l; return setup_sgl(fifo, sgl, nents, len, fifo->in, dma); } EXPORT_SYMBOL(__kfifo_dma_in_prepare); unsigned int __kfifo_dma_out_prepare(struct __kfifo *fifo, struct scatterlist *sgl, int nents, unsigned int len, dma_addr_t dma) { unsigned int l; l = fifo->in - fifo->out; if (len > l) len = l; return setup_sgl(fifo, sgl, nents, len, fifo->out, dma); } EXPORT_SYMBOL(__kfifo_dma_out_prepare); unsigned int __kfifo_max_r(unsigned int len, size_t recsize) { unsigned int max = (1 << (recsize << 3)) - 1; if (len > max) return max; return len; } EXPORT_SYMBOL(__kfifo_max_r); #define __KFIFO_PEEK(data, out, mask) \ ((data)[(out) & (mask)]) /* * __kfifo_peek_n internal helper function for determinate the length of * the next record in the fifo */ static unsigned int __kfifo_peek_n(struct __kfifo *fifo, size_t recsize) { unsigned int l; unsigned int mask = fifo->mask; unsigned char *data = fifo->data; l = __KFIFO_PEEK(data, fifo->out, mask); if (--recsize) l |= __KFIFO_PEEK(data, fifo->out + 1, mask) << 8; return l; } #define __KFIFO_POKE(data, in, mask, val) \ ( \ (data)[(in) & (mask)] = (unsigned char)(val) \ ) /* * __kfifo_poke_n internal helper function for storing the length of * the record into the fifo */ static void __kfifo_poke_n(struct __kfifo *fifo, unsigned int n, size_t recsize) { unsigned int mask = fifo->mask; unsigned char *data = fifo->data; __KFIFO_POKE(data, fifo->in, mask, n); if (recsize > 1) __KFIFO_POKE(data, fifo->in + 1, mask, n >> 8); } unsigned int __kfifo_len_r(struct __kfifo *fifo, size_t recsize) { return __kfifo_peek_n(fifo, recsize); } EXPORT_SYMBOL(__kfifo_len_r); unsigned int __kfifo_in_r(struct __kfifo *fifo, const void *buf, unsigned int len, size_t recsize) { if (len + recsize > kfifo_unused(fifo)) return 0; __kfifo_poke_n(fifo, len, recsize); kfifo_copy_in(fifo, buf, len, fifo->in + recsize); fifo->in += len + recsize; return len; } EXPORT_SYMBOL(__kfifo_in_r); static unsigned int kfifo_out_copy_r(struct __kfifo *fifo, void *buf, unsigned int len, size_t recsize, unsigned int *n) { *n = __kfifo_peek_n(fifo, recsize); if (len > *n) len = *n; kfifo_copy_out(fifo, buf, len, fifo->out + recsize); return len; } unsigned int __kfifo_out_peek_r(struct __kfifo *fifo, void *buf, unsigned int len, size_t recsize) { unsigned int n; if (fifo->in == fifo->out) return 0; return kfifo_out_copy_r(fifo, buf, len, recsize, &n); } EXPORT_SYMBOL(__kfifo_out_peek_r); unsigned int __kfifo_out_linear_r(struct __kfifo *fifo, unsigned int *tail, unsigned int n, size_t recsize) { if (fifo->in == fifo->out) return 0; if (tail) *tail = fifo->out + recsize; return min(n, __kfifo_peek_n(fifo, recsize)); } EXPORT_SYMBOL(__kfifo_out_linear_r); unsigned int __kfifo_out_r(struct __kfifo *fifo, void *buf, unsigned int len, size_t recsize) { unsigned int n; if (fifo->in == fifo->out) return 0; len = kfifo_out_copy_r(fifo, buf, len, recsize, &n); fifo->out += n + recsize; return len; } EXPORT_SYMBOL(__kfifo_out_r); void __kfifo_skip_r(struct __kfifo *fifo, size_t recsize) { unsigned int n; n = __kfifo_peek_n(fifo, recsize); fifo->out += n + recsize; } EXPORT_SYMBOL(__kfifo_skip_r); int __kfifo_from_user_r(struct __kfifo *fifo, const void __user *from, unsigned long len, unsigned int *copied, size_t recsize) { unsigned long ret; len = __kfifo_max_r(len, recsize); if (len + recsize > kfifo_unused(fifo)) { *copied = 0; return 0; } __kfifo_poke_n(fifo, len, recsize); ret = kfifo_copy_from_user(fifo, from, len, fifo->in + recsize, copied); if (unlikely(ret)) { *copied = 0; return -EFAULT; } fifo->in += len + recsize; return 0; } EXPORT_SYMBOL(__kfifo_from_user_r); int __kfifo_to_user_r(struct __kfifo *fifo, void __user *to, unsigned long len, unsigned int *copied, size_t recsize) { unsigned long ret; unsigned int n; if (fifo->in == fifo->out) { *copied = 0; return 0; } n = __kfifo_peek_n(fifo, recsize); if (len > n) len = n; ret = kfifo_copy_to_user(fifo, to, len, fifo->out + recsize, copied); if (unlikely(ret)) { *copied = 0; return -EFAULT; } fifo->out += n + recsize; return 0; } EXPORT_SYMBOL(__kfifo_to_user_r); unsigned int __kfifo_dma_in_prepare_r(struct __kfifo *fifo, struct scatterlist *sgl, int nents, unsigned int len, size_t recsize, dma_addr_t dma) { BUG_ON(!nents); len = __kfifo_max_r(len, recsize); if (len + recsize > kfifo_unused(fifo)) return 0; return setup_sgl(fifo, sgl, nents, len, fifo->in + recsize, dma); } EXPORT_SYMBOL(__kfifo_dma_in_prepare_r); void __kfifo_dma_in_finish_r(struct __kfifo *fifo, unsigned int len, size_t recsize) { len = __kfifo_max_r(len, recsize); __kfifo_poke_n(fifo, len, recsize); fifo->in += len + recsize; } EXPORT_SYMBOL(__kfifo_dma_in_finish_r); unsigned int __kfifo_dma_out_prepare_r(struct __kfifo *fifo, struct scatterlist *sgl, int nents, unsigned int len, size_t recsize, dma_addr_t dma) { BUG_ON(!nents); len = __kfifo_max_r(len, recsize); if (len + recsize > fifo->in - fifo->out) return 0; return setup_sgl(fifo, sgl, nents, len, fifo->out + recsize, dma); } EXPORT_SYMBOL(__kfifo_dma_out_prepare_r); |
| 1 27 27 27 27 75 653 99 625 316 648 323 324 38 648 648 648 417 416 422 432 101 440 124 17 17 13 4 17 17 17 35 35 35 35 35 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_NETFILTER_H #define __LINUX_NETFILTER_H #include <linux/init.h> #include <linux/skbuff.h> #include <linux/net.h> #include <linux/if.h> #include <linux/in.h> #include <linux/in6.h> #include <linux/wait.h> #include <linux/list.h> #include <linux/static_key.h> #include <linux/module.h> #include <linux/netfilter_defs.h> #include <linux/netdevice.h> #include <linux/sockptr.h> #include <net/net_namespace.h> static inline int NF_DROP_GETERR(int verdict) { return -(verdict >> NF_VERDICT_QBITS); } static __always_inline int NF_DROP_REASON(struct sk_buff *skb, enum skb_drop_reason reason, u32 err) { BUILD_BUG_ON(err > 0xffff); kfree_skb_reason(skb, reason); return ((err << 16) | NF_STOLEN); } static inline int nf_inet_addr_cmp(const union nf_inet_addr *a1, const union nf_inet_addr *a2) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 const unsigned long *ul1 = (const unsigned long *)a1; const unsigned long *ul2 = (const unsigned long *)a2; return ((ul1[0] ^ ul2[0]) | (ul1[1] ^ ul2[1])) == 0UL; #else return a1->all[0] == a2->all[0] && a1->all[1] == a2->all[1] && a1->all[2] == a2->all[2] && a1->all[3] == a2->all[3]; #endif } static inline void nf_inet_addr_mask(const union nf_inet_addr *a1, union nf_inet_addr *result, const union nf_inet_addr *mask) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 const unsigned long *ua = (const unsigned long *)a1; unsigned long *ur = (unsigned long *)result; const unsigned long *um = (const unsigned long *)mask; ur[0] = ua[0] & um[0]; ur[1] = ua[1] & um[1]; #else result->all[0] = a1->all[0] & mask->all[0]; result->all[1] = a1->all[1] & mask->all[1]; result->all[2] = a1->all[2] & mask->all[2]; result->all[3] = a1->all[3] & mask->all[3]; #endif } int netfilter_init(void); struct sk_buff; struct nf_hook_ops; struct sock; struct nf_hook_state { u8 hook; u8 pf; struct net_device *in; struct net_device *out; struct sock *sk; struct net *net; int (*okfn)(struct net *, struct sock *, struct sk_buff *); }; typedef unsigned int nf_hookfn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state); enum nf_hook_ops_type { NF_HOOK_OP_UNDEFINED, NF_HOOK_OP_NF_TABLES, NF_HOOK_OP_BPF, }; struct nf_hook_ops { /* User fills in from here down. */ nf_hookfn *hook; struct net_device *dev; void *priv; u8 pf; enum nf_hook_ops_type hook_ops_type:8; unsigned int hooknum; /* Hooks are ordered in ascending priority. */ int priority; }; struct nf_hook_entry { nf_hookfn *hook; void *priv; }; struct nf_hook_entries_rcu_head { struct rcu_head head; void *allocation; }; struct nf_hook_entries { u16 num_hook_entries; /* padding */ struct nf_hook_entry hooks[]; /* trailer: pointers to original orig_ops of each hook, * followed by rcu_head and scratch space used for freeing * the structure via call_rcu. * * This is not part of struct nf_hook_entry since its only * needed in slow path (hook register/unregister): * const struct nf_hook_ops *orig_ops[] * * For the same reason, we store this at end -- its * only needed when a hook is deleted, not during * packet path processing: * struct nf_hook_entries_rcu_head head */ }; #ifdef CONFIG_NETFILTER static inline struct nf_hook_ops **nf_hook_entries_get_hook_ops(const struct nf_hook_entries *e) { unsigned int n = e->num_hook_entries; const void *hook_end; hook_end = &e->hooks[n]; /* this is *past* ->hooks[]! */ return (struct nf_hook_ops **)hook_end; } static inline int nf_hook_entry_hookfn(const struct nf_hook_entry *entry, struct sk_buff *skb, struct nf_hook_state *state) { return entry->hook(entry->priv, skb, state); } static inline void nf_hook_state_init(struct nf_hook_state *p, unsigned int hook, u_int8_t pf, struct net_device *indev, struct net_device *outdev, struct sock *sk, struct net *net, int (*okfn)(struct net *, struct sock *, struct sk_buff *)) { p->hook = hook; p->pf = pf; p->in = indev; p->out = outdev; p->sk = sk; p->net = net; p->okfn = okfn; } struct nf_sockopt_ops { struct list_head list; u_int8_t pf; /* Non-inclusive ranges: use 0/0/NULL to never get called. */ int set_optmin; int set_optmax; int (*set)(struct sock *sk, int optval, sockptr_t arg, unsigned int len); int get_optmin; int get_optmax; int (*get)(struct sock *sk, int optval, void __user *user, int *len); /* Use the module struct to lock set/get code in place */ struct module *owner; }; /* Function to register/unregister hook points. */ int nf_register_net_hook(struct net *net, const struct nf_hook_ops *ops); void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *ops); int nf_register_net_hooks(struct net *net, const struct nf_hook_ops *reg, unsigned int n); void nf_unregister_net_hooks(struct net *net, const struct nf_hook_ops *reg, unsigned int n); /* Functions to register get/setsockopt ranges (non-inclusive). You need to check permissions yourself! */ int nf_register_sockopt(struct nf_sockopt_ops *reg); void nf_unregister_sockopt(struct nf_sockopt_ops *reg); #ifdef CONFIG_JUMP_LABEL extern struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; #endif int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state, const struct nf_hook_entries *e, unsigned int i); void nf_hook_slow_list(struct list_head *head, struct nf_hook_state *state, const struct nf_hook_entries *e); /** * nf_hook - call a netfilter hook * * Returns 1 if the hook has allowed the packet to pass. The function * okfn must be invoked by the caller in this case. Any other return * value indicates the packet has been consumed by the hook. */ static inline int nf_hook(u_int8_t pf, unsigned int hook, struct net *net, struct sock *sk, struct sk_buff *skb, struct net_device *indev, struct net_device *outdev, int (*okfn)(struct net *, struct sock *, struct sk_buff *)) { struct nf_hook_entries *hook_head = NULL; int ret = 1; #ifdef CONFIG_JUMP_LABEL if (__builtin_constant_p(pf) && __builtin_constant_p(hook) && !static_key_false(&nf_hooks_needed[pf][hook])) return 1; #endif rcu_read_lock(); switch (pf) { case NFPROTO_IPV4: hook_head = rcu_dereference(net->nf.hooks_ipv4[hook]); break; case NFPROTO_IPV6: hook_head = rcu_dereference(net->nf.hooks_ipv6[hook]); break; case NFPROTO_ARP: #ifdef CONFIG_NETFILTER_FAMILY_ARP if (WARN_ON_ONCE(hook >= ARRAY_SIZE(net->nf.hooks_arp))) break; hook_head = rcu_dereference(net->nf.hooks_arp[hook]); #endif break; case NFPROTO_BRIDGE: #ifdef CONFIG_NETFILTER_FAMILY_BRIDGE hook_head = rcu_dereference(net->nf.hooks_bridge[hook]); #endif break; default: WARN_ON_ONCE(1); break; } if (hook_head) { struct nf_hook_state state; nf_hook_state_init(&state, hook, pf, indev, outdev, sk, net, okfn); ret = nf_hook_slow(skb, &state, hook_head, 0); } rcu_read_unlock(); return ret; } /* Activate hook; either okfn or kfree_skb called, unless a hook returns NF_STOLEN (in which case, it's up to the hook to deal with the consequences). Returns -ERRNO if packet dropped. Zero means queued, stolen or accepted. */ /* RR: > I don't want nf_hook to return anything because people might forget > about async and trust the return value to mean "packet was ok". AK: Just document it clearly, then you can expect some sense from kernel coders :) */ static inline int NF_HOOK_COND(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk, struct sk_buff *skb, struct net_device *in, struct net_device *out, int (*okfn)(struct net *, struct sock *, struct sk_buff *), bool cond) { int ret; if (!cond || ((ret = nf_hook(pf, hook, net, sk, skb, in, out, okfn)) == 1)) ret = okfn(net, sk, skb); return ret; } static inline int NF_HOOK(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk, struct sk_buff *skb, struct net_device *in, struct net_device *out, int (*okfn)(struct net *, struct sock *, struct sk_buff *)) { int ret = nf_hook(pf, hook, net, sk, skb, in, out, okfn); if (ret == 1) ret = okfn(net, sk, skb); return ret; } static inline void NF_HOOK_LIST(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk, struct list_head *head, struct net_device *in, struct net_device *out, int (*okfn)(struct net *, struct sock *, struct sk_buff *)) { struct nf_hook_entries *hook_head = NULL; #ifdef CONFIG_JUMP_LABEL if (__builtin_constant_p(pf) && __builtin_constant_p(hook) && !static_key_false(&nf_hooks_needed[pf][hook])) return; #endif rcu_read_lock(); switch (pf) { case NFPROTO_IPV4: hook_head = rcu_dereference(net->nf.hooks_ipv4[hook]); break; case NFPROTO_IPV6: hook_head = rcu_dereference(net->nf.hooks_ipv6[hook]); break; default: WARN_ON_ONCE(1); break; } if (hook_head) { struct nf_hook_state state; nf_hook_state_init(&state, hook, pf, in, out, sk, net, okfn); nf_hook_slow_list(head, &state, hook_head); } rcu_read_unlock(); } /* Call setsockopt() */ int nf_setsockopt(struct sock *sk, u_int8_t pf, int optval, sockptr_t opt, unsigned int len); int nf_getsockopt(struct sock *sk, u_int8_t pf, int optval, char __user *opt, int *len); struct flowi; struct nf_queue_entry; __sum16 nf_checksum(struct sk_buff *skb, unsigned int hook, unsigned int dataoff, u_int8_t protocol, unsigned short family); __sum16 nf_checksum_partial(struct sk_buff *skb, unsigned int hook, unsigned int dataoff, unsigned int len, u_int8_t protocol, unsigned short family); int nf_route(struct net *net, struct dst_entry **dst, struct flowi *fl, bool strict, unsigned short family); #include <net/flow.h> struct nf_conn; enum nf_nat_manip_type; struct nlattr; enum ip_conntrack_dir; struct nf_nat_hook { int (*parse_nat_setup)(struct nf_conn *ct, enum nf_nat_manip_type manip, const struct nlattr *attr); void (*decode_session)(struct sk_buff *skb, struct flowi *fl); unsigned int (*manip_pkt)(struct sk_buff *skb, struct nf_conn *ct, enum nf_nat_manip_type mtype, enum ip_conntrack_dir dir); void (*remove_nat_bysrc)(struct nf_conn *ct); }; extern const struct nf_nat_hook __rcu *nf_nat_hook; static inline void nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl, u_int8_t family) { #if IS_ENABLED(CONFIG_NF_NAT) const struct nf_nat_hook *nat_hook; rcu_read_lock(); nat_hook = rcu_dereference(nf_nat_hook); if (nat_hook && nat_hook->decode_session) nat_hook->decode_session(skb, fl); rcu_read_unlock(); #endif } #else /* !CONFIG_NETFILTER */ static inline int NF_HOOK_COND(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk, struct sk_buff *skb, struct net_device *in, struct net_device *out, int (*okfn)(struct net *, struct sock *, struct sk_buff *), bool cond) { return okfn(net, sk, skb); } static inline int NF_HOOK(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk, struct sk_buff *skb, struct net_device *in, struct net_device *out, int (*okfn)(struct net *, struct sock *, struct sk_buff *)) { return okfn(net, sk, skb); } static inline void NF_HOOK_LIST(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk, struct list_head *head, struct net_device *in, struct net_device *out, int (*okfn)(struct net *, struct sock *, struct sk_buff *)) { /* nothing to do */ } static inline int nf_hook(u_int8_t pf, unsigned int hook, struct net *net, struct sock *sk, struct sk_buff *skb, struct net_device *indev, struct net_device *outdev, int (*okfn)(struct net *, struct sock *, struct sk_buff *)) { return 1; } struct flowi; static inline void nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl, u_int8_t family) { } #endif /*CONFIG_NETFILTER*/ #if IS_ENABLED(CONFIG_NF_CONNTRACK) #include <linux/netfilter/nf_conntrack_zones_common.h> void nf_ct_attach(struct sk_buff *, const struct sk_buff *); void nf_ct_set_closing(struct nf_conntrack *nfct); struct nf_conntrack_tuple; bool nf_ct_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple, const struct sk_buff *skb); #else static inline void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb) {} static inline void nf_ct_set_closing(struct nf_conntrack *nfct) {} struct nf_conntrack_tuple; static inline bool nf_ct_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple, const struct sk_buff *skb) { return false; } #endif struct nf_conn; enum ip_conntrack_info; struct nf_ct_hook { int (*update)(struct net *net, struct sk_buff *skb); void (*destroy)(struct nf_conntrack *); bool (*get_tuple_skb)(struct nf_conntrack_tuple *, const struct sk_buff *); void (*attach)(struct sk_buff *nskb, const struct sk_buff *skb); void (*set_closing)(struct nf_conntrack *nfct); int (*confirm)(struct sk_buff *skb); }; extern const struct nf_ct_hook __rcu *nf_ct_hook; struct nlattr; struct nfnl_ct_hook { size_t (*build_size)(const struct nf_conn *ct); int (*build)(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, u_int16_t ct_attr, u_int16_t ct_info_attr); int (*parse)(const struct nlattr *attr, struct nf_conn *ct); int (*attach_expect)(const struct nlattr *attr, struct nf_conn *ct, u32 portid, u32 report); void (*seq_adjust)(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, s32 off); }; extern const struct nfnl_ct_hook __rcu *nfnl_ct_hook; struct nf_defrag_hook { struct module *owner; int (*enable)(struct net *net); void (*disable)(struct net *net); }; extern const struct nf_defrag_hook __rcu *nf_defrag_v4_hook; extern const struct nf_defrag_hook __rcu *nf_defrag_v6_hook; /* * nf_skb_duplicated - TEE target has sent a packet * * When a xtables target sends a packet, the OUTPUT and POSTROUTING * hooks are traversed again, i.e. nft and xtables are invoked recursively. * * This is used by xtables TEE target to prevent the duplicated skb from * being duplicated again. */ DECLARE_PER_CPU(bool, nf_skb_duplicated); /* * Contains bitmask of ctnetlink event subscribers, if any. * Can't be pernet due to NETLINK_LISTEN_ALL_NSID setsockopt flag. */ extern u8 nf_ctnetlink_has_listener; #endif /*__LINUX_NETFILTER_H*/ |
| 7 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 | /* SPDX-License-Identifier: GPL-2.0 */ /* * This is <linux/capability.h> * * Andrew G. Morgan <morgan@kernel.org> * Alexander Kjeldaas <astor@guardian.no> * with help from Aleph1, Roland Buresund and Andrew Main. * * See here for the libcap library ("POSIX draft" compliance): * * ftp://www.kernel.org/pub/linux/libs/security/linux-privs/kernel-2.6/ */ #ifndef _LINUX_CAPABILITY_H #define _LINUX_CAPABILITY_H #include <uapi/linux/capability.h> #include <linux/uidgid.h> #include <linux/bits.h> #define _KERNEL_CAPABILITY_VERSION _LINUX_CAPABILITY_VERSION_3 extern int file_caps_enabled; typedef struct { u64 val; } kernel_cap_t; /* same as vfs_ns_cap_data but in cpu endian and always filled completely */ struct cpu_vfs_cap_data { __u32 magic_etc; kuid_t rootid; kernel_cap_t permitted; kernel_cap_t inheritable; }; #define _USER_CAP_HEADER_SIZE (sizeof(struct __user_cap_header_struct)) #define _KERNEL_CAP_T_SIZE (sizeof(kernel_cap_t)) struct file; struct inode; struct dentry; struct task_struct; struct user_namespace; struct mnt_idmap; /* * CAP_FS_MASK and CAP_NFSD_MASKS: * * The fs mask is all the privileges that fsuid==0 historically meant. * At one time in the past, that included CAP_MKNOD and CAP_LINUX_IMMUTABLE. * * It has never meant setting security.* and trusted.* xattrs. * * We could also define fsmask as follows: * 1. CAP_FS_MASK is the privilege to bypass all fs-related DAC permissions * 2. The security.* and trusted.* xattrs are fs-related MAC permissions */ # define CAP_FS_MASK (BIT_ULL(CAP_CHOWN) \ | BIT_ULL(CAP_MKNOD) \ | BIT_ULL(CAP_DAC_OVERRIDE) \ | BIT_ULL(CAP_DAC_READ_SEARCH) \ | BIT_ULL(CAP_FOWNER) \ | BIT_ULL(CAP_FSETID) \ | BIT_ULL(CAP_MAC_OVERRIDE)) #define CAP_VALID_MASK (BIT_ULL(CAP_LAST_CAP+1)-1) # define CAP_EMPTY_SET ((kernel_cap_t) { 0 }) # define CAP_FULL_SET ((kernel_cap_t) { CAP_VALID_MASK }) # define CAP_FS_SET ((kernel_cap_t) { CAP_FS_MASK | BIT_ULL(CAP_LINUX_IMMUTABLE) }) # define CAP_NFSD_SET ((kernel_cap_t) { CAP_FS_MASK | BIT_ULL(CAP_SYS_RESOURCE) }) # define cap_clear(c) do { (c).val = 0; } while (0) #define cap_raise(c, flag) ((c).val |= BIT_ULL(flag)) #define cap_lower(c, flag) ((c).val &= ~BIT_ULL(flag)) #define cap_raised(c, flag) (((c).val & BIT_ULL(flag)) != 0) static inline kernel_cap_t cap_combine(const kernel_cap_t a, const kernel_cap_t b) { return (kernel_cap_t) { a.val | b.val }; } static inline kernel_cap_t cap_intersect(const kernel_cap_t a, const kernel_cap_t b) { return (kernel_cap_t) { a.val & b.val }; } static inline kernel_cap_t cap_drop(const kernel_cap_t a, const kernel_cap_t drop) { return (kernel_cap_t) { a.val &~ drop.val }; } static inline bool cap_isclear(const kernel_cap_t a) { return !a.val; } static inline bool cap_isidentical(const kernel_cap_t a, const kernel_cap_t b) { return a.val == b.val; } /* * Check if "a" is a subset of "set". * return true if ALL of the capabilities in "a" are also in "set" * cap_issubset(0101, 1111) will return true * return false if ANY of the capabilities in "a" are not in "set" * cap_issubset(1111, 0101) will return false */ static inline bool cap_issubset(const kernel_cap_t a, const kernel_cap_t set) { return !(a.val & ~set.val); } /* Used to decide between falling back on the old suser() or fsuser(). */ static inline kernel_cap_t cap_drop_fs_set(const kernel_cap_t a) { return cap_drop(a, CAP_FS_SET); } static inline kernel_cap_t cap_raise_fs_set(const kernel_cap_t a, const kernel_cap_t permitted) { return cap_combine(a, cap_intersect(permitted, CAP_FS_SET)); } static inline kernel_cap_t cap_drop_nfsd_set(const kernel_cap_t a) { return cap_drop(a, CAP_NFSD_SET); } static inline kernel_cap_t cap_raise_nfsd_set(const kernel_cap_t a, const kernel_cap_t permitted) { return cap_combine(a, cap_intersect(permitted, CAP_NFSD_SET)); } #ifdef CONFIG_MULTIUSER extern bool has_capability(struct task_struct *t, int cap); extern bool has_ns_capability(struct task_struct *t, struct user_namespace *ns, int cap); extern bool has_capability_noaudit(struct task_struct *t, int cap); extern bool has_ns_capability_noaudit(struct task_struct *t, struct user_namespace *ns, int cap); extern bool capable(int cap); extern bool ns_capable(struct user_namespace *ns, int cap); extern bool ns_capable_noaudit(struct user_namespace *ns, int cap); extern bool ns_capable_setid(struct user_namespace *ns, int cap); #else static inline bool has_capability(struct task_struct *t, int cap) { return true; } static inline bool has_ns_capability(struct task_struct *t, struct user_namespace *ns, int cap) { return true; } static inline bool has_capability_noaudit(struct task_struct *t, int cap) { return true; } static inline bool has_ns_capability_noaudit(struct task_struct *t, struct user_namespace *ns, int cap) { return true; } static inline bool capable(int cap) { return true; } static inline bool ns_capable(struct user_namespace *ns, int cap) { return true; } static inline bool ns_capable_noaudit(struct user_namespace *ns, int cap) { return true; } static inline bool ns_capable_setid(struct user_namespace *ns, int cap) { return true; } #endif /* CONFIG_MULTIUSER */ bool privileged_wrt_inode_uidgid(struct user_namespace *ns, struct mnt_idmap *idmap, const struct inode *inode); bool capable_wrt_inode_uidgid(struct mnt_idmap *idmap, const struct inode *inode, int cap); extern bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap); extern bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns); static inline bool perfmon_capable(void) { return capable(CAP_PERFMON) || capable(CAP_SYS_ADMIN); } static inline bool bpf_capable(void) { return capable(CAP_BPF) || capable(CAP_SYS_ADMIN); } static inline bool checkpoint_restore_ns_capable(struct user_namespace *ns) { return ns_capable(ns, CAP_CHECKPOINT_RESTORE) || ns_capable(ns, CAP_SYS_ADMIN); } /* audit system wants to get cap info from files as well */ int get_vfs_caps_from_disk(struct mnt_idmap *idmap, const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps); int cap_convert_nscap(struct mnt_idmap *idmap, struct dentry *dentry, const void **ivalue, size_t size); #endif /* !_LINUX_CAPABILITY_H */ |
| 5 4 4 4 4 1 3 3 2 1 1 5 5 5 2 2 1 1 1 1 2 2 2 1 3 3 1 9 9 5 2 2 1 11 2 2 1 1 1 2 2 2 2 2 2 2 1 1 2 2 2 2 2 1 2 60 2 2 61 1 1 1 1 2 1 1 1 7 7 7 2 2 1 2 11 71 1 5 1 2 1 71 61 27 34 33 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 | // SPDX-License-Identifier: GPL-2.0 #include <linux/capability.h> #include <linux/compat.h> #include <linux/blkdev.h> #include <linux/export.h> #include <linux/gfp.h> #include <linux/blkpg.h> #include <linux/hdreg.h> #include <linux/backing-dev.h> #include <linux/fs.h> #include <linux/blktrace_api.h> #include <linux/pr.h> #include <linux/uaccess.h> #include "blk.h" static int blkpg_do_ioctl(struct block_device *bdev, struct blkpg_partition __user *upart, int op) { struct gendisk *disk = bdev->bd_disk; struct blkpg_partition p; sector_t start, length, capacity, end; if (!capable(CAP_SYS_ADMIN)) return -EACCES; if (copy_from_user(&p, upart, sizeof(struct blkpg_partition))) return -EFAULT; if (bdev_is_partition(bdev)) return -EINVAL; if (p.pno <= 0) return -EINVAL; if (op == BLKPG_DEL_PARTITION) return bdev_del_partition(disk, p.pno); if (p.start < 0 || p.length <= 0 || LLONG_MAX - p.length < p.start) return -EINVAL; /* Check that the partition is aligned to the block size */ if (!IS_ALIGNED(p.start | p.length, bdev_logical_block_size(bdev))) return -EINVAL; start = p.start >> SECTOR_SHIFT; length = p.length >> SECTOR_SHIFT; capacity = get_capacity(disk); if (check_add_overflow(start, length, &end)) return -EINVAL; if (start >= capacity || end > capacity) return -EINVAL; switch (op) { case BLKPG_ADD_PARTITION: return bdev_add_partition(disk, p.pno, start, length); case BLKPG_RESIZE_PARTITION: return bdev_resize_partition(disk, p.pno, start, length); default: return -EINVAL; } } static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user *arg) { struct blkpg_partition __user *udata; int op; if (get_user(op, &arg->op) || get_user(udata, &arg->data)) return -EFAULT; return blkpg_do_ioctl(bdev, udata, op); } #ifdef CONFIG_COMPAT struct compat_blkpg_ioctl_arg { compat_int_t op; compat_int_t flags; compat_int_t datalen; compat_caddr_t data; }; static int compat_blkpg_ioctl(struct block_device *bdev, struct compat_blkpg_ioctl_arg __user *arg) { compat_caddr_t udata; int op; if (get_user(op, &arg->op) || get_user(udata, &arg->data)) return -EFAULT; return blkpg_do_ioctl(bdev, compat_ptr(udata), op); } #endif static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode, unsigned long arg) { unsigned int bs_mask = bdev_logical_block_size(bdev) - 1; uint64_t range[2], start, len, end; struct bio *prev = NULL, *bio; sector_t sector, nr_sects; struct blk_plug plug; int err; if (!(mode & BLK_OPEN_WRITE)) return -EBADF; if (!bdev_max_discard_sectors(bdev)) return -EOPNOTSUPP; if (bdev_read_only(bdev)) return -EPERM; if (copy_from_user(range, (void __user *)arg, sizeof(range))) return -EFAULT; start = range[0]; len = range[1]; if (!len) return -EINVAL; if ((start | len) & bs_mask) return -EINVAL; if (check_add_overflow(start, len, &end) || end > bdev_nr_bytes(bdev)) return -EINVAL; filemap_invalidate_lock(bdev->bd_mapping); err = truncate_bdev_range(bdev, mode, start, start + len - 1); if (err) goto fail; sector = start >> SECTOR_SHIFT; nr_sects = len >> SECTOR_SHIFT; blk_start_plug(&plug); while (1) { if (fatal_signal_pending(current)) { if (prev) bio_await_chain(prev); err = -EINTR; goto out_unplug; } bio = blk_alloc_discard_bio(bdev, §or, &nr_sects, GFP_KERNEL); if (!bio) break; prev = bio_chain_and_submit(prev, bio); } if (prev) { err = submit_bio_wait(prev); if (err == -EOPNOTSUPP) err = 0; bio_put(prev); } out_unplug: blk_finish_plug(&plug); fail: filemap_invalidate_unlock(bdev->bd_mapping); return err; } static int blk_ioctl_secure_erase(struct block_device *bdev, blk_mode_t mode, void __user *argp) { uint64_t start, len; uint64_t range[2]; int err; if (!(mode & BLK_OPEN_WRITE)) return -EBADF; if (!bdev_max_secure_erase_sectors(bdev)) return -EOPNOTSUPP; if (copy_from_user(range, argp, sizeof(range))) return -EFAULT; start = range[0]; len = range[1]; if ((start & 511) || (len & 511)) return -EINVAL; if (start + len > bdev_nr_bytes(bdev)) return -EINVAL; filemap_invalidate_lock(bdev->bd_mapping); err = truncate_bdev_range(bdev, mode, start, start + len - 1); if (!err) err = blkdev_issue_secure_erase(bdev, start >> 9, len >> 9, GFP_KERNEL); filemap_invalidate_unlock(bdev->bd_mapping); return err; } static int blk_ioctl_zeroout(struct block_device *bdev, blk_mode_t mode, unsigned long arg) { uint64_t range[2]; uint64_t start, end, len; int err; if (!(mode & BLK_OPEN_WRITE)) return -EBADF; if (copy_from_user(range, (void __user *)arg, sizeof(range))) return -EFAULT; start = range[0]; len = range[1]; end = start + len - 1; if (start & 511) return -EINVAL; if (len & 511) return -EINVAL; if (end >= (uint64_t)bdev_nr_bytes(bdev)) return -EINVAL; if (end < start) return -EINVAL; /* Invalidate the page cache, including dirty pages */ filemap_invalidate_lock(bdev->bd_mapping); err = truncate_bdev_range(bdev, mode, start, end); if (err) goto fail; err = blkdev_issue_zeroout(bdev, start >> 9, len >> 9, GFP_KERNEL, BLKDEV_ZERO_NOUNMAP); fail: filemap_invalidate_unlock(bdev->bd_mapping); return err; } static int put_ushort(unsigned short __user *argp, unsigned short val) { return put_user(val, argp); } static int put_int(int __user *argp, int val) { return put_user(val, argp); } static int put_uint(unsigned int __user *argp, unsigned int val) { return put_user(val, argp); } static int put_long(long __user *argp, long val) { return put_user(val, argp); } static int put_ulong(unsigned long __user *argp, unsigned long val) { return put_user(val, argp); } static int put_u64(u64 __user *argp, u64 val) { return put_user(val, argp); } #ifdef CONFIG_COMPAT static int compat_put_long(compat_long_t __user *argp, long val) { return put_user(val, argp); } static int compat_put_ulong(compat_ulong_t __user *argp, compat_ulong_t val) { return put_user(val, argp); } #endif #ifdef CONFIG_COMPAT /* * This is the equivalent of compat_ptr_ioctl(), to be used by block * drivers that implement only commands that are completely compatible * between 32-bit and 64-bit user space */ int blkdev_compat_ptr_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned cmd, unsigned long arg) { struct gendisk *disk = bdev->bd_disk; if (disk->fops->ioctl) return disk->fops->ioctl(bdev, mode, cmd, (unsigned long)compat_ptr(arg)); return -ENOIOCTLCMD; } EXPORT_SYMBOL(blkdev_compat_ptr_ioctl); #endif static bool blkdev_pr_allowed(struct block_device *bdev, blk_mode_t mode) { /* no sense to make reservations for partitions */ if (bdev_is_partition(bdev)) return false; if (capable(CAP_SYS_ADMIN)) return true; /* * Only allow unprivileged reservations if the file descriptor is open * for writing. */ return mode & BLK_OPEN_WRITE; } static int blkdev_pr_register(struct block_device *bdev, blk_mode_t mode, struct pr_registration __user *arg) { const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops; struct pr_registration reg; if (!blkdev_pr_allowed(bdev, mode)) return -EPERM; if (!ops || !ops->pr_register) return -EOPNOTSUPP; if (copy_from_user(®, arg, sizeof(reg))) return -EFAULT; if (reg.flags & ~PR_FL_IGNORE_KEY) return -EOPNOTSUPP; return ops->pr_register(bdev, reg.old_key, reg.new_key, reg.flags); } static int blkdev_pr_reserve(struct block_device *bdev, blk_mode_t mode, struct pr_reservation __user *arg) { const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops; struct pr_reservation rsv; if (!blkdev_pr_allowed(bdev, mode)) return -EPERM; if (!ops || !ops->pr_reserve) return -EOPNOTSUPP; if (copy_from_user(&rsv, arg, sizeof(rsv))) return -EFAULT; if (rsv.flags & ~PR_FL_IGNORE_KEY) return -EOPNOTSUPP; return ops->pr_reserve(bdev, rsv.key, rsv.type, rsv.flags); } static int blkdev_pr_release(struct block_device *bdev, blk_mode_t mode, struct pr_reservation __user *arg) { const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops; struct pr_reservation rsv; if (!blkdev_pr_allowed(bdev, mode)) return -EPERM; if (!ops || !ops->pr_release) return -EOPNOTSUPP; if (copy_from_user(&rsv, arg, sizeof(rsv))) return -EFAULT; if (rsv.flags) return -EOPNOTSUPP; return ops->pr_release(bdev, rsv.key, rsv.type); } static int blkdev_pr_preempt(struct block_device *bdev, blk_mode_t mode, struct pr_preempt __user *arg, bool abort) { const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops; struct pr_preempt p; if (!blkdev_pr_allowed(bdev, mode)) return -EPERM; if (!ops || !ops->pr_preempt) return -EOPNOTSUPP; if (copy_from_user(&p, arg, sizeof(p))) return -EFAULT; if (p.flags) return -EOPNOTSUPP; return ops->pr_preempt(bdev, p.old_key, p.new_key, p.type, abort); } static int blkdev_pr_clear(struct block_device *bdev, blk_mode_t mode, struct pr_clear __user *arg) { const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops; struct pr_clear c; if (!blkdev_pr_allowed(bdev, mode)) return -EPERM; if (!ops || !ops->pr_clear) return -EOPNOTSUPP; if (copy_from_user(&c, arg, sizeof(c))) return -EFAULT; if (c.flags) return -EOPNOTSUPP; return ops->pr_clear(bdev, c.key); } static int blkdev_flushbuf(struct block_device *bdev, unsigned cmd, unsigned long arg) { if (!capable(CAP_SYS_ADMIN)) return -EACCES; mutex_lock(&bdev->bd_holder_lock); if (bdev->bd_holder_ops && bdev->bd_holder_ops->sync) bdev->bd_holder_ops->sync(bdev); else { mutex_unlock(&bdev->bd_holder_lock); sync_blockdev(bdev); } invalidate_bdev(bdev); return 0; } static int blkdev_roset(struct block_device *bdev, unsigned cmd, unsigned long arg) { int ret, n; if (!capable(CAP_SYS_ADMIN)) return -EACCES; if (get_user(n, (int __user *)arg)) return -EFAULT; if (bdev->bd_disk->fops->set_read_only) { ret = bdev->bd_disk->fops->set_read_only(bdev, n); if (ret) return ret; } if (n) bdev_set_flag(bdev, BD_READ_ONLY); else bdev_clear_flag(bdev, BD_READ_ONLY); return 0; } static int blkdev_getgeo(struct block_device *bdev, struct hd_geometry __user *argp) { struct gendisk *disk = bdev->bd_disk; struct hd_geometry geo; int ret; if (!argp) return -EINVAL; if (!disk->fops->getgeo) return -ENOTTY; /* * We need to set the startsect first, the driver may * want to override it. */ memset(&geo, 0, sizeof(geo)); geo.start = get_start_sect(bdev); ret = disk->fops->getgeo(bdev, &geo); if (ret) return ret; if (copy_to_user(argp, &geo, sizeof(geo))) return -EFAULT; return 0; } #ifdef CONFIG_COMPAT struct compat_hd_geometry { unsigned char heads; unsigned char sectors; unsigned short cylinders; u32 start; }; static int compat_hdio_getgeo(struct block_device *bdev, struct compat_hd_geometry __user *ugeo) { struct gendisk *disk = bdev->bd_disk; struct hd_geometry geo; int ret; if (!ugeo) return -EINVAL; if (!disk->fops->getgeo) return -ENOTTY; memset(&geo, 0, sizeof(geo)); /* * We need to set the startsect first, the driver may * want to override it. */ geo.start = get_start_sect(bdev); ret = disk->fops->getgeo(bdev, &geo); if (ret) return ret; ret = copy_to_user(ugeo, &geo, 4); ret |= put_user(geo.start, &ugeo->start); if (ret) ret = -EFAULT; return ret; } #endif /* set the logical block size */ static int blkdev_bszset(struct file *file, blk_mode_t mode, int __user *argp) { // this one might be file_inode(file)->i_rdev - a rare valid // use of file_inode() for those. dev_t dev = I_BDEV(file->f_mapping->host)->bd_dev; struct file *excl_file; int ret, n; if (!capable(CAP_SYS_ADMIN)) return -EACCES; if (!argp) return -EINVAL; if (get_user(n, argp)) return -EFAULT; if (mode & BLK_OPEN_EXCL) return set_blocksize(file, n); excl_file = bdev_file_open_by_dev(dev, mode, &dev, NULL); if (IS_ERR(excl_file)) return -EBUSY; ret = set_blocksize(excl_file, n); fput(excl_file); return ret; } /* * Common commands that are handled the same way on native and compat * user space. Note the separate arg/argp parameters that are needed * to deal with the compat_ptr() conversion. */ static int blkdev_common_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned int cmd, unsigned long arg, void __user *argp) { unsigned int max_sectors; switch (cmd) { case BLKFLSBUF: return blkdev_flushbuf(bdev, cmd, arg); case BLKROSET: return blkdev_roset(bdev, cmd, arg); case BLKDISCARD: return blk_ioctl_discard(bdev, mode, arg); case BLKSECDISCARD: return blk_ioctl_secure_erase(bdev, mode, argp); case BLKZEROOUT: return blk_ioctl_zeroout(bdev, mode, arg); case BLKGETDISKSEQ: return put_u64(argp, bdev->bd_disk->diskseq); case BLKREPORTZONE: return blkdev_report_zones_ioctl(bdev, cmd, arg); case BLKRESETZONE: case BLKOPENZONE: case BLKCLOSEZONE: case BLKFINISHZONE: return blkdev_zone_mgmt_ioctl(bdev, mode, cmd, arg); case BLKGETZONESZ: return put_uint(argp, bdev_zone_sectors(bdev)); case BLKGETNRZONES: return put_uint(argp, bdev_nr_zones(bdev)); case BLKROGET: return put_int(argp, bdev_read_only(bdev) != 0); case BLKSSZGET: /* get block device logical block size */ return put_int(argp, bdev_logical_block_size(bdev)); case BLKPBSZGET: /* get block device physical block size */ return put_uint(argp, bdev_physical_block_size(bdev)); case BLKIOMIN: return put_uint(argp, bdev_io_min(bdev)); case BLKIOOPT: return put_uint(argp, bdev_io_opt(bdev)); case BLKALIGNOFF: return put_int(argp, bdev_alignment_offset(bdev)); case BLKDISCARDZEROES: return put_uint(argp, 0); case BLKSECTGET: max_sectors = min_t(unsigned int, USHRT_MAX, queue_max_sectors(bdev_get_queue(bdev))); return put_ushort(argp, max_sectors); case BLKROTATIONAL: return put_ushort(argp, !bdev_nonrot(bdev)); case BLKRASET: case BLKFRASET: if(!capable(CAP_SYS_ADMIN)) return -EACCES; bdev->bd_disk->bdi->ra_pages = (arg * 512) / PAGE_SIZE; return 0; case BLKRRPART: if (!capable(CAP_SYS_ADMIN)) return -EACCES; if (bdev_is_partition(bdev)) return -EINVAL; return disk_scan_partitions(bdev->bd_disk, mode | BLK_OPEN_STRICT_SCAN); case BLKTRACESTART: case BLKTRACESTOP: case BLKTRACETEARDOWN: return blk_trace_ioctl(bdev, cmd, argp); case IOC_PR_REGISTER: return blkdev_pr_register(bdev, mode, argp); case IOC_PR_RESERVE: return blkdev_pr_reserve(bdev, mode, argp); case IOC_PR_RELEASE: return blkdev_pr_release(bdev, mode, argp); case IOC_PR_PREEMPT: return blkdev_pr_preempt(bdev, mode, argp, false); case IOC_PR_PREEMPT_ABORT: return blkdev_pr_preempt(bdev, mode, argp, true); case IOC_PR_CLEAR: return blkdev_pr_clear(bdev, mode, argp); default: return -ENOIOCTLCMD; } } /* * Always keep this in sync with compat_blkdev_ioctl() * to handle all incompatible commands in both functions. * * New commands must be compatible and go into blkdev_common_ioctl */ long blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) { struct block_device *bdev = I_BDEV(file->f_mapping->host); void __user *argp = (void __user *)arg; blk_mode_t mode = file_to_blk_mode(file); int ret; switch (cmd) { /* These need separate implementations for the data structure */ case HDIO_GETGEO: return blkdev_getgeo(bdev, argp); case BLKPG: return blkpg_ioctl(bdev, argp); /* Compat mode returns 32-bit data instead of 'long' */ case BLKRAGET: case BLKFRAGET: if (!argp) return -EINVAL; return put_long(argp, (bdev->bd_disk->bdi->ra_pages * PAGE_SIZE) / 512); case BLKGETSIZE: if (bdev_nr_sectors(bdev) > ~0UL) return -EFBIG; return put_ulong(argp, bdev_nr_sectors(bdev)); /* The data is compatible, but the command number is different */ case BLKBSZGET: /* get block device soft block size (cf. BLKSSZGET) */ return put_int(argp, block_size(bdev)); case BLKBSZSET: return blkdev_bszset(file, mode, argp); case BLKGETSIZE64: return put_u64(argp, bdev_nr_bytes(bdev)); /* Incompatible alignment on i386 */ case BLKTRACESETUP: return blk_trace_ioctl(bdev, cmd, argp); default: break; } ret = blkdev_common_ioctl(bdev, mode, cmd, arg, argp); if (ret != -ENOIOCTLCMD) return ret; if (!bdev->bd_disk->fops->ioctl) return -ENOTTY; return bdev->bd_disk->fops->ioctl(bdev, mode, cmd, arg); } #ifdef CONFIG_COMPAT #define BLKBSZGET_32 _IOR(0x12, 112, int) #define BLKBSZSET_32 _IOW(0x12, 113, int) #define BLKGETSIZE64_32 _IOR(0x12, 114, int) /* Most of the generic ioctls are handled in the normal fallback path. This assumes the blkdev's low level compat_ioctl always returns ENOIOCTLCMD for unknown ioctls. */ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) { int ret; void __user *argp = compat_ptr(arg); struct block_device *bdev = I_BDEV(file->f_mapping->host); struct gendisk *disk = bdev->bd_disk; blk_mode_t mode = file_to_blk_mode(file); switch (cmd) { /* These need separate implementations for the data structure */ case HDIO_GETGEO: return compat_hdio_getgeo(bdev, argp); case BLKPG: return compat_blkpg_ioctl(bdev, argp); /* Compat mode returns 32-bit data instead of 'long' */ case BLKRAGET: case BLKFRAGET: if (!argp) return -EINVAL; return compat_put_long(argp, (bdev->bd_disk->bdi->ra_pages * PAGE_SIZE) / 512); case BLKGETSIZE: if (bdev_nr_sectors(bdev) > ~(compat_ulong_t)0) return -EFBIG; return compat_put_ulong(argp, bdev_nr_sectors(bdev)); /* The data is compatible, but the command number is different */ case BLKBSZGET_32: /* get the logical block size (cf. BLKSSZGET) */ return put_int(argp, bdev_logical_block_size(bdev)); case BLKBSZSET_32: return blkdev_bszset(file, mode, argp); case BLKGETSIZE64_32: return put_u64(argp, bdev_nr_bytes(bdev)); /* Incompatible alignment on i386 */ case BLKTRACESETUP32: return blk_trace_ioctl(bdev, cmd, argp); default: break; } ret = blkdev_common_ioctl(bdev, mode, cmd, arg, argp); if (ret == -ENOIOCTLCMD && disk->fops->compat_ioctl) ret = disk->fops->compat_ioctl(bdev, mode, cmd, arg); return ret; } #endif |
| 7 7 7 2 2 2 2 1 2 1 6 5 2 2 5 5 4 4 5 5 5 6 15 7 5 2 1 2 6 6 6 6 6 6 5 5 4 1 1 1 3 3 1 1 1 2 2 2 2 5 5 5 4 4 6 6 6 6 6 6 6 4 4 4 6 6 6 6 6 6 4 4 4 4 4 4 4 4 4 4 4 4 4 10 10 10 10 1 1 7 7 7 5 4 6 6 6 4 6 7 11 12 12 12 12 7 7 7 1 7 12 11 12 7 7 7 7 7 7 7 7 4 12 13 13 13 13 13 13 12 12 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 | // SPDX-License-Identifier: GPL-2.0-only /* * Landlock LSM - Ruleset management * * Copyright © 2016-2020 Mickaël Salaün <mic@digikod.net> * Copyright © 2018-2020 ANSSI */ #include <linux/bits.h> #include <linux/bug.h> #include <linux/compiler_types.h> #include <linux/err.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/lockdep.h> #include <linux/overflow.h> #include <linux/rbtree.h> #include <linux/refcount.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/workqueue.h> #include "limits.h" #include "object.h" #include "ruleset.h" static struct landlock_ruleset *create_ruleset(const u32 num_layers) { struct landlock_ruleset *new_ruleset; new_ruleset = kzalloc(struct_size(new_ruleset, access_masks, num_layers), GFP_KERNEL_ACCOUNT); if (!new_ruleset) return ERR_PTR(-ENOMEM); refcount_set(&new_ruleset->usage, 1); mutex_init(&new_ruleset->lock); new_ruleset->root_inode = RB_ROOT; #if IS_ENABLED(CONFIG_INET) new_ruleset->root_net_port = RB_ROOT; #endif /* IS_ENABLED(CONFIG_INET) */ new_ruleset->num_layers = num_layers; /* * hierarchy = NULL * num_rules = 0 * access_masks[] = 0 */ return new_ruleset; } struct landlock_ruleset * landlock_create_ruleset(const access_mask_t fs_access_mask, const access_mask_t net_access_mask) { struct landlock_ruleset *new_ruleset; /* Informs about useless ruleset. */ if (!fs_access_mask && !net_access_mask) return ERR_PTR(-ENOMSG); new_ruleset = create_ruleset(1); if (IS_ERR(new_ruleset)) return new_ruleset; if (fs_access_mask) landlock_add_fs_access_mask(new_ruleset, fs_access_mask, 0); if (net_access_mask) landlock_add_net_access_mask(new_ruleset, net_access_mask, 0); return new_ruleset; } static void build_check_rule(void) { const struct landlock_rule rule = { .num_layers = ~0, }; BUILD_BUG_ON(rule.num_layers < LANDLOCK_MAX_NUM_LAYERS); } static bool is_object_pointer(const enum landlock_key_type key_type) { switch (key_type) { case LANDLOCK_KEY_INODE: return true; #if IS_ENABLED(CONFIG_INET) case LANDLOCK_KEY_NET_PORT: return false; #endif /* IS_ENABLED(CONFIG_INET) */ default: WARN_ON_ONCE(1); return false; } } static struct landlock_rule * create_rule(const struct landlock_id id, const struct landlock_layer (*const layers)[], const u32 num_layers, const struct landlock_layer *const new_layer) { struct landlock_rule *new_rule; u32 new_num_layers; build_check_rule(); if (new_layer) { /* Should already be checked by landlock_merge_ruleset(). */ if (WARN_ON_ONCE(num_layers >= LANDLOCK_MAX_NUM_LAYERS)) return ERR_PTR(-E2BIG); new_num_layers = num_layers + 1; } else { new_num_layers = num_layers; } new_rule = kzalloc(struct_size(new_rule, layers, new_num_layers), GFP_KERNEL_ACCOUNT); if (!new_rule) return ERR_PTR(-ENOMEM); RB_CLEAR_NODE(&new_rule->node); if (is_object_pointer(id.type)) { /* This should be catched by insert_rule(). */ WARN_ON_ONCE(!id.key.object); landlock_get_object(id.key.object); } new_rule->key = id.key; new_rule->num_layers = new_num_layers; /* Copies the original layer stack. */ memcpy(new_rule->layers, layers, flex_array_size(new_rule, layers, num_layers)); if (new_layer) /* Adds a copy of @new_layer on the layer stack. */ new_rule->layers[new_rule->num_layers - 1] = *new_layer; return new_rule; } static struct rb_root *get_root(struct landlock_ruleset *const ruleset, const enum landlock_key_type key_type) { switch (key_type) { case LANDLOCK_KEY_INODE: return &ruleset->root_inode; #if IS_ENABLED(CONFIG_INET) case LANDLOCK_KEY_NET_PORT: return &ruleset->root_net_port; #endif /* IS_ENABLED(CONFIG_INET) */ default: WARN_ON_ONCE(1); return ERR_PTR(-EINVAL); } } static void free_rule(struct landlock_rule *const rule, const enum landlock_key_type key_type) { might_sleep(); if (!rule) return; if (is_object_pointer(key_type)) landlock_put_object(rule->key.object); kfree(rule); } static void build_check_ruleset(void) { const struct landlock_ruleset ruleset = { .num_rules = ~0, .num_layers = ~0, }; typeof(ruleset.access_masks[0]) access_masks = ~0; BUILD_BUG_ON(ruleset.num_rules < LANDLOCK_MAX_NUM_RULES); BUILD_BUG_ON(ruleset.num_layers < LANDLOCK_MAX_NUM_LAYERS); BUILD_BUG_ON(access_masks < ((LANDLOCK_MASK_ACCESS_FS << LANDLOCK_SHIFT_ACCESS_FS) | (LANDLOCK_MASK_ACCESS_NET << LANDLOCK_SHIFT_ACCESS_NET))); } /** * insert_rule - Create and insert a rule in a ruleset * * @ruleset: The ruleset to be updated. * @id: The ID to build the new rule with. The underlying kernel object, if * any, must be held by the caller. * @layers: One or multiple layers to be copied into the new rule. * @num_layers: The number of @layers entries. * * When user space requests to add a new rule to a ruleset, @layers only * contains one entry and this entry is not assigned to any level. In this * case, the new rule will extend @ruleset, similarly to a boolean OR between * access rights. * * When merging a ruleset in a domain, or copying a domain, @layers will be * added to @ruleset as new constraints, similarly to a boolean AND between * access rights. */ static int insert_rule(struct landlock_ruleset *const ruleset, const struct landlock_id id, const struct landlock_layer (*const layers)[], const size_t num_layers) { struct rb_node **walker_node; struct rb_node *parent_node = NULL; struct landlock_rule *new_rule; struct rb_root *root; might_sleep(); lockdep_assert_held(&ruleset->lock); if (WARN_ON_ONCE(!layers)) return -ENOENT; if (is_object_pointer(id.type) && WARN_ON_ONCE(!id.key.object)) return -ENOENT; root = get_root(ruleset, id.type); if (IS_ERR(root)) return PTR_ERR(root); walker_node = &root->rb_node; while (*walker_node) { struct landlock_rule *const this = rb_entry(*walker_node, struct landlock_rule, node); if (this->key.data != id.key.data) { parent_node = *walker_node; if (this->key.data < id.key.data) walker_node = &((*walker_node)->rb_right); else walker_node = &((*walker_node)->rb_left); continue; } /* Only a single-level layer should match an existing rule. */ if (WARN_ON_ONCE(num_layers != 1)) return -EINVAL; /* If there is a matching rule, updates it. */ if ((*layers)[0].level == 0) { /* * Extends access rights when the request comes from * landlock_add_rule(2), i.e. @ruleset is not a domain. */ if (WARN_ON_ONCE(this->num_layers != 1)) return -EINVAL; if (WARN_ON_ONCE(this->layers[0].level != 0)) return -EINVAL; this->layers[0].access |= (*layers)[0].access; return 0; } if (WARN_ON_ONCE(this->layers[0].level == 0)) return -EINVAL; /* * Intersects access rights when it is a merge between a * ruleset and a domain. */ new_rule = create_rule(id, &this->layers, this->num_layers, &(*layers)[0]); if (IS_ERR(new_rule)) return PTR_ERR(new_rule); rb_replace_node(&this->node, &new_rule->node, root); free_rule(this, id.type); return 0; } /* There is no match for @id. */ build_check_ruleset(); if (ruleset->num_rules >= LANDLOCK_MAX_NUM_RULES) return -E2BIG; new_rule = create_rule(id, layers, num_layers, NULL); if (IS_ERR(new_rule)) return PTR_ERR(new_rule); rb_link_node(&new_rule->node, parent_node, walker_node); rb_insert_color(&new_rule->node, root); ruleset->num_rules++; return 0; } static void build_check_layer(void) { const struct landlock_layer layer = { .level = ~0, .access = ~0, }; BUILD_BUG_ON(layer.level < LANDLOCK_MAX_NUM_LAYERS); BUILD_BUG_ON(layer.access < LANDLOCK_MASK_ACCESS_FS); } /* @ruleset must be locked by the caller. */ int landlock_insert_rule(struct landlock_ruleset *const ruleset, const struct landlock_id id, const access_mask_t access) { struct landlock_layer layers[] = { { .access = access, /* When @level is zero, insert_rule() extends @ruleset. */ .level = 0, } }; build_check_layer(); return insert_rule(ruleset, id, &layers, ARRAY_SIZE(layers)); } static void get_hierarchy(struct landlock_hierarchy *const hierarchy) { if (hierarchy) refcount_inc(&hierarchy->usage); } static void put_hierarchy(struct landlock_hierarchy *hierarchy) { while (hierarchy && refcount_dec_and_test(&hierarchy->usage)) { const struct landlock_hierarchy *const freeme = hierarchy; hierarchy = hierarchy->parent; kfree(freeme); } } static int merge_tree(struct landlock_ruleset *const dst, struct landlock_ruleset *const src, const enum landlock_key_type key_type) { struct landlock_rule *walker_rule, *next_rule; struct rb_root *src_root; int err = 0; might_sleep(); lockdep_assert_held(&dst->lock); lockdep_assert_held(&src->lock); src_root = get_root(src, key_type); if (IS_ERR(src_root)) return PTR_ERR(src_root); /* Merges the @src tree. */ rbtree_postorder_for_each_entry_safe(walker_rule, next_rule, src_root, node) { struct landlock_layer layers[] = { { .level = dst->num_layers, } }; const struct landlock_id id = { .key = walker_rule->key, .type = key_type, }; if (WARN_ON_ONCE(walker_rule->num_layers != 1)) return -EINVAL; if (WARN_ON_ONCE(walker_rule->layers[0].level != 0)) return -EINVAL; layers[0].access = walker_rule->layers[0].access; err = insert_rule(dst, id, &layers, ARRAY_SIZE(layers)); if (err) return err; } return err; } static int merge_ruleset(struct landlock_ruleset *const dst, struct landlock_ruleset *const src) { int err = 0; might_sleep(); /* Should already be checked by landlock_merge_ruleset() */ if (WARN_ON_ONCE(!src)) return 0; /* Only merge into a domain. */ if (WARN_ON_ONCE(!dst || !dst->hierarchy)) return -EINVAL; /* Locks @dst first because we are its only owner. */ mutex_lock(&dst->lock); mutex_lock_nested(&src->lock, SINGLE_DEPTH_NESTING); /* Stacks the new layer. */ if (WARN_ON_ONCE(src->num_layers != 1 || dst->num_layers < 1)) { err = -EINVAL; goto out_unlock; } dst->access_masks[dst->num_layers - 1] = src->access_masks[0]; /* Merges the @src inode tree. */ err = merge_tree(dst, src, LANDLOCK_KEY_INODE); if (err) goto out_unlock; #if IS_ENABLED(CONFIG_INET) /* Merges the @src network port tree. */ err = merge_tree(dst, src, LANDLOCK_KEY_NET_PORT); if (err) goto out_unlock; #endif /* IS_ENABLED(CONFIG_INET) */ out_unlock: mutex_unlock(&src->lock); mutex_unlock(&dst->lock); return err; } static int inherit_tree(struct landlock_ruleset *const parent, struct landlock_ruleset *const child, const enum landlock_key_type key_type) { struct landlock_rule *walker_rule, *next_rule; struct rb_root *parent_root; int err = 0; might_sleep(); lockdep_assert_held(&parent->lock); lockdep_assert_held(&child->lock); parent_root = get_root(parent, key_type); if (IS_ERR(parent_root)) return PTR_ERR(parent_root); /* Copies the @parent inode or network tree. */ rbtree_postorder_for_each_entry_safe(walker_rule, next_rule, parent_root, node) { const struct landlock_id id = { .key = walker_rule->key, .type = key_type, }; err = insert_rule(child, id, &walker_rule->layers, walker_rule->num_layers); if (err) return err; } return err; } static int inherit_ruleset(struct landlock_ruleset *const parent, struct landlock_ruleset *const child) { int err = 0; might_sleep(); if (!parent) return 0; /* Locks @child first because we are its only owner. */ mutex_lock(&child->lock); mutex_lock_nested(&parent->lock, SINGLE_DEPTH_NESTING); /* Copies the @parent inode tree. */ err = inherit_tree(parent, child, LANDLOCK_KEY_INODE); if (err) goto out_unlock; #if IS_ENABLED(CONFIG_INET) /* Copies the @parent network port tree. */ err = inherit_tree(parent, child, LANDLOCK_KEY_NET_PORT); if (err) goto out_unlock; #endif /* IS_ENABLED(CONFIG_INET) */ if (WARN_ON_ONCE(child->num_layers <= parent->num_layers)) { err = -EINVAL; goto out_unlock; } /* Copies the parent layer stack and leaves a space for the new layer. */ memcpy(child->access_masks, parent->access_masks, flex_array_size(parent, access_masks, parent->num_layers)); if (WARN_ON_ONCE(!parent->hierarchy)) { err = -EINVAL; goto out_unlock; } get_hierarchy(parent->hierarchy); child->hierarchy->parent = parent->hierarchy; out_unlock: mutex_unlock(&parent->lock); mutex_unlock(&child->lock); return err; } static void free_ruleset(struct landlock_ruleset *const ruleset) { struct landlock_rule *freeme, *next; might_sleep(); rbtree_postorder_for_each_entry_safe(freeme, next, &ruleset->root_inode, node) free_rule(freeme, LANDLOCK_KEY_INODE); #if IS_ENABLED(CONFIG_INET) rbtree_postorder_for_each_entry_safe(freeme, next, &ruleset->root_net_port, node) free_rule(freeme, LANDLOCK_KEY_NET_PORT); #endif /* IS_ENABLED(CONFIG_INET) */ put_hierarchy(ruleset->hierarchy); kfree(ruleset); } void landlock_put_ruleset(struct landlock_ruleset *const ruleset) { might_sleep(); if (ruleset && refcount_dec_and_test(&ruleset->usage)) free_ruleset(ruleset); } static void free_ruleset_work(struct work_struct *const work) { struct landlock_ruleset *ruleset; ruleset = container_of(work, struct landlock_ruleset, work_free); free_ruleset(ruleset); } void landlock_put_ruleset_deferred(struct landlock_ruleset *const ruleset) { if (ruleset && refcount_dec_and_test(&ruleset->usage)) { INIT_WORK(&ruleset->work_free, free_ruleset_work); schedule_work(&ruleset->work_free); } } /** * landlock_merge_ruleset - Merge a ruleset with a domain * * @parent: Parent domain. * @ruleset: New ruleset to be merged. * * Returns the intersection of @parent and @ruleset, or returns @parent if * @ruleset is empty, or returns a duplicate of @ruleset if @parent is empty. */ struct landlock_ruleset * landlock_merge_ruleset(struct landlock_ruleset *const parent, struct landlock_ruleset *const ruleset) { struct landlock_ruleset *new_dom; u32 num_layers; int err; might_sleep(); if (WARN_ON_ONCE(!ruleset || parent == ruleset)) return ERR_PTR(-EINVAL); if (parent) { if (parent->num_layers >= LANDLOCK_MAX_NUM_LAYERS) return ERR_PTR(-E2BIG); num_layers = parent->num_layers + 1; } else { num_layers = 1; } /* Creates a new domain... */ new_dom = create_ruleset(num_layers); if (IS_ERR(new_dom)) return new_dom; new_dom->hierarchy = kzalloc(sizeof(*new_dom->hierarchy), GFP_KERNEL_ACCOUNT); if (!new_dom->hierarchy) { err = -ENOMEM; goto out_put_dom; } refcount_set(&new_dom->hierarchy->usage, 1); /* ...as a child of @parent... */ err = inherit_ruleset(parent, new_dom); if (err) goto out_put_dom; /* ...and including @ruleset. */ err = merge_ruleset(new_dom, ruleset); if (err) goto out_put_dom; return new_dom; out_put_dom: landlock_put_ruleset(new_dom); return ERR_PTR(err); } /* * The returned access has the same lifetime as @ruleset. */ const struct landlock_rule * landlock_find_rule(const struct landlock_ruleset *const ruleset, const struct landlock_id id) { const struct rb_root *root; const struct rb_node *node; root = get_root((struct landlock_ruleset *)ruleset, id.type); if (IS_ERR(root)) return NULL; node = root->rb_node; while (node) { struct landlock_rule *this = rb_entry(node, struct landlock_rule, node); if (this->key.data == id.key.data) return this; if (this->key.data < id.key.data) node = node->rb_right; else node = node->rb_left; } return NULL; } /* * @layer_masks is read and may be updated according to the access request and * the matching rule. * @masks_array_size must be equal to ARRAY_SIZE(*layer_masks). * * Returns true if the request is allowed (i.e. relevant layer masks for the * request are empty). */ bool landlock_unmask_layers(const struct landlock_rule *const rule, const access_mask_t access_request, layer_mask_t (*const layer_masks)[], const size_t masks_array_size) { size_t layer_level; if (!access_request || !layer_masks) return true; if (!rule) return false; /* * An access is granted if, for each policy layer, at least one rule * encountered on the pathwalk grants the requested access, * regardless of its position in the layer stack. We must then check * the remaining layers for each inode, from the first added layer to * the last one. When there is multiple requested accesses, for each * policy layer, the full set of requested accesses may not be granted * by only one rule, but by the union (binary OR) of multiple rules. * E.g. /a/b <execute> + /a <read> => /a/b <execute + read> */ for (layer_level = 0; layer_level < rule->num_layers; layer_level++) { const struct landlock_layer *const layer = &rule->layers[layer_level]; const layer_mask_t layer_bit = BIT_ULL(layer->level - 1); const unsigned long access_req = access_request; unsigned long access_bit; bool is_empty; /* * Records in @layer_masks which layer grants access to each * requested access. */ is_empty = true; for_each_set_bit(access_bit, &access_req, masks_array_size) { if (layer->access & BIT_ULL(access_bit)) (*layer_masks)[access_bit] &= ~layer_bit; is_empty = is_empty && !(*layer_masks)[access_bit]; } if (is_empty) return true; } return false; } typedef access_mask_t get_access_mask_t(const struct landlock_ruleset *const ruleset, const u16 layer_level); /** * landlock_init_layer_masks - Initialize layer masks from an access request * * Populates @layer_masks such that for each access right in @access_request, * the bits for all the layers are set where this access right is handled. * * @domain: The domain that defines the current restrictions. * @access_request: The requested access rights to check. * @layer_masks: It must contain %LANDLOCK_NUM_ACCESS_FS or * %LANDLOCK_NUM_ACCESS_NET elements according to @key_type. * @key_type: The key type to switch between access masks of different types. * * Returns: An access mask where each access right bit is set which is handled * in any of the active layers in @domain. */ access_mask_t landlock_init_layer_masks(const struct landlock_ruleset *const domain, const access_mask_t access_request, layer_mask_t (*const layer_masks)[], const enum landlock_key_type key_type) { access_mask_t handled_accesses = 0; size_t layer_level, num_access; get_access_mask_t *get_access_mask; switch (key_type) { case LANDLOCK_KEY_INODE: get_access_mask = landlock_get_fs_access_mask; num_access = LANDLOCK_NUM_ACCESS_FS; break; #if IS_ENABLED(CONFIG_INET) case LANDLOCK_KEY_NET_PORT: get_access_mask = landlock_get_net_access_mask; num_access = LANDLOCK_NUM_ACCESS_NET; break; #endif /* IS_ENABLED(CONFIG_INET) */ default: WARN_ON_ONCE(1); return 0; } memset(layer_masks, 0, array_size(sizeof((*layer_masks)[0]), num_access)); /* An empty access request can happen because of O_WRONLY | O_RDWR. */ if (!access_request) return 0; /* Saves all handled accesses per layer. */ for (layer_level = 0; layer_level < domain->num_layers; layer_level++) { const unsigned long access_req = access_request; const access_mask_t access_mask = get_access_mask(domain, layer_level); unsigned long access_bit; for_each_set_bit(access_bit, &access_req, num_access) { if (BIT_ULL(access_bit) & access_mask) { (*layer_masks)[access_bit] |= BIT_ULL(layer_level); handled_accesses |= BIT_ULL(access_bit); } } } return handled_accesses; } |
| 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 | // SPDX-License-Identifier: GPL-2.0 /* * Arch specific cpu topology information * * Copyright (C) 2016, ARM Ltd. * Written by: Juri Lelli, ARM Ltd. */ #include <linux/acpi.h> #include <linux/cacheinfo.h> #include <linux/cpu.h> #include <linux/cpufreq.h> #include <linux/device.h> #include <linux/of.h> #include <linux/slab.h> #include <linux/sched/topology.h> #include <linux/cpuset.h> #include <linux/cpumask.h> #include <linux/init.h> #include <linux/rcupdate.h> #include <linux/sched.h> #include <linux/units.h> #define CREATE_TRACE_POINTS #include <trace/events/hw_pressure.h> static DEFINE_PER_CPU(struct scale_freq_data __rcu *, sft_data); static struct cpumask scale_freq_counters_mask; static bool scale_freq_invariant; DEFINE_PER_CPU(unsigned long, capacity_freq_ref) = 1; EXPORT_PER_CPU_SYMBOL_GPL(capacity_freq_ref); static bool supports_scale_freq_counters(const struct cpumask *cpus) { return cpumask_subset(cpus, &scale_freq_counters_mask); } bool topology_scale_freq_invariant(void) { return cpufreq_supports_freq_invariance() || supports_scale_freq_counters(cpu_online_mask); } static void update_scale_freq_invariant(bool status) { if (scale_freq_invariant == status) return; /* * Task scheduler behavior depends on frequency invariance support, * either cpufreq or counter driven. If the support status changes as * a result of counter initialisation and use, retrigger the build of * scheduling domains to ensure the information is propagated properly. */ if (topology_scale_freq_invariant() == status) { scale_freq_invariant = status; rebuild_sched_domains_energy(); } } void topology_set_scale_freq_source(struct scale_freq_data *data, const struct cpumask *cpus) { struct scale_freq_data *sfd; int cpu; /* * Avoid calling rebuild_sched_domains() unnecessarily if FIE is * supported by cpufreq. */ if (cpumask_empty(&scale_freq_counters_mask)) scale_freq_invariant = topology_scale_freq_invariant(); rcu_read_lock(); for_each_cpu(cpu, cpus) { sfd = rcu_dereference(*per_cpu_ptr(&sft_data, cpu)); /* Use ARCH provided counters whenever possible */ if (!sfd || sfd->source != SCALE_FREQ_SOURCE_ARCH) { rcu_assign_pointer(per_cpu(sft_data, cpu), data); cpumask_set_cpu(cpu, &scale_freq_counters_mask); } } rcu_read_unlock(); update_scale_freq_invariant(true); } EXPORT_SYMBOL_GPL(topology_set_scale_freq_source); void topology_clear_scale_freq_source(enum scale_freq_source source, const struct cpumask *cpus) { struct scale_freq_data *sfd; int cpu; rcu_read_lock(); for_each_cpu(cpu, cpus) { sfd = rcu_dereference(*per_cpu_ptr(&sft_data, cpu)); if (sfd && sfd->source == source) { rcu_assign_pointer(per_cpu(sft_data, cpu), NULL); cpumask_clear_cpu(cpu, &scale_freq_counters_mask); } } rcu_read_unlock(); /* * Make sure all references to previous sft_data are dropped to avoid * use-after-free races. */ synchronize_rcu(); update_scale_freq_invariant(false); } EXPORT_SYMBOL_GPL(topology_clear_scale_freq_source); void topology_scale_freq_tick(void) { struct scale_freq_data *sfd = rcu_dereference_sched(*this_cpu_ptr(&sft_data)); if (sfd) sfd->set_freq_scale(); } DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE; EXPORT_PER_CPU_SYMBOL_GPL(arch_freq_scale); void topology_set_freq_scale(const struct cpumask *cpus, unsigned long cur_freq, unsigned long max_freq) { unsigned long scale; int i; if (WARN_ON_ONCE(!cur_freq || !max_freq)) return; /* * If the use of counters for FIE is enabled, just return as we don't * want to update the scale factor with information from CPUFREQ. * Instead the scale factor will be updated from arch_scale_freq_tick. */ if (supports_scale_freq_counters(cpus)) return; scale = (cur_freq << SCHED_CAPACITY_SHIFT) / max_freq; for_each_cpu(i, cpus) per_cpu(arch_freq_scale, i) = scale; } DEFINE_PER_CPU(unsigned long, cpu_scale) = SCHED_CAPACITY_SCALE; EXPORT_PER_CPU_SYMBOL_GPL(cpu_scale); void topology_set_cpu_scale(unsigned int cpu, unsigned long capacity) { per_cpu(cpu_scale, cpu) = capacity; } DEFINE_PER_CPU(unsigned long, hw_pressure); /** * topology_update_hw_pressure() - Update HW pressure for CPUs * @cpus : The related CPUs for which capacity has been reduced * @capped_freq : The maximum allowed frequency that CPUs can run at * * Update the value of HW pressure for all @cpus in the mask. The * cpumask should include all (online+offline) affected CPUs, to avoid * operating on stale data when hot-plug is used for some CPUs. The * @capped_freq reflects the currently allowed max CPUs frequency due to * HW capping. It might be also a boost frequency value, which is bigger * than the internal 'capacity_freq_ref' max frequency. In such case the * pressure value should simply be removed, since this is an indication that * there is no HW throttling. The @capped_freq must be provided in kHz. */ void topology_update_hw_pressure(const struct cpumask *cpus, unsigned long capped_freq) { unsigned long max_capacity, capacity, pressure; u32 max_freq; int cpu; cpu = cpumask_first(cpus); max_capacity = arch_scale_cpu_capacity(cpu); max_freq = arch_scale_freq_ref(cpu); /* * Handle properly the boost frequencies, which should simply clean * the HW pressure value. */ if (max_freq <= capped_freq) capacity = max_capacity; else capacity = mult_frac(max_capacity, capped_freq, max_freq); pressure = max_capacity - capacity; trace_hw_pressure_update(cpu, pressure); for_each_cpu(cpu, cpus) WRITE_ONCE(per_cpu(hw_pressure, cpu), pressure); } EXPORT_SYMBOL_GPL(topology_update_hw_pressure); static ssize_t cpu_capacity_show(struct device *dev, struct device_attribute *attr, char *buf) { struct cpu *cpu = container_of(dev, struct cpu, dev); return sysfs_emit(buf, "%lu\n", topology_get_cpu_scale(cpu->dev.id)); } static void update_topology_flags_workfn(struct work_struct *work); static DECLARE_WORK(update_topology_flags_work, update_topology_flags_workfn); static DEVICE_ATTR_RO(cpu_capacity); static int cpu_capacity_sysctl_add(unsigned int cpu) { struct device *cpu_dev = get_cpu_device(cpu); if (!cpu_dev) return -ENOENT; device_create_file(cpu_dev, &dev_attr_cpu_capacity); return 0; } static int cpu_capacity_sysctl_remove(unsigned int cpu) { struct device *cpu_dev = get_cpu_device(cpu); if (!cpu_dev) return -ENOENT; device_remove_file(cpu_dev, &dev_attr_cpu_capacity); return 0; } static int register_cpu_capacity_sysctl(void) { cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "topology/cpu-capacity", cpu_capacity_sysctl_add, cpu_capacity_sysctl_remove); return 0; } subsys_initcall(register_cpu_capacity_sysctl); static int update_topology; int topology_update_cpu_topology(void) { return update_topology; } /* * Updating the sched_domains can't be done directly from cpufreq callbacks * due to locking, so queue the work for later. */ static void update_topology_flags_workfn(struct work_struct *work) { update_topology = 1; rebuild_sched_domains(); pr_debug("sched_domain hierarchy rebuilt, flags updated\n"); update_topology = 0; } static u32 *raw_capacity; static int free_raw_capacity(void) { kfree(raw_capacity); raw_capacity = NULL; return 0; } void topology_normalize_cpu_scale(void) { u64 capacity; u64 capacity_scale; int cpu; if (!raw_capacity) return; capacity_scale = 1; for_each_possible_cpu(cpu) { capacity = raw_capacity[cpu] * per_cpu(capacity_freq_ref, cpu); capacity_scale = max(capacity, capacity_scale); } pr_debug("cpu_capacity: capacity_scale=%llu\n", capacity_scale); for_each_possible_cpu(cpu) { capacity = raw_capacity[cpu] * per_cpu(capacity_freq_ref, cpu); capacity = div64_u64(capacity << SCHED_CAPACITY_SHIFT, capacity_scale); topology_set_cpu_scale(cpu, capacity); pr_debug("cpu_capacity: CPU%d cpu_capacity=%lu\n", cpu, topology_get_cpu_scale(cpu)); } } bool __init topology_parse_cpu_capacity(struct device_node *cpu_node, int cpu) { struct clk *cpu_clk; static bool cap_parsing_failed; int ret; u32 cpu_capacity; if (cap_parsing_failed) return false; ret = of_property_read_u32(cpu_node, "capacity-dmips-mhz", &cpu_capacity); if (!ret) { if (!raw_capacity) { raw_capacity = kcalloc(num_possible_cpus(), sizeof(*raw_capacity), GFP_KERNEL); if (!raw_capacity) { cap_parsing_failed = true; return false; } } raw_capacity[cpu] = cpu_capacity; pr_debug("cpu_capacity: %pOF cpu_capacity=%u (raw)\n", cpu_node, raw_capacity[cpu]); /* * Update capacity_freq_ref for calculating early boot CPU capacities. * For non-clk CPU DVFS mechanism, there's no way to get the * frequency value now, assuming they are running at the same * frequency (by keeping the initial capacity_freq_ref value). */ cpu_clk = of_clk_get(cpu_node, 0); if (!PTR_ERR_OR_ZERO(cpu_clk)) { per_cpu(capacity_freq_ref, cpu) = clk_get_rate(cpu_clk) / HZ_PER_KHZ; clk_put(cpu_clk); } } else { if (raw_capacity) { pr_err("cpu_capacity: missing %pOF raw capacity\n", cpu_node); pr_err("cpu_capacity: partial information: fallback to 1024 for all CPUs\n"); } cap_parsing_failed = true; free_raw_capacity(); } return !ret; } void __weak freq_inv_set_max_ratio(int cpu, u64 max_rate) { } #ifdef CONFIG_ACPI_CPPC_LIB #include <acpi/cppc_acpi.h> void topology_init_cpu_capacity_cppc(void) { u64 capacity, capacity_scale = 0; struct cppc_perf_caps perf_caps; int cpu; if (likely(!acpi_cpc_valid())) return; raw_capacity = kcalloc(num_possible_cpus(), sizeof(*raw_capacity), GFP_KERNEL); if (!raw_capacity) return; for_each_possible_cpu(cpu) { if (!cppc_get_perf_caps(cpu, &perf_caps) && (perf_caps.highest_perf >= perf_caps.nominal_perf) && (perf_caps.highest_perf >= perf_caps.lowest_perf)) { raw_capacity[cpu] = perf_caps.highest_perf; capacity_scale = max_t(u64, capacity_scale, raw_capacity[cpu]); per_cpu(capacity_freq_ref, cpu) = cppc_perf_to_khz(&perf_caps, raw_capacity[cpu]); pr_debug("cpu_capacity: CPU%d cpu_capacity=%u (raw).\n", cpu, raw_capacity[cpu]); continue; } pr_err("cpu_capacity: CPU%d missing/invalid highest performance.\n", cpu); pr_err("cpu_capacity: partial information: fallback to 1024 for all CPUs\n"); goto exit; } for_each_possible_cpu(cpu) { freq_inv_set_max_ratio(cpu, per_cpu(capacity_freq_ref, cpu) * HZ_PER_KHZ); capacity = raw_capacity[cpu]; capacity = div64_u64(capacity << SCHED_CAPACITY_SHIFT, capacity_scale); topology_set_cpu_scale(cpu, capacity); pr_debug("cpu_capacity: CPU%d cpu_capacity=%lu\n", cpu, topology_get_cpu_scale(cpu)); } schedule_work(&update_topology_flags_work); pr_debug("cpu_capacity: cpu_capacity initialization done\n"); exit: free_raw_capacity(); } #endif #ifdef CONFIG_CPU_FREQ static cpumask_var_t cpus_to_visit; static void parsing_done_workfn(struct work_struct *work); static DECLARE_WORK(parsing_done_work, parsing_done_workfn); static int init_cpu_capacity_callback(struct notifier_block *nb, unsigned long val, void *data) { struct cpufreq_policy *policy = data; int cpu; if (val != CPUFREQ_CREATE_POLICY) return 0; pr_debug("cpu_capacity: init cpu capacity for CPUs [%*pbl] (to_visit=%*pbl)\n", cpumask_pr_args(policy->related_cpus), cpumask_pr_args(cpus_to_visit)); cpumask_andnot(cpus_to_visit, cpus_to_visit, policy->related_cpus); for_each_cpu(cpu, policy->related_cpus) { per_cpu(capacity_freq_ref, cpu) = policy->cpuinfo.max_freq; freq_inv_set_max_ratio(cpu, per_cpu(capacity_freq_ref, cpu) * HZ_PER_KHZ); } if (cpumask_empty(cpus_to_visit)) { if (raw_capacity) { topology_normalize_cpu_scale(); schedule_work(&update_topology_flags_work); free_raw_capacity(); } pr_debug("cpu_capacity: parsing done\n"); schedule_work(&parsing_done_work); } return 0; } static struct notifier_block init_cpu_capacity_notifier = { .notifier_call = init_cpu_capacity_callback, }; static int __init register_cpufreq_notifier(void) { int ret; /* * On ACPI-based systems skip registering cpufreq notifier as cpufreq * information is not needed for cpu capacity initialization. */ if (!acpi_disabled) return -EINVAL; if (!alloc_cpumask_var(&cpus_to_visit, GFP_KERNEL)) return -ENOMEM; cpumask_copy(cpus_to_visit, cpu_possible_mask); ret = cpufreq_register_notifier(&init_cpu_capacity_notifier, CPUFREQ_POLICY_NOTIFIER); if (ret) free_cpumask_var(cpus_to_visit); return ret; } core_initcall(register_cpufreq_notifier); static void parsing_done_workfn(struct work_struct *work) { cpufreq_unregister_notifier(&init_cpu_capacity_notifier, CPUFREQ_POLICY_NOTIFIER); free_cpumask_var(cpus_to_visit); } #else core_initcall(free_raw_capacity); #endif #if defined(CONFIG_ARM64) || defined(CONFIG_RISCV) /* * This function returns the logic cpu number of the node. * There are basically three kinds of return values: * (1) logic cpu number which is > 0. * (2) -ENODEV when the device tree(DT) node is valid and found in the DT but * there is no possible logical CPU in the kernel to match. This happens * when CONFIG_NR_CPUS is configure to be smaller than the number of * CPU nodes in DT. We need to just ignore this case. * (3) -1 if the node does not exist in the device tree */ static int __init get_cpu_for_node(struct device_node *node) { struct device_node *cpu_node; int cpu; cpu_node = of_parse_phandle(node, "cpu", 0); if (!cpu_node) return -1; cpu = of_cpu_node_to_id(cpu_node); if (cpu >= 0) topology_parse_cpu_capacity(cpu_node, cpu); else pr_info("CPU node for %pOF exist but the possible cpu range is :%*pbl\n", cpu_node, cpumask_pr_args(cpu_possible_mask)); of_node_put(cpu_node); return cpu; } static int __init parse_core(struct device_node *core, int package_id, int cluster_id, int core_id) { char name[20]; bool leaf = true; int i = 0; int cpu; struct device_node *t; do { snprintf(name, sizeof(name), "thread%d", i); t = of_get_child_by_name(core, name); if (t) { leaf = false; cpu = get_cpu_for_node(t); if (cpu >= 0) { cpu_topology[cpu].package_id = package_id; cpu_topology[cpu].cluster_id = cluster_id; cpu_topology[cpu].core_id = core_id; cpu_topology[cpu].thread_id = i; } else if (cpu != -ENODEV) { pr_err("%pOF: Can't get CPU for thread\n", t); of_node_put(t); return -EINVAL; } of_node_put(t); } i++; } while (t); cpu = get_cpu_for_node(core); if (cpu >= 0) { if (!leaf) { pr_err("%pOF: Core has both threads and CPU\n", core); return -EINVAL; } cpu_topology[cpu].package_id = package_id; cpu_topology[cpu].cluster_id = cluster_id; cpu_topology[cpu].core_id = core_id; } else if (leaf && cpu != -ENODEV) { pr_err("%pOF: Can't get CPU for leaf core\n", core); return -EINVAL; } return 0; } static int __init parse_cluster(struct device_node *cluster, int package_id, int cluster_id, int depth) { char name[20]; bool leaf = true; bool has_cores = false; struct device_node *c; int core_id = 0; int i, ret; /* * First check for child clusters; we currently ignore any * information about the nesting of clusters and present the * scheduler with a flat list of them. */ i = 0; do { snprintf(name, sizeof(name), "cluster%d", i); c = of_get_child_by_name(cluster, name); if (c) { leaf = false; ret = parse_cluster(c, package_id, i, depth + 1); if (depth > 0) pr_warn("Topology for clusters of clusters not yet supported\n"); of_node_put(c); if (ret != 0) return ret; } i++; } while (c); /* Now check for cores */ i = 0; do { snprintf(name, sizeof(name), "core%d", i); c = of_get_child_by_name(cluster, name); if (c) { has_cores = true; if (depth == 0) { pr_err("%pOF: cpu-map children should be clusters\n", c); of_node_put(c); return -EINVAL; } if (leaf) { ret = parse_core(c, package_id, cluster_id, core_id++); } else { pr_err("%pOF: Non-leaf cluster with core %s\n", cluster, name); ret = -EINVAL; } of_node_put(c); if (ret != 0) return ret; } i++; } while (c); if (leaf && !has_cores) pr_warn("%pOF: empty cluster\n", cluster); return 0; } static int __init parse_socket(struct device_node *socket) { char name[20]; struct device_node *c; bool has_socket = false; int package_id = 0, ret; do { snprintf(name, sizeof(name), "socket%d", package_id); c = of_get_child_by_name(socket, name); if (c) { has_socket = true; ret = parse_cluster(c, package_id, -1, 0); of_node_put(c); if (ret != 0) return ret; } package_id++; } while (c); if (!has_socket) ret = parse_cluster(socket, 0, -1, 0); return ret; } static int __init parse_dt_topology(void) { struct device_node *cn, *map; int ret = 0; int cpu; cn = of_find_node_by_path("/cpus"); if (!cn) { pr_err("No CPU information found in DT\n"); return 0; } /* * When topology is provided cpu-map is essentially a root * cluster with restricted subnodes. */ map = of_get_child_by_name(cn, "cpu-map"); if (!map) goto out; ret = parse_socket(map); if (ret != 0) goto out_map; topology_normalize_cpu_scale(); /* * Check that all cores are in the topology; the SMP code will * only mark cores described in the DT as possible. */ for_each_possible_cpu(cpu) if (cpu_topology[cpu].package_id < 0) { ret = -EINVAL; break; } out_map: of_node_put(map); out: of_node_put(cn); return ret; } #endif /* * cpu topology table */ struct cpu_topology cpu_topology[NR_CPUS]; EXPORT_SYMBOL_GPL(cpu_topology); const struct cpumask *cpu_coregroup_mask(int cpu) { const cpumask_t *core_mask = cpumask_of_node(cpu_to_node(cpu)); /* Find the smaller of NUMA, core or LLC siblings */ if (cpumask_subset(&cpu_topology[cpu].core_sibling, core_mask)) { /* not numa in package, lets use the package siblings */ core_mask = &cpu_topology[cpu].core_sibling; } if (last_level_cache_is_valid(cpu)) { if (cpumask_subset(&cpu_topology[cpu].llc_sibling, core_mask)) core_mask = &cpu_topology[cpu].llc_sibling; } /* * For systems with no shared cpu-side LLC but with clusters defined, * extend core_mask to cluster_siblings. The sched domain builder will * then remove MC as redundant with CLS if SCHED_CLUSTER is enabled. */ if (IS_ENABLED(CONFIG_SCHED_CLUSTER) && cpumask_subset(core_mask, &cpu_topology[cpu].cluster_sibling)) core_mask = &cpu_topology[cpu].cluster_sibling; return core_mask; } const struct cpumask *cpu_clustergroup_mask(int cpu) { /* * Forbid cpu_clustergroup_mask() to span more or the same CPUs as * cpu_coregroup_mask(). */ if (cpumask_subset(cpu_coregroup_mask(cpu), &cpu_topology[cpu].cluster_sibling)) return topology_sibling_cpumask(cpu); return &cpu_topology[cpu].cluster_sibling; } void update_siblings_masks(unsigned int cpuid) { struct cpu_topology *cpu_topo, *cpuid_topo = &cpu_topology[cpuid]; int cpu, ret; ret = detect_cache_attributes(cpuid); if (ret && ret != -ENOENT) pr_info("Early cacheinfo allocation failed, ret = %d\n", ret); /* update core and thread sibling masks */ for_each_online_cpu(cpu) { cpu_topo = &cpu_topology[cpu]; if (last_level_cache_is_shared(cpu, cpuid)) { cpumask_set_cpu(cpu, &cpuid_topo->llc_sibling); cpumask_set_cpu(cpuid, &cpu_topo->llc_sibling); } if (cpuid_topo->package_id != cpu_topo->package_id) continue; cpumask_set_cpu(cpuid, &cpu_topo->core_sibling); cpumask_set_cpu(cpu, &cpuid_topo->core_sibling); if (cpuid_topo->cluster_id != cpu_topo->cluster_id) continue; if (cpuid_topo->cluster_id >= 0) { cpumask_set_cpu(cpu, &cpuid_topo->cluster_sibling); cpumask_set_cpu(cpuid, &cpu_topo->cluster_sibling); } if (cpuid_topo->core_id != cpu_topo->core_id) continue; cpumask_set_cpu(cpuid, &cpu_topo->thread_sibling); cpumask_set_cpu(cpu, &cpuid_topo->thread_sibling); } } static void clear_cpu_topology(int cpu) { struct cpu_topology *cpu_topo = &cpu_topology[cpu]; cpumask_clear(&cpu_topo->llc_sibling); cpumask_set_cpu(cpu, &cpu_topo->llc_sibling); cpumask_clear(&cpu_topo->cluster_sibling); cpumask_set_cpu(cpu, &cpu_topo->cluster_sibling); cpumask_clear(&cpu_topo->core_sibling); cpumask_set_cpu(cpu, &cpu_topo->core_sibling); cpumask_clear(&cpu_topo->thread_sibling); cpumask_set_cpu(cpu, &cpu_topo->thread_sibling); } void __init reset_cpu_topology(void) { unsigned int cpu; for_each_possible_cpu(cpu) { struct cpu_topology *cpu_topo = &cpu_topology[cpu]; cpu_topo->thread_id = -1; cpu_topo->core_id = -1; cpu_topo->cluster_id = -1; cpu_topo->package_id = -1; clear_cpu_topology(cpu); } } void remove_cpu_topology(unsigned int cpu) { int sibling; for_each_cpu(sibling, topology_core_cpumask(cpu)) cpumask_clear_cpu(cpu, topology_core_cpumask(sibling)); for_each_cpu(sibling, topology_sibling_cpumask(cpu)) cpumask_clear_cpu(cpu, topology_sibling_cpumask(sibling)); for_each_cpu(sibling, topology_cluster_cpumask(cpu)) cpumask_clear_cpu(cpu, topology_cluster_cpumask(sibling)); for_each_cpu(sibling, topology_llc_cpumask(cpu)) cpumask_clear_cpu(cpu, topology_llc_cpumask(sibling)); clear_cpu_topology(cpu); } __weak int __init parse_acpi_topology(void) { return 0; } #if defined(CONFIG_ARM64) || defined(CONFIG_RISCV) void __init init_cpu_topology(void) { int cpu, ret; reset_cpu_topology(); ret = parse_acpi_topology(); if (!ret) ret = of_have_populated_dt() && parse_dt_topology(); if (ret) { /* * Discard anything that was parsed if we hit an error so we * don't use partial information. But do not return yet to give * arch-specific early cache level detection a chance to run. */ reset_cpu_topology(); } for_each_possible_cpu(cpu) { ret = fetch_cache_info(cpu); if (!ret) continue; else if (ret != -ENOENT) pr_err("Early cacheinfo failed, ret = %d\n", ret); return; } } void store_cpu_topology(unsigned int cpuid) { struct cpu_topology *cpuid_topo = &cpu_topology[cpuid]; if (cpuid_topo->package_id != -1) goto topology_populated; cpuid_topo->thread_id = -1; cpuid_topo->core_id = cpuid; cpuid_topo->package_id = cpu_to_node(cpuid); pr_debug("CPU%u: package %d core %d thread %d\n", cpuid, cpuid_topo->package_id, cpuid_topo->core_id, cpuid_topo->thread_id); topology_populated: update_siblings_masks(cpuid); } #endif |
| 3 2 2 3 3 3 40 41 3 41 40 40 31 31 31 30 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 | // SPDX-License-Identifier: GPL-2.0-only /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> */ #include <linux/types.h> #include <linux/ip.h> #include <linux/netfilter.h> #include <linux/module.h> #include <linux/rcupdate.h> #include <linux/skbuff.h> #include <net/netns/generic.h> #include <net/route.h> #include <net/ip.h> #include <linux/netfilter_bridge.h> #include <linux/netfilter_ipv4.h> #include <net/netfilter/ipv4/nf_defrag_ipv4.h> #if IS_ENABLED(CONFIG_NF_CONNTRACK) #include <net/netfilter/nf_conntrack.h> #endif #include <net/netfilter/nf_conntrack_zones.h> static DEFINE_MUTEX(defrag4_mutex); static int nf_ct_ipv4_gather_frags(struct net *net, struct sk_buff *skb, u_int32_t user) { int err; local_bh_disable(); err = ip_defrag(net, skb, user); local_bh_enable(); if (!err) skb->ignore_df = 1; return err; } static enum ip_defrag_users nf_ct_defrag_user(unsigned int hooknum, struct sk_buff *skb) { u16 zone_id = NF_CT_DEFAULT_ZONE_ID; #if IS_ENABLED(CONFIG_NF_CONNTRACK) if (skb_nfct(skb)) { enum ip_conntrack_info ctinfo; const struct nf_conn *ct = nf_ct_get(skb, &ctinfo); zone_id = nf_ct_zone_id(nf_ct_zone(ct), CTINFO2DIR(ctinfo)); } #endif if (nf_bridge_in_prerouting(skb)) return IP_DEFRAG_CONNTRACK_BRIDGE_IN + zone_id; if (hooknum == NF_INET_PRE_ROUTING) return IP_DEFRAG_CONNTRACK_IN + zone_id; else return IP_DEFRAG_CONNTRACK_OUT + zone_id; } static unsigned int ipv4_conntrack_defrag(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct sock *sk = skb->sk; if (sk && sk_fullsock(sk) && (sk->sk_family == PF_INET) && inet_test_bit(NODEFRAG, sk)) return NF_ACCEPT; #if IS_ENABLED(CONFIG_NF_CONNTRACK) #if !IS_ENABLED(CONFIG_NF_NAT) /* Previously seen (loopback)? Ignore. Do this before fragment check. */ if (skb_nfct(skb) && !nf_ct_is_template((struct nf_conn *)skb_nfct(skb))) return NF_ACCEPT; #endif if (skb->_nfct == IP_CT_UNTRACKED) return NF_ACCEPT; #endif /* Gather fragments. */ if (ip_is_fragment(ip_hdr(skb))) { enum ip_defrag_users user = nf_ct_defrag_user(state->hook, skb); if (nf_ct_ipv4_gather_frags(state->net, skb, user)) return NF_STOLEN; } return NF_ACCEPT; } static const struct nf_hook_ops ipv4_defrag_ops[] = { { .hook = ipv4_conntrack_defrag, .pf = NFPROTO_IPV4, .hooknum = NF_INET_PRE_ROUTING, .priority = NF_IP_PRI_CONNTRACK_DEFRAG, }, { .hook = ipv4_conntrack_defrag, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP_PRI_CONNTRACK_DEFRAG, }, }; static void __net_exit defrag4_net_exit(struct net *net) { if (net->nf.defrag_ipv4_users) { nf_unregister_net_hooks(net, ipv4_defrag_ops, ARRAY_SIZE(ipv4_defrag_ops)); net->nf.defrag_ipv4_users = 0; } } static const struct nf_defrag_hook defrag_hook = { .owner = THIS_MODULE, .enable = nf_defrag_ipv4_enable, .disable = nf_defrag_ipv4_disable, }; static struct pernet_operations defrag4_net_ops = { .exit = defrag4_net_exit, }; static int __init nf_defrag_init(void) { int err; err = register_pernet_subsys(&defrag4_net_ops); if (err) return err; rcu_assign_pointer(nf_defrag_v4_hook, &defrag_hook); return err; } static void __exit nf_defrag_fini(void) { rcu_assign_pointer(nf_defrag_v4_hook, NULL); unregister_pernet_subsys(&defrag4_net_ops); } int nf_defrag_ipv4_enable(struct net *net) { int err = 0; mutex_lock(&defrag4_mutex); if (net->nf.defrag_ipv4_users == UINT_MAX) { err = -EOVERFLOW; goto out_unlock; } if (net->nf.defrag_ipv4_users) { net->nf.defrag_ipv4_users++; goto out_unlock; } err = nf_register_net_hooks(net, ipv4_defrag_ops, ARRAY_SIZE(ipv4_defrag_ops)); if (err == 0) net->nf.defrag_ipv4_users = 1; out_unlock: mutex_unlock(&defrag4_mutex); return err; } EXPORT_SYMBOL_GPL(nf_defrag_ipv4_enable); void nf_defrag_ipv4_disable(struct net *net) { mutex_lock(&defrag4_mutex); if (net->nf.defrag_ipv4_users) { net->nf.defrag_ipv4_users--; if (net->nf.defrag_ipv4_users == 0) nf_unregister_net_hooks(net, ipv4_defrag_ops, ARRAY_SIZE(ipv4_defrag_ops)); } mutex_unlock(&defrag4_mutex); } EXPORT_SYMBOL_GPL(nf_defrag_ipv4_disable); module_init(nf_defrag_init); module_exit(nf_defrag_fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("IPv4 defragmentation support"); |
| 3 3 3 3 1 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 1 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 1 2 2 2 2 2 2 2 2 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/proc/array.c * * Copyright (C) 1992 by Linus Torvalds * based on ideas by Darren Senn * * Fixes: * Michael. K. Johnson: stat,statm extensions. * <johnsonm@stolaf.edu> * * Pauline Middelink : Made cmdline,envline only break at '\0's, to * make sure SET_PROCTITLE works. Also removed * bad '!' which forced address recalculation for * EVERY character on the current page. * <middelin@polyware.iaf.nl> * * Danny ter Haar : added cpuinfo * <dth@cistron.nl> * * Alessandro Rubini : profile extension. * <rubini@ipvvis.unipv.it> * * Jeff Tranter : added BogoMips field to cpuinfo * <Jeff_Tranter@Mitel.COM> * * Bruno Haible : remove 4K limit for the maps file * <haible@ma2s2.mathematik.uni-karlsruhe.de> * * Yves Arrouye : remove removal of trailing spaces in get_array. * <Yves.Arrouye@marin.fdn.fr> * * Jerome Forissier : added per-CPU time information to /proc/stat * and /proc/<pid>/cpu extension * <forissier@isia.cma.fr> * - Incorporation and non-SMP safe operation * of forissier patch in 2.1.78 by * Hans Marcus <crowbar@concepts.nl> * * aeb@cwi.nl : /proc/partitions * * * Alan Cox : security fixes. * <alan@lxorguk.ukuu.org.uk> * * Al Viro : safe handling of mm_struct * * Gerhard Wichert : added BIGMEM support * Siemens AG <Gerhard.Wichert@pdb.siemens.de> * * Al Viro & Jeff Garzik : moved most of the thing into base.c and * : proc_misc.c. The rest may eventually go into * : base.c too. */ #include <linux/types.h> #include <linux/errno.h> #include <linux/time.h> #include <linux/time_namespace.h> #include <linux/kernel.h> #include <linux/kernel_stat.h> #include <linux/tty.h> #include <linux/string.h> #include <linux/mman.h> #include <linux/sched/mm.h> #include <linux/sched/numa_balancing.h> #include <linux/sched/task_stack.h> #include <linux/sched/task.h> #include <linux/sched/cputime.h> #include <linux/proc_fs.h> #include <linux/ioport.h> #include <linux/io.h> #include <linux/mm.h> #include <linux/hugetlb.h> #include <linux/pagemap.h> #include <linux/swap.h> #include <linux/smp.h> #include <linux/signal.h> #include <linux/highmem.h> #include <linux/file.h> #include <linux/fdtable.h> #include <linux/times.h> #include <linux/cpuset.h> #include <linux/rcupdate.h> #include <linux/delayacct.h> #include <linux/seq_file.h> #include <linux/pid_namespace.h> #include <linux/prctl.h> #include <linux/ptrace.h> #include <linux/string_helpers.h> #include <linux/user_namespace.h> #include <linux/fs_struct.h> #include <linux/kthread.h> #include <linux/mmu_context.h> #include <asm/processor.h> #include "internal.h" void proc_task_name(struct seq_file *m, struct task_struct *p, bool escape) { char tcomm[64]; /* * Test before PF_KTHREAD because all workqueue worker threads are * kernel threads. */ if (p->flags & PF_WQ_WORKER) wq_worker_comm(tcomm, sizeof(tcomm), p); else if (p->flags & PF_KTHREAD) get_kthread_comm(tcomm, sizeof(tcomm), p); else __get_task_comm(tcomm, sizeof(tcomm), p); if (escape) seq_escape_str(m, tcomm, ESCAPE_SPACE | ESCAPE_SPECIAL, "\n\\"); else seq_printf(m, "%.64s", tcomm); } /* * The task state array is a strange "bitmap" of * reasons to sleep. Thus "running" is zero, and * you can test for combinations of others with * simple bit tests. */ static const char * const task_state_array[] = { /* states in TASK_REPORT: */ "R (running)", /* 0x00 */ "S (sleeping)", /* 0x01 */ "D (disk sleep)", /* 0x02 */ "T (stopped)", /* 0x04 */ "t (tracing stop)", /* 0x08 */ "X (dead)", /* 0x10 */ "Z (zombie)", /* 0x20 */ "P (parked)", /* 0x40 */ /* states beyond TASK_REPORT: */ "I (idle)", /* 0x80 */ }; static inline const char *get_task_state(struct task_struct *tsk) { BUILD_BUG_ON(1 + ilog2(TASK_REPORT_MAX) != ARRAY_SIZE(task_state_array)); return task_state_array[task_state_index(tsk)]; } static inline void task_state(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *p) { struct user_namespace *user_ns = seq_user_ns(m); struct group_info *group_info; int g, umask = -1; struct task_struct *tracer; const struct cred *cred; pid_t ppid, tpid = 0, tgid, ngid; unsigned int max_fds = 0; rcu_read_lock(); ppid = pid_alive(p) ? task_tgid_nr_ns(rcu_dereference(p->real_parent), ns) : 0; tracer = ptrace_parent(p); if (tracer) tpid = task_pid_nr_ns(tracer, ns); tgid = task_tgid_nr_ns(p, ns); ngid = task_numa_group_id(p); cred = get_task_cred(p); task_lock(p); if (p->fs) umask = p->fs->umask; if (p->files) max_fds = files_fdtable(p->files)->max_fds; task_unlock(p); rcu_read_unlock(); if (umask >= 0) seq_printf(m, "Umask:\t%#04o\n", umask); seq_puts(m, "State:\t"); seq_puts(m, get_task_state(p)); seq_put_decimal_ull(m, "\nTgid:\t", tgid); seq_put_decimal_ull(m, "\nNgid:\t", ngid); seq_put_decimal_ull(m, "\nPid:\t", pid_nr_ns(pid, ns)); seq_put_decimal_ull(m, "\nPPid:\t", ppid); seq_put_decimal_ull(m, "\nTracerPid:\t", tpid); seq_put_decimal_ull(m, "\nUid:\t", from_kuid_munged(user_ns, cred->uid)); seq_put_decimal_ull(m, "\t", from_kuid_munged(user_ns, cred->euid)); seq_put_decimal_ull(m, "\t", from_kuid_munged(user_ns, cred->suid)); seq_put_decimal_ull(m, "\t", from_kuid_munged(user_ns, cred->fsuid)); seq_put_decimal_ull(m, "\nGid:\t", from_kgid_munged(user_ns, cred->gid)); seq_put_decimal_ull(m, "\t", from_kgid_munged(user_ns, cred->egid)); seq_put_decimal_ull(m, "\t", from_kgid_munged(user_ns, cred->sgid)); seq_put_decimal_ull(m, "\t", from_kgid_munged(user_ns, cred->fsgid)); seq_put_decimal_ull(m, "\nFDSize:\t", max_fds); seq_puts(m, "\nGroups:\t"); group_info = cred->group_info; for (g = 0; g < group_info->ngroups; g++) seq_put_decimal_ull(m, g ? " " : "", from_kgid_munged(user_ns, group_info->gid[g])); put_cred(cred); /* Trailing space shouldn't have been added in the first place. */ seq_putc(m, ' '); #ifdef CONFIG_PID_NS seq_puts(m, "\nNStgid:"); for (g = ns->level; g <= pid->level; g++) seq_put_decimal_ull(m, "\t", task_tgid_nr_ns(p, pid->numbers[g].ns)); seq_puts(m, "\nNSpid:"); for (g = ns->level; g <= pid->level; g++) seq_put_decimal_ull(m, "\t", task_pid_nr_ns(p, pid->numbers[g].ns)); seq_puts(m, "\nNSpgid:"); for (g = ns->level; g <= pid->level; g++) seq_put_decimal_ull(m, "\t", task_pgrp_nr_ns(p, pid->numbers[g].ns)); seq_puts(m, "\nNSsid:"); for (g = ns->level; g <= pid->level; g++) seq_put_decimal_ull(m, "\t", task_session_nr_ns(p, pid->numbers[g].ns)); #endif seq_putc(m, '\n'); seq_printf(m, "Kthread:\t%c\n", p->flags & PF_KTHREAD ? '1' : '0'); } void render_sigset_t(struct seq_file *m, const char *header, sigset_t *set) { int i; seq_puts(m, header); i = _NSIG; do { int x = 0; i -= 4; if (sigismember(set, i+1)) x |= 1; if (sigismember(set, i+2)) x |= 2; if (sigismember(set, i+3)) x |= 4; if (sigismember(set, i+4)) x |= 8; seq_putc(m, hex_asc[x]); } while (i >= 4); seq_putc(m, '\n'); } static void collect_sigign_sigcatch(struct task_struct *p, sigset_t *sigign, sigset_t *sigcatch) { struct k_sigaction *k; int i; k = p->sighand->action; for (i = 1; i <= _NSIG; ++i, ++k) { if (k->sa.sa_handler == SIG_IGN) sigaddset(sigign, i); else if (k->sa.sa_handler != SIG_DFL) sigaddset(sigcatch, i); } } static inline void task_sig(struct seq_file *m, struct task_struct *p) { unsigned long flags; sigset_t pending, shpending, blocked, ignored, caught; int num_threads = 0; unsigned int qsize = 0; unsigned long qlim = 0; sigemptyset(&pending); sigemptyset(&shpending); sigemptyset(&blocked); sigemptyset(&ignored); sigemptyset(&caught); if (lock_task_sighand(p, &flags)) { pending = p->pending.signal; shpending = p->signal->shared_pending.signal; blocked = p->blocked; collect_sigign_sigcatch(p, &ignored, &caught); num_threads = get_nr_threads(p); rcu_read_lock(); /* FIXME: is this correct? */ qsize = get_rlimit_value(task_ucounts(p), UCOUNT_RLIMIT_SIGPENDING); rcu_read_unlock(); qlim = task_rlimit(p, RLIMIT_SIGPENDING); unlock_task_sighand(p, &flags); } seq_put_decimal_ull(m, "Threads:\t", num_threads); seq_put_decimal_ull(m, "\nSigQ:\t", qsize); seq_put_decimal_ull(m, "/", qlim); /* render them all */ render_sigset_t(m, "\nSigPnd:\t", &pending); render_sigset_t(m, "ShdPnd:\t", &shpending); render_sigset_t(m, "SigBlk:\t", &blocked); render_sigset_t(m, "SigIgn:\t", &ignored); render_sigset_t(m, "SigCgt:\t", &caught); } static void render_cap_t(struct seq_file *m, const char *header, kernel_cap_t *a) { seq_puts(m, header); seq_put_hex_ll(m, NULL, a->val, 16); seq_putc(m, '\n'); } static inline void task_cap(struct seq_file *m, struct task_struct *p) { const struct cred *cred; kernel_cap_t cap_inheritable, cap_permitted, cap_effective, cap_bset, cap_ambient; rcu_read_lock(); cred = __task_cred(p); cap_inheritable = cred->cap_inheritable; cap_permitted = cred->cap_permitted; cap_effective = cred->cap_effective; cap_bset = cred->cap_bset; cap_ambient = cred->cap_ambient; rcu_read_unlock(); render_cap_t(m, "CapInh:\t", &cap_inheritable); render_cap_t(m, "CapPrm:\t", &cap_permitted); render_cap_t(m, "CapEff:\t", &cap_effective); render_cap_t(m, "CapBnd:\t", &cap_bset); render_cap_t(m, "CapAmb:\t", &cap_ambient); } static inline void task_seccomp(struct seq_file *m, struct task_struct *p) { seq_put_decimal_ull(m, "NoNewPrivs:\t", task_no_new_privs(p)); #ifdef CONFIG_SECCOMP seq_put_decimal_ull(m, "\nSeccomp:\t", p->seccomp.mode); #ifdef CONFIG_SECCOMP_FILTER seq_put_decimal_ull(m, "\nSeccomp_filters:\t", atomic_read(&p->seccomp.filter_count)); #endif #endif seq_puts(m, "\nSpeculation_Store_Bypass:\t"); switch (arch_prctl_spec_ctrl_get(p, PR_SPEC_STORE_BYPASS)) { case -EINVAL: seq_puts(m, "unknown"); break; case PR_SPEC_NOT_AFFECTED: seq_puts(m, "not vulnerable"); break; case PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE: seq_puts(m, "thread force mitigated"); break; case PR_SPEC_PRCTL | PR_SPEC_DISABLE: seq_puts(m, "thread mitigated"); break; case PR_SPEC_PRCTL | PR_SPEC_ENABLE: seq_puts(m, "thread vulnerable"); break; case PR_SPEC_DISABLE: seq_puts(m, "globally mitigated"); break; default: seq_puts(m, "vulnerable"); break; } seq_puts(m, "\nSpeculationIndirectBranch:\t"); switch (arch_prctl_spec_ctrl_get(p, PR_SPEC_INDIRECT_BRANCH)) { case -EINVAL: seq_puts(m, "unsupported"); break; case PR_SPEC_NOT_AFFECTED: seq_puts(m, "not affected"); break; case PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE: seq_puts(m, "conditional force disabled"); break; case PR_SPEC_PRCTL | PR_SPEC_DISABLE: seq_puts(m, "conditional disabled"); break; case PR_SPEC_PRCTL | PR_SPEC_ENABLE: seq_puts(m, "conditional enabled"); break; case PR_SPEC_ENABLE: seq_puts(m, "always enabled"); break; case PR_SPEC_DISABLE: seq_puts(m, "always disabled"); break; default: seq_puts(m, "unknown"); break; } seq_putc(m, '\n'); } static inline void task_context_switch_counts(struct seq_file *m, struct task_struct *p) { seq_put_decimal_ull(m, "voluntary_ctxt_switches:\t", p->nvcsw); seq_put_decimal_ull(m, "\nnonvoluntary_ctxt_switches:\t", p->nivcsw); seq_putc(m, '\n'); } static void task_cpus_allowed(struct seq_file *m, struct task_struct *task) { seq_printf(m, "Cpus_allowed:\t%*pb\n", cpumask_pr_args(&task->cpus_mask)); seq_printf(m, "Cpus_allowed_list:\t%*pbl\n", cpumask_pr_args(&task->cpus_mask)); } static inline void task_core_dumping(struct seq_file *m, struct task_struct *task) { seq_put_decimal_ull(m, "CoreDumping:\t", !!task->signal->core_state); seq_putc(m, '\n'); } static inline void task_thp_status(struct seq_file *m, struct mm_struct *mm) { bool thp_enabled = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE); if (thp_enabled) thp_enabled = !test_bit(MMF_DISABLE_THP, &mm->flags); seq_printf(m, "THP_enabled:\t%d\n", thp_enabled); } static inline void task_untag_mask(struct seq_file *m, struct mm_struct *mm) { seq_printf(m, "untag_mask:\t%#lx\n", mm_untag_mask(mm)); } __weak void arch_proc_pid_thread_features(struct seq_file *m, struct task_struct *task) { } int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { struct mm_struct *mm = get_task_mm(task); seq_puts(m, "Name:\t"); proc_task_name(m, task, true); seq_putc(m, '\n'); task_state(m, ns, pid, task); if (mm) { task_mem(m, mm); task_core_dumping(m, task); task_thp_status(m, mm); task_untag_mask(m, mm); mmput(mm); } task_sig(m, task); task_cap(m, task); task_seccomp(m, task); task_cpus_allowed(m, task); cpuset_task_status_allowed(m, task); task_context_switch_counts(m, task); arch_proc_pid_thread_features(m, task); return 0; } static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task, int whole) { unsigned long vsize, eip, esp, wchan = 0; int priority, nice; int tty_pgrp = -1, tty_nr = 0; sigset_t sigign, sigcatch; char state; pid_t ppid = 0, pgid = -1, sid = -1; int num_threads = 0; int permitted; struct mm_struct *mm; unsigned long long start_time; unsigned long cmin_flt, cmaj_flt, min_flt, maj_flt; u64 cutime, cstime, cgtime, utime, stime, gtime; unsigned long rsslim = 0; unsigned long flags; int exit_code = task->exit_code; struct signal_struct *sig = task->signal; unsigned int seq = 1; state = *get_task_state(task); vsize = eip = esp = 0; permitted = ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS | PTRACE_MODE_NOAUDIT); mm = get_task_mm(task); if (mm) { vsize = task_vsize(mm); /* * esp and eip are intentionally zeroed out. There is no * non-racy way to read them without freezing the task. * Programs that need reliable values can use ptrace(2). * * The only exception is if the task is core dumping because * a program is not able to use ptrace(2) in that case. It is * safe because the task has stopped executing permanently. */ if (permitted && (task->flags & (PF_EXITING|PF_DUMPCORE))) { if (try_get_task_stack(task)) { eip = KSTK_EIP(task); esp = KSTK_ESP(task); put_task_stack(task); } } } sigemptyset(&sigign); sigemptyset(&sigcatch); if (lock_task_sighand(task, &flags)) { if (sig->tty) { struct pid *pgrp = tty_get_pgrp(sig->tty); tty_pgrp = pid_nr_ns(pgrp, ns); put_pid(pgrp); tty_nr = new_encode_dev(tty_devnum(sig->tty)); } num_threads = get_nr_threads(task); collect_sigign_sigcatch(task, &sigign, &sigcatch); rsslim = READ_ONCE(sig->rlim[RLIMIT_RSS].rlim_cur); if (whole) { if (sig->flags & (SIGNAL_GROUP_EXIT | SIGNAL_STOP_STOPPED)) exit_code = sig->group_exit_code; } sid = task_session_nr_ns(task, ns); ppid = task_tgid_nr_ns(task->real_parent, ns); pgid = task_pgrp_nr_ns(task, ns); unlock_task_sighand(task, &flags); } if (permitted && (!whole || num_threads < 2)) wchan = !task_is_running(task); do { seq++; /* 2 on the 1st/lockless path, otherwise odd */ flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq); cmin_flt = sig->cmin_flt; cmaj_flt = sig->cmaj_flt; cutime = sig->cutime; cstime = sig->cstime; cgtime = sig->cgtime; if (whole) { struct task_struct *t; min_flt = sig->min_flt; maj_flt = sig->maj_flt; gtime = sig->gtime; rcu_read_lock(); __for_each_thread(sig, t) { min_flt += t->min_flt; maj_flt += t->maj_flt; gtime += task_gtime(t); } rcu_read_unlock(); } } while (need_seqretry(&sig->stats_lock, seq)); done_seqretry_irqrestore(&sig->stats_lock, seq, flags); if (whole) { thread_group_cputime_adjusted(task, &utime, &stime); } else { task_cputime_adjusted(task, &utime, &stime); min_flt = task->min_flt; maj_flt = task->maj_flt; gtime = task_gtime(task); } /* scale priority and nice values from timeslices to -20..20 */ /* to make it look like a "normal" Unix priority/nice value */ priority = task_prio(task); nice = task_nice(task); /* apply timens offset for boottime and convert nsec -> ticks */ start_time = nsec_to_clock_t(timens_add_boottime_ns(task->start_boottime)); seq_put_decimal_ull(m, "", pid_nr_ns(pid, ns)); seq_puts(m, " ("); proc_task_name(m, task, false); seq_puts(m, ") "); seq_putc(m, state); seq_put_decimal_ll(m, " ", ppid); seq_put_decimal_ll(m, " ", pgid); seq_put_decimal_ll(m, " ", sid); seq_put_decimal_ll(m, " ", tty_nr); seq_put_decimal_ll(m, " ", tty_pgrp); seq_put_decimal_ull(m, " ", task->flags); seq_put_decimal_ull(m, " ", min_flt); seq_put_decimal_ull(m, " ", cmin_flt); seq_put_decimal_ull(m, " ", maj_flt); seq_put_decimal_ull(m, " ", cmaj_flt); seq_put_decimal_ull(m, " ", nsec_to_clock_t(utime)); seq_put_decimal_ull(m, " ", nsec_to_clock_t(stime)); seq_put_decimal_ll(m, " ", nsec_to_clock_t(cutime)); seq_put_decimal_ll(m, " ", nsec_to_clock_t(cstime)); seq_put_decimal_ll(m, " ", priority); seq_put_decimal_ll(m, " ", nice); seq_put_decimal_ll(m, " ", num_threads); seq_put_decimal_ull(m, " ", 0); seq_put_decimal_ull(m, " ", start_time); seq_put_decimal_ull(m, " ", vsize); seq_put_decimal_ull(m, " ", mm ? get_mm_rss(mm) : 0); seq_put_decimal_ull(m, " ", rsslim); seq_put_decimal_ull(m, " ", mm ? (permitted ? mm->start_code : 1) : 0); seq_put_decimal_ull(m, " ", mm ? (permitted ? mm->end_code : 1) : 0); seq_put_decimal_ull(m, " ", (permitted && mm) ? mm->start_stack : 0); seq_put_decimal_ull(m, " ", esp); seq_put_decimal_ull(m, " ", eip); /* The signal information here is obsolete. * It must be decimal for Linux 2.0 compatibility. * Use /proc/#/status for real-time signals. */ seq_put_decimal_ull(m, " ", task->pending.signal.sig[0] & 0x7fffffffUL); seq_put_decimal_ull(m, " ", task->blocked.sig[0] & 0x7fffffffUL); seq_put_decimal_ull(m, " ", sigign.sig[0] & 0x7fffffffUL); seq_put_decimal_ull(m, " ", sigcatch.sig[0] & 0x7fffffffUL); /* * We used to output the absolute kernel address, but that's an * information leak - so instead we show a 0/1 flag here, to signal * to user-space whether there's a wchan field in /proc/PID/wchan. * * This works with older implementations of procps as well. */ seq_put_decimal_ull(m, " ", wchan); seq_put_decimal_ull(m, " ", 0); seq_put_decimal_ull(m, " ", 0); seq_put_decimal_ll(m, " ", task->exit_signal); seq_put_decimal_ll(m, " ", task_cpu(task)); seq_put_decimal_ull(m, " ", task->rt_priority); seq_put_decimal_ull(m, " ", task->policy); seq_put_decimal_ull(m, " ", delayacct_blkio_ticks(task)); seq_put_decimal_ull(m, " ", nsec_to_clock_t(gtime)); seq_put_decimal_ll(m, " ", nsec_to_clock_t(cgtime)); if (mm && permitted) { seq_put_decimal_ull(m, " ", mm->start_data); seq_put_decimal_ull(m, " ", mm->end_data); seq_put_decimal_ull(m, " ", mm->start_brk); seq_put_decimal_ull(m, " ", mm->arg_start); seq_put_decimal_ull(m, " ", mm->arg_end); seq_put_decimal_ull(m, " ", mm->env_start); seq_put_decimal_ull(m, " ", mm->env_end); } else seq_puts(m, " 0 0 0 0 0 0 0"); if (permitted) seq_put_decimal_ll(m, " ", exit_code); else seq_puts(m, " 0"); seq_putc(m, '\n'); if (mm) mmput(mm); return 0; } int proc_tid_stat(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { return do_task_stat(m, ns, pid, task, 0); } int proc_tgid_stat(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { return do_task_stat(m, ns, pid, task, 1); } int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { struct mm_struct *mm = get_task_mm(task); if (mm) { unsigned long size; unsigned long resident = 0; unsigned long shared = 0; unsigned long text = 0; unsigned long data = 0; size = task_statm(mm, &shared, &text, &data, &resident); mmput(mm); /* * For quick read, open code by putting numbers directly * expected format is * seq_printf(m, "%lu %lu %lu %lu 0 %lu 0\n", * size, resident, shared, text, data); */ seq_put_decimal_ull(m, "", size); seq_put_decimal_ull(m, " ", resident); seq_put_decimal_ull(m, " ", shared); seq_put_decimal_ull(m, " ", text); seq_put_decimal_ull(m, " ", 0); seq_put_decimal_ull(m, " ", data); seq_put_decimal_ull(m, " ", 0); seq_putc(m, '\n'); } else { seq_write(m, "0 0 0 0 0 0 0\n", 14); } return 0; } #ifdef CONFIG_PROC_CHILDREN static struct pid * get_children_pid(struct inode *inode, struct pid *pid_prev, loff_t pos) { struct task_struct *start, *task; struct pid *pid = NULL; read_lock(&tasklist_lock); start = pid_task(proc_pid(inode), PIDTYPE_PID); if (!start) goto out; /* * Lets try to continue searching first, this gives * us significant speedup on children-rich processes. */ if (pid_prev) { task = pid_task(pid_prev, PIDTYPE_PID); if (task && task->real_parent == start && !(list_empty(&task->sibling))) { if (list_is_last(&task->sibling, &start->children)) goto out; task = list_first_entry(&task->sibling, struct task_struct, sibling); pid = get_pid(task_pid(task)); goto out; } } /* * Slow search case. * * We might miss some children here if children * are exited while we were not holding the lock, * but it was never promised to be accurate that * much. * * "Just suppose that the parent sleeps, but N children * exit after we printed their tids. Now the slow paths * skips N extra children, we miss N tasks." (c) * * So one need to stop or freeze the leader and all * its children to get a precise result. */ list_for_each_entry(task, &start->children, sibling) { if (pos-- == 0) { pid = get_pid(task_pid(task)); break; } } out: read_unlock(&tasklist_lock); return pid; } static int children_seq_show(struct seq_file *seq, void *v) { struct inode *inode = file_inode(seq->file); seq_printf(seq, "%d ", pid_nr_ns(v, proc_pid_ns(inode->i_sb))); return 0; } static void *children_seq_start(struct seq_file *seq, loff_t *pos) { return get_children_pid(file_inode(seq->file), NULL, *pos); } static void *children_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct pid *pid; pid = get_children_pid(file_inode(seq->file), v, *pos + 1); put_pid(v); ++*pos; return pid; } static void children_seq_stop(struct seq_file *seq, void *v) { put_pid(v); } static const struct seq_operations children_seq_ops = { .start = children_seq_start, .next = children_seq_next, .stop = children_seq_stop, .show = children_seq_show, }; static int children_seq_open(struct inode *inode, struct file *file) { return seq_open(file, &children_seq_ops); } const struct file_operations proc_tid_children_operations = { .open = children_seq_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release, }; #endif /* CONFIG_PROC_CHILDREN */ |
| 276 123 153 2719 79 2644 4 57 34 57 4 4 4 4 1 126 632 390 393 393 342 332 2797 2745 2741 2731 2695 2746 449 448 94 94 401 31 31 31 31 31 31 10 10 2 379 375 377 379 100 95 87 377 2 7 53 53 53 53 53 53 53 16 16 16 16 16 16 43 44 43 40 42 44 250 190 58 44 16 16 251 69 69 30 30 30 23 7 7 7 7 68 5 38 3 38 54 53 90 91 2 2 1 1 2 2 2 2 8 4 2 4 2 2 2 8 126 126 127 64 99 18 18 18 15 18 65 53 65 65 65 65 64 65 65 6 62 65 65 65 65 10 10 9 9 4 4 51 50 37 14 10 4 4 52 2422 2399 2401 2393 2404 2401 10 196 195 38 191 194 195 3 3 2662 2243 2239 2235 2232 2236 2688 527 357 3 3 186 185 186 2 2 2 2666 182 14 185 524 498 32 2643 2611 2605 2589 10 10 5 10 65 7 7 7 7 7 7 59 59 58 59 2 56 66 59 7 7 7 65 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 | // SPDX-License-Identifier: GPL-2.0-only #include <linux/export.h> #include <linux/bvec.h> #include <linux/fault-inject-usercopy.h> #include <linux/uio.h> #include <linux/pagemap.h> #include <linux/highmem.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/splice.h> #include <linux/compat.h> #include <linux/scatterlist.h> #include <linux/instrumented.h> #include <linux/iov_iter.h> static __always_inline size_t copy_to_user_iter(void __user *iter_to, size_t progress, size_t len, void *from, void *priv2) { if (should_fail_usercopy()) return len; if (access_ok(iter_to, len)) { from += progress; instrument_copy_to_user(iter_to, from, len); len = raw_copy_to_user(iter_to, from, len); } return len; } static __always_inline size_t copy_to_user_iter_nofault(void __user *iter_to, size_t progress, size_t len, void *from, void *priv2) { ssize_t res; if (should_fail_usercopy()) return len; from += progress; res = copy_to_user_nofault(iter_to, from, len); return res < 0 ? len : res; } static __always_inline size_t copy_from_user_iter(void __user *iter_from, size_t progress, size_t len, void *to, void *priv2) { size_t res = len; if (should_fail_usercopy()) return len; if (access_ok(iter_from, len)) { to += progress; instrument_copy_from_user_before(to, iter_from, len); res = raw_copy_from_user(to, iter_from, len); instrument_copy_from_user_after(to, iter_from, len, res); } return res; } static __always_inline size_t memcpy_to_iter(void *iter_to, size_t progress, size_t len, void *from, void *priv2) { memcpy(iter_to, from + progress, len); return 0; } static __always_inline size_t memcpy_from_iter(void *iter_from, size_t progress, size_t len, void *to, void *priv2) { memcpy(to + progress, iter_from, len); return 0; } /* * fault_in_iov_iter_readable - fault in iov iterator for reading * @i: iterator * @size: maximum length * * Fault in one or more iovecs of the given iov_iter, to a maximum length of * @size. For each iovec, fault in each page that constitutes the iovec. * * Returns the number of bytes not faulted in (like copy_to_user() and * copy_from_user()). * * Always returns 0 for non-userspace iterators. */ size_t fault_in_iov_iter_readable(const struct iov_iter *i, size_t size) { if (iter_is_ubuf(i)) { size_t n = min(size, iov_iter_count(i)); n -= fault_in_readable(i->ubuf + i->iov_offset, n); return size - n; } else if (iter_is_iovec(i)) { size_t count = min(size, iov_iter_count(i)); const struct iovec *p; size_t skip; size -= count; for (p = iter_iov(i), skip = i->iov_offset; count; p++, skip = 0) { size_t len = min(count, p->iov_len - skip); size_t ret; if (unlikely(!len)) continue; ret = fault_in_readable(p->iov_base + skip, len); count -= len - ret; if (ret) break; } return count + size; } return 0; } EXPORT_SYMBOL(fault_in_iov_iter_readable); /* * fault_in_iov_iter_writeable - fault in iov iterator for writing * @i: iterator * @size: maximum length * * Faults in the iterator using get_user_pages(), i.e., without triggering * hardware page faults. This is primarily useful when we already know that * some or all of the pages in @i aren't in memory. * * Returns the number of bytes not faulted in, like copy_to_user() and * copy_from_user(). * * Always returns 0 for non-user-space iterators. */ size_t fault_in_iov_iter_writeable(const struct iov_iter *i, size_t size) { if (iter_is_ubuf(i)) { size_t n = min(size, iov_iter_count(i)); n -= fault_in_safe_writeable(i->ubuf + i->iov_offset, n); return size - n; } else if (iter_is_iovec(i)) { size_t count = min(size, iov_iter_count(i)); const struct iovec *p; size_t skip; size -= count; for (p = iter_iov(i), skip = i->iov_offset; count; p++, skip = 0) { size_t len = min(count, p->iov_len - skip); size_t ret; if (unlikely(!len)) continue; ret = fault_in_safe_writeable(p->iov_base + skip, len); count -= len - ret; if (ret) break; } return count + size; } return 0; } EXPORT_SYMBOL(fault_in_iov_iter_writeable); void iov_iter_init(struct iov_iter *i, unsigned int direction, const struct iovec *iov, unsigned long nr_segs, size_t count) { WARN_ON(direction & ~(READ | WRITE)); *i = (struct iov_iter) { .iter_type = ITER_IOVEC, .nofault = false, .data_source = direction, .__iov = iov, .nr_segs = nr_segs, .iov_offset = 0, .count = count }; } EXPORT_SYMBOL(iov_iter_init); size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i) { if (WARN_ON_ONCE(i->data_source)) return 0; if (user_backed_iter(i)) might_fault(); return iterate_and_advance(i, bytes, (void *)addr, copy_to_user_iter, memcpy_to_iter); } EXPORT_SYMBOL(_copy_to_iter); #ifdef CONFIG_ARCH_HAS_COPY_MC static __always_inline size_t copy_to_user_iter_mc(void __user *iter_to, size_t progress, size_t len, void *from, void *priv2) { if (access_ok(iter_to, len)) { from += progress; instrument_copy_to_user(iter_to, from, len); len = copy_mc_to_user(iter_to, from, len); } return len; } static __always_inline size_t memcpy_to_iter_mc(void *iter_to, size_t progress, size_t len, void *from, void *priv2) { return copy_mc_to_kernel(iter_to, from + progress, len); } /** * _copy_mc_to_iter - copy to iter with source memory error exception handling * @addr: source kernel address * @bytes: total transfer length * @i: destination iterator * * The pmem driver deploys this for the dax operation * (dax_copy_to_iter()) for dax reads (bypass page-cache and the * block-layer). Upon #MC read(2) aborts and returns EIO or the bytes * successfully copied. * * The main differences between this and typical _copy_to_iter(). * * * Typical tail/residue handling after a fault retries the copy * byte-by-byte until the fault happens again. Re-triggering machine * checks is potentially fatal so the implementation uses source * alignment and poison alignment assumptions to avoid re-triggering * hardware exceptions. * * * ITER_KVEC and ITER_BVEC can return short copies. Compare to * copy_to_iter() where only ITER_IOVEC attempts might return a short copy. * * Return: number of bytes copied (may be %0) */ size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i) { if (WARN_ON_ONCE(i->data_source)) return 0; if (user_backed_iter(i)) might_fault(); return iterate_and_advance(i, bytes, (void *)addr, copy_to_user_iter_mc, memcpy_to_iter_mc); } EXPORT_SYMBOL_GPL(_copy_mc_to_iter); #endif /* CONFIG_ARCH_HAS_COPY_MC */ static __always_inline size_t __copy_from_iter(void *addr, size_t bytes, struct iov_iter *i) { return iterate_and_advance(i, bytes, addr, copy_from_user_iter, memcpy_from_iter); } size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i) { if (WARN_ON_ONCE(!i->data_source)) return 0; if (user_backed_iter(i)) might_fault(); return __copy_from_iter(addr, bytes, i); } EXPORT_SYMBOL(_copy_from_iter); static __always_inline size_t copy_from_user_iter_nocache(void __user *iter_from, size_t progress, size_t len, void *to, void *priv2) { return __copy_from_user_inatomic_nocache(to + progress, iter_from, len); } size_t _copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i) { if (WARN_ON_ONCE(!i->data_source)) return 0; return iterate_and_advance(i, bytes, addr, copy_from_user_iter_nocache, memcpy_from_iter); } EXPORT_SYMBOL(_copy_from_iter_nocache); #ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE static __always_inline size_t copy_from_user_iter_flushcache(void __user *iter_from, size_t progress, size_t len, void *to, void *priv2) { return __copy_from_user_flushcache(to + progress, iter_from, len); } static __always_inline size_t memcpy_from_iter_flushcache(void *iter_from, size_t progress, size_t len, void *to, void *priv2) { memcpy_flushcache(to + progress, iter_from, len); return 0; } /** * _copy_from_iter_flushcache - write destination through cpu cache * @addr: destination kernel address * @bytes: total transfer length * @i: source iterator * * The pmem driver arranges for filesystem-dax to use this facility via * dax_copy_from_iter() for ensuring that writes to persistent memory * are flushed through the CPU cache. It is differentiated from * _copy_from_iter_nocache() in that guarantees all data is flushed for * all iterator types. The _copy_from_iter_nocache() only attempts to * bypass the cache for the ITER_IOVEC case, and on some archs may use * instructions that strand dirty-data in the cache. * * Return: number of bytes copied (may be %0) */ size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i) { if (WARN_ON_ONCE(!i->data_source)) return 0; return iterate_and_advance(i, bytes, addr, copy_from_user_iter_flushcache, memcpy_from_iter_flushcache); } EXPORT_SYMBOL_GPL(_copy_from_iter_flushcache); #endif static inline bool page_copy_sane(struct page *page, size_t offset, size_t n) { struct page *head; size_t v = n + offset; /* * The general case needs to access the page order in order * to compute the page size. * However, we mostly deal with order-0 pages and thus can * avoid a possible cache line miss for requests that fit all * page orders. */ if (n <= v && v <= PAGE_SIZE) return true; head = compound_head(page); v += (page - head) << PAGE_SHIFT; if (WARN_ON(n > v || v > page_size(head))) return false; return true; } size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes, struct iov_iter *i) { size_t res = 0; if (!page_copy_sane(page, offset, bytes)) return 0; if (WARN_ON_ONCE(i->data_source)) return 0; page += offset / PAGE_SIZE; // first subpage offset %= PAGE_SIZE; while (1) { void *kaddr = kmap_local_page(page); size_t n = min(bytes, (size_t)PAGE_SIZE - offset); n = _copy_to_iter(kaddr + offset, n, i); kunmap_local(kaddr); res += n; bytes -= n; if (!bytes || !n) break; offset += n; if (offset == PAGE_SIZE) { page++; offset = 0; } } return res; } EXPORT_SYMBOL(copy_page_to_iter); size_t copy_page_to_iter_nofault(struct page *page, unsigned offset, size_t bytes, struct iov_iter *i) { size_t res = 0; if (!page_copy_sane(page, offset, bytes)) return 0; if (WARN_ON_ONCE(i->data_source)) return 0; page += offset / PAGE_SIZE; // first subpage offset %= PAGE_SIZE; while (1) { void *kaddr = kmap_local_page(page); size_t n = min(bytes, (size_t)PAGE_SIZE - offset); n = iterate_and_advance(i, n, kaddr + offset, copy_to_user_iter_nofault, memcpy_to_iter); kunmap_local(kaddr); res += n; bytes -= n; if (!bytes || !n) break; offset += n; if (offset == PAGE_SIZE) { page++; offset = 0; } } return res; } EXPORT_SYMBOL(copy_page_to_iter_nofault); size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes, struct iov_iter *i) { size_t res = 0; if (!page_copy_sane(page, offset, bytes)) return 0; page += offset / PAGE_SIZE; // first subpage offset %= PAGE_SIZE; while (1) { void *kaddr = kmap_local_page(page); size_t n = min(bytes, (size_t)PAGE_SIZE - offset); n = _copy_from_iter(kaddr + offset, n, i); kunmap_local(kaddr); res += n; bytes -= n; if (!bytes || !n) break; offset += n; if (offset == PAGE_SIZE) { page++; offset = 0; } } return res; } EXPORT_SYMBOL(copy_page_from_iter); static __always_inline size_t zero_to_user_iter(void __user *iter_to, size_t progress, size_t len, void *priv, void *priv2) { return clear_user(iter_to, len); } static __always_inline size_t zero_to_iter(void *iter_to, size_t progress, size_t len, void *priv, void *priv2) { memset(iter_to, 0, len); return 0; } size_t iov_iter_zero(size_t bytes, struct iov_iter *i) { return iterate_and_advance(i, bytes, NULL, zero_to_user_iter, zero_to_iter); } EXPORT_SYMBOL(iov_iter_zero); size_t copy_page_from_iter_atomic(struct page *page, size_t offset, size_t bytes, struct iov_iter *i) { size_t n, copied = 0; if (!page_copy_sane(page, offset, bytes)) return 0; if (WARN_ON_ONCE(!i->data_source)) return 0; do { char *p; n = bytes - copied; if (PageHighMem(page)) { page += offset / PAGE_SIZE; offset %= PAGE_SIZE; n = min_t(size_t, n, PAGE_SIZE - offset); } p = kmap_atomic(page) + offset; n = __copy_from_iter(p, n, i); kunmap_atomic(p); copied += n; offset += n; } while (PageHighMem(page) && copied != bytes && n > 0); return copied; } EXPORT_SYMBOL(copy_page_from_iter_atomic); static void iov_iter_bvec_advance(struct iov_iter *i, size_t size) { const struct bio_vec *bvec, *end; if (!i->count) return; i->count -= size; size += i->iov_offset; for (bvec = i->bvec, end = bvec + i->nr_segs; bvec < end; bvec++) { if (likely(size < bvec->bv_len)) break; size -= bvec->bv_len; } i->iov_offset = size; i->nr_segs -= bvec - i->bvec; i->bvec = bvec; } static void iov_iter_iovec_advance(struct iov_iter *i, size_t size) { const struct iovec *iov, *end; if (!i->count) return; i->count -= size; size += i->iov_offset; // from beginning of current segment for (iov = iter_iov(i), end = iov + i->nr_segs; iov < end; iov++) { if (likely(size < iov->iov_len)) break; size -= iov->iov_len; } i->iov_offset = size; i->nr_segs -= iov - iter_iov(i); i->__iov = iov; } void iov_iter_advance(struct iov_iter *i, size_t size) { if (unlikely(i->count < size)) size = i->count; if (likely(iter_is_ubuf(i)) || unlikely(iov_iter_is_xarray(i))) { i->iov_offset += size; i->count -= size; } else if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) { /* iovec and kvec have identical layouts */ iov_iter_iovec_advance(i, size); } else if (iov_iter_is_bvec(i)) { iov_iter_bvec_advance(i, size); } else if (iov_iter_is_discard(i)) { i->count -= size; } } EXPORT_SYMBOL(iov_iter_advance); void iov_iter_revert(struct iov_iter *i, size_t unroll) { if (!unroll) return; if (WARN_ON(unroll > MAX_RW_COUNT)) return; i->count += unroll; if (unlikely(iov_iter_is_discard(i))) return; if (unroll <= i->iov_offset) { i->iov_offset -= unroll; return; } unroll -= i->iov_offset; if (iov_iter_is_xarray(i) || iter_is_ubuf(i)) { BUG(); /* We should never go beyond the start of the specified * range since we might then be straying into pages that * aren't pinned. */ } else if (iov_iter_is_bvec(i)) { const struct bio_vec *bvec = i->bvec; while (1) { size_t n = (--bvec)->bv_len; i->nr_segs++; if (unroll <= n) { i->bvec = bvec; i->iov_offset = n - unroll; return; } unroll -= n; } } else { /* same logics for iovec and kvec */ const struct iovec *iov = iter_iov(i); while (1) { size_t n = (--iov)->iov_len; i->nr_segs++; if (unroll <= n) { i->__iov = iov; i->iov_offset = n - unroll; return; } unroll -= n; } } } EXPORT_SYMBOL(iov_iter_revert); /* * Return the count of just the current iov_iter segment. */ size_t iov_iter_single_seg_count(const struct iov_iter *i) { if (i->nr_segs > 1) { if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) return min(i->count, iter_iov(i)->iov_len - i->iov_offset); if (iov_iter_is_bvec(i)) return min(i->count, i->bvec->bv_len - i->iov_offset); } return i->count; } EXPORT_SYMBOL(iov_iter_single_seg_count); void iov_iter_kvec(struct iov_iter *i, unsigned int direction, const struct kvec *kvec, unsigned long nr_segs, size_t count) { WARN_ON(direction & ~(READ | WRITE)); *i = (struct iov_iter){ .iter_type = ITER_KVEC, .data_source = direction, .kvec = kvec, .nr_segs = nr_segs, .iov_offset = 0, .count = count }; } EXPORT_SYMBOL(iov_iter_kvec); void iov_iter_bvec(struct iov_iter *i, unsigned int direction, const struct bio_vec *bvec, unsigned long nr_segs, size_t count) { WARN_ON(direction & ~(READ | WRITE)); *i = (struct iov_iter){ .iter_type = ITER_BVEC, .data_source = direction, .bvec = bvec, .nr_segs = nr_segs, .iov_offset = 0, .count = count }; } EXPORT_SYMBOL(iov_iter_bvec); /** * iov_iter_xarray - Initialise an I/O iterator to use the pages in an xarray * @i: The iterator to initialise. * @direction: The direction of the transfer. * @xarray: The xarray to access. * @start: The start file position. * @count: The size of the I/O buffer in bytes. * * Set up an I/O iterator to either draw data out of the pages attached to an * inode or to inject data into those pages. The pages *must* be prevented * from evaporation, either by taking a ref on them or locking them by the * caller. */ void iov_iter_xarray(struct iov_iter *i, unsigned int direction, struct xarray *xarray, loff_t start, size_t count) { BUG_ON(direction & ~1); *i = (struct iov_iter) { .iter_type = ITER_XARRAY, .data_source = direction, .xarray = xarray, .xarray_start = start, .count = count, .iov_offset = 0 }; } EXPORT_SYMBOL(iov_iter_xarray); /** * iov_iter_discard - Initialise an I/O iterator that discards data * @i: The iterator to initialise. * @direction: The direction of the transfer. * @count: The size of the I/O buffer in bytes. * * Set up an I/O iterator that just discards everything that's written to it. * It's only available as a READ iterator. */ void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count) { BUG_ON(direction != READ); *i = (struct iov_iter){ .iter_type = ITER_DISCARD, .data_source = false, .count = count, .iov_offset = 0 }; } EXPORT_SYMBOL(iov_iter_discard); static bool iov_iter_aligned_iovec(const struct iov_iter *i, unsigned addr_mask, unsigned len_mask) { const struct iovec *iov = iter_iov(i); size_t size = i->count; size_t skip = i->iov_offset; do { size_t len = iov->iov_len - skip; if (len > size) len = size; if (len & len_mask) return false; if ((unsigned long)(iov->iov_base + skip) & addr_mask) return false; iov++; size -= len; skip = 0; } while (size); return true; } static bool iov_iter_aligned_bvec(const struct iov_iter *i, unsigned addr_mask, unsigned len_mask) { const struct bio_vec *bvec = i->bvec; unsigned skip = i->iov_offset; size_t size = i->count; do { size_t len = bvec->bv_len; if (len > size) len = size; if (len & len_mask) return false; if ((unsigned long)(bvec->bv_offset + skip) & addr_mask) return false; bvec++; size -= len; skip = 0; } while (size); return true; } /** * iov_iter_is_aligned() - Check if the addresses and lengths of each segments * are aligned to the parameters. * * @i: &struct iov_iter to restore * @addr_mask: bit mask to check against the iov element's addresses * @len_mask: bit mask to check against the iov element's lengths * * Return: false if any addresses or lengths intersect with the provided masks */ bool iov_iter_is_aligned(const struct iov_iter *i, unsigned addr_mask, unsigned len_mask) { if (likely(iter_is_ubuf(i))) { if (i->count & len_mask) return false; if ((unsigned long)(i->ubuf + i->iov_offset) & addr_mask) return false; return true; } if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) return iov_iter_aligned_iovec(i, addr_mask, len_mask); if (iov_iter_is_bvec(i)) return iov_iter_aligned_bvec(i, addr_mask, len_mask); if (iov_iter_is_xarray(i)) { if (i->count & len_mask) return false; if ((i->xarray_start + i->iov_offset) & addr_mask) return false; } return true; } EXPORT_SYMBOL_GPL(iov_iter_is_aligned); static unsigned long iov_iter_alignment_iovec(const struct iov_iter *i) { const struct iovec *iov = iter_iov(i); unsigned long res = 0; size_t size = i->count; size_t skip = i->iov_offset; do { size_t len = iov->iov_len - skip; if (len) { res |= (unsigned long)iov->iov_base + skip; if (len > size) len = size; res |= len; size -= len; } iov++; skip = 0; } while (size); return res; } static unsigned long iov_iter_alignment_bvec(const struct iov_iter *i) { const struct bio_vec *bvec = i->bvec; unsigned res = 0; size_t size = i->count; unsigned skip = i->iov_offset; do { size_t len = bvec->bv_len - skip; res |= (unsigned long)bvec->bv_offset + skip; if (len > size) len = size; res |= len; bvec++; size -= len; skip = 0; } while (size); return res; } unsigned long iov_iter_alignment(const struct iov_iter *i) { if (likely(iter_is_ubuf(i))) { size_t size = i->count; if (size) return ((unsigned long)i->ubuf + i->iov_offset) | size; return 0; } /* iovec and kvec have identical layouts */ if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) return iov_iter_alignment_iovec(i); if (iov_iter_is_bvec(i)) return iov_iter_alignment_bvec(i); if (iov_iter_is_xarray(i)) return (i->xarray_start + i->iov_offset) | i->count; return 0; } EXPORT_SYMBOL(iov_iter_alignment); unsigned long iov_iter_gap_alignment(const struct iov_iter *i) { unsigned long res = 0; unsigned long v = 0; size_t size = i->count; unsigned k; if (iter_is_ubuf(i)) return 0; if (WARN_ON(!iter_is_iovec(i))) return ~0U; for (k = 0; k < i->nr_segs; k++) { const struct iovec *iov = iter_iov(i) + k; if (iov->iov_len) { unsigned long base = (unsigned long)iov->iov_base; if (v) // if not the first one res |= base | v; // this start | previous end v = base + iov->iov_len; if (size <= iov->iov_len) break; size -= iov->iov_len; } } return res; } EXPORT_SYMBOL(iov_iter_gap_alignment); static int want_pages_array(struct page ***res, size_t size, size_t start, unsigned int maxpages) { unsigned int count = DIV_ROUND_UP(size + start, PAGE_SIZE); if (count > maxpages) count = maxpages; WARN_ON(!count); // caller should've prevented that if (!*res) { *res = kvmalloc_array(count, sizeof(struct page *), GFP_KERNEL); if (!*res) return 0; } return count; } static ssize_t iter_xarray_populate_pages(struct page **pages, struct xarray *xa, pgoff_t index, unsigned int nr_pages) { XA_STATE(xas, xa, index); struct page *page; unsigned int ret = 0; rcu_read_lock(); for (page = xas_load(&xas); page; page = xas_next(&xas)) { if (xas_retry(&xas, page)) continue; /* Has the page moved or been split? */ if (unlikely(page != xas_reload(&xas))) { xas_reset(&xas); continue; } pages[ret] = find_subpage(page, xas.xa_index); get_page(pages[ret]); if (++ret == nr_pages) break; } rcu_read_unlock(); return ret; } static ssize_t iter_xarray_get_pages(struct iov_iter *i, struct page ***pages, size_t maxsize, unsigned maxpages, size_t *_start_offset) { unsigned nr, offset, count; pgoff_t index; loff_t pos; pos = i->xarray_start + i->iov_offset; index = pos >> PAGE_SHIFT; offset = pos & ~PAGE_MASK; *_start_offset = offset; count = want_pages_array(pages, maxsize, offset, maxpages); if (!count) return -ENOMEM; nr = iter_xarray_populate_pages(*pages, i->xarray, index, count); if (nr == 0) return 0; maxsize = min_t(size_t, nr * PAGE_SIZE - offset, maxsize); i->iov_offset += maxsize; i->count -= maxsize; return maxsize; } /* must be done on non-empty ITER_UBUF or ITER_IOVEC one */ static unsigned long first_iovec_segment(const struct iov_iter *i, size_t *size) { size_t skip; long k; if (iter_is_ubuf(i)) return (unsigned long)i->ubuf + i->iov_offset; for (k = 0, skip = i->iov_offset; k < i->nr_segs; k++, skip = 0) { const struct iovec *iov = iter_iov(i) + k; size_t len = iov->iov_len - skip; if (unlikely(!len)) continue; if (*size > len) *size = len; return (unsigned long)iov->iov_base + skip; } BUG(); // if it had been empty, we wouldn't get called } /* must be done on non-empty ITER_BVEC one */ static struct page *first_bvec_segment(const struct iov_iter *i, size_t *size, size_t *start) { struct page *page; size_t skip = i->iov_offset, len; len = i->bvec->bv_len - skip; if (*size > len) *size = len; skip += i->bvec->bv_offset; page = i->bvec->bv_page + skip / PAGE_SIZE; *start = skip % PAGE_SIZE; return page; } static ssize_t __iov_iter_get_pages_alloc(struct iov_iter *i, struct page ***pages, size_t maxsize, unsigned int maxpages, size_t *start) { unsigned int n, gup_flags = 0; if (maxsize > i->count) maxsize = i->count; if (!maxsize) return 0; if (maxsize > MAX_RW_COUNT) maxsize = MAX_RW_COUNT; if (likely(user_backed_iter(i))) { unsigned long addr; int res; if (iov_iter_rw(i) != WRITE) gup_flags |= FOLL_WRITE; if (i->nofault) gup_flags |= FOLL_NOFAULT; addr = first_iovec_segment(i, &maxsize); *start = addr % PAGE_SIZE; addr &= PAGE_MASK; n = want_pages_array(pages, maxsize, *start, maxpages); if (!n) return -ENOMEM; res = get_user_pages_fast(addr, n, gup_flags, *pages); if (unlikely(res <= 0)) return res; maxsize = min_t(size_t, maxsize, res * PAGE_SIZE - *start); iov_iter_advance(i, maxsize); return maxsize; } if (iov_iter_is_bvec(i)) { struct page **p; struct page *page; page = first_bvec_segment(i, &maxsize, start); n = want_pages_array(pages, maxsize, *start, maxpages); if (!n) return -ENOMEM; p = *pages; for (int k = 0; k < n; k++) get_page(p[k] = page + k); maxsize = min_t(size_t, maxsize, n * PAGE_SIZE - *start); i->count -= maxsize; i->iov_offset += maxsize; if (i->iov_offset == i->bvec->bv_len) { i->iov_offset = 0; i->bvec++; i->nr_segs--; } return maxsize; } if (iov_iter_is_xarray(i)) return iter_xarray_get_pages(i, pages, maxsize, maxpages, start); return -EFAULT; } ssize_t iov_iter_get_pages2(struct iov_iter *i, struct page **pages, size_t maxsize, unsigned maxpages, size_t *start) { if (!maxpages) return 0; BUG_ON(!pages); return __iov_iter_get_pages_alloc(i, &pages, maxsize, maxpages, start); } EXPORT_SYMBOL(iov_iter_get_pages2); ssize_t iov_iter_get_pages_alloc2(struct iov_iter *i, struct page ***pages, size_t maxsize, size_t *start) { ssize_t len; *pages = NULL; len = __iov_iter_get_pages_alloc(i, pages, maxsize, ~0U, start); if (len <= 0) { kvfree(*pages); *pages = NULL; } return len; } EXPORT_SYMBOL(iov_iter_get_pages_alloc2); static int iov_npages(const struct iov_iter *i, int maxpages) { size_t skip = i->iov_offset, size = i->count; const struct iovec *p; int npages = 0; for (p = iter_iov(i); size; skip = 0, p++) { unsigned offs = offset_in_page(p->iov_base + skip); size_t len = min(p->iov_len - skip, size); if (len) { size -= len; npages += DIV_ROUND_UP(offs + len, PAGE_SIZE); if (unlikely(npages > maxpages)) return maxpages; } } return npages; } static int bvec_npages(const struct iov_iter *i, int maxpages) { size_t skip = i->iov_offset, size = i->count; const struct bio_vec *p; int npages = 0; for (p = i->bvec; size; skip = 0, p++) { unsigned offs = (p->bv_offset + skip) % PAGE_SIZE; size_t len = min(p->bv_len - skip, size); size -= len; npages += DIV_ROUND_UP(offs + len, PAGE_SIZE); if (unlikely(npages > maxpages)) return maxpages; } return npages; } int iov_iter_npages(const struct iov_iter *i, int maxpages) { if (unlikely(!i->count)) return 0; if (likely(iter_is_ubuf(i))) { unsigned offs = offset_in_page(i->ubuf + i->iov_offset); int npages = DIV_ROUND_UP(offs + i->count, PAGE_SIZE); return min(npages, maxpages); } /* iovec and kvec have identical layouts */ if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) return iov_npages(i, maxpages); if (iov_iter_is_bvec(i)) return bvec_npages(i, maxpages); if (iov_iter_is_xarray(i)) { unsigned offset = (i->xarray_start + i->iov_offset) % PAGE_SIZE; int npages = DIV_ROUND_UP(offset + i->count, PAGE_SIZE); return min(npages, maxpages); } return 0; } EXPORT_SYMBOL(iov_iter_npages); const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags) { *new = *old; if (iov_iter_is_bvec(new)) return new->bvec = kmemdup(new->bvec, new->nr_segs * sizeof(struct bio_vec), flags); else if (iov_iter_is_kvec(new) || iter_is_iovec(new)) /* iovec and kvec have identical layout */ return new->__iov = kmemdup(new->__iov, new->nr_segs * sizeof(struct iovec), flags); return NULL; } EXPORT_SYMBOL(dup_iter); static __noclone int copy_compat_iovec_from_user(struct iovec *iov, const struct iovec __user *uvec, u32 nr_segs) { const struct compat_iovec __user *uiov = (const struct compat_iovec __user *)uvec; int ret = -EFAULT; u32 i; if (!user_access_begin(uiov, nr_segs * sizeof(*uiov))) return -EFAULT; for (i = 0; i < nr_segs; i++) { compat_uptr_t buf; compat_ssize_t len; unsafe_get_user(len, &uiov[i].iov_len, uaccess_end); unsafe_get_user(buf, &uiov[i].iov_base, uaccess_end); /* check for compat_size_t not fitting in compat_ssize_t .. */ if (len < 0) { ret = -EINVAL; goto uaccess_end; } iov[i].iov_base = compat_ptr(buf); iov[i].iov_len = len; } ret = 0; uaccess_end: user_access_end(); return ret; } static __noclone int copy_iovec_from_user(struct iovec *iov, const struct iovec __user *uiov, unsigned long nr_segs) { int ret = -EFAULT; if (!user_access_begin(uiov, nr_segs * sizeof(*uiov))) return -EFAULT; do { void __user *buf; ssize_t len; unsafe_get_user(len, &uiov->iov_len, uaccess_end); unsafe_get_user(buf, &uiov->iov_base, uaccess_end); /* check for size_t not fitting in ssize_t .. */ if (unlikely(len < 0)) { ret = -EINVAL; goto uaccess_end; } iov->iov_base = buf; iov->iov_len = len; uiov++; iov++; } while (--nr_segs); ret = 0; uaccess_end: user_access_end(); return ret; } struct iovec *iovec_from_user(const struct iovec __user *uvec, unsigned long nr_segs, unsigned long fast_segs, struct iovec *fast_iov, bool compat) { struct iovec *iov = fast_iov; int ret; /* * SuS says "The readv() function *may* fail if the iovcnt argument was * less than or equal to 0, or greater than {IOV_MAX}. Linux has * traditionally returned zero for zero segments, so... */ if (nr_segs == 0) return iov; if (nr_segs > UIO_MAXIOV) return ERR_PTR(-EINVAL); if (nr_segs > fast_segs) { iov = kmalloc_array(nr_segs, sizeof(struct iovec), GFP_KERNEL); if (!iov) return ERR_PTR(-ENOMEM); } if (unlikely(compat)) ret = copy_compat_iovec_from_user(iov, uvec, nr_segs); else ret = copy_iovec_from_user(iov, uvec, nr_segs); if (ret) { if (iov != fast_iov) kfree(iov); return ERR_PTR(ret); } return iov; } /* * Single segment iovec supplied by the user, import it as ITER_UBUF. */ static ssize_t __import_iovec_ubuf(int type, const struct iovec __user *uvec, struct iovec **iovp, struct iov_iter *i, bool compat) { struct iovec *iov = *iovp; ssize_t ret; if (compat) ret = copy_compat_iovec_from_user(iov, uvec, 1); else ret = copy_iovec_from_user(iov, uvec, 1); if (unlikely(ret)) return ret; ret = import_ubuf(type, iov->iov_base, iov->iov_len, i); if (unlikely(ret)) return ret; *iovp = NULL; return i->count; } ssize_t __import_iovec(int type, const struct iovec __user *uvec, unsigned nr_segs, unsigned fast_segs, struct iovec **iovp, struct iov_iter *i, bool compat) { ssize_t total_len = 0; unsigned long seg; struct iovec *iov; if (nr_segs == 1) return __import_iovec_ubuf(type, uvec, iovp, i, compat); iov = iovec_from_user(uvec, nr_segs, fast_segs, *iovp, compat); if (IS_ERR(iov)) { *iovp = NULL; return PTR_ERR(iov); } /* * According to the Single Unix Specification we should return EINVAL if * an element length is < 0 when cast to ssize_t or if the total length * would overflow the ssize_t return value of the system call. * * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the * overflow case. */ for (seg = 0; seg < nr_segs; seg++) { ssize_t len = (ssize_t)iov[seg].iov_len; if (!access_ok(iov[seg].iov_base, len)) { if (iov != *iovp) kfree(iov); *iovp = NULL; return -EFAULT; } if (len > MAX_RW_COUNT - total_len) { len = MAX_RW_COUNT - total_len; iov[seg].iov_len = len; } total_len += len; } iov_iter_init(i, type, iov, nr_segs, total_len); if (iov == *iovp) *iovp = NULL; else *iovp = iov; return total_len; } /** * import_iovec() - Copy an array of &struct iovec from userspace * into the kernel, check that it is valid, and initialize a new * &struct iov_iter iterator to access it. * * @type: One of %READ or %WRITE. * @uvec: Pointer to the userspace array. * @nr_segs: Number of elements in userspace array. * @fast_segs: Number of elements in @iov. * @iovp: (input and output parameter) Pointer to pointer to (usually small * on-stack) kernel array. * @i: Pointer to iterator that will be initialized on success. * * If the array pointed to by *@iov is large enough to hold all @nr_segs, * then this function places %NULL in *@iov on return. Otherwise, a new * array will be allocated and the result placed in *@iov. This means that * the caller may call kfree() on *@iov regardless of whether the small * on-stack array was used or not (and regardless of whether this function * returns an error or not). * * Return: Negative error code on error, bytes imported on success */ ssize_t import_iovec(int type, const struct iovec __user *uvec, unsigned nr_segs, unsigned fast_segs, struct iovec **iovp, struct iov_iter *i) { return __import_iovec(type, uvec, nr_segs, fast_segs, iovp, i, in_compat_syscall()); } EXPORT_SYMBOL(import_iovec); int import_ubuf(int rw, void __user *buf, size_t len, struct iov_iter *i) { if (len > MAX_RW_COUNT) len = MAX_RW_COUNT; if (unlikely(!access_ok(buf, len))) return -EFAULT; iov_iter_ubuf(i, rw, buf, len); return 0; } EXPORT_SYMBOL_GPL(import_ubuf); /** * iov_iter_restore() - Restore a &struct iov_iter to the same state as when * iov_iter_save_state() was called. * * @i: &struct iov_iter to restore * @state: state to restore from * * Used after iov_iter_save_state() to bring restore @i, if operations may * have advanced it. * * Note: only works on ITER_IOVEC, ITER_BVEC, and ITER_KVEC */ void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state) { if (WARN_ON_ONCE(!iov_iter_is_bvec(i) && !iter_is_iovec(i) && !iter_is_ubuf(i)) && !iov_iter_is_kvec(i)) return; i->iov_offset = state->iov_offset; i->count = state->count; if (iter_is_ubuf(i)) return; /* * For the *vec iters, nr_segs + iov is constant - if we increment * the vec, then we also decrement the nr_segs count. Hence we don't * need to track both of these, just one is enough and we can deduct * the other from that. ITER_KVEC and ITER_IOVEC are the same struct * size, so we can just increment the iov pointer as they are unionzed. * ITER_BVEC _may_ be the same size on some archs, but on others it is * not. Be safe and handle it separately. */ BUILD_BUG_ON(sizeof(struct iovec) != sizeof(struct kvec)); if (iov_iter_is_bvec(i)) i->bvec -= state->nr_segs - i->nr_segs; else i->__iov -= state->nr_segs - i->nr_segs; i->nr_segs = state->nr_segs; } /* * Extract a list of contiguous pages from an ITER_XARRAY iterator. This does not * get references on the pages, nor does it get a pin on them. */ static ssize_t iov_iter_extract_xarray_pages(struct iov_iter *i, struct page ***pages, size_t maxsize, unsigned int maxpages, iov_iter_extraction_t extraction_flags, size_t *offset0) { struct page *page, **p; unsigned int nr = 0, offset; loff_t pos = i->xarray_start + i->iov_offset; pgoff_t index = pos >> PAGE_SHIFT; XA_STATE(xas, i->xarray, index); offset = pos & ~PAGE_MASK; *offset0 = offset; maxpages = want_pages_array(pages, maxsize, offset, maxpages); if (!maxpages) return -ENOMEM; p = *pages; rcu_read_lock(); for (page = xas_load(&xas); page; page = xas_next(&xas)) { if (xas_retry(&xas, page)) continue; /* Has the page moved or been split? */ if (unlikely(page != xas_reload(&xas))) { xas_reset(&xas); continue; } p[nr++] = find_subpage(page, xas.xa_index); if (nr == maxpages) break; } rcu_read_unlock(); maxsize = min_t(size_t, nr * PAGE_SIZE - offset, maxsize); iov_iter_advance(i, maxsize); return maxsize; } /* * Extract a list of contiguous pages from an ITER_BVEC iterator. This does * not get references on the pages, nor does it get a pin on them. */ static ssize_t iov_iter_extract_bvec_pages(struct iov_iter *i, struct page ***pages, size_t maxsize, unsigned int maxpages, iov_iter_extraction_t extraction_flags, size_t *offset0) { struct page **p, *page; size_t skip = i->iov_offset, offset, size; int k; for (;;) { if (i->nr_segs == 0) return 0; size = min(maxsize, i->bvec->bv_len - skip); if (size) break; i->iov_offset = 0; i->nr_segs--; i->bvec++; skip = 0; } skip += i->bvec->bv_offset; page = i->bvec->bv_page + skip / PAGE_SIZE; offset = skip % PAGE_SIZE; *offset0 = offset; maxpages = want_pages_array(pages, size, offset, maxpages); if (!maxpages) return -ENOMEM; p = *pages; for (k = 0; k < maxpages; k++) p[k] = page + k; size = min_t(size_t, size, maxpages * PAGE_SIZE - offset); iov_iter_advance(i, size); return size; } /* * Extract a list of virtually contiguous pages from an ITER_KVEC iterator. * This does not get references on the pages, nor does it get a pin on them. */ static ssize_t iov_iter_extract_kvec_pages(struct iov_iter *i, struct page ***pages, size_t maxsize, unsigned int maxpages, iov_iter_extraction_t extraction_flags, size_t *offset0) { struct page **p, *page; const void *kaddr; size_t skip = i->iov_offset, offset, len, size; int k; for (;;) { if (i->nr_segs == 0) return 0; size = min(maxsize, i->kvec->iov_len - skip); if (size) break; i->iov_offset = 0; i->nr_segs--; i->kvec++; skip = 0; } kaddr = i->kvec->iov_base + skip; offset = (unsigned long)kaddr & ~PAGE_MASK; *offset0 = offset; maxpages = want_pages_array(pages, size, offset, maxpages); if (!maxpages) return -ENOMEM; p = *pages; kaddr -= offset; len = offset + size; for (k = 0; k < maxpages; k++) { size_t seg = min_t(size_t, len, PAGE_SIZE); if (is_vmalloc_or_module_addr(kaddr)) page = vmalloc_to_page(kaddr); else page = virt_to_page(kaddr); p[k] = page; len -= seg; kaddr += PAGE_SIZE; } size = min_t(size_t, size, maxpages * PAGE_SIZE - offset); iov_iter_advance(i, size); return size; } /* * Extract a list of contiguous pages from a user iterator and get a pin on * each of them. This should only be used if the iterator is user-backed * (IOBUF/UBUF). * * It does not get refs on the pages, but the pages must be unpinned by the * caller once the transfer is complete. * * This is safe to be used where background IO/DMA *is* going to be modifying * the buffer; using a pin rather than a ref makes forces fork() to give the * child a copy of the page. */ static ssize_t iov_iter_extract_user_pages(struct iov_iter *i, struct page ***pages, size_t maxsize, unsigned int maxpages, iov_iter_extraction_t extraction_flags, size_t *offset0) { unsigned long addr; unsigned int gup_flags = 0; size_t offset; int res; if (i->data_source == ITER_DEST) gup_flags |= FOLL_WRITE; if (extraction_flags & ITER_ALLOW_P2PDMA) gup_flags |= FOLL_PCI_P2PDMA; if (i->nofault) gup_flags |= FOLL_NOFAULT; addr = first_iovec_segment(i, &maxsize); *offset0 = offset = addr % PAGE_SIZE; addr &= PAGE_MASK; maxpages = want_pages_array(pages, maxsize, offset, maxpages); if (!maxpages) return -ENOMEM; res = pin_user_pages_fast(addr, maxpages, gup_flags, *pages); if (unlikely(res <= 0)) return res; maxsize = min_t(size_t, maxsize, res * PAGE_SIZE - offset); iov_iter_advance(i, maxsize); return maxsize; } /** * iov_iter_extract_pages - Extract a list of contiguous pages from an iterator * @i: The iterator to extract from * @pages: Where to return the list of pages * @maxsize: The maximum amount of iterator to extract * @maxpages: The maximum size of the list of pages * @extraction_flags: Flags to qualify request * @offset0: Where to return the starting offset into (*@pages)[0] * * Extract a list of contiguous pages from the current point of the iterator, * advancing the iterator. The maximum number of pages and the maximum amount * of page contents can be set. * * If *@pages is NULL, a page list will be allocated to the required size and * *@pages will be set to its base. If *@pages is not NULL, it will be assumed * that the caller allocated a page list at least @maxpages in size and this * will be filled in. * * @extraction_flags can have ITER_ALLOW_P2PDMA set to request peer-to-peer DMA * be allowed on the pages extracted. * * The iov_iter_extract_will_pin() function can be used to query how cleanup * should be performed. * * Extra refs or pins on the pages may be obtained as follows: * * (*) If the iterator is user-backed (ITER_IOVEC/ITER_UBUF), pins will be * added to the pages, but refs will not be taken. * iov_iter_extract_will_pin() will return true. * * (*) If the iterator is ITER_KVEC, ITER_BVEC or ITER_XARRAY, the pages are * merely listed; no extra refs or pins are obtained. * iov_iter_extract_will_pin() will return 0. * * Note also: * * (*) Use with ITER_DISCARD is not supported as that has no content. * * On success, the function sets *@pages to the new pagelist, if allocated, and * sets *offset0 to the offset into the first page. * * It may also return -ENOMEM and -EFAULT. */ ssize_t iov_iter_extract_pages(struct iov_iter *i, struct page ***pages, size_t maxsize, unsigned int maxpages, iov_iter_extraction_t extraction_flags, size_t *offset0) { maxsize = min_t(size_t, min_t(size_t, maxsize, i->count), MAX_RW_COUNT); if (!maxsize) return 0; if (likely(user_backed_iter(i))) return iov_iter_extract_user_pages(i, pages, maxsize, maxpages, extraction_flags, offset0); if (iov_iter_is_kvec(i)) return iov_iter_extract_kvec_pages(i, pages, maxsize, maxpages, extraction_flags, offset0); if (iov_iter_is_bvec(i)) return iov_iter_extract_bvec_pages(i, pages, maxsize, maxpages, extraction_flags, offset0); if (iov_iter_is_xarray(i)) return iov_iter_extract_xarray_pages(i, pages, maxsize, maxpages, extraction_flags, offset0); return -EFAULT; } EXPORT_SYMBOL_GPL(iov_iter_extract_pages); |
| 8 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Virtio PCI driver - common functionality for all device versions * * This module allows virtio devices to be used over a virtual PCI device. * This can be used with QEMU based VMMs like KVM or Xen. * * Copyright IBM Corp. 2007 * Copyright Red Hat, Inc. 2014 * * Authors: * Anthony Liguori <aliguori@us.ibm.com> * Rusty Russell <rusty@rustcorp.com.au> * Michael S. Tsirkin <mst@redhat.com> */ #include "virtio_pci_common.h" static bool force_legacy = false; #if IS_ENABLED(CONFIG_VIRTIO_PCI_LEGACY) module_param(force_legacy, bool, 0444); MODULE_PARM_DESC(force_legacy, "Force legacy mode for transitional virtio 1 devices"); #endif /* wait for pending irq handlers */ void vp_synchronize_vectors(struct virtio_device *vdev) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); int i; if (vp_dev->intx_enabled) synchronize_irq(vp_dev->pci_dev->irq); for (i = 0; i < vp_dev->msix_vectors; ++i) synchronize_irq(pci_irq_vector(vp_dev->pci_dev, i)); } /* the notify function used when creating a virt queue */ bool vp_notify(struct virtqueue *vq) { /* we write the queue's selector into the notification register to * signal the other end */ iowrite16(vq->index, (void __iomem *)vq->priv); return true; } /* Handle a configuration change: Tell driver if it wants to know. */ static irqreturn_t vp_config_changed(int irq, void *opaque) { struct virtio_pci_device *vp_dev = opaque; virtio_config_changed(&vp_dev->vdev); return IRQ_HANDLED; } /* Notify all virtqueues on an interrupt. */ static irqreturn_t vp_vring_interrupt(int irq, void *opaque) { struct virtio_pci_device *vp_dev = opaque; struct virtio_pci_vq_info *info; irqreturn_t ret = IRQ_NONE; unsigned long flags; spin_lock_irqsave(&vp_dev->lock, flags); list_for_each_entry(info, &vp_dev->virtqueues, node) { if (vring_interrupt(irq, info->vq) == IRQ_HANDLED) ret = IRQ_HANDLED; } spin_unlock_irqrestore(&vp_dev->lock, flags); return ret; } /* A small wrapper to also acknowledge the interrupt when it's handled. * I really need an EIO hook for the vring so I can ack the interrupt once we * know that we'll be handling the IRQ but before we invoke the callback since * the callback may notify the host which results in the host attempting to * raise an interrupt that we would then mask once we acknowledged the * interrupt. */ static irqreturn_t vp_interrupt(int irq, void *opaque) { struct virtio_pci_device *vp_dev = opaque; u8 isr; /* reading the ISR has the effect of also clearing it so it's very * important to save off the value. */ isr = ioread8(vp_dev->isr); /* It's definitely not us if the ISR was not high */ if (!isr) return IRQ_NONE; /* Configuration change? Tell driver if it wants to know. */ if (isr & VIRTIO_PCI_ISR_CONFIG) vp_config_changed(irq, opaque); return vp_vring_interrupt(irq, opaque); } static int vp_request_msix_vectors(struct virtio_device *vdev, int nvectors, bool per_vq_vectors, struct irq_affinity *desc) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); const char *name = dev_name(&vp_dev->vdev.dev); unsigned int flags = PCI_IRQ_MSIX; unsigned int i, v; int err = -ENOMEM; vp_dev->msix_vectors = nvectors; vp_dev->msix_names = kmalloc_array(nvectors, sizeof(*vp_dev->msix_names), GFP_KERNEL); if (!vp_dev->msix_names) goto error; vp_dev->msix_affinity_masks = kcalloc(nvectors, sizeof(*vp_dev->msix_affinity_masks), GFP_KERNEL); if (!vp_dev->msix_affinity_masks) goto error; for (i = 0; i < nvectors; ++i) if (!alloc_cpumask_var(&vp_dev->msix_affinity_masks[i], GFP_KERNEL)) goto error; if (desc) { flags |= PCI_IRQ_AFFINITY; desc->pre_vectors++; /* virtio config vector */ } err = pci_alloc_irq_vectors_affinity(vp_dev->pci_dev, nvectors, nvectors, flags, desc); if (err < 0) goto error; vp_dev->msix_enabled = 1; /* Set the vector used for configuration */ v = vp_dev->msix_used_vectors; snprintf(vp_dev->msix_names[v], sizeof *vp_dev->msix_names, "%s-config", name); err = request_irq(pci_irq_vector(vp_dev->pci_dev, v), vp_config_changed, 0, vp_dev->msix_names[v], vp_dev); if (err) goto error; ++vp_dev->msix_used_vectors; v = vp_dev->config_vector(vp_dev, v); /* Verify we had enough resources to assign the vector */ if (v == VIRTIO_MSI_NO_VECTOR) { err = -EBUSY; goto error; } if (!per_vq_vectors) { /* Shared vector for all VQs */ v = vp_dev->msix_used_vectors; snprintf(vp_dev->msix_names[v], sizeof *vp_dev->msix_names, "%s-virtqueues", name); err = request_irq(pci_irq_vector(vp_dev->pci_dev, v), vp_vring_interrupt, 0, vp_dev->msix_names[v], vp_dev); if (err) goto error; ++vp_dev->msix_used_vectors; } return 0; error: return err; } static struct virtqueue *vp_setup_vq(struct virtio_device *vdev, unsigned int index, void (*callback)(struct virtqueue *vq), const char *name, bool ctx, u16 msix_vec) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); struct virtio_pci_vq_info *info = kmalloc(sizeof *info, GFP_KERNEL); struct virtqueue *vq; unsigned long flags; /* fill out our structure that represents an active queue */ if (!info) return ERR_PTR(-ENOMEM); vq = vp_dev->setup_vq(vp_dev, info, index, callback, name, ctx, msix_vec); if (IS_ERR(vq)) goto out_info; info->vq = vq; if (callback) { spin_lock_irqsave(&vp_dev->lock, flags); list_add(&info->node, &vp_dev->virtqueues); spin_unlock_irqrestore(&vp_dev->lock, flags); } else { INIT_LIST_HEAD(&info->node); } vp_dev->vqs[index] = info; return vq; out_info: kfree(info); return vq; } static void vp_del_vq(struct virtqueue *vq) { struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev); struct virtio_pci_vq_info *info = vp_dev->vqs[vq->index]; unsigned long flags; /* * If it fails during re-enable reset vq. This way we won't rejoin * info->node to the queue. Prevent unexpected irqs. */ if (!vq->reset) { spin_lock_irqsave(&vp_dev->lock, flags); list_del(&info->node); spin_unlock_irqrestore(&vp_dev->lock, flags); } vp_dev->del_vq(info); kfree(info); } /* the config->del_vqs() implementation */ void vp_del_vqs(struct virtio_device *vdev) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); struct virtqueue *vq, *n; int i; list_for_each_entry_safe(vq, n, &vdev->vqs, list) { if (vp_dev->is_avq && vp_dev->is_avq(vdev, vq->index)) continue; if (vp_dev->per_vq_vectors) { int v = vp_dev->vqs[vq->index]->msix_vector; if (v != VIRTIO_MSI_NO_VECTOR) { int irq = pci_irq_vector(vp_dev->pci_dev, v); irq_update_affinity_hint(irq, NULL); free_irq(irq, vq); } } vp_del_vq(vq); } vp_dev->per_vq_vectors = false; if (vp_dev->intx_enabled) { free_irq(vp_dev->pci_dev->irq, vp_dev); vp_dev->intx_enabled = 0; } for (i = 0; i < vp_dev->msix_used_vectors; ++i) free_irq(pci_irq_vector(vp_dev->pci_dev, i), vp_dev); if (vp_dev->msix_affinity_masks) { for (i = 0; i < vp_dev->msix_vectors; i++) free_cpumask_var(vp_dev->msix_affinity_masks[i]); } if (vp_dev->msix_enabled) { /* Disable the vector used for configuration */ vp_dev->config_vector(vp_dev, VIRTIO_MSI_NO_VECTOR); pci_free_irq_vectors(vp_dev->pci_dev); vp_dev->msix_enabled = 0; } vp_dev->msix_vectors = 0; vp_dev->msix_used_vectors = 0; kfree(vp_dev->msix_names); vp_dev->msix_names = NULL; kfree(vp_dev->msix_affinity_masks); vp_dev->msix_affinity_masks = NULL; kfree(vp_dev->vqs); vp_dev->vqs = NULL; } static int vp_find_vqs_msix(struct virtio_device *vdev, unsigned int nvqs, struct virtqueue *vqs[], vq_callback_t *callbacks[], const char * const names[], bool per_vq_vectors, const bool *ctx, struct irq_affinity *desc) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); u16 msix_vec; int i, err, nvectors, allocated_vectors, queue_idx = 0; vp_dev->vqs = kcalloc(nvqs, sizeof(*vp_dev->vqs), GFP_KERNEL); if (!vp_dev->vqs) return -ENOMEM; if (per_vq_vectors) { /* Best option: one for change interrupt, one per vq. */ nvectors = 1; for (i = 0; i < nvqs; ++i) if (names[i] && callbacks[i]) ++nvectors; } else { /* Second best: one for change, shared for all vqs. */ nvectors = 2; } err = vp_request_msix_vectors(vdev, nvectors, per_vq_vectors, per_vq_vectors ? desc : NULL); if (err) goto error_find; vp_dev->per_vq_vectors = per_vq_vectors; allocated_vectors = vp_dev->msix_used_vectors; for (i = 0; i < nvqs; ++i) { if (!names[i]) { vqs[i] = NULL; continue; } if (!callbacks[i]) msix_vec = VIRTIO_MSI_NO_VECTOR; else if (vp_dev->per_vq_vectors) msix_vec = allocated_vectors++; else msix_vec = VP_MSIX_VQ_VECTOR; vqs[i] = vp_setup_vq(vdev, queue_idx++, callbacks[i], names[i], ctx ? ctx[i] : false, msix_vec); if (IS_ERR(vqs[i])) { err = PTR_ERR(vqs[i]); goto error_find; } if (!vp_dev->per_vq_vectors || msix_vec == VIRTIO_MSI_NO_VECTOR) continue; /* allocate per-vq irq if available and necessary */ snprintf(vp_dev->msix_names[msix_vec], sizeof *vp_dev->msix_names, "%s-%s", dev_name(&vp_dev->vdev.dev), names[i]); err = request_irq(pci_irq_vector(vp_dev->pci_dev, msix_vec), vring_interrupt, 0, vp_dev->msix_names[msix_vec], vqs[i]); if (err) { vp_del_vq(vqs[i]); goto error_find; } } return 0; error_find: vp_del_vqs(vdev); return err; } static int vp_find_vqs_intx(struct virtio_device *vdev, unsigned int nvqs, struct virtqueue *vqs[], vq_callback_t *callbacks[], const char * const names[], const bool *ctx) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); int i, err, queue_idx = 0; vp_dev->vqs = kcalloc(nvqs, sizeof(*vp_dev->vqs), GFP_KERNEL); if (!vp_dev->vqs) return -ENOMEM; err = request_irq(vp_dev->pci_dev->irq, vp_interrupt, IRQF_SHARED, dev_name(&vdev->dev), vp_dev); if (err) goto out_del_vqs; vp_dev->intx_enabled = 1; vp_dev->per_vq_vectors = false; for (i = 0; i < nvqs; ++i) { if (!names[i]) { vqs[i] = NULL; continue; } vqs[i] = vp_setup_vq(vdev, queue_idx++, callbacks[i], names[i], ctx ? ctx[i] : false, VIRTIO_MSI_NO_VECTOR); if (IS_ERR(vqs[i])) { err = PTR_ERR(vqs[i]); goto out_del_vqs; } } return 0; out_del_vqs: vp_del_vqs(vdev); return err; } /* the config->find_vqs() implementation */ int vp_find_vqs(struct virtio_device *vdev, unsigned int nvqs, struct virtqueue *vqs[], vq_callback_t *callbacks[], const char * const names[], const bool *ctx, struct irq_affinity *desc) { int err; /* Try MSI-X with one vector per queue. */ err = vp_find_vqs_msix(vdev, nvqs, vqs, callbacks, names, true, ctx, desc); if (!err) return 0; /* Fallback: MSI-X with one vector for config, one shared for queues. */ err = vp_find_vqs_msix(vdev, nvqs, vqs, callbacks, names, false, ctx, desc); if (!err) return 0; /* Is there an interrupt? If not give up. */ if (!(to_vp_device(vdev)->pci_dev->irq)) return err; /* Finally fall back to regular interrupts. */ return vp_find_vqs_intx(vdev, nvqs, vqs, callbacks, names, ctx); } const char *vp_bus_name(struct virtio_device *vdev) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); return pci_name(vp_dev->pci_dev); } /* Setup the affinity for a virtqueue: * - force the affinity for per vq vector * - OR over all affinities for shared MSI * - ignore the affinity request if we're using INTX */ int vp_set_vq_affinity(struct virtqueue *vq, const struct cpumask *cpu_mask) { struct virtio_device *vdev = vq->vdev; struct virtio_pci_device *vp_dev = to_vp_device(vdev); struct virtio_pci_vq_info *info = vp_dev->vqs[vq->index]; struct cpumask *mask; unsigned int irq; if (!vq->callback) return -EINVAL; if (vp_dev->msix_enabled) { mask = vp_dev->msix_affinity_masks[info->msix_vector]; irq = pci_irq_vector(vp_dev->pci_dev, info->msix_vector); if (!cpu_mask) irq_update_affinity_hint(irq, NULL); else { cpumask_copy(mask, cpu_mask); irq_set_affinity_and_hint(irq, mask); } } return 0; } const struct cpumask *vp_get_vq_affinity(struct virtio_device *vdev, int index) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); if (!vp_dev->per_vq_vectors || vp_dev->vqs[index]->msix_vector == VIRTIO_MSI_NO_VECTOR) return NULL; return pci_irq_get_affinity(vp_dev->pci_dev, vp_dev->vqs[index]->msix_vector); } #ifdef CONFIG_PM_SLEEP static int virtio_pci_freeze(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev); int ret; ret = virtio_device_freeze(&vp_dev->vdev); if (!ret) pci_disable_device(pci_dev); return ret; } static int virtio_pci_restore(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev); int ret; ret = pci_enable_device(pci_dev); if (ret) return ret; pci_set_master(pci_dev); return virtio_device_restore(&vp_dev->vdev); } static bool vp_supports_pm_no_reset(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); u16 pmcsr; if (!pci_dev->pm_cap) return false; pci_read_config_word(pci_dev, pci_dev->pm_cap + PCI_PM_CTRL, &pmcsr); if (PCI_POSSIBLE_ERROR(pmcsr)) { dev_err(dev, "Unable to query pmcsr"); return false; } return pmcsr & PCI_PM_CTRL_NO_SOFT_RESET; } static int virtio_pci_suspend(struct device *dev) { return vp_supports_pm_no_reset(dev) ? 0 : virtio_pci_freeze(dev); } static int virtio_pci_resume(struct device *dev) { return vp_supports_pm_no_reset(dev) ? 0 : virtio_pci_restore(dev); } static const struct dev_pm_ops virtio_pci_pm_ops = { .suspend = virtio_pci_suspend, .resume = virtio_pci_resume, .freeze = virtio_pci_freeze, .thaw = virtio_pci_restore, .poweroff = virtio_pci_freeze, .restore = virtio_pci_restore, }; #endif /* Qumranet donated their vendor ID for devices 0x1000 thru 0x10FF. */ static const struct pci_device_id virtio_pci_id_table[] = { { PCI_DEVICE(PCI_VENDOR_ID_REDHAT_QUMRANET, PCI_ANY_ID) }, { 0 } }; MODULE_DEVICE_TABLE(pci, virtio_pci_id_table); static void virtio_pci_release_dev(struct device *_d) { struct virtio_device *vdev = dev_to_virtio(_d); struct virtio_pci_device *vp_dev = to_vp_device(vdev); /* As struct device is a kobject, it's not safe to * free the memory (including the reference counter itself) * until it's release callback. */ kfree(vp_dev); } static int virtio_pci_probe(struct pci_dev *pci_dev, const struct pci_device_id *id) { struct virtio_pci_device *vp_dev, *reg_dev = NULL; int rc; /* allocate our structure and fill it out */ vp_dev = kzalloc(sizeof(struct virtio_pci_device), GFP_KERNEL); if (!vp_dev) return -ENOMEM; pci_set_drvdata(pci_dev, vp_dev); vp_dev->vdev.dev.parent = &pci_dev->dev; vp_dev->vdev.dev.release = virtio_pci_release_dev; vp_dev->pci_dev = pci_dev; INIT_LIST_HEAD(&vp_dev->virtqueues); spin_lock_init(&vp_dev->lock); /* enable the device */ rc = pci_enable_device(pci_dev); if (rc) goto err_enable_device; if (force_legacy) { rc = virtio_pci_legacy_probe(vp_dev); /* Also try modern mode if we can't map BAR0 (no IO space). */ if (rc == -ENODEV || rc == -ENOMEM) rc = virtio_pci_modern_probe(vp_dev); if (rc) goto err_probe; } else { rc = virtio_pci_modern_probe(vp_dev); if (rc == -ENODEV) rc = virtio_pci_legacy_probe(vp_dev); if (rc) goto err_probe; } pci_set_master(pci_dev); rc = register_virtio_device(&vp_dev->vdev); reg_dev = vp_dev; if (rc) goto err_register; return 0; err_register: if (vp_dev->is_legacy) virtio_pci_legacy_remove(vp_dev); else virtio_pci_modern_remove(vp_dev); err_probe: pci_disable_device(pci_dev); err_enable_device: if (reg_dev) put_device(&vp_dev->vdev.dev); else kfree(vp_dev); return rc; } static void virtio_pci_remove(struct pci_dev *pci_dev) { struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev); struct device *dev = get_device(&vp_dev->vdev.dev); /* * Device is marked broken on surprise removal so that virtio upper * layers can abort any ongoing operation. */ if (!pci_device_is_present(pci_dev)) virtio_break_device(&vp_dev->vdev); pci_disable_sriov(pci_dev); unregister_virtio_device(&vp_dev->vdev); if (vp_dev->is_legacy) virtio_pci_legacy_remove(vp_dev); else virtio_pci_modern_remove(vp_dev); pci_disable_device(pci_dev); put_device(dev); } static int virtio_pci_sriov_configure(struct pci_dev *pci_dev, int num_vfs) { struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev); struct virtio_device *vdev = &vp_dev->vdev; int ret; if (!(vdev->config->get_status(vdev) & VIRTIO_CONFIG_S_DRIVER_OK)) return -EBUSY; if (!__virtio_test_bit(vdev, VIRTIO_F_SR_IOV)) return -EINVAL; if (pci_vfs_assigned(pci_dev)) return -EPERM; if (num_vfs == 0) { pci_disable_sriov(pci_dev); return 0; } ret = pci_enable_sriov(pci_dev, num_vfs); if (ret < 0) return ret; return num_vfs; } static struct pci_driver virtio_pci_driver = { .name = "virtio-pci", .id_table = virtio_pci_id_table, .probe = virtio_pci_probe, .remove = virtio_pci_remove, #ifdef CONFIG_PM_SLEEP .driver.pm = &virtio_pci_pm_ops, #endif .sriov_configure = virtio_pci_sriov_configure, }; struct virtio_device *virtio_pci_vf_get_pf_dev(struct pci_dev *pdev) { struct virtio_pci_device *pf_vp_dev; pf_vp_dev = pci_iov_get_pf_drvdata(pdev, &virtio_pci_driver); if (IS_ERR(pf_vp_dev)) return NULL; return &pf_vp_dev->vdev; } module_pci_driver(virtio_pci_driver); MODULE_AUTHOR("Anthony Liguori <aliguori@us.ibm.com>"); MODULE_DESCRIPTION("virtio-pci"); MODULE_LICENSE("GPL"); MODULE_VERSION("1"); |
| 3 157 4 4 1 3 3 3 2 3 1 2 2 4 1 14 14 1 12 1 1 1 3 2 3 1 1 1 1 1 1 3 2 82 52 53 53 85 84 7 4 3 3 1 2 1 1 1 3 2 1 1 2 2 1 1 1 53 1 1 2 3 172 169 172 9 9 8 8 170 154 155 157 149 151 15 14 14 14 51 1 1 3 30 30 151 56 54 53 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 | // SPDX-License-Identifier: GPL-2.0 #include <linux/kmod.h> #include <linux/netdevice.h> #include <linux/inetdevice.h> #include <linux/etherdevice.h> #include <linux/rtnetlink.h> #include <linux/net_tstamp.h> #include <linux/phylib_stubs.h> #include <linux/wireless.h> #include <linux/if_bridge.h> #include <net/dsa_stubs.h> #include <net/wext.h> #include "dev.h" /* * Map an interface index to its name (SIOCGIFNAME) */ /* * We need this ioctl for efficient implementation of the * if_indextoname() function required by the IPv6 API. Without * it, we would have to search all the interfaces to find a * match. --pb */ static int dev_ifname(struct net *net, struct ifreq *ifr) { ifr->ifr_name[IFNAMSIZ-1] = 0; return netdev_get_name(net, ifr->ifr_name, ifr->ifr_ifindex); } /* * Perform a SIOCGIFCONF call. This structure will change * size eventually, and there is nothing I can do about it. * Thus we will need a 'compatibility mode'. */ int dev_ifconf(struct net *net, struct ifconf __user *uifc) { struct net_device *dev; void __user *pos; size_t size; int len, total = 0, done; /* both the ifconf and the ifreq structures are slightly different */ if (in_compat_syscall()) { struct compat_ifconf ifc32; if (copy_from_user(&ifc32, uifc, sizeof(struct compat_ifconf))) return -EFAULT; pos = compat_ptr(ifc32.ifcbuf); len = ifc32.ifc_len; size = sizeof(struct compat_ifreq); } else { struct ifconf ifc; if (copy_from_user(&ifc, uifc, sizeof(struct ifconf))) return -EFAULT; pos = ifc.ifc_buf; len = ifc.ifc_len; size = sizeof(struct ifreq); } /* Loop over the interfaces, and write an info block for each. */ rtnl_lock(); for_each_netdev(net, dev) { if (!pos) done = inet_gifconf(dev, NULL, 0, size); else done = inet_gifconf(dev, pos + total, len - total, size); if (done < 0) { rtnl_unlock(); return -EFAULT; } total += done; } rtnl_unlock(); return put_user(total, &uifc->ifc_len); } static int dev_getifmap(struct net_device *dev, struct ifreq *ifr) { struct ifmap *ifmap = &ifr->ifr_map; if (in_compat_syscall()) { struct compat_ifmap *cifmap = (struct compat_ifmap *)ifmap; cifmap->mem_start = dev->mem_start; cifmap->mem_end = dev->mem_end; cifmap->base_addr = dev->base_addr; cifmap->irq = dev->irq; cifmap->dma = dev->dma; cifmap->port = dev->if_port; return 0; } ifmap->mem_start = dev->mem_start; ifmap->mem_end = dev->mem_end; ifmap->base_addr = dev->base_addr; ifmap->irq = dev->irq; ifmap->dma = dev->dma; ifmap->port = dev->if_port; return 0; } static int dev_setifmap(struct net_device *dev, struct ifreq *ifr) { struct compat_ifmap *cifmap = (struct compat_ifmap *)&ifr->ifr_map; if (!dev->netdev_ops->ndo_set_config) return -EOPNOTSUPP; if (in_compat_syscall()) { struct ifmap ifmap = { .mem_start = cifmap->mem_start, .mem_end = cifmap->mem_end, .base_addr = cifmap->base_addr, .irq = cifmap->irq, .dma = cifmap->dma, .port = cifmap->port, }; return dev->netdev_ops->ndo_set_config(dev, &ifmap); } return dev->netdev_ops->ndo_set_config(dev, &ifr->ifr_map); } /* * Perform the SIOCxIFxxx calls, inside rcu_read_lock() */ static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd) { int err; struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name); if (!dev) return -ENODEV; switch (cmd) { case SIOCGIFFLAGS: /* Get interface flags */ ifr->ifr_flags = (short) dev_get_flags(dev); return 0; case SIOCGIFMETRIC: /* Get the metric on the interface (currently unused) */ ifr->ifr_metric = 0; return 0; case SIOCGIFMTU: /* Get the MTU of a device */ ifr->ifr_mtu = dev->mtu; return 0; case SIOCGIFSLAVE: err = -EINVAL; break; case SIOCGIFMAP: return dev_getifmap(dev, ifr); case SIOCGIFINDEX: ifr->ifr_ifindex = dev->ifindex; return 0; case SIOCGIFTXQLEN: ifr->ifr_qlen = dev->tx_queue_len; return 0; default: /* dev_ioctl() should ensure this case * is never reached */ WARN_ON(1); err = -ENOTTY; break; } return err; } static int net_hwtstamp_validate(const struct kernel_hwtstamp_config *cfg) { enum hwtstamp_tx_types tx_type; enum hwtstamp_rx_filters rx_filter; int tx_type_valid = 0; int rx_filter_valid = 0; if (cfg->flags & ~HWTSTAMP_FLAG_MASK) return -EINVAL; tx_type = cfg->tx_type; rx_filter = cfg->rx_filter; switch (tx_type) { case HWTSTAMP_TX_OFF: case HWTSTAMP_TX_ON: case HWTSTAMP_TX_ONESTEP_SYNC: case HWTSTAMP_TX_ONESTEP_P2P: tx_type_valid = 1; break; case __HWTSTAMP_TX_CNT: /* not a real value */ break; } switch (rx_filter) { case HWTSTAMP_FILTER_NONE: case HWTSTAMP_FILTER_ALL: case HWTSTAMP_FILTER_SOME: case HWTSTAMP_FILTER_PTP_V1_L4_EVENT: case HWTSTAMP_FILTER_PTP_V1_L4_SYNC: case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ: case HWTSTAMP_FILTER_PTP_V2_L4_EVENT: case HWTSTAMP_FILTER_PTP_V2_L4_SYNC: case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ: case HWTSTAMP_FILTER_PTP_V2_L2_EVENT: case HWTSTAMP_FILTER_PTP_V2_L2_SYNC: case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ: case HWTSTAMP_FILTER_PTP_V2_EVENT: case HWTSTAMP_FILTER_PTP_V2_SYNC: case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ: case HWTSTAMP_FILTER_NTP_ALL: rx_filter_valid = 1; break; case __HWTSTAMP_FILTER_CNT: /* not a real value */ break; } if (!tx_type_valid || !rx_filter_valid) return -ERANGE; return 0; } static int dev_eth_ioctl(struct net_device *dev, struct ifreq *ifr, unsigned int cmd) { const struct net_device_ops *ops = dev->netdev_ops; if (!ops->ndo_eth_ioctl) return -EOPNOTSUPP; if (!netif_device_present(dev)) return -ENODEV; return ops->ndo_eth_ioctl(dev, ifr, cmd); } /** * dev_get_hwtstamp_phylib() - Get hardware timestamping settings of NIC * or of attached phylib PHY * @dev: Network device * @cfg: Timestamping configuration structure * * Helper for enforcing a common policy that phylib timestamping, if available, * should take precedence in front of hardware timestamping provided by the * netdev. * * Note: phy_mii_ioctl() only handles SIOCSHWTSTAMP (not SIOCGHWTSTAMP), and * there only exists a phydev->mii_ts->hwtstamp() method. So this will return * -EOPNOTSUPP for phylib for now, which is still more accurate than letting * the netdev handle the GET request. */ static int dev_get_hwtstamp_phylib(struct net_device *dev, struct kernel_hwtstamp_config *cfg) { if (phy_has_hwtstamp(dev->phydev)) return phy_hwtstamp_get(dev->phydev, cfg); return dev->netdev_ops->ndo_hwtstamp_get(dev, cfg); } static int dev_get_hwtstamp(struct net_device *dev, struct ifreq *ifr) { const struct net_device_ops *ops = dev->netdev_ops; struct kernel_hwtstamp_config kernel_cfg = {}; struct hwtstamp_config cfg; int err; if (!ops->ndo_hwtstamp_get) return dev_eth_ioctl(dev, ifr, SIOCGHWTSTAMP); /* legacy */ if (!netif_device_present(dev)) return -ENODEV; kernel_cfg.ifr = ifr; err = dev_get_hwtstamp_phylib(dev, &kernel_cfg); if (err) return err; /* If the request was resolved through an unconverted driver, omit * the copy_to_user(), since the implementation has already done that */ if (!kernel_cfg.copied_to_user) { hwtstamp_config_from_kernel(&cfg, &kernel_cfg); if (copy_to_user(ifr->ifr_data, &cfg, sizeof(cfg))) return -EFAULT; } return 0; } /** * dev_set_hwtstamp_phylib() - Change hardware timestamping of NIC * or of attached phylib PHY * @dev: Network device * @cfg: Timestamping configuration structure * @extack: Netlink extended ack message structure, for error reporting * * Helper for enforcing a common policy that phylib timestamping, if available, * should take precedence in front of hardware timestamping provided by the * netdev. If the netdev driver needs to perform specific actions even for PHY * timestamping to work properly (a switch port must trap the timestamped * frames and not forward them), it must set IFF_SEE_ALL_HWTSTAMP_REQUESTS in * dev->priv_flags. */ int dev_set_hwtstamp_phylib(struct net_device *dev, struct kernel_hwtstamp_config *cfg, struct netlink_ext_ack *extack) { const struct net_device_ops *ops = dev->netdev_ops; bool phy_ts = phy_has_hwtstamp(dev->phydev); struct kernel_hwtstamp_config old_cfg = {}; bool changed = false; int err; cfg->source = phy_ts ? HWTSTAMP_SOURCE_PHYLIB : HWTSTAMP_SOURCE_NETDEV; if (phy_ts && (dev->priv_flags & IFF_SEE_ALL_HWTSTAMP_REQUESTS)) { err = ops->ndo_hwtstamp_get(dev, &old_cfg); if (err) return err; } if (!phy_ts || (dev->priv_flags & IFF_SEE_ALL_HWTSTAMP_REQUESTS)) { err = ops->ndo_hwtstamp_set(dev, cfg, extack); if (err) { if (extack->_msg) netdev_err(dev, "%s\n", extack->_msg); return err; } } if (phy_ts && (dev->priv_flags & IFF_SEE_ALL_HWTSTAMP_REQUESTS)) changed = kernel_hwtstamp_config_changed(&old_cfg, cfg); if (phy_ts) { err = phy_hwtstamp_set(dev->phydev, cfg, extack); if (err) { if (changed) ops->ndo_hwtstamp_set(dev, &old_cfg, NULL); return err; } } return 0; } EXPORT_SYMBOL_GPL(dev_set_hwtstamp_phylib); static int dev_set_hwtstamp(struct net_device *dev, struct ifreq *ifr) { const struct net_device_ops *ops = dev->netdev_ops; struct kernel_hwtstamp_config kernel_cfg = {}; struct netlink_ext_ack extack = {}; struct hwtstamp_config cfg; int err; if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg))) return -EFAULT; hwtstamp_config_to_kernel(&kernel_cfg, &cfg); kernel_cfg.ifr = ifr; err = net_hwtstamp_validate(&kernel_cfg); if (err) return err; err = dsa_conduit_hwtstamp_validate(dev, &kernel_cfg, &extack); if (err) { if (extack._msg) netdev_err(dev, "%s\n", extack._msg); return err; } if (!ops->ndo_hwtstamp_set) return dev_eth_ioctl(dev, ifr, SIOCSHWTSTAMP); /* legacy */ if (!netif_device_present(dev)) return -ENODEV; err = dev_set_hwtstamp_phylib(dev, &kernel_cfg, &extack); if (err) return err; /* The driver may have modified the configuration, so copy the * updated version of it back to user space */ if (!kernel_cfg.copied_to_user) { hwtstamp_config_from_kernel(&cfg, &kernel_cfg); if (copy_to_user(ifr->ifr_data, &cfg, sizeof(cfg))) return -EFAULT; } return 0; } static int generic_hwtstamp_ioctl_lower(struct net_device *dev, int cmd, struct kernel_hwtstamp_config *kernel_cfg) { struct ifreq ifrr; int err; strscpy_pad(ifrr.ifr_name, dev->name, IFNAMSIZ); ifrr.ifr_ifru = kernel_cfg->ifr->ifr_ifru; err = dev_eth_ioctl(dev, &ifrr, cmd); if (err) return err; kernel_cfg->ifr->ifr_ifru = ifrr.ifr_ifru; kernel_cfg->copied_to_user = true; return 0; } int generic_hwtstamp_get_lower(struct net_device *dev, struct kernel_hwtstamp_config *kernel_cfg) { const struct net_device_ops *ops = dev->netdev_ops; if (!netif_device_present(dev)) return -ENODEV; if (ops->ndo_hwtstamp_get) return dev_get_hwtstamp_phylib(dev, kernel_cfg); /* Legacy path: unconverted lower driver */ return generic_hwtstamp_ioctl_lower(dev, SIOCGHWTSTAMP, kernel_cfg); } EXPORT_SYMBOL(generic_hwtstamp_get_lower); int generic_hwtstamp_set_lower(struct net_device *dev, struct kernel_hwtstamp_config *kernel_cfg, struct netlink_ext_ack *extack) { const struct net_device_ops *ops = dev->netdev_ops; if (!netif_device_present(dev)) return -ENODEV; if (ops->ndo_hwtstamp_set) return dev_set_hwtstamp_phylib(dev, kernel_cfg, extack); /* Legacy path: unconverted lower driver */ return generic_hwtstamp_ioctl_lower(dev, SIOCSHWTSTAMP, kernel_cfg); } EXPORT_SYMBOL(generic_hwtstamp_set_lower); static int dev_siocbond(struct net_device *dev, struct ifreq *ifr, unsigned int cmd) { const struct net_device_ops *ops = dev->netdev_ops; if (ops->ndo_siocbond) { if (netif_device_present(dev)) return ops->ndo_siocbond(dev, ifr, cmd); else return -ENODEV; } return -EOPNOTSUPP; } static int dev_siocdevprivate(struct net_device *dev, struct ifreq *ifr, void __user *data, unsigned int cmd) { const struct net_device_ops *ops = dev->netdev_ops; if (ops->ndo_siocdevprivate) { if (netif_device_present(dev)) return ops->ndo_siocdevprivate(dev, ifr, data, cmd); else return -ENODEV; } return -EOPNOTSUPP; } static int dev_siocwandev(struct net_device *dev, struct if_settings *ifs) { const struct net_device_ops *ops = dev->netdev_ops; if (ops->ndo_siocwandev) { if (netif_device_present(dev)) return ops->ndo_siocwandev(dev, ifs); else return -ENODEV; } return -EOPNOTSUPP; } /* * Perform the SIOCxIFxxx calls, inside rtnl_lock() */ static int dev_ifsioc(struct net *net, struct ifreq *ifr, void __user *data, unsigned int cmd) { int err; struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name); const struct net_device_ops *ops; netdevice_tracker dev_tracker; if (!dev) return -ENODEV; ops = dev->netdev_ops; switch (cmd) { case SIOCSIFFLAGS: /* Set interface flags */ return dev_change_flags(dev, ifr->ifr_flags, NULL); case SIOCSIFMETRIC: /* Set the metric on the interface (currently unused) */ return -EOPNOTSUPP; case SIOCSIFMTU: /* Set the MTU of a device */ return dev_set_mtu(dev, ifr->ifr_mtu); case SIOCSIFHWADDR: if (dev->addr_len > sizeof(struct sockaddr)) return -EINVAL; return dev_set_mac_address_user(dev, &ifr->ifr_hwaddr, NULL); case SIOCSIFHWBROADCAST: if (ifr->ifr_hwaddr.sa_family != dev->type) return -EINVAL; memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data, min(sizeof(ifr->ifr_hwaddr.sa_data_min), (size_t)dev->addr_len)); call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); return 0; case SIOCSIFMAP: return dev_setifmap(dev, ifr); case SIOCADDMULTI: if (!ops->ndo_set_rx_mode || ifr->ifr_hwaddr.sa_family != AF_UNSPEC) return -EINVAL; if (!netif_device_present(dev)) return -ENODEV; return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data); case SIOCDELMULTI: if (!ops->ndo_set_rx_mode || ifr->ifr_hwaddr.sa_family != AF_UNSPEC) return -EINVAL; if (!netif_device_present(dev)) return -ENODEV; return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data); case SIOCSIFTXQLEN: if (ifr->ifr_qlen < 0) return -EINVAL; return dev_change_tx_queue_len(dev, ifr->ifr_qlen); case SIOCSIFNAME: ifr->ifr_newname[IFNAMSIZ-1] = '\0'; return dev_change_name(dev, ifr->ifr_newname); case SIOCWANDEV: return dev_siocwandev(dev, &ifr->ifr_settings); case SIOCBRADDIF: case SIOCBRDELIF: if (!netif_device_present(dev)) return -ENODEV; if (!netif_is_bridge_master(dev)) return -EOPNOTSUPP; netdev_hold(dev, &dev_tracker, GFP_KERNEL); rtnl_unlock(); err = br_ioctl_call(net, netdev_priv(dev), cmd, ifr, NULL); netdev_put(dev, &dev_tracker); rtnl_lock(); return err; case SIOCDEVPRIVATE ... SIOCDEVPRIVATE + 15: return dev_siocdevprivate(dev, ifr, data, cmd); case SIOCSHWTSTAMP: return dev_set_hwtstamp(dev, ifr); case SIOCGHWTSTAMP: return dev_get_hwtstamp(dev, ifr); case SIOCGMIIPHY: case SIOCGMIIREG: case SIOCSMIIREG: return dev_eth_ioctl(dev, ifr, cmd); case SIOCBONDENSLAVE: case SIOCBONDRELEASE: case SIOCBONDSETHWADDR: case SIOCBONDSLAVEINFOQUERY: case SIOCBONDINFOQUERY: case SIOCBONDCHANGEACTIVE: return dev_siocbond(dev, ifr, cmd); /* Unknown ioctl */ default: err = -EINVAL; } return err; } /** * dev_load - load a network module * @net: the applicable net namespace * @name: name of interface * * If a network interface is not present and the process has suitable * privileges this function loads the module. If module loading is not * available in this kernel then it becomes a nop. */ void dev_load(struct net *net, const char *name) { struct net_device *dev; int no_module; rcu_read_lock(); dev = dev_get_by_name_rcu(net, name); rcu_read_unlock(); no_module = !dev; if (no_module && capable(CAP_NET_ADMIN)) no_module = request_module("netdev-%s", name); if (no_module && capable(CAP_SYS_MODULE)) request_module("%s", name); } EXPORT_SYMBOL(dev_load); /* * This function handles all "interface"-type I/O control requests. The actual * 'doing' part of this is dev_ifsioc above. */ /** * dev_ioctl - network device ioctl * @net: the applicable net namespace * @cmd: command to issue * @ifr: pointer to a struct ifreq in user space * @data: data exchanged with userspace * @need_copyout: whether or not copy_to_user() should be called * * Issue ioctl functions to devices. This is normally called by the * user space syscall interfaces but can sometimes be useful for * other purposes. The return value is the return from the syscall if * positive or a negative errno code on error. */ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, void __user *data, bool *need_copyout) { int ret; char *colon; if (need_copyout) *need_copyout = true; if (cmd == SIOCGIFNAME) return dev_ifname(net, ifr); ifr->ifr_name[IFNAMSIZ-1] = 0; colon = strchr(ifr->ifr_name, ':'); if (colon) *colon = 0; /* * See which interface the caller is talking about. */ switch (cmd) { case SIOCGIFHWADDR: dev_load(net, ifr->ifr_name); ret = dev_get_mac_address(&ifr->ifr_hwaddr, net, ifr->ifr_name); if (colon) *colon = ':'; return ret; /* * These ioctl calls: * - can be done by all. * - atomic and do not require locking. * - return a value */ case SIOCGIFFLAGS: case SIOCGIFMETRIC: case SIOCGIFMTU: case SIOCGIFSLAVE: case SIOCGIFMAP: case SIOCGIFINDEX: case SIOCGIFTXQLEN: dev_load(net, ifr->ifr_name); rcu_read_lock(); ret = dev_ifsioc_locked(net, ifr, cmd); rcu_read_unlock(); if (colon) *colon = ':'; return ret; case SIOCETHTOOL: dev_load(net, ifr->ifr_name); ret = dev_ethtool(net, ifr, data); if (colon) *colon = ':'; return ret; /* * These ioctl calls: * - require superuser power. * - require strict serialization. * - return a value */ case SIOCGMIIPHY: case SIOCGMIIREG: case SIOCSIFNAME: dev_load(net, ifr->ifr_name); if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; rtnl_lock(); ret = dev_ifsioc(net, ifr, data, cmd); rtnl_unlock(); if (colon) *colon = ':'; return ret; /* * These ioctl calls: * - require superuser power. * - require strict serialization. * - do not return a value */ case SIOCSIFMAP: case SIOCSIFTXQLEN: if (!capable(CAP_NET_ADMIN)) return -EPERM; fallthrough; /* * These ioctl calls: * - require local superuser power. * - require strict serialization. * - do not return a value */ case SIOCSIFFLAGS: case SIOCSIFMETRIC: case SIOCSIFMTU: case SIOCSIFHWADDR: case SIOCSIFSLAVE: case SIOCADDMULTI: case SIOCDELMULTI: case SIOCSIFHWBROADCAST: case SIOCSMIIREG: case SIOCBONDENSLAVE: case SIOCBONDRELEASE: case SIOCBONDSETHWADDR: case SIOCBONDCHANGEACTIVE: case SIOCBRADDIF: case SIOCBRDELIF: case SIOCSHWTSTAMP: if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; fallthrough; case SIOCBONDSLAVEINFOQUERY: case SIOCBONDINFOQUERY: dev_load(net, ifr->ifr_name); rtnl_lock(); ret = dev_ifsioc(net, ifr, data, cmd); rtnl_unlock(); if (need_copyout) *need_copyout = false; return ret; case SIOCGIFMEM: /* Get the per device memory space. We can add this but * currently do not support it */ case SIOCSIFMEM: /* Set the per device memory buffer space. * Not applicable in our case */ case SIOCSIFLINK: return -ENOTTY; /* * Unknown or private ioctl. */ default: if (cmd == SIOCWANDEV || cmd == SIOCGHWTSTAMP || (cmd >= SIOCDEVPRIVATE && cmd <= SIOCDEVPRIVATE + 15)) { dev_load(net, ifr->ifr_name); rtnl_lock(); ret = dev_ifsioc(net, ifr, data, cmd); rtnl_unlock(); return ret; } return -ENOTTY; } } |
| 274 219 274 54 138 143 127 164 303 159 67 224 224 139 170 174 23 17 23 170 96 91 91 91 170 113 170 170 54 139 139 296 308 296 100 151 162 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* Red Black Trees (C) 1999 Andrea Arcangeli <andrea@suse.de> (C) 2002 David Woodhouse <dwmw2@infradead.org> (C) 2012 Michel Lespinasse <walken@google.com> linux/include/linux/rbtree_augmented.h */ #ifndef _LINUX_RBTREE_AUGMENTED_H #define _LINUX_RBTREE_AUGMENTED_H #include <linux/compiler.h> #include <linux/rbtree.h> #include <linux/rcupdate.h> /* * Please note - only struct rb_augment_callbacks and the prototypes for * rb_insert_augmented() and rb_erase_augmented() are intended to be public. * The rest are implementation details you are not expected to depend on. * * See Documentation/core-api/rbtree.rst for documentation and samples. */ struct rb_augment_callbacks { void (*propagate)(struct rb_node *node, struct rb_node *stop); void (*copy)(struct rb_node *old, struct rb_node *new); void (*rotate)(struct rb_node *old, struct rb_node *new); }; extern void __rb_insert_augmented(struct rb_node *node, struct rb_root *root, void (*augment_rotate)(struct rb_node *old, struct rb_node *new)); /* * Fixup the rbtree and update the augmented information when rebalancing. * * On insertion, the user must update the augmented information on the path * leading to the inserted node, then call rb_link_node() as usual and * rb_insert_augmented() instead of the usual rb_insert_color() call. * If rb_insert_augmented() rebalances the rbtree, it will callback into * a user provided function to update the augmented information on the * affected subtrees. */ static inline void rb_insert_augmented(struct rb_node *node, struct rb_root *root, const struct rb_augment_callbacks *augment) { __rb_insert_augmented(node, root, augment->rotate); } static inline void rb_insert_augmented_cached(struct rb_node *node, struct rb_root_cached *root, bool newleft, const struct rb_augment_callbacks *augment) { if (newleft) root->rb_leftmost = node; rb_insert_augmented(node, &root->rb_root, augment); } static __always_inline struct rb_node * rb_add_augmented_cached(struct rb_node *node, struct rb_root_cached *tree, bool (*less)(struct rb_node *, const struct rb_node *), const struct rb_augment_callbacks *augment) { struct rb_node **link = &tree->rb_root.rb_node; struct rb_node *parent = NULL; bool leftmost = true; while (*link) { parent = *link; if (less(node, parent)) { link = &parent->rb_left; } else { link = &parent->rb_right; leftmost = false; } } rb_link_node(node, parent, link); augment->propagate(parent, NULL); /* suboptimal */ rb_insert_augmented_cached(node, tree, leftmost, augment); return leftmost ? node : NULL; } /* * Template for declaring augmented rbtree callbacks (generic case) * * RBSTATIC: 'static' or empty * RBNAME: name of the rb_augment_callbacks structure * RBSTRUCT: struct type of the tree nodes * RBFIELD: name of struct rb_node field within RBSTRUCT * RBAUGMENTED: name of field within RBSTRUCT holding data for subtree * RBCOMPUTE: name of function that recomputes the RBAUGMENTED data */ #define RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME, \ RBSTRUCT, RBFIELD, RBAUGMENTED, RBCOMPUTE) \ static inline void \ RBNAME ## _propagate(struct rb_node *rb, struct rb_node *stop) \ { \ while (rb != stop) { \ RBSTRUCT *node = rb_entry(rb, RBSTRUCT, RBFIELD); \ if (RBCOMPUTE(node, true)) \ break; \ rb = rb_parent(&node->RBFIELD); \ } \ } \ static inline void \ RBNAME ## _copy(struct rb_node *rb_old, struct rb_node *rb_new) \ { \ RBSTRUCT *old = rb_entry(rb_old, RBSTRUCT, RBFIELD); \ RBSTRUCT *new = rb_entry(rb_new, RBSTRUCT, RBFIELD); \ new->RBAUGMENTED = old->RBAUGMENTED; \ } \ static void \ RBNAME ## _rotate(struct rb_node *rb_old, struct rb_node *rb_new) \ { \ RBSTRUCT *old = rb_entry(rb_old, RBSTRUCT, RBFIELD); \ RBSTRUCT *new = rb_entry(rb_new, RBSTRUCT, RBFIELD); \ new->RBAUGMENTED = old->RBAUGMENTED; \ RBCOMPUTE(old, false); \ } \ RBSTATIC const struct rb_augment_callbacks RBNAME = { \ .propagate = RBNAME ## _propagate, \ .copy = RBNAME ## _copy, \ .rotate = RBNAME ## _rotate \ }; /* * Template for declaring augmented rbtree callbacks, * computing RBAUGMENTED scalar as max(RBCOMPUTE(node)) for all subtree nodes. * * RBSTATIC: 'static' or empty * RBNAME: name of the rb_augment_callbacks structure * RBSTRUCT: struct type of the tree nodes * RBFIELD: name of struct rb_node field within RBSTRUCT * RBTYPE: type of the RBAUGMENTED field * RBAUGMENTED: name of RBTYPE field within RBSTRUCT holding data for subtree * RBCOMPUTE: name of function that returns the per-node RBTYPE scalar */ #define RB_DECLARE_CALLBACKS_MAX(RBSTATIC, RBNAME, RBSTRUCT, RBFIELD, \ RBTYPE, RBAUGMENTED, RBCOMPUTE) \ static inline bool RBNAME ## _compute_max(RBSTRUCT *node, bool exit) \ { \ RBSTRUCT *child; \ RBTYPE max = RBCOMPUTE(node); \ if (node->RBFIELD.rb_left) { \ child = rb_entry(node->RBFIELD.rb_left, RBSTRUCT, RBFIELD); \ if (child->RBAUGMENTED > max) \ max = child->RBAUGMENTED; \ } \ if (node->RBFIELD.rb_right) { \ child = rb_entry(node->RBFIELD.rb_right, RBSTRUCT, RBFIELD); \ if (child->RBAUGMENTED > max) \ max = child->RBAUGMENTED; \ } \ if (exit && node->RBAUGMENTED == max) \ return true; \ node->RBAUGMENTED = max; \ return false; \ } \ RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME, \ RBSTRUCT, RBFIELD, RBAUGMENTED, RBNAME ## _compute_max) #define RB_RED 0 #define RB_BLACK 1 #define __rb_parent(pc) ((struct rb_node *)(pc & ~3)) #define __rb_color(pc) ((pc) & 1) #define __rb_is_black(pc) __rb_color(pc) #define __rb_is_red(pc) (!__rb_color(pc)) #define rb_color(rb) __rb_color((rb)->__rb_parent_color) #define rb_is_red(rb) __rb_is_red((rb)->__rb_parent_color) #define rb_is_black(rb) __rb_is_black((rb)->__rb_parent_color) static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p) { rb->__rb_parent_color = rb_color(rb) + (unsigned long)p; } static inline void rb_set_parent_color(struct rb_node *rb, struct rb_node *p, int color) { rb->__rb_parent_color = (unsigned long)p + color; } static inline void __rb_change_child(struct rb_node *old, struct rb_node *new, struct rb_node *parent, struct rb_root *root) { if (parent) { if (parent->rb_left == old) WRITE_ONCE(parent->rb_left, new); else WRITE_ONCE(parent->rb_right, new); } else WRITE_ONCE(root->rb_node, new); } static inline void __rb_change_child_rcu(struct rb_node *old, struct rb_node *new, struct rb_node *parent, struct rb_root *root) { if (parent) { if (parent->rb_left == old) rcu_assign_pointer(parent->rb_left, new); else rcu_assign_pointer(parent->rb_right, new); } else rcu_assign_pointer(root->rb_node, new); } extern void __rb_erase_color(struct rb_node *parent, struct rb_root *root, void (*augment_rotate)(struct rb_node *old, struct rb_node *new)); static __always_inline struct rb_node * __rb_erase_augmented(struct rb_node *node, struct rb_root *root, const struct rb_augment_callbacks *augment) { struct rb_node *child = node->rb_right; struct rb_node *tmp = node->rb_left; struct rb_node *parent, *rebalance; unsigned long pc; if (!tmp) { /* * Case 1: node to erase has no more than 1 child (easy!) * * Note that if there is one child it must be red due to 5) * and node must be black due to 4). We adjust colors locally * so as to bypass __rb_erase_color() later on. */ pc = node->__rb_parent_color; parent = __rb_parent(pc); __rb_change_child(node, child, parent, root); if (child) { child->__rb_parent_color = pc; rebalance = NULL; } else rebalance = __rb_is_black(pc) ? parent : NULL; tmp = parent; } else if (!child) { /* Still case 1, but this time the child is node->rb_left */ tmp->__rb_parent_color = pc = node->__rb_parent_color; parent = __rb_parent(pc); __rb_change_child(node, tmp, parent, root); rebalance = NULL; tmp = parent; } else { struct rb_node *successor = child, *child2; tmp = child->rb_left; if (!tmp) { /* * Case 2: node's successor is its right child * * (n) (s) * / \ / \ * (x) (s) -> (x) (c) * \ * (c) */ parent = successor; child2 = successor->rb_right; augment->copy(node, successor); } else { /* * Case 3: node's successor is leftmost under * node's right child subtree * * (n) (s) * / \ / \ * (x) (y) -> (x) (y) * / / * (p) (p) * / / * (s) (c) * \ * (c) */ do { parent = successor; successor = tmp; tmp = tmp->rb_left; } while (tmp); child2 = successor->rb_right; WRITE_ONCE(parent->rb_left, child2); WRITE_ONCE(successor->rb_right, child); rb_set_parent(child, successor); augment->copy(node, successor); augment->propagate(parent, successor); } tmp = node->rb_left; WRITE_ONCE(successor->rb_left, tmp); rb_set_parent(tmp, successor); pc = node->__rb_parent_color; tmp = __rb_parent(pc); __rb_change_child(node, successor, tmp, root); if (child2) { rb_set_parent_color(child2, parent, RB_BLACK); rebalance = NULL; } else { rebalance = rb_is_black(successor) ? parent : NULL; } successor->__rb_parent_color = pc; tmp = successor; } augment->propagate(tmp, NULL); return rebalance; } static __always_inline void rb_erase_augmented(struct rb_node *node, struct rb_root *root, const struct rb_augment_callbacks *augment) { struct rb_node *rebalance = __rb_erase_augmented(node, root, augment); if (rebalance) __rb_erase_color(rebalance, root, augment->rotate); } static __always_inline void rb_erase_augmented_cached(struct rb_node *node, struct rb_root_cached *root, const struct rb_augment_callbacks *augment) { if (root->rb_leftmost == node) root->rb_leftmost = rb_next(node); rb_erase_augmented(node, &root->rb_root, augment); } #endif /* _LINUX_RBTREE_AUGMENTED_H */ |
| 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * NetLabel Network Address Lists * * This file contains network address list functions used to manage ordered * lists of network addresses for use by the NetLabel subsystem. The NetLabel * system manages static and dynamic label mappings for network protocols such * as CIPSO and RIPSO. * * Author: Paul Moore <paul@paul-moore.com> */ /* * (c) Copyright Hewlett-Packard Development Company, L.P., 2008 */ #ifndef _NETLABEL_ADDRLIST_H #define _NETLABEL_ADDRLIST_H #include <linux/types.h> #include <linux/rcupdate.h> #include <linux/list.h> #include <linux/in6.h> #include <linux/audit.h> /** * struct netlbl_af4list - NetLabel IPv4 address list * @addr: IPv4 address * @mask: IPv4 address mask * @valid: valid flag * @list: list structure, used internally */ struct netlbl_af4list { __be32 addr; __be32 mask; u32 valid; struct list_head list; }; /** * struct netlbl_af6list - NetLabel IPv6 address list * @addr: IPv6 address * @mask: IPv6 address mask * @valid: valid flag * @list: list structure, used internally */ struct netlbl_af6list { struct in6_addr addr; struct in6_addr mask; u32 valid; struct list_head list; }; #define __af4list_entry(ptr) container_of(ptr, struct netlbl_af4list, list) static inline struct netlbl_af4list *__af4list_valid(struct list_head *s, struct list_head *h) { struct list_head *i = s; struct netlbl_af4list *n = __af4list_entry(s); while (i != h && !n->valid) { i = i->next; n = __af4list_entry(i); } return n; } static inline struct netlbl_af4list *__af4list_valid_rcu(struct list_head *s, struct list_head *h) { struct list_head *i = s; struct netlbl_af4list *n = __af4list_entry(s); while (i != h && !n->valid) { i = rcu_dereference(list_next_rcu(i)); n = __af4list_entry(i); } return n; } #define netlbl_af4list_foreach(iter, head) \ for (iter = __af4list_valid((head)->next, head); \ &iter->list != (head); \ iter = __af4list_valid(iter->list.next, head)) #define netlbl_af4list_foreach_rcu(iter, head) \ for (iter = __af4list_valid_rcu((head)->next, head); \ &iter->list != (head); \ iter = __af4list_valid_rcu(iter->list.next, head)) #define netlbl_af4list_foreach_safe(iter, tmp, head) \ for (iter = __af4list_valid((head)->next, head), \ tmp = __af4list_valid(iter->list.next, head); \ &iter->list != (head); \ iter = tmp, tmp = __af4list_valid(iter->list.next, head)) int netlbl_af4list_add(struct netlbl_af4list *entry, struct list_head *head); struct netlbl_af4list *netlbl_af4list_remove(__be32 addr, __be32 mask, struct list_head *head); void netlbl_af4list_remove_entry(struct netlbl_af4list *entry); struct netlbl_af4list *netlbl_af4list_search(__be32 addr, struct list_head *head); struct netlbl_af4list *netlbl_af4list_search_exact(__be32 addr, __be32 mask, struct list_head *head); #ifdef CONFIG_AUDIT void netlbl_af4list_audit_addr(struct audit_buffer *audit_buf, int src, const char *dev, __be32 addr, __be32 mask); #else static inline void netlbl_af4list_audit_addr(struct audit_buffer *audit_buf, int src, const char *dev, __be32 addr, __be32 mask) { } #endif #if IS_ENABLED(CONFIG_IPV6) #define __af6list_entry(ptr) container_of(ptr, struct netlbl_af6list, list) static inline struct netlbl_af6list *__af6list_valid(struct list_head *s, struct list_head *h) { struct list_head *i = s; struct netlbl_af6list *n = __af6list_entry(s); while (i != h && !n->valid) { i = i->next; n = __af6list_entry(i); } return n; } static inline struct netlbl_af6list *__af6list_valid_rcu(struct list_head *s, struct list_head *h) { struct list_head *i = s; struct netlbl_af6list *n = __af6list_entry(s); while (i != h && !n->valid) { i = rcu_dereference(list_next_rcu(i)); n = __af6list_entry(i); } return n; } #define netlbl_af6list_foreach(iter, head) \ for (iter = __af6list_valid((head)->next, head); \ &iter->list != (head); \ iter = __af6list_valid(iter->list.next, head)) #define netlbl_af6list_foreach_rcu(iter, head) \ for (iter = __af6list_valid_rcu((head)->next, head); \ &iter->list != (head); \ iter = __af6list_valid_rcu(iter->list.next, head)) #define netlbl_af6list_foreach_safe(iter, tmp, head) \ for (iter = __af6list_valid((head)->next, head), \ tmp = __af6list_valid(iter->list.next, head); \ &iter->list != (head); \ iter = tmp, tmp = __af6list_valid(iter->list.next, head)) int netlbl_af6list_add(struct netlbl_af6list *entry, struct list_head *head); struct netlbl_af6list *netlbl_af6list_remove(const struct in6_addr *addr, const struct in6_addr *mask, struct list_head *head); void netlbl_af6list_remove_entry(struct netlbl_af6list *entry); struct netlbl_af6list *netlbl_af6list_search(const struct in6_addr *addr, struct list_head *head); struct netlbl_af6list *netlbl_af6list_search_exact(const struct in6_addr *addr, const struct in6_addr *mask, struct list_head *head); #ifdef CONFIG_AUDIT void netlbl_af6list_audit_addr(struct audit_buffer *audit_buf, int src, const char *dev, const struct in6_addr *addr, const struct in6_addr *mask); #else static inline void netlbl_af6list_audit_addr(struct audit_buffer *audit_buf, int src, const char *dev, const struct in6_addr *addr, const struct in6_addr *mask) { } #endif #endif /* IPV6 */ #endif |
| 120 121 79 68 79 68 120 82 81 80 79 112 111 121 121 119 121 12 3 3 4 4 82 83 11 11 11 11 11 11 11 11 11 4 4 4 4 4 11 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/char_dev.c * * Copyright (C) 1991, 1992 Linus Torvalds */ #include <linux/init.h> #include <linux/fs.h> #include <linux/kdev_t.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/major.h> #include <linux/errno.h> #include <linux/module.h> #include <linux/seq_file.h> #include <linux/kobject.h> #include <linux/kobj_map.h> #include <linux/cdev.h> #include <linux/mutex.h> #include <linux/backing-dev.h> #include <linux/tty.h> #include "internal.h" static struct kobj_map *cdev_map __ro_after_init; static DEFINE_MUTEX(chrdevs_lock); #define CHRDEV_MAJOR_HASH_SIZE 255 static struct char_device_struct { struct char_device_struct *next; unsigned int major; unsigned int baseminor; int minorct; char name[64]; struct cdev *cdev; /* will die */ } *chrdevs[CHRDEV_MAJOR_HASH_SIZE]; /* index in the above */ static inline int major_to_index(unsigned major) { return major % CHRDEV_MAJOR_HASH_SIZE; } #ifdef CONFIG_PROC_FS void chrdev_show(struct seq_file *f, off_t offset) { struct char_device_struct *cd; mutex_lock(&chrdevs_lock); for (cd = chrdevs[major_to_index(offset)]; cd; cd = cd->next) { if (cd->major == offset) seq_printf(f, "%3d %s\n", cd->major, cd->name); } mutex_unlock(&chrdevs_lock); } #endif /* CONFIG_PROC_FS */ static int find_dynamic_major(void) { int i; struct char_device_struct *cd; for (i = ARRAY_SIZE(chrdevs)-1; i >= CHRDEV_MAJOR_DYN_END; i--) { if (chrdevs[i] == NULL) return i; } for (i = CHRDEV_MAJOR_DYN_EXT_START; i >= CHRDEV_MAJOR_DYN_EXT_END; i--) { for (cd = chrdevs[major_to_index(i)]; cd; cd = cd->next) if (cd->major == i) break; if (cd == NULL) return i; } return -EBUSY; } /* * Register a single major with a specified minor range. * * If major == 0 this function will dynamically allocate an unused major. * If major > 0 this function will attempt to reserve the range of minors * with given major. * */ static struct char_device_struct * __register_chrdev_region(unsigned int major, unsigned int baseminor, int minorct, const char *name) { struct char_device_struct *cd, *curr, *prev = NULL; int ret; int i; if (major >= CHRDEV_MAJOR_MAX) { pr_err("CHRDEV \"%s\" major requested (%u) is greater than the maximum (%u)\n", name, major, CHRDEV_MAJOR_MAX-1); return ERR_PTR(-EINVAL); } if (minorct > MINORMASK + 1 - baseminor) { pr_err("CHRDEV \"%s\" minor range requested (%u-%u) is out of range of maximum range (%u-%u) for a single major\n", name, baseminor, baseminor + minorct - 1, 0, MINORMASK); return ERR_PTR(-EINVAL); } cd = kzalloc(sizeof(struct char_device_struct), GFP_KERNEL); if (cd == NULL) return ERR_PTR(-ENOMEM); mutex_lock(&chrdevs_lock); if (major == 0) { ret = find_dynamic_major(); if (ret < 0) { pr_err("CHRDEV \"%s\" dynamic allocation region is full\n", name); goto out; } major = ret; } ret = -EBUSY; i = major_to_index(major); for (curr = chrdevs[i]; curr; prev = curr, curr = curr->next) { if (curr->major < major) continue; if (curr->major > major) break; if (curr->baseminor + curr->minorct <= baseminor) continue; if (curr->baseminor >= baseminor + minorct) break; goto out; } cd->major = major; cd->baseminor = baseminor; cd->minorct = minorct; strscpy(cd->name, name, sizeof(cd->name)); if (!prev) { cd->next = curr; chrdevs[i] = cd; } else { cd->next = prev->next; prev->next = cd; } mutex_unlock(&chrdevs_lock); return cd; out: mutex_unlock(&chrdevs_lock); kfree(cd); return ERR_PTR(ret); } static struct char_device_struct * __unregister_chrdev_region(unsigned major, unsigned baseminor, int minorct) { struct char_device_struct *cd = NULL, **cp; int i = major_to_index(major); mutex_lock(&chrdevs_lock); for (cp = &chrdevs[i]; *cp; cp = &(*cp)->next) if ((*cp)->major == major && (*cp)->baseminor == baseminor && (*cp)->minorct == minorct) break; if (*cp) { cd = *cp; *cp = cd->next; } mutex_unlock(&chrdevs_lock); return cd; } /** * register_chrdev_region() - register a range of device numbers * @from: the first in the desired range of device numbers; must include * the major number. * @count: the number of consecutive device numbers required * @name: the name of the device or driver. * * Return value is zero on success, a negative error code on failure. */ int register_chrdev_region(dev_t from, unsigned count, const char *name) { struct char_device_struct *cd; dev_t to = from + count; dev_t n, next; for (n = from; n < to; n = next) { next = MKDEV(MAJOR(n)+1, 0); if (next > to) next = to; cd = __register_chrdev_region(MAJOR(n), MINOR(n), next - n, name); if (IS_ERR(cd)) goto fail; } return 0; fail: to = n; for (n = from; n < to; n = next) { next = MKDEV(MAJOR(n)+1, 0); kfree(__unregister_chrdev_region(MAJOR(n), MINOR(n), next - n)); } return PTR_ERR(cd); } /** * alloc_chrdev_region() - register a range of char device numbers * @dev: output parameter for first assigned number * @baseminor: first of the requested range of minor numbers * @count: the number of minor numbers required * @name: the name of the associated device or driver * * Allocates a range of char device numbers. The major number will be * chosen dynamically, and returned (along with the first minor number) * in @dev. Returns zero or a negative error code. */ int alloc_chrdev_region(dev_t *dev, unsigned baseminor, unsigned count, const char *name) { struct char_device_struct *cd; cd = __register_chrdev_region(0, baseminor, count, name); if (IS_ERR(cd)) return PTR_ERR(cd); *dev = MKDEV(cd->major, cd->baseminor); return 0; } /** * __register_chrdev() - create and register a cdev occupying a range of minors * @major: major device number or 0 for dynamic allocation * @baseminor: first of the requested range of minor numbers * @count: the number of minor numbers required * @name: name of this range of devices * @fops: file operations associated with this devices * * If @major == 0 this functions will dynamically allocate a major and return * its number. * * If @major > 0 this function will attempt to reserve a device with the given * major number and will return zero on success. * * Returns a -ve errno on failure. * * The name of this device has nothing to do with the name of the device in * /dev. It only helps to keep track of the different owners of devices. If * your module name has only one type of devices it's ok to use e.g. the name * of the module here. */ int __register_chrdev(unsigned int major, unsigned int baseminor, unsigned int count, const char *name, const struct file_operations *fops) { struct char_device_struct *cd; struct cdev *cdev; int err = -ENOMEM; cd = __register_chrdev_region(major, baseminor, count, name); if (IS_ERR(cd)) return PTR_ERR(cd); cdev = cdev_alloc(); if (!cdev) goto out2; cdev->owner = fops->owner; cdev->ops = fops; kobject_set_name(&cdev->kobj, "%s", name); err = cdev_add(cdev, MKDEV(cd->major, baseminor), count); if (err) goto out; cd->cdev = cdev; return major ? 0 : cd->major; out: kobject_put(&cdev->kobj); out2: kfree(__unregister_chrdev_region(cd->major, baseminor, count)); return err; } /** * unregister_chrdev_region() - unregister a range of device numbers * @from: the first in the range of numbers to unregister * @count: the number of device numbers to unregister * * This function will unregister a range of @count device numbers, * starting with @from. The caller should normally be the one who * allocated those numbers in the first place... */ void unregister_chrdev_region(dev_t from, unsigned count) { dev_t to = from + count; dev_t n, next; for (n = from; n < to; n = next) { next = MKDEV(MAJOR(n)+1, 0); if (next > to) next = to; kfree(__unregister_chrdev_region(MAJOR(n), MINOR(n), next - n)); } } /** * __unregister_chrdev - unregister and destroy a cdev * @major: major device number * @baseminor: first of the range of minor numbers * @count: the number of minor numbers this cdev is occupying * @name: name of this range of devices * * Unregister and destroy the cdev occupying the region described by * @major, @baseminor and @count. This function undoes what * __register_chrdev() did. */ void __unregister_chrdev(unsigned int major, unsigned int baseminor, unsigned int count, const char *name) { struct char_device_struct *cd; cd = __unregister_chrdev_region(major, baseminor, count); if (cd && cd->cdev) cdev_del(cd->cdev); kfree(cd); } static DEFINE_SPINLOCK(cdev_lock); static struct kobject *cdev_get(struct cdev *p) { struct module *owner = p->owner; struct kobject *kobj; if (!try_module_get(owner)) return NULL; kobj = kobject_get_unless_zero(&p->kobj); if (!kobj) module_put(owner); return kobj; } void cdev_put(struct cdev *p) { if (p) { struct module *owner = p->owner; kobject_put(&p->kobj); module_put(owner); } } /* * Called every time a character special file is opened */ static int chrdev_open(struct inode *inode, struct file *filp) { const struct file_operations *fops; struct cdev *p; struct cdev *new = NULL; int ret = 0; spin_lock(&cdev_lock); p = inode->i_cdev; if (!p) { struct kobject *kobj; int idx; spin_unlock(&cdev_lock); kobj = kobj_lookup(cdev_map, inode->i_rdev, &idx); if (!kobj) return -ENXIO; new = container_of(kobj, struct cdev, kobj); spin_lock(&cdev_lock); /* Check i_cdev again in case somebody beat us to it while we dropped the lock. */ p = inode->i_cdev; if (!p) { inode->i_cdev = p = new; list_add(&inode->i_devices, &p->list); new = NULL; } else if (!cdev_get(p)) ret = -ENXIO; } else if (!cdev_get(p)) ret = -ENXIO; spin_unlock(&cdev_lock); cdev_put(new); if (ret) return ret; ret = -ENXIO; fops = fops_get(p->ops); if (!fops) goto out_cdev_put; replace_fops(filp, fops); if (filp->f_op->open) { ret = filp->f_op->open(inode, filp); if (ret) goto out_cdev_put; } return 0; out_cdev_put: cdev_put(p); return ret; } void cd_forget(struct inode *inode) { spin_lock(&cdev_lock); list_del_init(&inode->i_devices); inode->i_cdev = NULL; inode->i_mapping = &inode->i_data; spin_unlock(&cdev_lock); } static void cdev_purge(struct cdev *cdev) { spin_lock(&cdev_lock); while (!list_empty(&cdev->list)) { struct inode *inode; inode = container_of(cdev->list.next, struct inode, i_devices); list_del_init(&inode->i_devices); inode->i_cdev = NULL; } spin_unlock(&cdev_lock); } /* * Dummy default file-operations: the only thing this does * is contain the open that then fills in the correct operations * depending on the special file... */ const struct file_operations def_chr_fops = { .open = chrdev_open, .llseek = noop_llseek, }; static struct kobject *exact_match(dev_t dev, int *part, void *data) { struct cdev *p = data; return &p->kobj; } static int exact_lock(dev_t dev, void *data) { struct cdev *p = data; return cdev_get(p) ? 0 : -1; } /** * cdev_add() - add a char device to the system * @p: the cdev structure for the device * @dev: the first device number for which this device is responsible * @count: the number of consecutive minor numbers corresponding to this * device * * cdev_add() adds the device represented by @p to the system, making it * live immediately. A negative error code is returned on failure. */ int cdev_add(struct cdev *p, dev_t dev, unsigned count) { int error; p->dev = dev; p->count = count; if (WARN_ON(dev == WHITEOUT_DEV)) { error = -EBUSY; goto err; } error = kobj_map(cdev_map, dev, count, NULL, exact_match, exact_lock, p); if (error) goto err; kobject_get(p->kobj.parent); return 0; err: kfree_const(p->kobj.name); p->kobj.name = NULL; return error; } /** * cdev_set_parent() - set the parent kobject for a char device * @p: the cdev structure * @kobj: the kobject to take a reference to * * cdev_set_parent() sets a parent kobject which will be referenced * appropriately so the parent is not freed before the cdev. This * should be called before cdev_add. */ void cdev_set_parent(struct cdev *p, struct kobject *kobj) { WARN_ON(!kobj->state_initialized); p->kobj.parent = kobj; } /** * cdev_device_add() - add a char device and it's corresponding * struct device, linkink * @dev: the device structure * @cdev: the cdev structure * * cdev_device_add() adds the char device represented by @cdev to the system, * just as cdev_add does. It then adds @dev to the system using device_add * The dev_t for the char device will be taken from the struct device which * needs to be initialized first. This helper function correctly takes a * reference to the parent device so the parent will not get released until * all references to the cdev are released. * * This helper uses dev->devt for the device number. If it is not set * it will not add the cdev and it will be equivalent to device_add. * * This function should be used whenever the struct cdev and the * struct device are members of the same structure whose lifetime is * managed by the struct device. * * NOTE: Callers must assume that userspace was able to open the cdev and * can call cdev fops callbacks at any time, even if this function fails. */ int cdev_device_add(struct cdev *cdev, struct device *dev) { int rc = 0; if (dev->devt) { cdev_set_parent(cdev, &dev->kobj); rc = cdev_add(cdev, dev->devt, 1); if (rc) return rc; } rc = device_add(dev); if (rc && dev->devt) cdev_del(cdev); return rc; } /** * cdev_device_del() - inverse of cdev_device_add * @dev: the device structure * @cdev: the cdev structure * * cdev_device_del() is a helper function to call cdev_del and device_del. * It should be used whenever cdev_device_add is used. * * If dev->devt is not set it will not remove the cdev and will be equivalent * to device_del. * * NOTE: This guarantees that associated sysfs callbacks are not running * or runnable, however any cdevs already open will remain and their fops * will still be callable even after this function returns. */ void cdev_device_del(struct cdev *cdev, struct device *dev) { device_del(dev); if (dev->devt) cdev_del(cdev); } static void cdev_unmap(dev_t dev, unsigned count) { kobj_unmap(cdev_map, dev, count); } /** * cdev_del() - remove a cdev from the system * @p: the cdev structure to be removed * * cdev_del() removes @p from the system, possibly freeing the structure * itself. * * NOTE: This guarantees that cdev device will no longer be able to be * opened, however any cdevs already open will remain and their fops will * still be callable even after cdev_del returns. */ void cdev_del(struct cdev *p) { cdev_unmap(p->dev, p->count); kobject_put(&p->kobj); } static void cdev_default_release(struct kobject *kobj) { struct cdev *p = container_of(kobj, struct cdev, kobj); struct kobject *parent = kobj->parent; cdev_purge(p); kobject_put(parent); } static void cdev_dynamic_release(struct kobject *kobj) { struct cdev *p = container_of(kobj, struct cdev, kobj); struct kobject *parent = kobj->parent; cdev_purge(p); kfree(p); kobject_put(parent); } static struct kobj_type ktype_cdev_default = { .release = cdev_default_release, }; static struct kobj_type ktype_cdev_dynamic = { .release = cdev_dynamic_release, }; /** * cdev_alloc() - allocate a cdev structure * * Allocates and returns a cdev structure, or NULL on failure. */ struct cdev *cdev_alloc(void) { struct cdev *p = kzalloc(sizeof(struct cdev), GFP_KERNEL); if (p) { INIT_LIST_HEAD(&p->list); kobject_init(&p->kobj, &ktype_cdev_dynamic); } return p; } /** * cdev_init() - initialize a cdev structure * @cdev: the structure to initialize * @fops: the file_operations for this device * * Initializes @cdev, remembering @fops, making it ready to add to the * system with cdev_add(). */ void cdev_init(struct cdev *cdev, const struct file_operations *fops) { memset(cdev, 0, sizeof *cdev); INIT_LIST_HEAD(&cdev->list); kobject_init(&cdev->kobj, &ktype_cdev_default); cdev->ops = fops; } static struct kobject *base_probe(dev_t dev, int *part, void *data) { if (request_module("char-major-%d-%d", MAJOR(dev), MINOR(dev)) > 0) /* Make old-style 2.4 aliases work */ request_module("char-major-%d", MAJOR(dev)); return NULL; } void __init chrdev_init(void) { cdev_map = kobj_map_init(base_probe, &chrdevs_lock); } /* Let modules do char dev stuff */ EXPORT_SYMBOL(register_chrdev_region); EXPORT_SYMBOL(unregister_chrdev_region); EXPORT_SYMBOL(alloc_chrdev_region); EXPORT_SYMBOL(cdev_init); EXPORT_SYMBOL(cdev_alloc); EXPORT_SYMBOL(cdev_del); EXPORT_SYMBOL(cdev_add); EXPORT_SYMBOL(cdev_set_parent); EXPORT_SYMBOL(cdev_device_add); EXPORT_SYMBOL(cdev_device_del); EXPORT_SYMBOL(__register_chrdev); EXPORT_SYMBOL(__unregister_chrdev); |
| 1553 3620 3813 4278 4856 95 1066 1089 282 1102 3 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 | /* SPDX-License-Identifier: GPL-2.0 */ /* * Wrapper functions for accessing the file_struct fd array. */ #ifndef __LINUX_FILE_H #define __LINUX_FILE_H #include <linux/compiler.h> #include <linux/types.h> #include <linux/posix_types.h> #include <linux/errno.h> #include <linux/cleanup.h> struct file; extern void fput(struct file *); struct file_operations; struct task_struct; struct vfsmount; struct dentry; struct inode; struct path; extern struct file *alloc_file_pseudo(struct inode *, struct vfsmount *, const char *, int flags, const struct file_operations *); extern struct file *alloc_file_pseudo_noaccount(struct inode *, struct vfsmount *, const char *, int flags, const struct file_operations *); extern struct file *alloc_file_clone(struct file *, int flags, const struct file_operations *); static inline void fput_light(struct file *file, int fput_needed) { if (fput_needed) fput(file); } struct fd { struct file *file; unsigned int flags; }; #define FDPUT_FPUT 1 #define FDPUT_POS_UNLOCK 2 static inline void fdput(struct fd fd) { if (fd.flags & FDPUT_FPUT) fput(fd.file); } extern struct file *fget(unsigned int fd); extern struct file *fget_raw(unsigned int fd); extern struct file *fget_task(struct task_struct *task, unsigned int fd); extern unsigned long __fdget(unsigned int fd); extern unsigned long __fdget_raw(unsigned int fd); extern unsigned long __fdget_pos(unsigned int fd); extern void __f_unlock_pos(struct file *); static inline struct fd __to_fd(unsigned long v) { return (struct fd){(struct file *)(v & ~3),v & 3}; } static inline struct fd fdget(unsigned int fd) { return __to_fd(__fdget(fd)); } static inline struct fd fdget_raw(unsigned int fd) { return __to_fd(__fdget_raw(fd)); } static inline struct fd fdget_pos(int fd) { return __to_fd(__fdget_pos(fd)); } static inline void fdput_pos(struct fd f) { if (f.flags & FDPUT_POS_UNLOCK) __f_unlock_pos(f.file); fdput(f); } DEFINE_CLASS(fd, struct fd, fdput(_T), fdget(fd), int fd) DEFINE_CLASS(fd_raw, struct fd, fdput(_T), fdget_raw(fd), int fd) extern int f_dupfd(unsigned int from, struct file *file, unsigned flags); extern int replace_fd(unsigned fd, struct file *file, unsigned flags); extern void set_close_on_exec(unsigned int fd, int flag); extern bool get_close_on_exec(unsigned int fd); extern int __get_unused_fd_flags(unsigned flags, unsigned long nofile); extern int get_unused_fd_flags(unsigned flags); extern void put_unused_fd(unsigned int fd); DEFINE_CLASS(get_unused_fd, int, if (_T >= 0) put_unused_fd(_T), get_unused_fd_flags(flags), unsigned flags) extern void fd_install(unsigned int fd, struct file *file); int receive_fd(struct file *file, int __user *ufd, unsigned int o_flags); int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags); extern void flush_delayed_fput(void); extern void __fput_sync(struct file *); extern unsigned int sysctl_nr_open_min, sysctl_nr_open_max; #endif /* __LINUX_FILE_H */ |
| 9 7 7 7 7 2 3 2 7 3 4 3 6 1 5 5 9 1 1 1 1 1 2 1 1 1 1 4 2 4 1 2 4 3 1 8 8 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> * * Development of this code funded by Astaro AG (http://www.astaro.com/) */ #include <linux/kernel.h> #include <linux/init.h> #include <linux/module.h> #include <linux/spinlock.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables.h> struct nft_limit { spinlock_t lock; u64 last; u64 tokens; }; struct nft_limit_priv { struct nft_limit *limit; u64 tokens_max; u64 rate; u64 nsecs; u32 burst; bool invert; }; static inline bool nft_limit_eval(struct nft_limit_priv *priv, u64 cost) { u64 now, tokens; s64 delta; spin_lock_bh(&priv->limit->lock); now = ktime_get_ns(); tokens = priv->limit->tokens + now - priv->limit->last; if (tokens > priv->tokens_max) tokens = priv->tokens_max; priv->limit->last = now; delta = tokens - cost; if (delta >= 0) { priv->limit->tokens = delta; spin_unlock_bh(&priv->limit->lock); return priv->invert; } priv->limit->tokens = tokens; spin_unlock_bh(&priv->limit->lock); return !priv->invert; } /* Use same default as in iptables. */ #define NFT_LIMIT_PKT_BURST_DEFAULT 5 static int nft_limit_init(struct nft_limit_priv *priv, const struct nlattr * const tb[], bool pkts) { u64 unit, tokens, rate_with_burst; bool invert = false; if (tb[NFTA_LIMIT_RATE] == NULL || tb[NFTA_LIMIT_UNIT] == NULL) return -EINVAL; priv->rate = be64_to_cpu(nla_get_be64(tb[NFTA_LIMIT_RATE])); if (priv->rate == 0) return -EINVAL; unit = be64_to_cpu(nla_get_be64(tb[NFTA_LIMIT_UNIT])); if (check_mul_overflow(unit, NSEC_PER_SEC, &priv->nsecs)) return -EOVERFLOW; if (tb[NFTA_LIMIT_BURST]) priv->burst = ntohl(nla_get_be32(tb[NFTA_LIMIT_BURST])); if (pkts && priv->burst == 0) priv->burst = NFT_LIMIT_PKT_BURST_DEFAULT; if (check_add_overflow(priv->rate, priv->burst, &rate_with_burst)) return -EOVERFLOW; if (pkts) { u64 tmp = div64_u64(priv->nsecs, priv->rate); if (check_mul_overflow(tmp, priv->burst, &tokens)) return -EOVERFLOW; } else { u64 tmp; /* The token bucket size limits the number of tokens can be * accumulated. tokens_max specifies the bucket size. * tokens_max = unit * (rate + burst) / rate. */ if (check_mul_overflow(priv->nsecs, rate_with_burst, &tmp)) return -EOVERFLOW; tokens = div64_u64(tmp, priv->rate); } if (tb[NFTA_LIMIT_FLAGS]) { u32 flags = ntohl(nla_get_be32(tb[NFTA_LIMIT_FLAGS])); if (flags & ~NFT_LIMIT_F_INV) return -EOPNOTSUPP; if (flags & NFT_LIMIT_F_INV) invert = true; } priv->limit = kmalloc(sizeof(*priv->limit), GFP_KERNEL_ACCOUNT); if (!priv->limit) return -ENOMEM; priv->limit->tokens = tokens; priv->tokens_max = priv->limit->tokens; priv->invert = invert; priv->limit->last = ktime_get_ns(); spin_lock_init(&priv->limit->lock); return 0; } static int nft_limit_dump(struct sk_buff *skb, const struct nft_limit_priv *priv, enum nft_limit_type type) { u32 flags = priv->invert ? NFT_LIMIT_F_INV : 0; u64 secs = div_u64(priv->nsecs, NSEC_PER_SEC); if (nla_put_be64(skb, NFTA_LIMIT_RATE, cpu_to_be64(priv->rate), NFTA_LIMIT_PAD) || nla_put_be64(skb, NFTA_LIMIT_UNIT, cpu_to_be64(secs), NFTA_LIMIT_PAD) || nla_put_be32(skb, NFTA_LIMIT_BURST, htonl(priv->burst)) || nla_put_be32(skb, NFTA_LIMIT_TYPE, htonl(type)) || nla_put_be32(skb, NFTA_LIMIT_FLAGS, htonl(flags))) goto nla_put_failure; return 0; nla_put_failure: return -1; } static void nft_limit_destroy(const struct nft_ctx *ctx, const struct nft_limit_priv *priv) { kfree(priv->limit); } static int nft_limit_clone(struct nft_limit_priv *priv_dst, const struct nft_limit_priv *priv_src, gfp_t gfp) { priv_dst->tokens_max = priv_src->tokens_max; priv_dst->rate = priv_src->rate; priv_dst->nsecs = priv_src->nsecs; priv_dst->burst = priv_src->burst; priv_dst->invert = priv_src->invert; priv_dst->limit = kmalloc(sizeof(*priv_dst->limit), gfp); if (!priv_dst->limit) return -ENOMEM; spin_lock_init(&priv_dst->limit->lock); priv_dst->limit->tokens = priv_src->tokens_max; priv_dst->limit->last = ktime_get_ns(); return 0; } struct nft_limit_priv_pkts { struct nft_limit_priv limit; u64 cost; }; static void nft_limit_pkts_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { struct nft_limit_priv_pkts *priv = nft_expr_priv(expr); if (nft_limit_eval(&priv->limit, priv->cost)) regs->verdict.code = NFT_BREAK; } static const struct nla_policy nft_limit_policy[NFTA_LIMIT_MAX + 1] = { [NFTA_LIMIT_RATE] = { .type = NLA_U64 }, [NFTA_LIMIT_UNIT] = { .type = NLA_U64 }, [NFTA_LIMIT_BURST] = { .type = NLA_U32 }, [NFTA_LIMIT_TYPE] = { .type = NLA_U32 }, [NFTA_LIMIT_FLAGS] = { .type = NLA_U32 }, }; static int nft_limit_pkts_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_limit_priv_pkts *priv = nft_expr_priv(expr); int err; err = nft_limit_init(&priv->limit, tb, true); if (err < 0) return err; priv->cost = div64_u64(priv->limit.nsecs, priv->limit.rate); return 0; } static int nft_limit_pkts_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { const struct nft_limit_priv_pkts *priv = nft_expr_priv(expr); return nft_limit_dump(skb, &priv->limit, NFT_LIMIT_PKTS); } static void nft_limit_pkts_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { const struct nft_limit_priv_pkts *priv = nft_expr_priv(expr); nft_limit_destroy(ctx, &priv->limit); } static int nft_limit_pkts_clone(struct nft_expr *dst, const struct nft_expr *src, gfp_t gfp) { struct nft_limit_priv_pkts *priv_dst = nft_expr_priv(dst); struct nft_limit_priv_pkts *priv_src = nft_expr_priv(src); priv_dst->cost = priv_src->cost; return nft_limit_clone(&priv_dst->limit, &priv_src->limit, gfp); } static struct nft_expr_type nft_limit_type; static const struct nft_expr_ops nft_limit_pkts_ops = { .type = &nft_limit_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_limit_priv_pkts)), .eval = nft_limit_pkts_eval, .init = nft_limit_pkts_init, .destroy = nft_limit_pkts_destroy, .clone = nft_limit_pkts_clone, .dump = nft_limit_pkts_dump, .reduce = NFT_REDUCE_READONLY, }; static void nft_limit_bytes_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { struct nft_limit_priv *priv = nft_expr_priv(expr); u64 cost = div64_u64(priv->nsecs * pkt->skb->len, priv->rate); if (nft_limit_eval(priv, cost)) regs->verdict.code = NFT_BREAK; } static int nft_limit_bytes_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_limit_priv *priv = nft_expr_priv(expr); return nft_limit_init(priv, tb, false); } static int nft_limit_bytes_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { const struct nft_limit_priv *priv = nft_expr_priv(expr); return nft_limit_dump(skb, priv, NFT_LIMIT_PKT_BYTES); } static void nft_limit_bytes_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { const struct nft_limit_priv *priv = nft_expr_priv(expr); nft_limit_destroy(ctx, priv); } static int nft_limit_bytes_clone(struct nft_expr *dst, const struct nft_expr *src, gfp_t gfp) { struct nft_limit_priv *priv_dst = nft_expr_priv(dst); struct nft_limit_priv *priv_src = nft_expr_priv(src); return nft_limit_clone(priv_dst, priv_src, gfp); } static const struct nft_expr_ops nft_limit_bytes_ops = { .type = &nft_limit_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_limit_priv)), .eval = nft_limit_bytes_eval, .init = nft_limit_bytes_init, .dump = nft_limit_bytes_dump, .clone = nft_limit_bytes_clone, .destroy = nft_limit_bytes_destroy, .reduce = NFT_REDUCE_READONLY, }; static const struct nft_expr_ops * nft_limit_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { if (tb[NFTA_LIMIT_TYPE] == NULL) return &nft_limit_pkts_ops; switch (ntohl(nla_get_be32(tb[NFTA_LIMIT_TYPE]))) { case NFT_LIMIT_PKTS: return &nft_limit_pkts_ops; case NFT_LIMIT_PKT_BYTES: return &nft_limit_bytes_ops; } return ERR_PTR(-EOPNOTSUPP); } static struct nft_expr_type nft_limit_type __read_mostly = { .name = "limit", .select_ops = nft_limit_select_ops, .policy = nft_limit_policy, .maxattr = NFTA_LIMIT_MAX, .flags = NFT_EXPR_STATEFUL, .owner = THIS_MODULE, }; static void nft_limit_obj_pkts_eval(struct nft_object *obj, struct nft_regs *regs, const struct nft_pktinfo *pkt) { struct nft_limit_priv_pkts *priv = nft_obj_data(obj); if (nft_limit_eval(&priv->limit, priv->cost)) regs->verdict.code = NFT_BREAK; } static int nft_limit_obj_pkts_init(const struct nft_ctx *ctx, const struct nlattr * const tb[], struct nft_object *obj) { struct nft_limit_priv_pkts *priv = nft_obj_data(obj); int err; err = nft_limit_init(&priv->limit, tb, true); if (err < 0) return err; priv->cost = div64_u64(priv->limit.nsecs, priv->limit.rate); return 0; } static int nft_limit_obj_pkts_dump(struct sk_buff *skb, struct nft_object *obj, bool reset) { const struct nft_limit_priv_pkts *priv = nft_obj_data(obj); return nft_limit_dump(skb, &priv->limit, NFT_LIMIT_PKTS); } static void nft_limit_obj_pkts_destroy(const struct nft_ctx *ctx, struct nft_object *obj) { struct nft_limit_priv_pkts *priv = nft_obj_data(obj); nft_limit_destroy(ctx, &priv->limit); } static struct nft_object_type nft_limit_obj_type; static const struct nft_object_ops nft_limit_obj_pkts_ops = { .type = &nft_limit_obj_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_limit_priv_pkts)), .init = nft_limit_obj_pkts_init, .destroy = nft_limit_obj_pkts_destroy, .eval = nft_limit_obj_pkts_eval, .dump = nft_limit_obj_pkts_dump, }; static void nft_limit_obj_bytes_eval(struct nft_object *obj, struct nft_regs *regs, const struct nft_pktinfo *pkt) { struct nft_limit_priv *priv = nft_obj_data(obj); u64 cost = div64_u64(priv->nsecs * pkt->skb->len, priv->rate); if (nft_limit_eval(priv, cost)) regs->verdict.code = NFT_BREAK; } static int nft_limit_obj_bytes_init(const struct nft_ctx *ctx, const struct nlattr * const tb[], struct nft_object *obj) { struct nft_limit_priv *priv = nft_obj_data(obj); return nft_limit_init(priv, tb, false); } static int nft_limit_obj_bytes_dump(struct sk_buff *skb, struct nft_object *obj, bool reset) { const struct nft_limit_priv *priv = nft_obj_data(obj); return nft_limit_dump(skb, priv, NFT_LIMIT_PKT_BYTES); } static void nft_limit_obj_bytes_destroy(const struct nft_ctx *ctx, struct nft_object *obj) { struct nft_limit_priv *priv = nft_obj_data(obj); nft_limit_destroy(ctx, priv); } static struct nft_object_type nft_limit_obj_type; static const struct nft_object_ops nft_limit_obj_bytes_ops = { .type = &nft_limit_obj_type, .size = sizeof(struct nft_limit_priv), .init = nft_limit_obj_bytes_init, .destroy = nft_limit_obj_bytes_destroy, .eval = nft_limit_obj_bytes_eval, .dump = nft_limit_obj_bytes_dump, }; static const struct nft_object_ops * nft_limit_obj_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { if (!tb[NFTA_LIMIT_TYPE]) return &nft_limit_obj_pkts_ops; switch (ntohl(nla_get_be32(tb[NFTA_LIMIT_TYPE]))) { case NFT_LIMIT_PKTS: return &nft_limit_obj_pkts_ops; case NFT_LIMIT_PKT_BYTES: return &nft_limit_obj_bytes_ops; } return ERR_PTR(-EOPNOTSUPP); } static struct nft_object_type nft_limit_obj_type __read_mostly = { .select_ops = nft_limit_obj_select_ops, .type = NFT_OBJECT_LIMIT, .maxattr = NFTA_LIMIT_MAX, .policy = nft_limit_policy, .owner = THIS_MODULE, }; static int __init nft_limit_module_init(void) { int err; err = nft_register_obj(&nft_limit_obj_type); if (err < 0) return err; err = nft_register_expr(&nft_limit_type); if (err < 0) goto err1; return 0; err1: nft_unregister_obj(&nft_limit_obj_type); return err; } static void __exit nft_limit_module_exit(void) { nft_unregister_expr(&nft_limit_type); nft_unregister_obj(&nft_limit_obj_type); } module_init(nft_limit_module_init); module_exit(nft_limit_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); MODULE_ALIAS_NFT_EXPR("limit"); MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_LIMIT); MODULE_DESCRIPTION("nftables limit expression support"); |
| 1 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 | /* Kernel module to match connection tracking byte counter. * GPL (C) 2002 Martin Devera (devik@cdi.cz). */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/bitops.h> #include <linux/skbuff.h> #include <linux/math64.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_connbytes.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_acct.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); MODULE_DESCRIPTION("Xtables: Number of packets/bytes per connection matching"); MODULE_ALIAS("ipt_connbytes"); MODULE_ALIAS("ip6t_connbytes"); static bool connbytes_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_connbytes_info *sinfo = par->matchinfo; const struct nf_conn *ct; enum ip_conntrack_info ctinfo; u_int64_t what = 0; /* initialize to make gcc happy */ u_int64_t bytes = 0; u_int64_t pkts = 0; const struct nf_conn_acct *acct; const struct nf_conn_counter *counters; ct = nf_ct_get(skb, &ctinfo); if (!ct) return false; acct = nf_conn_acct_find(ct); if (!acct) return false; counters = acct->counter; switch (sinfo->what) { case XT_CONNBYTES_PKTS: switch (sinfo->direction) { case XT_CONNBYTES_DIR_ORIGINAL: what = atomic64_read(&counters[IP_CT_DIR_ORIGINAL].packets); break; case XT_CONNBYTES_DIR_REPLY: what = atomic64_read(&counters[IP_CT_DIR_REPLY].packets); break; case XT_CONNBYTES_DIR_BOTH: what = atomic64_read(&counters[IP_CT_DIR_ORIGINAL].packets); what += atomic64_read(&counters[IP_CT_DIR_REPLY].packets); break; } break; case XT_CONNBYTES_BYTES: switch (sinfo->direction) { case XT_CONNBYTES_DIR_ORIGINAL: what = atomic64_read(&counters[IP_CT_DIR_ORIGINAL].bytes); break; case XT_CONNBYTES_DIR_REPLY: what = atomic64_read(&counters[IP_CT_DIR_REPLY].bytes); break; case XT_CONNBYTES_DIR_BOTH: what = atomic64_read(&counters[IP_CT_DIR_ORIGINAL].bytes); what += atomic64_read(&counters[IP_CT_DIR_REPLY].bytes); break; } break; case XT_CONNBYTES_AVGPKT: switch (sinfo->direction) { case XT_CONNBYTES_DIR_ORIGINAL: bytes = atomic64_read(&counters[IP_CT_DIR_ORIGINAL].bytes); pkts = atomic64_read(&counters[IP_CT_DIR_ORIGINAL].packets); break; case XT_CONNBYTES_DIR_REPLY: bytes = atomic64_read(&counters[IP_CT_DIR_REPLY].bytes); pkts = atomic64_read(&counters[IP_CT_DIR_REPLY].packets); break; case XT_CONNBYTES_DIR_BOTH: bytes = atomic64_read(&counters[IP_CT_DIR_ORIGINAL].bytes) + atomic64_read(&counters[IP_CT_DIR_REPLY].bytes); pkts = atomic64_read(&counters[IP_CT_DIR_ORIGINAL].packets) + atomic64_read(&counters[IP_CT_DIR_REPLY].packets); break; } if (pkts != 0) what = div64_u64(bytes, pkts); break; } if (sinfo->count.to >= sinfo->count.from) return what <= sinfo->count.to && what >= sinfo->count.from; else /* inverted */ return what < sinfo->count.to || what > sinfo->count.from; } static int connbytes_mt_check(const struct xt_mtchk_param *par) { const struct xt_connbytes_info *sinfo = par->matchinfo; int ret; if (sinfo->what != XT_CONNBYTES_PKTS && sinfo->what != XT_CONNBYTES_BYTES && sinfo->what != XT_CONNBYTES_AVGPKT) return -EINVAL; if (sinfo->direction != XT_CONNBYTES_DIR_ORIGINAL && sinfo->direction != XT_CONNBYTES_DIR_REPLY && sinfo->direction != XT_CONNBYTES_DIR_BOTH) return -EINVAL; ret = nf_ct_netns_get(par->net, par->family); if (ret < 0) pr_info_ratelimited("cannot load conntrack support for proto=%u\n", par->family); /* * This filter cannot function correctly unless connection tracking * accounting is enabled, so complain in the hope that someone notices. */ if (!nf_ct_acct_enabled(par->net)) { pr_warn("Forcing CT accounting to be enabled\n"); nf_ct_set_acct(par->net, true); } return ret; } static void connbytes_mt_destroy(const struct xt_mtdtor_param *par) { nf_ct_netns_put(par->net, par->family); } static struct xt_match connbytes_mt_reg __read_mostly = { .name = "connbytes", .revision = 0, .family = NFPROTO_UNSPEC, .checkentry = connbytes_mt_check, .match = connbytes_mt, .destroy = connbytes_mt_destroy, .matchsize = sizeof(struct xt_connbytes_info), .me = THIS_MODULE, }; static int __init connbytes_mt_init(void) { return xt_register_match(&connbytes_mt_reg); } static void __exit connbytes_mt_exit(void) { xt_unregister_match(&connbytes_mt_reg); } module_init(connbytes_mt_init); module_exit(connbytes_mt_exit); |
| 1 30 35 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_NSPROXY_H #define _LINUX_NSPROXY_H #include <linux/refcount.h> #include <linux/spinlock.h> #include <linux/sched.h> struct mnt_namespace; struct uts_namespace; struct ipc_namespace; struct pid_namespace; struct cgroup_namespace; struct fs_struct; /* * A structure to contain pointers to all per-process * namespaces - fs (mount), uts, network, sysvipc, etc. * * The pid namespace is an exception -- it's accessed using * task_active_pid_ns. The pid namespace here is the * namespace that children will use. * * 'count' is the number of tasks holding a reference. * The count for each namespace, then, will be the number * of nsproxies pointing to it, not the number of tasks. * * The nsproxy is shared by tasks which share all namespaces. * As soon as a single namespace is cloned or unshared, the * nsproxy is copied. */ struct nsproxy { refcount_t count; struct uts_namespace *uts_ns; struct ipc_namespace *ipc_ns; struct mnt_namespace *mnt_ns; struct pid_namespace *pid_ns_for_children; struct net *net_ns; struct time_namespace *time_ns; struct time_namespace *time_ns_for_children; struct cgroup_namespace *cgroup_ns; }; extern struct nsproxy init_nsproxy; /* * A structure to encompass all bits needed to install * a partial or complete new set of namespaces. * * If a new user namespace is requested cred will * point to a modifiable set of credentials. If a pointer * to a modifiable set is needed nsset_cred() must be * used and tested. */ struct nsset { unsigned flags; struct nsproxy *nsproxy; struct fs_struct *fs; const struct cred *cred; }; static inline struct cred *nsset_cred(struct nsset *set) { if (set->flags & CLONE_NEWUSER) return (struct cred *)set->cred; return NULL; } /* * the namespaces access rules are: * * 1. only current task is allowed to change tsk->nsproxy pointer or * any pointer on the nsproxy itself. Current must hold the task_lock * when changing tsk->nsproxy. * * 2. when accessing (i.e. reading) current task's namespaces - no * precautions should be taken - just dereference the pointers * * 3. the access to other task namespaces is performed like this * task_lock(task); * nsproxy = task->nsproxy; * if (nsproxy != NULL) { * / * * * work with the namespaces here * * e.g. get the reference on one of them * * / * } / * * * NULL task->nsproxy means that this task is * * almost dead (zombie) * * / * task_unlock(task); * */ int copy_namespaces(unsigned long flags, struct task_struct *tsk); void exit_task_namespaces(struct task_struct *tsk); void switch_task_namespaces(struct task_struct *tsk, struct nsproxy *new); int exec_task_namespaces(void); void free_nsproxy(struct nsproxy *ns); int unshare_nsproxy_namespaces(unsigned long, struct nsproxy **, struct cred *, struct fs_struct *); int __init nsproxy_cache_init(void); static inline void put_nsproxy(struct nsproxy *ns) { if (refcount_dec_and_test(&ns->count)) free_nsproxy(ns); } static inline void get_nsproxy(struct nsproxy *ns) { refcount_inc(&ns->count); } #endif |
| 1 1 1 1 1 36 36 36 2 1 2 36 36 36 36 36 2 2 36 36 35 35 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 | // SPDX-License-Identifier: GPL-2.0 /* * USB device quirk handling logic and table * * Copyright (c) 2007 Oliver Neukum * Copyright (c) 2007 Greg Kroah-Hartman <gregkh@suse.de> */ #include <linux/moduleparam.h> #include <linux/usb.h> #include <linux/usb/quirks.h> #include <linux/usb/hcd.h> #include "usb.h" struct quirk_entry { u16 vid; u16 pid; u32 flags; }; static DEFINE_MUTEX(quirk_mutex); static struct quirk_entry *quirk_list; static unsigned int quirk_count; static char quirks_param[128]; static int quirks_param_set(const char *value, const struct kernel_param *kp) { char *val, *p, *field; u16 vid, pid; u32 flags; size_t i; int err; val = kstrdup(value, GFP_KERNEL); if (!val) return -ENOMEM; err = param_set_copystring(val, kp); if (err) { kfree(val); return err; } mutex_lock(&quirk_mutex); if (!*val) { quirk_count = 0; kfree(quirk_list); quirk_list = NULL; goto unlock; } for (quirk_count = 1, i = 0; val[i]; i++) if (val[i] == ',') quirk_count++; if (quirk_list) { kfree(quirk_list); quirk_list = NULL; } quirk_list = kcalloc(quirk_count, sizeof(struct quirk_entry), GFP_KERNEL); if (!quirk_list) { quirk_count = 0; mutex_unlock(&quirk_mutex); kfree(val); return -ENOMEM; } for (i = 0, p = val; p && *p;) { /* Each entry consists of VID:PID:flags */ field = strsep(&p, ":"); if (!field) break; if (kstrtou16(field, 16, &vid)) break; field = strsep(&p, ":"); if (!field) break; if (kstrtou16(field, 16, &pid)) break; field = strsep(&p, ","); if (!field || !*field) break; /* Collect the flags */ for (flags = 0; *field; field++) { switch (*field) { case 'a': flags |= USB_QUIRK_STRING_FETCH_255; break; case 'b': flags |= USB_QUIRK_RESET_RESUME; break; case 'c': flags |= USB_QUIRK_NO_SET_INTF; break; case 'd': flags |= USB_QUIRK_CONFIG_INTF_STRINGS; break; case 'e': flags |= USB_QUIRK_RESET; break; case 'f': flags |= USB_QUIRK_HONOR_BNUMINTERFACES; break; case 'g': flags |= USB_QUIRK_DELAY_INIT; break; case 'h': flags |= USB_QUIRK_LINEAR_UFRAME_INTR_BINTERVAL; break; case 'i': flags |= USB_QUIRK_DEVICE_QUALIFIER; break; case 'j': flags |= USB_QUIRK_IGNORE_REMOTE_WAKEUP; break; case 'k': flags |= USB_QUIRK_NO_LPM; break; case 'l': flags |= USB_QUIRK_LINEAR_FRAME_INTR_BINTERVAL; break; case 'm': flags |= USB_QUIRK_DISCONNECT_SUSPEND; break; case 'n': flags |= USB_QUIRK_DELAY_CTRL_MSG; break; case 'o': flags |= USB_QUIRK_HUB_SLOW_RESET; break; case 'p': flags |= USB_QUIRK_SHORT_SET_ADDRESS_REQ_TIMEOUT; break; /* Ignore unrecognized flag characters */ } } quirk_list[i++] = (struct quirk_entry) { .vid = vid, .pid = pid, .flags = flags }; } if (i < quirk_count) quirk_count = i; unlock: mutex_unlock(&quirk_mutex); kfree(val); return 0; } static const struct kernel_param_ops quirks_param_ops = { .set = quirks_param_set, .get = param_get_string, }; static struct kparam_string quirks_param_string = { .maxlen = sizeof(quirks_param), .string = quirks_param, }; device_param_cb(quirks, &quirks_param_ops, &quirks_param_string, 0644); MODULE_PARM_DESC(quirks, "Add/modify USB quirks by specifying quirks=vendorID:productID:quirks"); /* Lists of quirky USB devices, split in device quirks and interface quirks. * Device quirks are applied at the very beginning of the enumeration process, * right after reading the device descriptor. They can thus only match on device * information. * * Interface quirks are applied after reading all the configuration descriptors. * They can match on both device and interface information. * * Note that the DELAY_INIT and HONOR_BNUMINTERFACES quirks do not make sense as * interface quirks, as they only influence the enumeration process which is run * before processing the interface quirks. * * Please keep the lists ordered by: * 1) Vendor ID * 2) Product ID * 3) Class ID */ static const struct usb_device_id usb_quirk_list[] = { /* CBM - Flash disk */ { USB_DEVICE(0x0204, 0x6025), .driver_info = USB_QUIRK_RESET_RESUME }, /* WORLDE Controller KS49 or Prodipe MIDI 49C USB controller */ { USB_DEVICE(0x0218, 0x0201), .driver_info = USB_QUIRK_CONFIG_INTF_STRINGS }, /* WORLDE easy key (easykey.25) MIDI controller */ { USB_DEVICE(0x0218, 0x0401), .driver_info = USB_QUIRK_CONFIG_INTF_STRINGS }, /* HP 5300/5370C scanner */ { USB_DEVICE(0x03f0, 0x0701), .driver_info = USB_QUIRK_STRING_FETCH_255 }, /* HP v222w 16GB Mini USB Drive */ { USB_DEVICE(0x03f0, 0x3f40), .driver_info = USB_QUIRK_DELAY_INIT }, /* Creative SB Audigy 2 NX */ { USB_DEVICE(0x041e, 0x3020), .driver_info = USB_QUIRK_RESET_RESUME }, /* USB3503 */ { USB_DEVICE(0x0424, 0x3503), .driver_info = USB_QUIRK_RESET_RESUME }, /* Microsoft Wireless Laser Mouse 6000 Receiver */ { USB_DEVICE(0x045e, 0x00e1), .driver_info = USB_QUIRK_RESET_RESUME }, /* Microsoft LifeCam-VX700 v2.0 */ { USB_DEVICE(0x045e, 0x0770), .driver_info = USB_QUIRK_RESET_RESUME }, /* Microsoft Surface Dock Ethernet (RTL8153 GigE) */ { USB_DEVICE(0x045e, 0x07c6), .driver_info = USB_QUIRK_NO_LPM }, /* Cherry Stream G230 2.0 (G85-231) and 3.0 (G85-232) */ { USB_DEVICE(0x046a, 0x0023), .driver_info = USB_QUIRK_RESET_RESUME }, /* Logitech HD Webcam C270 */ { USB_DEVICE(0x046d, 0x0825), .driver_info = USB_QUIRK_RESET_RESUME }, /* Logitech HD Pro Webcams C920, C920-C, C922, C925e and C930e */ { USB_DEVICE(0x046d, 0x082d), .driver_info = USB_QUIRK_DELAY_INIT }, { USB_DEVICE(0x046d, 0x0841), .driver_info = USB_QUIRK_DELAY_INIT }, { USB_DEVICE(0x046d, 0x0843), .driver_info = USB_QUIRK_DELAY_INIT }, { USB_DEVICE(0x046d, 0x085b), .driver_info = USB_QUIRK_DELAY_INIT }, { USB_DEVICE(0x046d, 0x085c), .driver_info = USB_QUIRK_DELAY_INIT }, /* Logitech ConferenceCam CC3000e */ { USB_DEVICE(0x046d, 0x0847), .driver_info = USB_QUIRK_DELAY_INIT }, { USB_DEVICE(0x046d, 0x0848), .driver_info = USB_QUIRK_DELAY_INIT }, /* Logitech PTZ Pro Camera */ { USB_DEVICE(0x046d, 0x0853), .driver_info = USB_QUIRK_DELAY_INIT }, /* Logitech Screen Share */ { USB_DEVICE(0x046d, 0x086c), .driver_info = USB_QUIRK_NO_LPM }, /* Logitech Quickcam Fusion */ { USB_DEVICE(0x046d, 0x08c1), .driver_info = USB_QUIRK_RESET_RESUME }, /* Logitech Quickcam Orbit MP */ { USB_DEVICE(0x046d, 0x08c2), .driver_info = USB_QUIRK_RESET_RESUME }, /* Logitech Quickcam Pro for Notebook */ { USB_DEVICE(0x046d, 0x08c3), .driver_info = USB_QUIRK_RESET_RESUME }, /* Logitech Quickcam Pro 5000 */ { USB_DEVICE(0x046d, 0x08c5), .driver_info = USB_QUIRK_RESET_RESUME }, /* Logitech Quickcam OEM Dell Notebook */ { USB_DEVICE(0x046d, 0x08c6), .driver_info = USB_QUIRK_RESET_RESUME }, /* Logitech Quickcam OEM Cisco VT Camera II */ { USB_DEVICE(0x046d, 0x08c7), .driver_info = USB_QUIRK_RESET_RESUME }, /* Logitech Harmony 700-series */ { USB_DEVICE(0x046d, 0xc122), .driver_info = USB_QUIRK_DELAY_INIT }, /* Philips PSC805 audio device */ { USB_DEVICE(0x0471, 0x0155), .driver_info = USB_QUIRK_RESET_RESUME }, /* Plantronic Audio 655 DSP */ { USB_DEVICE(0x047f, 0xc008), .driver_info = USB_QUIRK_RESET_RESUME }, /* Plantronic Audio 648 USB */ { USB_DEVICE(0x047f, 0xc013), .driver_info = USB_QUIRK_RESET_RESUME }, /* Artisman Watchdog Dongle */ { USB_DEVICE(0x04b4, 0x0526), .driver_info = USB_QUIRK_CONFIG_INTF_STRINGS }, /* Microchip Joss Optical infrared touchboard device */ { USB_DEVICE(0x04d8, 0x000c), .driver_info = USB_QUIRK_CONFIG_INTF_STRINGS }, /* CarrolTouch 4000U */ { USB_DEVICE(0x04e7, 0x0009), .driver_info = USB_QUIRK_RESET_RESUME }, /* CarrolTouch 4500U */ { USB_DEVICE(0x04e7, 0x0030), .driver_info = USB_QUIRK_RESET_RESUME }, /* Samsung Android phone modem - ID conflict with SPH-I500 */ { USB_DEVICE(0x04e8, 0x6601), .driver_info = USB_QUIRK_CONFIG_INTF_STRINGS }, /* Elan Touchscreen */ { USB_DEVICE(0x04f3, 0x0089), .driver_info = USB_QUIRK_DEVICE_QUALIFIER }, { USB_DEVICE(0x04f3, 0x009b), .driver_info = USB_QUIRK_DEVICE_QUALIFIER }, { USB_DEVICE(0x04f3, 0x010c), .driver_info = USB_QUIRK_DEVICE_QUALIFIER }, { USB_DEVICE(0x04f3, 0x0125), .driver_info = USB_QUIRK_DEVICE_QUALIFIER }, { USB_DEVICE(0x04f3, 0x016f), .driver_info = USB_QUIRK_DEVICE_QUALIFIER }, { USB_DEVICE(0x04f3, 0x0381), .driver_info = USB_QUIRK_NO_LPM }, { USB_DEVICE(0x04f3, 0x21b8), .driver_info = USB_QUIRK_DEVICE_QUALIFIER }, /* Roland SC-8820 */ { USB_DEVICE(0x0582, 0x0007), .driver_info = USB_QUIRK_RESET_RESUME }, /* Edirol SD-20 */ { USB_DEVICE(0x0582, 0x0027), .driver_info = USB_QUIRK_RESET_RESUME }, /* Alcor Micro Corp. Hub */ { USB_DEVICE(0x058f, 0x9254), .driver_info = USB_QUIRK_RESET_RESUME }, /* appletouch */ { USB_DEVICE(0x05ac, 0x021a), .driver_info = USB_QUIRK_RESET_RESUME }, /* Genesys Logic hub, internally used by KY-688 USB 3.1 Type-C Hub */ { USB_DEVICE(0x05e3, 0x0612), .driver_info = USB_QUIRK_NO_LPM }, /* ELSA MicroLink 56K */ { USB_DEVICE(0x05cc, 0x2267), .driver_info = USB_QUIRK_RESET_RESUME }, /* Genesys Logic hub, internally used by Moshi USB to Ethernet Adapter */ { USB_DEVICE(0x05e3, 0x0616), .driver_info = USB_QUIRK_NO_LPM }, /* Avision AV600U */ { USB_DEVICE(0x0638, 0x0a13), .driver_info = USB_QUIRK_STRING_FETCH_255 }, /* Saitek Cyborg Gold Joystick */ { USB_DEVICE(0x06a3, 0x0006), .driver_info = USB_QUIRK_CONFIG_INTF_STRINGS }, /* Agfa SNAPSCAN 1212U */ { USB_DEVICE(0x06bd, 0x0001), .driver_info = USB_QUIRK_RESET_RESUME }, /* Guillemot Webcam Hercules Dualpix Exchange (2nd ID) */ { USB_DEVICE(0x06f8, 0x0804), .driver_info = USB_QUIRK_RESET_RESUME }, /* Guillemot Webcam Hercules Dualpix Exchange*/ { USB_DEVICE(0x06f8, 0x3005), .driver_info = USB_QUIRK_RESET_RESUME }, /* Guillemot Hercules DJ Console audio card (BZ 208357) */ { USB_DEVICE(0x06f8, 0xb000), .driver_info = USB_QUIRK_ENDPOINT_IGNORE }, /* Midiman M-Audio Keystation 88es */ { USB_DEVICE(0x0763, 0x0192), .driver_info = USB_QUIRK_RESET_RESUME }, /* SanDisk Ultra Fit and Ultra Flair */ { USB_DEVICE(0x0781, 0x5583), .driver_info = USB_QUIRK_NO_LPM }, { USB_DEVICE(0x0781, 0x5591), .driver_info = USB_QUIRK_NO_LPM }, /* Realforce 87U Keyboard */ { USB_DEVICE(0x0853, 0x011b), .driver_info = USB_QUIRK_NO_LPM }, /* M-Systems Flash Disk Pioneers */ { USB_DEVICE(0x08ec, 0x1000), .driver_info = USB_QUIRK_RESET_RESUME }, /* Baum Vario Ultra */ { USB_DEVICE(0x0904, 0x6101), .driver_info = USB_QUIRK_LINEAR_FRAME_INTR_BINTERVAL }, { USB_DEVICE(0x0904, 0x6102), .driver_info = USB_QUIRK_LINEAR_FRAME_INTR_BINTERVAL }, { USB_DEVICE(0x0904, 0x6103), .driver_info = USB_QUIRK_LINEAR_FRAME_INTR_BINTERVAL }, /* Sound Devices USBPre2 */ { USB_DEVICE(0x0926, 0x0202), .driver_info = USB_QUIRK_ENDPOINT_IGNORE }, /* Sound Devices MixPre-D */ { USB_DEVICE(0x0926, 0x0208), .driver_info = USB_QUIRK_ENDPOINT_IGNORE }, /* Keytouch QWERTY Panel keyboard */ { USB_DEVICE(0x0926, 0x3333), .driver_info = USB_QUIRK_CONFIG_INTF_STRINGS }, /* Kingston DataTraveler 3.0 */ { USB_DEVICE(0x0951, 0x1666), .driver_info = USB_QUIRK_NO_LPM }, /* NVIDIA Jetson devices in Force Recovery mode */ { USB_DEVICE(0x0955, 0x7018), .driver_info = USB_QUIRK_RESET_RESUME }, { USB_DEVICE(0x0955, 0x7019), .driver_info = USB_QUIRK_RESET_RESUME }, { USB_DEVICE(0x0955, 0x7418), .driver_info = USB_QUIRK_RESET_RESUME }, { USB_DEVICE(0x0955, 0x7721), .driver_info = USB_QUIRK_RESET_RESUME }, { USB_DEVICE(0x0955, 0x7c18), .driver_info = USB_QUIRK_RESET_RESUME }, { USB_DEVICE(0x0955, 0x7e19), .driver_info = USB_QUIRK_RESET_RESUME }, { USB_DEVICE(0x0955, 0x7f21), .driver_info = USB_QUIRK_RESET_RESUME }, /* X-Rite/Gretag-Macbeth Eye-One Pro display colorimeter */ { USB_DEVICE(0x0971, 0x2000), .driver_info = USB_QUIRK_NO_SET_INTF }, /* ELMO L-12F document camera */ { USB_DEVICE(0x09a1, 0x0028), .driver_info = USB_QUIRK_DELAY_CTRL_MSG }, /* Broadcom BCM92035DGROM BT dongle */ { USB_DEVICE(0x0a5c, 0x2021), .driver_info = USB_QUIRK_RESET_RESUME }, /* MAYA44USB sound device */ { USB_DEVICE(0x0a92, 0x0091), .driver_info = USB_QUIRK_RESET_RESUME }, /* ASUS Base Station(T100) */ { USB_DEVICE(0x0b05, 0x17e0), .driver_info = USB_QUIRK_IGNORE_REMOTE_WAKEUP }, /* Realtek Semiconductor Corp. Mass Storage Device (Multicard Reader)*/ { USB_DEVICE(0x0bda, 0x0151), .driver_info = USB_QUIRK_CONFIG_INTF_STRINGS }, /* Realtek hub in Dell WD19 (Type-C) */ { USB_DEVICE(0x0bda, 0x0487), .driver_info = USB_QUIRK_NO_LPM }, /* Generic RTL8153 based ethernet adapters */ { USB_DEVICE(0x0bda, 0x8153), .driver_info = USB_QUIRK_NO_LPM }, /* SONiX USB DEVICE Touchpad */ { USB_DEVICE(0x0c45, 0x7056), .driver_info = USB_QUIRK_IGNORE_REMOTE_WAKEUP }, /* Action Semiconductor flash disk */ { USB_DEVICE(0x10d6, 0x2200), .driver_info = USB_QUIRK_STRING_FETCH_255 }, /* novation SoundControl XL */ { USB_DEVICE(0x1235, 0x0061), .driver_info = USB_QUIRK_RESET_RESUME }, /* Focusrite Scarlett Solo USB */ { USB_DEVICE(0x1235, 0x8211), .driver_info = USB_QUIRK_DISCONNECT_SUSPEND }, /* Huawei 4G LTE module */ { USB_DEVICE(0x12d1, 0x15bb), .driver_info = USB_QUIRK_DISCONNECT_SUSPEND }, { USB_DEVICE(0x12d1, 0x15c3), .driver_info = USB_QUIRK_DISCONNECT_SUSPEND }, /* SKYMEDI USB_DRIVE */ { USB_DEVICE(0x1516, 0x8628), .driver_info = USB_QUIRK_RESET_RESUME }, /* Razer - Razer Blade Keyboard */ { USB_DEVICE(0x1532, 0x0116), .driver_info = USB_QUIRK_LINEAR_UFRAME_INTR_BINTERVAL }, /* Lenovo ThinkPad OneLink+ Dock twin hub controllers (VIA Labs VL812) */ { USB_DEVICE(0x17ef, 0x1018), .driver_info = USB_QUIRK_RESET_RESUME }, { USB_DEVICE(0x17ef, 0x1019), .driver_info = USB_QUIRK_RESET_RESUME }, /* Lenovo USB-C to Ethernet Adapter RTL8153-04 */ { USB_DEVICE(0x17ef, 0x720c), .driver_info = USB_QUIRK_NO_LPM }, /* Lenovo Powered USB-C Travel Hub (4X90S92381, RTL8153 GigE) */ { USB_DEVICE(0x17ef, 0x721e), .driver_info = USB_QUIRK_NO_LPM }, /* Lenovo ThinkCenter A630Z TI024Gen3 usb-audio */ { USB_DEVICE(0x17ef, 0xa012), .driver_info = USB_QUIRK_DISCONNECT_SUSPEND }, /* Lenovo ThinkPad USB-C Dock Gen2 Ethernet (RTL8153 GigE) */ { USB_DEVICE(0x17ef, 0xa387), .driver_info = USB_QUIRK_NO_LPM }, /* BUILDWIN Photo Frame */ { USB_DEVICE(0x1908, 0x1315), .driver_info = USB_QUIRK_HONOR_BNUMINTERFACES }, /* Protocol and OTG Electrical Test Device */ { USB_DEVICE(0x1a0a, 0x0200), .driver_info = USB_QUIRK_LINEAR_UFRAME_INTR_BINTERVAL }, /* Terminus Technology Inc. Hub */ { USB_DEVICE(0x1a40, 0x0101), .driver_info = USB_QUIRK_HUB_SLOW_RESET }, /* Corsair K70 RGB */ { USB_DEVICE(0x1b1c, 0x1b13), .driver_info = USB_QUIRK_DELAY_INIT | USB_QUIRK_DELAY_CTRL_MSG }, /* Corsair Strafe */ { USB_DEVICE(0x1b1c, 0x1b15), .driver_info = USB_QUIRK_DELAY_INIT | USB_QUIRK_DELAY_CTRL_MSG }, /* Corsair Strafe RGB */ { USB_DEVICE(0x1b1c, 0x1b20), .driver_info = USB_QUIRK_DELAY_INIT | USB_QUIRK_DELAY_CTRL_MSG }, /* Corsair K70 LUX RGB */ { USB_DEVICE(0x1b1c, 0x1b33), .driver_info = USB_QUIRK_DELAY_INIT }, /* Corsair K70 LUX */ { USB_DEVICE(0x1b1c, 0x1b36), .driver_info = USB_QUIRK_DELAY_INIT }, /* Corsair K70 RGB RAPDIFIRE */ { USB_DEVICE(0x1b1c, 0x1b38), .driver_info = USB_QUIRK_DELAY_INIT | USB_QUIRK_DELAY_CTRL_MSG }, /* MIDI keyboard WORLDE MINI */ { USB_DEVICE(0x1c75, 0x0204), .driver_info = USB_QUIRK_CONFIG_INTF_STRINGS }, /* Acer C120 LED Projector */ { USB_DEVICE(0x1de1, 0xc102), .driver_info = USB_QUIRK_NO_LPM }, /* Blackmagic Design Intensity Shuttle */ { USB_DEVICE(0x1edb, 0xbd3b), .driver_info = USB_QUIRK_NO_LPM }, /* Blackmagic Design UltraStudio SDI */ { USB_DEVICE(0x1edb, 0xbd4f), .driver_info = USB_QUIRK_NO_LPM }, /* Hauppauge HVR-950q */ { USB_DEVICE(0x2040, 0x7200), .driver_info = USB_QUIRK_CONFIG_INTF_STRINGS }, /* Raydium Touchscreen */ { USB_DEVICE(0x2386, 0x3114), .driver_info = USB_QUIRK_NO_LPM }, { USB_DEVICE(0x2386, 0x3119), .driver_info = USB_QUIRK_NO_LPM }, { USB_DEVICE(0x2386, 0x350e), .driver_info = USB_QUIRK_NO_LPM }, /* APTIV AUTOMOTIVE HUB */ { USB_DEVICE(0x2c48, 0x0132), .driver_info = USB_QUIRK_SHORT_SET_ADDRESS_REQ_TIMEOUT }, /* DJI CineSSD */ { USB_DEVICE(0x2ca3, 0x0031), .driver_info = USB_QUIRK_NO_LPM }, /* Alcor Link AK9563 SC Reader used in 2022 Lenovo ThinkPads */ { USB_DEVICE(0x2ce3, 0x9563), .driver_info = USB_QUIRK_NO_LPM }, /* DELL USB GEN2 */ { USB_DEVICE(0x413c, 0xb062), .driver_info = USB_QUIRK_NO_LPM | USB_QUIRK_RESET_RESUME }, /* VCOM device */ { USB_DEVICE(0x4296, 0x7570), .driver_info = USB_QUIRK_CONFIG_INTF_STRINGS }, /* INTEL VALUE SSD */ { USB_DEVICE(0x8086, 0xf1a5), .driver_info = USB_QUIRK_RESET_RESUME }, { } /* terminating entry must be last */ }; static const struct usb_device_id usb_interface_quirk_list[] = { /* Logitech UVC Cameras */ { USB_VENDOR_AND_INTERFACE_INFO(0x046d, USB_CLASS_VIDEO, 1, 0), .driver_info = USB_QUIRK_RESET_RESUME }, { } /* terminating entry must be last */ }; static const struct usb_device_id usb_amd_resume_quirk_list[] = { /* Lenovo Mouse with Pixart controller */ { USB_DEVICE(0x17ef, 0x602e), .driver_info = USB_QUIRK_RESET_RESUME }, /* Pixart Mouse */ { USB_DEVICE(0x093a, 0x2500), .driver_info = USB_QUIRK_RESET_RESUME }, { USB_DEVICE(0x093a, 0x2510), .driver_info = USB_QUIRK_RESET_RESUME }, { USB_DEVICE(0x093a, 0x2521), .driver_info = USB_QUIRK_RESET_RESUME }, { USB_DEVICE(0x03f0, 0x2b4a), .driver_info = USB_QUIRK_RESET_RESUME }, /* Logitech Optical Mouse M90/M100 */ { USB_DEVICE(0x046d, 0xc05a), .driver_info = USB_QUIRK_RESET_RESUME }, { } /* terminating entry must be last */ }; /* * Entries for endpoints that should be ignored when parsing configuration * descriptors. * * Matched for devices with USB_QUIRK_ENDPOINT_IGNORE. */ static const struct usb_device_id usb_endpoint_ignore[] = { { USB_DEVICE_INTERFACE_NUMBER(0x06f8, 0xb000, 5), .driver_info = 0x01 }, { USB_DEVICE_INTERFACE_NUMBER(0x06f8, 0xb000, 5), .driver_info = 0x81 }, { USB_DEVICE_INTERFACE_NUMBER(0x0926, 0x0202, 1), .driver_info = 0x85 }, { USB_DEVICE_INTERFACE_NUMBER(0x0926, 0x0208, 1), .driver_info = 0x85 }, { } }; bool usb_endpoint_is_ignored(struct usb_device *udev, struct usb_host_interface *intf, struct usb_endpoint_descriptor *epd) { const struct usb_device_id *id; unsigned int address; for (id = usb_endpoint_ignore; id->match_flags; ++id) { if (!usb_match_device(udev, id)) continue; if (!usb_match_one_id_intf(udev, intf, id)) continue; address = id->driver_info; if (address == epd->bEndpointAddress) return true; } return false; } static bool usb_match_any_interface(struct usb_device *udev, const struct usb_device_id *id) { unsigned int i; for (i = 0; i < udev->descriptor.bNumConfigurations; ++i) { struct usb_host_config *cfg = &udev->config[i]; unsigned int j; for (j = 0; j < cfg->desc.bNumInterfaces; ++j) { struct usb_interface_cache *cache; struct usb_host_interface *intf; cache = cfg->intf_cache[j]; if (cache->num_altsetting == 0) continue; intf = &cache->altsetting[0]; if (usb_match_one_id_intf(udev, intf, id)) return true; } } return false; } static int usb_amd_resume_quirk(struct usb_device *udev) { struct usb_hcd *hcd; hcd = bus_to_hcd(udev->bus); /* The device should be attached directly to root hub */ if (udev->level == 1 && hcd->amd_resume_bug == 1) return 1; return 0; } static u32 usb_detect_static_quirks(struct usb_device *udev, const struct usb_device_id *id) { u32 quirks = 0; for (; id->match_flags; id++) { if (!usb_match_device(udev, id)) continue; if ((id->match_flags & USB_DEVICE_ID_MATCH_INT_INFO) && !usb_match_any_interface(udev, id)) continue; quirks |= (u32)(id->driver_info); } return quirks; } static u32 usb_detect_dynamic_quirks(struct usb_device *udev) { u16 vid = le16_to_cpu(udev->descriptor.idVendor); u16 pid = le16_to_cpu(udev->descriptor.idProduct); int i, flags = 0; mutex_lock(&quirk_mutex); for (i = 0; i < quirk_count; i++) { if (vid == quirk_list[i].vid && pid == quirk_list[i].pid) { flags = quirk_list[i].flags; break; } } mutex_unlock(&quirk_mutex); return flags; } /* * Detect any quirks the device has, and do any housekeeping for it if needed. */ void usb_detect_quirks(struct usb_device *udev) { udev->quirks = usb_detect_static_quirks(udev, usb_quirk_list); /* * Pixart-based mice would trigger remote wakeup issue on AMD * Yangtze chipset, so set them as RESET_RESUME flag. */ if (usb_amd_resume_quirk(udev)) udev->quirks |= usb_detect_static_quirks(udev, usb_amd_resume_quirk_list); udev->quirks ^= usb_detect_dynamic_quirks(udev); if (udev->quirks) dev_dbg(&udev->dev, "USB quirks for this device: %x\n", udev->quirks); #ifdef CONFIG_USB_DEFAULT_PERSIST if (!(udev->quirks & USB_QUIRK_RESET)) udev->persist_enabled = 1; #else /* Hubs are automatically enabled for USB-PERSIST */ if (udev->descriptor.bDeviceClass == USB_CLASS_HUB) udev->persist_enabled = 1; #endif /* CONFIG_USB_DEFAULT_PERSIST */ } void usb_detect_interface_quirks(struct usb_device *udev) { u32 quirks; quirks = usb_detect_static_quirks(udev, usb_interface_quirk_list); if (quirks == 0) return; dev_dbg(&udev->dev, "USB interface quirks for this device: %x\n", quirks); udev->quirks |= quirks; } void usb_release_quirk_list(void) { mutex_lock(&quirk_mutex); kfree(quirk_list); quirk_list = NULL; mutex_unlock(&quirk_mutex); } |
| 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 1 1 32 32 32 3 32 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 4 |