Total coverage: 207559 (12%)of 1836104
14 1 13 1 3 9 14 1 13 13 14 11 1 1 1 1 13 13 14 16 16 4 14 15 7 2 3 3 8 4 2 18 18 9 3 12 3 12 9 1 6 95 91 3 85 82 11 11 1 8 2 8 2 8 2 5 5 6 4 7 3 9 2 117 186 1 1 1 1 182 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 // SPDX-License-Identifier: GPL-2.0-or-later /* * Linux NET3: IP/IP protocol decoder modified to support * virtual tunnel interface * * Authors: * Saurabh Mohan (saurabh.mohan@vyatta.com) 05/07/2012 */ /* This version of net/ipv4/ip_vti.c is cloned of net/ipv4/ipip.c For comments look at net/ipv4/ip_gre.c --ANK */ #include <linux/capability.h> #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/uaccess.h> #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/in.h> #include <linux/tcp.h> #include <linux/udp.h> #include <linux/if_arp.h> #include <linux/init.h> #include <linux/netfilter_ipv4.h> #include <linux/if_ether.h> #include <linux/icmpv6.h> #include <net/sock.h> #include <net/ip.h> #include <net/icmp.h> #include <net/ip_tunnels.h> #include <net/inet_ecn.h> #include <net/xfrm.h> #include <net/net_namespace.h> #include <net/netns/generic.h> static struct rtnl_link_ops vti_link_ops __read_mostly; static unsigned int vti_net_id __read_mostly; static int vti_tunnel_init(struct net_device *dev); static int vti_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type, bool update_skb_dev) { struct ip_tunnel *tunnel; const struct iphdr *iph = ip_hdr(skb); struct net *net = dev_net(skb->dev); struct ip_tunnel_net *itn = net_generic(net, vti_net_id); IP_TUNNEL_DECLARE_FLAGS(flags) = { }; __set_bit(IP_TUNNEL_NO_KEY_BIT, flags); tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, flags, iph->saddr, iph->daddr, 0); if (tunnel) { if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) goto drop; XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = tunnel; if (update_skb_dev) skb->dev = tunnel->dev; return xfrm_input(skb, nexthdr, spi, encap_type); } return -EINVAL; drop: kfree_skb(skb); return 0; } static int vti_input_proto(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) { return vti_input(skb, nexthdr, spi, encap_type, false); } static int vti_rcv(struct sk_buff *skb, __be32 spi, bool update_skb_dev) { XFRM_SPI_SKB_CB(skb)->family = AF_INET; XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr); return vti_input(skb, ip_hdr(skb)->protocol, spi, 0, update_skb_dev); } static int vti_rcv_proto(struct sk_buff *skb) { return vti_rcv(skb, 0, false); } static int vti_rcv_cb(struct sk_buff *skb, int err) { unsigned short family; struct net_device *dev; struct xfrm_state *x; const struct xfrm_mode *inner_mode; struct ip_tunnel *tunnel = XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4; u32 orig_mark = skb->mark; int ret; if (!tunnel) return 1; dev = tunnel->dev; if (err) { DEV_STATS_INC(dev, rx_errors); DEV_STATS_INC(dev, rx_dropped); return 0; } x = xfrm_input_state(skb); inner_mode = &x->inner_mode; if (x->sel.family == AF_UNSPEC) { inner_mode = xfrm_ip2inner_mode(x, XFRM_MODE_SKB_CB(skb)->protocol); if (inner_mode == NULL) { XFRM_INC_STATS(dev_net(skb->dev), LINUX_MIB_XFRMINSTATEMODEERROR); return -EINVAL; } } family = inner_mode->family; skb->mark = be32_to_cpu(tunnel->parms.i_key); ret = xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family); skb->mark = orig_mark; if (!ret) return -EPERM; skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(skb->dev))); skb->dev = dev; dev_sw_netstats_rx_add(dev, skb->len); return 0; } static bool vti_state_check(const struct xfrm_state *x, __be32 dst, __be32 src) { xfrm_address_t *daddr = (xfrm_address_t *)&dst; xfrm_address_t *saddr = (xfrm_address_t *)&src; /* if there is no transform then this tunnel is not functional. * Or if the xfrm is not mode tunnel. */ if (!x || x->props.mode != XFRM_MODE_TUNNEL || x->props.family != AF_INET) return false; if (!dst) return xfrm_addr_equal(saddr, &x->props.saddr, AF_INET); if (!xfrm_state_addr_check(x, daddr, saddr, AF_INET)) return false; return true; } static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) { struct ip_tunnel *tunnel = netdev_priv(dev); struct ip_tunnel_parm_kern *parms = &tunnel->parms; struct dst_entry *dst = skb_dst(skb); struct net_device *tdev; /* Device to other host */ int pkt_len = skb->len; int err; int mtu; if (!dst) { switch (skb->protocol) { case htons(ETH_P_IP): { struct rtable *rt; fl->u.ip4.flowi4_oif = dev->ifindex; fl->u.ip4.flowi4_flags |= FLOWI_FLAG_ANYSRC; rt = __ip_route_output_key(dev_net(dev), &fl->u.ip4); if (IS_ERR(rt)) { DEV_STATS_INC(dev, tx_carrier_errors); goto tx_error_icmp; } dst = &rt->dst; skb_dst_set(skb, dst); break; } #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): fl->u.ip6.flowi6_oif = dev->ifindex; fl->u.ip6.flowi6_flags |= FLOWI_FLAG_ANYSRC; dst = ip6_route_output(dev_net(dev), NULL, &fl->u.ip6); if (dst->error) { dst_release(dst); dst = NULL; DEV_STATS_INC(dev, tx_carrier_errors); goto tx_error_icmp; } skb_dst_set(skb, dst); break; #endif default: DEV_STATS_INC(dev, tx_carrier_errors); goto tx_error_icmp; } } dst_hold(dst); dst = xfrm_lookup_route(tunnel->net, dst, fl, NULL, 0); if (IS_ERR(dst)) { DEV_STATS_INC(dev, tx_carrier_errors); goto tx_error_icmp; } if (dst->flags & DST_XFRM_QUEUE) goto xmit; if (!vti_state_check(dst->xfrm, parms->iph.daddr, parms->iph.saddr)) { DEV_STATS_INC(dev, tx_carrier_errors); dst_release(dst); goto tx_error_icmp; } tdev = dst->dev; if (tdev == dev) { dst_release(dst); DEV_STATS_INC(dev, collisions); goto tx_error; } mtu = dst_mtu(dst); if (skb->len > mtu) { skb_dst_update_pmtu_no_confirm(skb, mtu); if (skb->protocol == htons(ETH_P_IP)) { if (!(ip_hdr(skb)->frag_off & htons(IP_DF))) goto xmit; icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); } else { if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); } dst_release(dst); goto tx_error; } xmit: skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(dev))); skb_dst_set(skb, dst); skb->dev = skb_dst(skb)->dev; err = dst_output(tunnel->net, skb->sk, skb); if (net_xmit_eval(err) == 0) err = pkt_len; iptunnel_xmit_stats(dev, err); return NETDEV_TX_OK; tx_error_icmp: dst_link_failure(skb); tx_error: DEV_STATS_INC(dev, tx_errors); kfree_skb(skb); return NETDEV_TX_OK; } /* This function assumes it is being called from dev_queue_xmit() * and that skb is filled properly by that function. */ static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); struct flowi fl; if (!pskb_inet_may_pull(skb)) goto tx_err; memset(&fl, 0, sizeof(fl)); switch (skb->protocol) { case htons(ETH_P_IP): memset(IPCB(skb), 0, sizeof(*IPCB(skb))); xfrm_decode_session(dev_net(dev), skb, &fl, AF_INET); break; case htons(ETH_P_IPV6): memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); xfrm_decode_session(dev_net(dev), skb, &fl, AF_INET6); break; default: goto tx_err; } /* override mark with tunnel output key */ fl.flowi_mark = be32_to_cpu(tunnel->parms.o_key); return vti_xmit(skb, dev, &fl); tx_err: DEV_STATS_INC(dev, tx_errors); kfree_skb(skb); return NETDEV_TX_OK; } static int vti4_err(struct sk_buff *skb, u32 info) { __be32 spi; __u32 mark; struct xfrm_state *x; struct ip_tunnel *tunnel; struct ip_esp_hdr *esph; struct ip_auth_hdr *ah ; struct ip_comp_hdr *ipch; struct net *net = dev_net(skb->dev); const struct iphdr *iph = (const struct iphdr *)skb->data; int protocol = iph->protocol; struct ip_tunnel_net *itn = net_generic(net, vti_net_id); IP_TUNNEL_DECLARE_FLAGS(flags) = { }; __set_bit(IP_TUNNEL_NO_KEY_BIT, flags); tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, flags, iph->daddr, iph->saddr, 0); if (!tunnel) return -1; mark = be32_to_cpu(tunnel->parms.o_key); switch (protocol) { case IPPROTO_ESP: esph = (struct ip_esp_hdr *)(skb->data+(iph->ihl<<2)); spi = esph->spi; break; case IPPROTO_AH: ah = (struct ip_auth_hdr *)(skb->data+(iph->ihl<<2)); spi = ah->spi; break; case IPPROTO_COMP: ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2)); spi = htonl(ntohs(ipch->cpi)); break; default: return 0; } switch (icmp_hdr(skb)->type) { case ICMP_DEST_UNREACH: if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) return 0; break; case ICMP_REDIRECT: break; default: return 0; } x = xfrm_state_lookup(net, mark, (const xfrm_address_t *)&iph->daddr, spi, protocol, AF_INET); if (!x) return 0; if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) ipv4_update_pmtu(skb, net, info, 0, protocol); else ipv4_redirect(skb, net, 0, protocol); xfrm_state_put(x); return 0; } static int vti_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm_kern *p, int cmd) { IP_TUNNEL_DECLARE_FLAGS(flags) = { }; int err = 0; if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) { if (p->iph.version != 4 || p->iph.protocol != IPPROTO_IPIP || p->iph.ihl != 5) return -EINVAL; } if (!ip_tunnel_flags_is_be16_compat(p->i_flags) || !ip_tunnel_flags_is_be16_compat(p->o_flags)) return -EOVERFLOW; if (!(ip_tunnel_flags_to_be16(p->i_flags) & GRE_KEY)) p->i_key = 0; if (!(ip_tunnel_flags_to_be16(p->o_flags) & GRE_KEY)) p->o_key = 0; __set_bit(IP_TUNNEL_VTI_BIT, flags); ip_tunnel_flags_copy(p->i_flags, flags); err = ip_tunnel_ctl(dev, p, cmd); if (err) return err; if (cmd != SIOCDELTUNNEL) { ip_tunnel_flags_from_be16(flags, GRE_KEY); ip_tunnel_flags_or(p->i_flags, p->i_flags, flags); ip_tunnel_flags_or(p->o_flags, p->o_flags, flags); } return 0; } static const struct net_device_ops vti_netdev_ops = { .ndo_init = vti_tunnel_init, .ndo_uninit = ip_tunnel_uninit, .ndo_start_xmit = vti_tunnel_xmit, .ndo_siocdevprivate = ip_tunnel_siocdevprivate, .ndo_change_mtu = ip_tunnel_change_mtu, .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip_tunnel_get_iflink, .ndo_tunnel_ctl = vti_tunnel_ctl, }; static void vti_tunnel_setup(struct net_device *dev) { dev->netdev_ops = &vti_netdev_ops; dev->header_ops = &ip_tunnel_header_ops; dev->type = ARPHRD_TUNNEL; ip_tunnel_setup(dev, vti_net_id); } static int vti_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); struct iphdr *iph = &tunnel->parms.iph; __dev_addr_set(dev, &iph->saddr, 4); memcpy(dev->broadcast, &iph->daddr, 4); dev->flags = IFF_NOARP; dev->addr_len = 4; dev->lltx = true; netif_keep_dst(dev); return ip_tunnel_init(dev); } static void __net_init vti_fb_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); struct iphdr *iph = &tunnel->parms.iph; iph->version = 4; iph->protocol = IPPROTO_IPIP; iph->ihl = 5; } static struct xfrm4_protocol vti_esp4_protocol __read_mostly = { .handler = vti_rcv_proto, .input_handler = vti_input_proto, .cb_handler = vti_rcv_cb, .err_handler = vti4_err, .priority = 100, }; static struct xfrm4_protocol vti_ah4_protocol __read_mostly = { .handler = vti_rcv_proto, .input_handler = vti_input_proto, .cb_handler = vti_rcv_cb, .err_handler = vti4_err, .priority = 100, }; static struct xfrm4_protocol vti_ipcomp4_protocol __read_mostly = { .handler = vti_rcv_proto, .input_handler = vti_input_proto, .cb_handler = vti_rcv_cb, .err_handler = vti4_err, .priority = 100, }; #if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL) static int vti_rcv_tunnel(struct sk_buff *skb) { XFRM_SPI_SKB_CB(skb)->family = AF_INET; XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr); return vti_input(skb, IPPROTO_IPIP, ip_hdr(skb)->saddr, 0, false); } static struct xfrm_tunnel vti_ipip_handler __read_mostly = { .handler = vti_rcv_tunnel, .cb_handler = vti_rcv_cb, .err_handler = vti4_err, .priority = 0, }; #if IS_ENABLED(CONFIG_IPV6) static struct xfrm_tunnel vti_ipip6_handler __read_mostly = { .handler = vti_rcv_tunnel, .cb_handler = vti_rcv_cb, .err_handler = vti4_err, .priority = 0, }; #endif #endif static int __net_init vti_init_net(struct net *net) { int err; struct ip_tunnel_net *itn; err = ip_tunnel_init_net(net, vti_net_id, &vti_link_ops, "ip_vti0"); if (err) return err; itn = net_generic(net, vti_net_id); if (itn->fb_tunnel_dev) vti_fb_tunnel_init(itn->fb_tunnel_dev); return 0; } static void __net_exit vti_exit_batch_rtnl(struct list_head *list_net, struct list_head *dev_to_kill) { ip_tunnel_delete_nets(list_net, vti_net_id, &vti_link_ops, dev_to_kill); } static struct pernet_operations vti_net_ops = { .init = vti_init_net, .exit_batch_rtnl = vti_exit_batch_rtnl, .id = &vti_net_id, .size = sizeof(struct ip_tunnel_net), }; static int vti_tunnel_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { return 0; } static void vti_netlink_parms(struct nlattr *data[], struct ip_tunnel_parm_kern *parms, __u32 *fwmark) { memset(parms, 0, sizeof(*parms)); parms->iph.protocol = IPPROTO_IPIP; if (!data) return; __set_bit(IP_TUNNEL_VTI_BIT, parms->i_flags); if (data[IFLA_VTI_LINK]) parms->link = nla_get_u32(data[IFLA_VTI_LINK]); if (data[IFLA_VTI_IKEY]) parms->i_key = nla_get_be32(data[IFLA_VTI_IKEY]); if (data[IFLA_VTI_OKEY]) parms->o_key = nla_get_be32(data[IFLA_VTI_OKEY]); if (data[IFLA_VTI_LOCAL]) parms->iph.saddr = nla_get_in_addr(data[IFLA_VTI_LOCAL]); if (data[IFLA_VTI_REMOTE]) parms->iph.daddr = nla_get_in_addr(data[IFLA_VTI_REMOTE]); if (data[IFLA_VTI_FWMARK]) *fwmark = nla_get_u32(data[IFLA_VTI_FWMARK]); } static int vti_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct ip_tunnel_parm_kern parms; __u32 fwmark = 0; vti_netlink_parms(data, &parms, &fwmark); return ip_tunnel_newlink(dev, tb, &parms, fwmark); } static int vti_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct ip_tunnel *t = netdev_priv(dev); struct ip_tunnel_parm_kern p; __u32 fwmark = t->fwmark; vti_netlink_parms(data, &p, &fwmark); return ip_tunnel_changelink(dev, tb, &p, fwmark); } static size_t vti_get_size(const struct net_device *dev) { return /* IFLA_VTI_LINK */ nla_total_size(4) + /* IFLA_VTI_IKEY */ nla_total_size(4) + /* IFLA_VTI_OKEY */ nla_total_size(4) + /* IFLA_VTI_LOCAL */ nla_total_size(4) + /* IFLA_VTI_REMOTE */ nla_total_size(4) + /* IFLA_VTI_FWMARK */ nla_total_size(4) + 0; } static int vti_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct ip_tunnel *t = netdev_priv(dev); struct ip_tunnel_parm_kern *p = &t->parms; if (nla_put_u32(skb, IFLA_VTI_LINK, p->link) || nla_put_be32(skb, IFLA_VTI_IKEY, p->i_key) || nla_put_be32(skb, IFLA_VTI_OKEY, p->o_key) || nla_put_in_addr(skb, IFLA_VTI_LOCAL, p->iph.saddr) || nla_put_in_addr(skb, IFLA_VTI_REMOTE, p->iph.daddr) || nla_put_u32(skb, IFLA_VTI_FWMARK, t->fwmark)) return -EMSGSIZE; return 0; } static const struct nla_policy vti_policy[IFLA_VTI_MAX + 1] = { [IFLA_VTI_LINK] = { .type = NLA_U32 }, [IFLA_VTI_IKEY] = { .type = NLA_U32 }, [IFLA_VTI_OKEY] = { .type = NLA_U32 }, [IFLA_VTI_LOCAL] = { .len = sizeof_field(struct iphdr, saddr) }, [IFLA_VTI_REMOTE] = { .len = sizeof_field(struct iphdr, daddr) }, [IFLA_VTI_FWMARK] = { .type = NLA_U32 }, }; static struct rtnl_link_ops vti_link_ops __read_mostly = { .kind = "vti", .maxtype = IFLA_VTI_MAX, .policy = vti_policy, .priv_size = sizeof(struct ip_tunnel), .setup = vti_tunnel_setup, .validate = vti_tunnel_validate, .newlink = vti_newlink, .changelink = vti_changelink, .dellink = ip_tunnel_dellink, .get_size = vti_get_size, .fill_info = vti_fill_info, .get_link_net = ip_tunnel_get_link_net, }; static int __init vti_init(void) { const char *msg; int err; pr_info("IPv4 over IPsec tunneling driver\n"); msg = "tunnel device"; err = register_pernet_device(&vti_net_ops); if (err < 0) goto pernet_dev_failed; msg = "tunnel protocols"; err = xfrm4_protocol_register(&vti_esp4_protocol, IPPROTO_ESP); if (err < 0) goto xfrm_proto_esp_failed; err = xfrm4_protocol_register(&vti_ah4_protocol, IPPROTO_AH); if (err < 0) goto xfrm_proto_ah_failed; err = xfrm4_protocol_register(&vti_ipcomp4_protocol, IPPROTO_COMP); if (err < 0) goto xfrm_proto_comp_failed; #if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL) msg = "ipip tunnel"; err = xfrm4_tunnel_register(&vti_ipip_handler, AF_INET); if (err < 0) goto xfrm_tunnel_ipip_failed; #if IS_ENABLED(CONFIG_IPV6) err = xfrm4_tunnel_register(&vti_ipip6_handler, AF_INET6); if (err < 0) goto xfrm_tunnel_ipip6_failed; #endif #endif msg = "netlink interface"; err = rtnl_link_register(&vti_link_ops); if (err < 0) goto rtnl_link_failed; return err; rtnl_link_failed: #if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL) #if IS_ENABLED(CONFIG_IPV6) xfrm4_tunnel_deregister(&vti_ipip6_handler, AF_INET6); xfrm_tunnel_ipip6_failed: #endif xfrm4_tunnel_deregister(&vti_ipip_handler, AF_INET); xfrm_tunnel_ipip_failed: #endif xfrm4_protocol_deregister(&vti_ipcomp4_protocol, IPPROTO_COMP); xfrm_proto_comp_failed: xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH); xfrm_proto_ah_failed: xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP); xfrm_proto_esp_failed: unregister_pernet_device(&vti_net_ops); pernet_dev_failed: pr_err("vti init: failed to register %s\n", msg); return err; } static void __exit vti_fini(void) { rtnl_link_unregister(&vti_link_ops); #if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL) #if IS_ENABLED(CONFIG_IPV6) xfrm4_tunnel_deregister(&vti_ipip6_handler, AF_INET6); #endif xfrm4_tunnel_deregister(&vti_ipip_handler, AF_INET); #endif xfrm4_protocol_deregister(&vti_ipcomp4_protocol, IPPROTO_COMP); xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH); xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP); unregister_pernet_device(&vti_net_ops); } module_init(vti_init); module_exit(vti_fini); MODULE_DESCRIPTION("Virtual (secure) IP tunneling library"); MODULE_LICENSE("GPL"); MODULE_ALIAS_RTNL_LINK("vti"); MODULE_ALIAS_NETDEV("ip_vti0");
63 28 27 11 11 57 58 130 128 24 24 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 // SPDX-License-Identifier: GPL-2.0-or-later /* * OSS compatible sequencer driver * * registration of device and proc * * Copyright (C) 1998,99 Takashi Iwai <tiwai@suse.de> */ #include <linux/init.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/compat.h> #include <sound/core.h> #include <sound/minors.h> #include <sound/initval.h> #include "seq_oss_device.h" #include "seq_oss_synth.h" /* * module option */ MODULE_AUTHOR("Takashi Iwai <tiwai@suse.de>"); MODULE_DESCRIPTION("OSS-compatible sequencer module"); MODULE_LICENSE("GPL"); /* Takashi says this is really only for sound-service-0-, but this is OK. */ MODULE_ALIAS_SNDRV_MINOR(SNDRV_MINOR_OSS_SEQUENCER); MODULE_ALIAS_SNDRV_MINOR(SNDRV_MINOR_OSS_MUSIC); /* * prototypes */ static int register_device(void); static void unregister_device(void); #ifdef CONFIG_SND_PROC_FS static int register_proc(void); static void unregister_proc(void); #else static inline int register_proc(void) { return 0; } static inline void unregister_proc(void) {} #endif static int odev_open(struct inode *inode, struct file *file); static int odev_release(struct inode *inode, struct file *file); static ssize_t odev_read(struct file *file, char __user *buf, size_t count, loff_t *offset); static ssize_t odev_write(struct file *file, const char __user *buf, size_t count, loff_t *offset); static long odev_ioctl(struct file *file, unsigned int cmd, unsigned long arg); static __poll_t odev_poll(struct file *file, poll_table * wait); /* * module interface */ static struct snd_seq_driver seq_oss_synth_driver = { .driver = { .name = KBUILD_MODNAME, .probe = snd_seq_oss_synth_probe, .remove = snd_seq_oss_synth_remove, }, .id = SNDRV_SEQ_DEV_ID_OSS, .argsize = sizeof(struct snd_seq_oss_reg), }; static int __init alsa_seq_oss_init(void) { int rc; rc = register_device(); if (rc < 0) goto error; rc = register_proc(); if (rc < 0) { unregister_device(); goto error; } rc = snd_seq_oss_create_client(); if (rc < 0) { unregister_proc(); unregister_device(); goto error; } rc = snd_seq_driver_register(&seq_oss_synth_driver); if (rc < 0) { snd_seq_oss_delete_client(); unregister_proc(); unregister_device(); goto error; } /* success */ snd_seq_oss_synth_init(); error: return rc; } static void __exit alsa_seq_oss_exit(void) { snd_seq_driver_unregister(&seq_oss_synth_driver); snd_seq_oss_delete_client(); unregister_proc(); unregister_device(); } module_init(alsa_seq_oss_init) module_exit(alsa_seq_oss_exit) /* * ALSA minor device interface */ static DEFINE_MUTEX(register_mutex); static int odev_open(struct inode *inode, struct file *file) { int level, rc; if (iminor(inode) == SNDRV_MINOR_OSS_MUSIC) level = SNDRV_SEQ_OSS_MODE_MUSIC; else level = SNDRV_SEQ_OSS_MODE_SYNTH; mutex_lock(&register_mutex); rc = snd_seq_oss_open(file, level); mutex_unlock(&register_mutex); return rc; } static int odev_release(struct inode *inode, struct file *file) { struct seq_oss_devinfo *dp; dp = file->private_data; if (!dp) return 0; mutex_lock(&register_mutex); snd_seq_oss_release(dp); mutex_unlock(&register_mutex); return 0; } static ssize_t odev_read(struct file *file, char __user *buf, size_t count, loff_t *offset) { struct seq_oss_devinfo *dp; dp = file->private_data; if (snd_BUG_ON(!dp)) return -ENXIO; return snd_seq_oss_read(dp, buf, count); } static ssize_t odev_write(struct file *file, const char __user *buf, size_t count, loff_t *offset) { struct seq_oss_devinfo *dp; dp = file->private_data; if (snd_BUG_ON(!dp)) return -ENXIO; return snd_seq_oss_write(dp, buf, count, file); } static long odev_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct seq_oss_devinfo *dp; long rc; dp = file->private_data; if (snd_BUG_ON(!dp)) return -ENXIO; if (cmd != SNDCTL_SEQ_SYNC && mutex_lock_interruptible(&register_mutex)) return -ERESTARTSYS; rc = snd_seq_oss_ioctl(dp, cmd, arg); if (cmd != SNDCTL_SEQ_SYNC) mutex_unlock(&register_mutex); return rc; } #ifdef CONFIG_COMPAT static long odev_ioctl_compat(struct file *file, unsigned int cmd, unsigned long arg) { return odev_ioctl(file, cmd, (unsigned long)compat_ptr(arg)); } #else #define odev_ioctl_compat NULL #endif static __poll_t odev_poll(struct file *file, poll_table * wait) { struct seq_oss_devinfo *dp; dp = file->private_data; if (snd_BUG_ON(!dp)) return EPOLLERR; return snd_seq_oss_poll(dp, file, wait); } /* * registration of sequencer minor device */ static const struct file_operations seq_oss_f_ops = { .owner = THIS_MODULE, .read = odev_read, .write = odev_write, .open = odev_open, .release = odev_release, .poll = odev_poll, .unlocked_ioctl = odev_ioctl, .compat_ioctl = odev_ioctl_compat, .llseek = noop_llseek, }; static int __init register_device(void) { int rc; mutex_lock(&register_mutex); rc = snd_register_oss_device(SNDRV_OSS_DEVICE_TYPE_SEQUENCER, NULL, 0, &seq_oss_f_ops, NULL); if (rc < 0) { pr_err("ALSA: seq_oss: can't register device seq\n"); mutex_unlock(&register_mutex); return rc; } rc = snd_register_oss_device(SNDRV_OSS_DEVICE_TYPE_MUSIC, NULL, 0, &seq_oss_f_ops, NULL); if (rc < 0) { pr_err("ALSA: seq_oss: can't register device music\n"); snd_unregister_oss_device(SNDRV_OSS_DEVICE_TYPE_SEQUENCER, NULL, 0); mutex_unlock(&register_mutex); return rc; } mutex_unlock(&register_mutex); return 0; } static void unregister_device(void) { mutex_lock(&register_mutex); if (snd_unregister_oss_device(SNDRV_OSS_DEVICE_TYPE_MUSIC, NULL, 0) < 0) pr_err("ALSA: seq_oss: error unregister device music\n"); if (snd_unregister_oss_device(SNDRV_OSS_DEVICE_TYPE_SEQUENCER, NULL, 0) < 0) pr_err("ALSA: seq_oss: error unregister device seq\n"); mutex_unlock(&register_mutex); } /* * /proc interface */ #ifdef CONFIG_SND_PROC_FS static struct snd_info_entry *info_entry; static void info_read(struct snd_info_entry *entry, struct snd_info_buffer *buf) { mutex_lock(&register_mutex); snd_iprintf(buf, "OSS sequencer emulation version %s\n", SNDRV_SEQ_OSS_VERSION_STR); snd_seq_oss_system_info_read(buf); snd_seq_oss_synth_info_read(buf); snd_seq_oss_midi_info_read(buf); mutex_unlock(&register_mutex); } static int __init register_proc(void) { struct snd_info_entry *entry; entry = snd_info_create_module_entry(THIS_MODULE, SNDRV_SEQ_OSS_PROCNAME, snd_seq_root); if (entry == NULL) return -ENOMEM; entry->content = SNDRV_INFO_CONTENT_TEXT; entry->private_data = NULL; entry->c.text.read = info_read; if (snd_info_register(entry) < 0) { snd_info_free_entry(entry); return -ENOMEM; } info_entry = entry; return 0; } static void unregister_proc(void) { snd_info_free_entry(info_entry); info_entry = NULL; } #endif /* CONFIG_SND_PROC_FS */
10 7 9 2 57 56 57 56 31 32 33 33 25 2 31 13 13 13 8 11 33 13 7 13 5 2 55 5 3 5 5 5 55 13 13 11 11 11 11 54 55 73 61 30 53 81 81 10 55 48 11 83 2 82 83 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 // SPDX-License-Identifier: GPL-2.0-or-later /* * net/dccp/options.c * * An implementation of the DCCP protocol * Copyright (c) 2005 Aristeu Sergio Rozanski Filho <aris@cathedrallabs.org> * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@ghostprotocols.net> * Copyright (c) 2005 Ian McDonald <ian.mcdonald@jandi.co.nz> */ #include <linux/dccp.h> #include <linux/module.h> #include <linux/types.h> #include <linux/unaligned.h> #include <linux/kernel.h> #include <linux/skbuff.h> #include "ackvec.h" #include "ccid.h" #include "dccp.h" #include "feat.h" u64 dccp_decode_value_var(const u8 *bf, const u8 len) { u64 value = 0; if (len >= DCCP_OPTVAL_MAXLEN) value += ((u64)*bf++) << 40; if (len > 4) value += ((u64)*bf++) << 32; if (len > 3) value += ((u64)*bf++) << 24; if (len > 2) value += ((u64)*bf++) << 16; if (len > 1) value += ((u64)*bf++) << 8; if (len > 0) value += *bf; return value; } /** * dccp_parse_options - Parse DCCP options present in @skb * @sk: client|server|listening dccp socket (when @dreq != NULL) * @dreq: request socket to use during connection setup, or NULL * @skb: frame to parse */ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq, struct sk_buff *skb) { struct dccp_sock *dp = dccp_sk(sk); const struct dccp_hdr *dh = dccp_hdr(skb); const u8 pkt_type = DCCP_SKB_CB(skb)->dccpd_type; unsigned char *options = (unsigned char *)dh + dccp_hdr_len(skb); unsigned char *opt_ptr = options; const unsigned char *opt_end = (unsigned char *)dh + (dh->dccph_doff * 4); struct dccp_options_received *opt_recv = &dp->dccps_options_received; unsigned char opt, len; unsigned char *value; u32 elapsed_time; __be32 opt_val; int rc; int mandatory = 0; memset(opt_recv, 0, sizeof(*opt_recv)); opt = len = 0; while (opt_ptr != opt_end) { opt = *opt_ptr++; len = 0; value = NULL; /* Check if this isn't a single byte option */ if (opt > DCCPO_MAX_RESERVED) { if (opt_ptr == opt_end) goto out_nonsensical_length; len = *opt_ptr++; if (len < 2) goto out_nonsensical_length; /* * Remove the type and len fields, leaving * just the value size */ len -= 2; value = opt_ptr; opt_ptr += len; if (opt_ptr > opt_end) goto out_nonsensical_length; } /* * CCID-specific options are ignored during connection setup, as * negotiation may still be in progress (see RFC 4340, 10.3). * The same applies to Ack Vectors, as these depend on the CCID. */ if (dreq != NULL && (opt >= DCCPO_MIN_RX_CCID_SPECIFIC || opt == DCCPO_ACK_VECTOR_0 || opt == DCCPO_ACK_VECTOR_1)) goto ignore_option; switch (opt) { case DCCPO_PADDING: break; case DCCPO_MANDATORY: if (mandatory) goto out_invalid_option; if (pkt_type != DCCP_PKT_DATA) mandatory = 1; break; case DCCPO_NDP_COUNT: if (len > 6) goto out_invalid_option; opt_recv->dccpor_ndp = dccp_decode_value_var(value, len); dccp_pr_debug("%s opt: NDP count=%llu\n", dccp_role(sk), (unsigned long long)opt_recv->dccpor_ndp); break; case DCCPO_CHANGE_L ... DCCPO_CONFIRM_R: if (pkt_type == DCCP_PKT_DATA) /* RFC 4340, 6 */ break; if (len == 0) goto out_invalid_option; rc = dccp_feat_parse_options(sk, dreq, mandatory, opt, *value, value + 1, len - 1); if (rc) goto out_featneg_failed; break; case DCCPO_TIMESTAMP: if (len != 4) goto out_invalid_option; /* * RFC 4340 13.1: "The precise time corresponding to * Timestamp Value zero is not specified". We use * zero to indicate absence of a meaningful timestamp. */ opt_val = get_unaligned((__be32 *)value); if (unlikely(opt_val == 0)) { DCCP_WARN("Timestamp with zero value\n"); break; } if (dreq != NULL) { dreq->dreq_timestamp_echo = ntohl(opt_val); dreq->dreq_timestamp_time = dccp_timestamp(); } else { opt_recv->dccpor_timestamp = dp->dccps_timestamp_echo = ntohl(opt_val); dp->dccps_timestamp_time = dccp_timestamp(); } dccp_pr_debug("%s rx opt: TIMESTAMP=%u, ackno=%llu\n", dccp_role(sk), ntohl(opt_val), (unsigned long long) DCCP_SKB_CB(skb)->dccpd_ack_seq); /* schedule an Ack in case this sender is quiescent */ inet_csk_schedule_ack(sk); break; case DCCPO_TIMESTAMP_ECHO: if (len != 4 && len != 6 && len != 8) goto out_invalid_option; opt_val = get_unaligned((__be32 *)value); opt_recv->dccpor_timestamp_echo = ntohl(opt_val); dccp_pr_debug("%s rx opt: TIMESTAMP_ECHO=%u, len=%d, " "ackno=%llu", dccp_role(sk), opt_recv->dccpor_timestamp_echo, len + 2, (unsigned long long) DCCP_SKB_CB(skb)->dccpd_ack_seq); value += 4; if (len == 4) { /* no elapsed time included */ dccp_pr_debug_cat("\n"); break; } if (len == 6) { /* 2-byte elapsed time */ __be16 opt_val2 = get_unaligned((__be16 *)value); elapsed_time = ntohs(opt_val2); } else { /* 4-byte elapsed time */ opt_val = get_unaligned((__be32 *)value); elapsed_time = ntohl(opt_val); } dccp_pr_debug_cat(", ELAPSED_TIME=%u\n", elapsed_time); /* Give precedence to the biggest ELAPSED_TIME */ if (elapsed_time > opt_recv->dccpor_elapsed_time) opt_recv->dccpor_elapsed_time = elapsed_time; break; case DCCPO_ELAPSED_TIME: if (dccp_packet_without_ack(skb)) /* RFC 4340, 13.2 */ break; if (len == 2) { __be16 opt_val2 = get_unaligned((__be16 *)value); elapsed_time = ntohs(opt_val2); } else if (len == 4) { opt_val = get_unaligned((__be32 *)value); elapsed_time = ntohl(opt_val); } else { goto out_invalid_option; } if (elapsed_time > opt_recv->dccpor_elapsed_time) opt_recv->dccpor_elapsed_time = elapsed_time; dccp_pr_debug("%s rx opt: ELAPSED_TIME=%d\n", dccp_role(sk), elapsed_time); break; case DCCPO_MIN_RX_CCID_SPECIFIC ... DCCPO_MAX_RX_CCID_SPECIFIC: if (ccid_hc_rx_parse_options(dp->dccps_hc_rx_ccid, sk, pkt_type, opt, value, len)) goto out_invalid_option; break; case DCCPO_ACK_VECTOR_0: case DCCPO_ACK_VECTOR_1: if (dccp_packet_without_ack(skb)) /* RFC 4340, 11.4 */ break; /* * Ack vectors are processed by the TX CCID if it is * interested. The RX CCID need not parse Ack Vectors, * since it is only interested in clearing old state. */ fallthrough; case DCCPO_MIN_TX_CCID_SPECIFIC ... DCCPO_MAX_TX_CCID_SPECIFIC: if (ccid_hc_tx_parse_options(dp->dccps_hc_tx_ccid, sk, pkt_type, opt, value, len)) goto out_invalid_option; break; default: DCCP_CRIT("DCCP(%p): option %d(len=%d) not " "implemented, ignoring", sk, opt, len); break; } ignore_option: if (opt != DCCPO_MANDATORY) mandatory = 0; } /* mandatory was the last byte in option list -> reset connection */ if (mandatory) goto out_invalid_option; out_nonsensical_length: /* RFC 4340, 5.8: ignore option and all remaining option space */ return 0; out_invalid_option: DCCP_INC_STATS(DCCP_MIB_INVALIDOPT); rc = DCCP_RESET_CODE_OPTION_ERROR; out_featneg_failed: DCCP_WARN("DCCP(%p): Option %d (len=%d) error=%u\n", sk, opt, len, rc); DCCP_SKB_CB(skb)->dccpd_reset_code = rc; DCCP_SKB_CB(skb)->dccpd_reset_data[0] = opt; DCCP_SKB_CB(skb)->dccpd_reset_data[1] = len > 0 ? value[0] : 0; DCCP_SKB_CB(skb)->dccpd_reset_data[2] = len > 1 ? value[1] : 0; return -1; } EXPORT_SYMBOL_GPL(dccp_parse_options); void dccp_encode_value_var(const u64 value, u8 *to, const u8 len) { if (len >= DCCP_OPTVAL_MAXLEN) *to++ = (value & 0xFF0000000000ull) >> 40; if (len > 4) *to++ = (value & 0xFF00000000ull) >> 32; if (len > 3) *to++ = (value & 0xFF000000) >> 24; if (len > 2) *to++ = (value & 0xFF0000) >> 16; if (len > 1) *to++ = (value & 0xFF00) >> 8; if (len > 0) *to++ = (value & 0xFF); } static inline u8 dccp_ndp_len(const u64 ndp) { if (likely(ndp <= 0xFF)) return 1; return likely(ndp <= USHRT_MAX) ? 2 : (ndp <= UINT_MAX ? 4 : 6); } int dccp_insert_option(struct sk_buff *skb, const unsigned char option, const void *value, const unsigned char len) { unsigned char *to; if (DCCP_SKB_CB(skb)->dccpd_opt_len + len + 2 > DCCP_MAX_OPT_LEN) return -1; DCCP_SKB_CB(skb)->dccpd_opt_len += len + 2; to = skb_push(skb, len + 2); *to++ = option; *to++ = len + 2; memcpy(to, value, len); return 0; } EXPORT_SYMBOL_GPL(dccp_insert_option); static int dccp_insert_option_ndp(struct sock *sk, struct sk_buff *skb) { struct dccp_sock *dp = dccp_sk(sk); u64 ndp = dp->dccps_ndp_count; if (dccp_non_data_packet(skb)) ++dp->dccps_ndp_count; else dp->dccps_ndp_count = 0; if (ndp > 0) { unsigned char *ptr; const int ndp_len = dccp_ndp_len(ndp); const int len = ndp_len + 2; if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) return -1; DCCP_SKB_CB(skb)->dccpd_opt_len += len; ptr = skb_push(skb, len); *ptr++ = DCCPO_NDP_COUNT; *ptr++ = len; dccp_encode_value_var(ndp, ptr, ndp_len); } return 0; } static inline int dccp_elapsed_time_len(const u32 elapsed_time) { return elapsed_time == 0 ? 0 : elapsed_time <= 0xFFFF ? 2 : 4; } static int dccp_insert_option_timestamp(struct sk_buff *skb) { __be32 now = htonl(dccp_timestamp()); /* yes this will overflow but that is the point as we want a * 10 usec 32 bit timer which mean it wraps every 11.9 hours */ return dccp_insert_option(skb, DCCPO_TIMESTAMP, &now, sizeof(now)); } static int dccp_insert_option_timestamp_echo(struct dccp_sock *dp, struct dccp_request_sock *dreq, struct sk_buff *skb) { __be32 tstamp_echo; unsigned char *to; u32 elapsed_time, elapsed_time_len, len; if (dreq != NULL) { elapsed_time = dccp_timestamp() - dreq->dreq_timestamp_time; tstamp_echo = htonl(dreq->dreq_timestamp_echo); dreq->dreq_timestamp_echo = 0; } else { elapsed_time = dccp_timestamp() - dp->dccps_timestamp_time; tstamp_echo = htonl(dp->dccps_timestamp_echo); dp->dccps_timestamp_echo = 0; } elapsed_time_len = dccp_elapsed_time_len(elapsed_time); len = 6 + elapsed_time_len; if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) return -1; DCCP_SKB_CB(skb)->dccpd_opt_len += len; to = skb_push(skb, len); *to++ = DCCPO_TIMESTAMP_ECHO; *to++ = len; memcpy(to, &tstamp_echo, 4); to += 4; if (elapsed_time_len == 2) { const __be16 var16 = htons((u16)elapsed_time); memcpy(to, &var16, 2); } else if (elapsed_time_len == 4) { const __be32 var32 = htonl(elapsed_time); memcpy(to, &var32, 4); } return 0; } static int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb) { struct dccp_sock *dp = dccp_sk(sk); struct dccp_ackvec *av = dp->dccps_hc_rx_ackvec; struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); const u16 buflen = dccp_ackvec_buflen(av); /* Figure out how many options do we need to represent the ackvec */ const u8 nr_opts = DIV_ROUND_UP(buflen, DCCP_SINGLE_OPT_MAXLEN); u16 len = buflen + 2 * nr_opts; u8 i, nonce = 0; const unsigned char *tail, *from; unsigned char *to; if (dcb->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) { DCCP_WARN("Lacking space for %u bytes on %s packet\n", len, dccp_packet_name(dcb->dccpd_type)); return -1; } /* * Since Ack Vectors are variable-length, we can not always predict * their size. To catch exception cases where the space is running out * on the skb, a separate Sync is scheduled to carry the Ack Vector. */ if (len > DCCPAV_MIN_OPTLEN && len + dcb->dccpd_opt_len + skb->len > dp->dccps_mss_cache) { DCCP_WARN("No space left for Ack Vector (%u) on skb (%u+%u), " "MPS=%u ==> reduce payload size?\n", len, skb->len, dcb->dccpd_opt_len, dp->dccps_mss_cache); dp->dccps_sync_scheduled = 1; return 0; } dcb->dccpd_opt_len += len; to = skb_push(skb, len); len = buflen; from = av->av_buf + av->av_buf_head; tail = av->av_buf + DCCPAV_MAX_ACKVEC_LEN; for (i = 0; i < nr_opts; ++i) { int copylen = len; if (len > DCCP_SINGLE_OPT_MAXLEN) copylen = DCCP_SINGLE_OPT_MAXLEN; /* * RFC 4340, 12.2: Encode the Nonce Echo for this Ack Vector via * its type; ack_nonce is the sum of all individual buf_nonce's. */ nonce ^= av->av_buf_nonce[i]; *to++ = DCCPO_ACK_VECTOR_0 + av->av_buf_nonce[i]; *to++ = copylen + 2; /* Check if buf_head wraps */ if (from + copylen > tail) { const u16 tailsize = tail - from; memcpy(to, from, tailsize); to += tailsize; len -= tailsize; copylen -= tailsize; from = av->av_buf; } memcpy(to, from, copylen); from += copylen; to += copylen; len -= copylen; } /* * Each sent Ack Vector is recorded in the list, as per A.2 of RFC 4340. */ if (dccp_ackvec_update_records(av, dcb->dccpd_seq, nonce)) return -ENOBUFS; return 0; } /** * dccp_insert_option_mandatory - Mandatory option (5.8.2) * @skb: frame into which to insert option * * Note that since we are using skb_push, this function needs to be called * _after_ inserting the option it is supposed to influence (stack order). */ int dccp_insert_option_mandatory(struct sk_buff *skb) { if (DCCP_SKB_CB(skb)->dccpd_opt_len >= DCCP_MAX_OPT_LEN) return -1; DCCP_SKB_CB(skb)->dccpd_opt_len++; *(u8 *)skb_push(skb, 1) = DCCPO_MANDATORY; return 0; } /** * dccp_insert_fn_opt - Insert single Feature-Negotiation option into @skb * @skb: frame to insert feature negotiation option into * @type: %DCCPO_CHANGE_L, %DCCPO_CHANGE_R, %DCCPO_CONFIRM_L, %DCCPO_CONFIRM_R * @feat: one out of %dccp_feature_numbers * @val: NN value or SP array (preferred element first) to copy * @len: true length of @val in bytes (excluding first element repetition) * @repeat_first: whether to copy the first element of @val twice * * The last argument is used to construct Confirm options, where the preferred * value and the preference list appear separately (RFC 4340, 6.3.1). Preference * lists are kept such that the preferred entry is always first, so we only need * to copy twice, and avoid the overhead of cloning into a bigger array. */ int dccp_insert_fn_opt(struct sk_buff *skb, u8 type, u8 feat, u8 *val, u8 len, bool repeat_first) { u8 tot_len, *to; /* take the `Feature' field and possible repetition into account */ if (len > (DCCP_SINGLE_OPT_MAXLEN - 2)) { DCCP_WARN("length %u for feature %u too large\n", len, feat); return -1; } if (unlikely(val == NULL || len == 0)) len = repeat_first = false; tot_len = 3 + repeat_first + len; if (DCCP_SKB_CB(skb)->dccpd_opt_len + tot_len > DCCP_MAX_OPT_LEN) { DCCP_WARN("packet too small for feature %d option!\n", feat); return -1; } DCCP_SKB_CB(skb)->dccpd_opt_len += tot_len; to = skb_push(skb, tot_len); *to++ = type; *to++ = tot_len; *to++ = feat; if (repeat_first) *to++ = *val; if (len) memcpy(to, val, len); return 0; } /* The length of all options needs to be a multiple of 4 (5.8) */ static void dccp_insert_option_padding(struct sk_buff *skb) { int padding = DCCP_SKB_CB(skb)->dccpd_opt_len % 4; if (padding != 0) { padding = 4 - padding; memset(skb_push(skb, padding), 0, padding); DCCP_SKB_CB(skb)->dccpd_opt_len += padding; } } int dccp_insert_options(struct sock *sk, struct sk_buff *skb) { struct dccp_sock *dp = dccp_sk(sk); DCCP_SKB_CB(skb)->dccpd_opt_len = 0; if (dp->dccps_send_ndp_count && dccp_insert_option_ndp(sk, skb)) return -1; if (DCCP_SKB_CB(skb)->dccpd_type != DCCP_PKT_DATA) { /* Feature Negotiation */ if (dccp_feat_insert_opts(dp, NULL, skb)) return -1; if (DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_REQUEST) { /* * Obtain RTT sample from Request/Response exchange. * This is currently used for TFRC initialisation. */ if (dccp_insert_option_timestamp(skb)) return -1; } else if (dccp_ackvec_pending(sk) && dccp_insert_option_ackvec(sk, skb)) { return -1; } } if (dp->dccps_hc_rx_insert_options) { if (ccid_hc_rx_insert_options(dp->dccps_hc_rx_ccid, sk, skb)) return -1; dp->dccps_hc_rx_insert_options = 0; } if (dp->dccps_timestamp_echo != 0 && dccp_insert_option_timestamp_echo(dp, NULL, skb)) return -1; dccp_insert_option_padding(skb); return 0; } int dccp_insert_options_rsk(struct dccp_request_sock *dreq, struct sk_buff *skb) { DCCP_SKB_CB(skb)->dccpd_opt_len = 0; if (dccp_feat_insert_opts(NULL, dreq, skb)) return -1; /* Obtain RTT sample from Response/Ack exchange (used by TFRC). */ if (dccp_insert_option_timestamp(skb)) return -1; if (dreq->dreq_timestamp_echo != 0 && dccp_insert_option_timestamp_echo(NULL, dreq, skb)) return -1; dccp_insert_option_padding(skb); return 0; }
3 1492 1497 1479 15 37 19 46 1 1 1 2 1 1 2 2 2 15 2 3 3 115 36 116 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 // SPDX-License-Identifier: GPL-2.0 /* Copyright 2011-2014 Autronica Fire and Security AS * * Author(s): * 2011-2014 Arvid Brodin, arvid.brodin@alten.se * * Event handling for HSR and PRP devices. */ #include <linux/netdevice.h> #include <net/rtnetlink.h> #include <linux/rculist.h> #include <linux/timer.h> #include <linux/etherdevice.h> #include "hsr_main.h" #include "hsr_device.h" #include "hsr_netlink.h" #include "hsr_framereg.h" #include "hsr_slave.h" static bool hsr_slave_empty(struct hsr_priv *hsr) { struct hsr_port *port; hsr_for_each_port(hsr, port) if (port->type != HSR_PT_MASTER) return false; return true; } static int hsr_netdev_notify(struct notifier_block *nb, unsigned long event, void *ptr) { struct hsr_port *port, *master; struct net_device *dev; struct hsr_priv *hsr; LIST_HEAD(list_kill); int mtu_max; int res; dev = netdev_notifier_info_to_dev(ptr); port = hsr_port_get_rtnl(dev); if (!port) { if (!is_hsr_master(dev)) return NOTIFY_DONE; /* Not an HSR device */ hsr = netdev_priv(dev); port = hsr_port_get_hsr(hsr, HSR_PT_MASTER); if (!port) { /* Resend of notification concerning removed device? */ return NOTIFY_DONE; } } else { hsr = port->hsr; } switch (event) { case NETDEV_UP: /* Administrative state DOWN */ case NETDEV_DOWN: /* Administrative state UP */ case NETDEV_CHANGE: /* Link (carrier) state changes */ hsr_check_carrier_and_operstate(hsr); break; case NETDEV_CHANGENAME: if (is_hsr_master(dev)) hsr_debugfs_rename(dev); break; case NETDEV_CHANGEADDR: if (port->type == HSR_PT_MASTER) { /* This should not happen since there's no * ndo_set_mac_address() for HSR devices - i.e. not * supported. */ break; } master = hsr_port_get_hsr(hsr, HSR_PT_MASTER); if (port->type == HSR_PT_SLAVE_A) { eth_hw_addr_set(master->dev, dev->dev_addr); call_netdevice_notifiers(NETDEV_CHANGEADDR, master->dev); } /* Make sure we recognize frames from ourselves in hsr_rcv() */ port = hsr_port_get_hsr(hsr, HSR_PT_SLAVE_B); res = hsr_create_self_node(hsr, master->dev->dev_addr, port ? port->dev->dev_addr : master->dev->dev_addr); if (res) netdev_warn(master->dev, "Could not update HSR node address.\n"); break; case NETDEV_CHANGEMTU: if (port->type == HSR_PT_MASTER) break; /* Handled in ndo_change_mtu() */ mtu_max = hsr_get_max_mtu(port->hsr); master = hsr_port_get_hsr(port->hsr, HSR_PT_MASTER); WRITE_ONCE(master->dev->mtu, mtu_max); break; case NETDEV_UNREGISTER: if (!is_hsr_master(dev)) { master = hsr_port_get_hsr(port->hsr, HSR_PT_MASTER); hsr_del_port(port); if (hsr_slave_empty(master->hsr)) { const struct rtnl_link_ops *ops; ops = master->dev->rtnl_link_ops; ops->dellink(master->dev, &list_kill); unregister_netdevice_many(&list_kill); } } break; case NETDEV_PRE_TYPE_CHANGE: /* HSR works only on Ethernet devices. Refuse slave to change * its type. */ return NOTIFY_BAD; } return NOTIFY_DONE; } struct hsr_port *hsr_port_get_hsr(struct hsr_priv *hsr, enum hsr_port_type pt) { struct hsr_port *port; hsr_for_each_port(hsr, port) if (port->type == pt) return port; return NULL; } int hsr_get_version(struct net_device *dev, enum hsr_version *ver) { struct hsr_priv *hsr; hsr = netdev_priv(dev); *ver = hsr->prot_version; return 0; } EXPORT_SYMBOL(hsr_get_version); static struct notifier_block hsr_nb = { .notifier_call = hsr_netdev_notify, /* Slave event notifications */ }; static int __init hsr_init(void) { int err; BUILD_BUG_ON(sizeof(struct hsr_tag) != HSR_HLEN); err = register_netdevice_notifier(&hsr_nb); if (err) return err; err = hsr_netlink_init(); if (err) { unregister_netdevice_notifier(&hsr_nb); return err; } return 0; } static void __exit hsr_exit(void) { hsr_netlink_exit(); hsr_debugfs_remove_root(); unregister_netdevice_notifier(&hsr_nb); } module_init(hsr_init); module_exit(hsr_exit); MODULE_DESCRIPTION("High-availability Seamless Redundancy (HSR) driver"); MODULE_LICENSE("GPL");
27 1071 1102 101 107 107 20 99 107 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 /* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ #ifndef _NET_BATMAN_ADV_HARD_INTERFACE_H_ #define _NET_BATMAN_ADV_HARD_INTERFACE_H_ #include "main.h" #include <linux/compiler.h> #include <linux/kref.h> #include <linux/netdevice.h> #include <linux/notifier.h> #include <linux/rcupdate.h> #include <linux/stddef.h> #include <linux/types.h> /** * enum batadv_hard_if_state - State of a hard interface */ enum batadv_hard_if_state { /** * @BATADV_IF_NOT_IN_USE: interface is not used as slave interface of a * batman-adv soft interface */ BATADV_IF_NOT_IN_USE, /** * @BATADV_IF_TO_BE_REMOVED: interface will be removed from soft * interface */ BATADV_IF_TO_BE_REMOVED, /** @BATADV_IF_INACTIVE: interface is deactivated */ BATADV_IF_INACTIVE, /** @BATADV_IF_ACTIVE: interface is used */ BATADV_IF_ACTIVE, /** @BATADV_IF_TO_BE_ACTIVATED: interface is getting activated */ BATADV_IF_TO_BE_ACTIVATED, }; /** * enum batadv_hard_if_bcast - broadcast avoidance options */ enum batadv_hard_if_bcast { /** @BATADV_HARDIF_BCAST_OK: Do broadcast on according hard interface */ BATADV_HARDIF_BCAST_OK = 0, /** * @BATADV_HARDIF_BCAST_NORECIPIENT: Broadcast not needed, there is no * recipient */ BATADV_HARDIF_BCAST_NORECIPIENT, /** * @BATADV_HARDIF_BCAST_DUPFWD: There is just the neighbor we got it * from */ BATADV_HARDIF_BCAST_DUPFWD, /** @BATADV_HARDIF_BCAST_DUPORIG: There is just the originator */ BATADV_HARDIF_BCAST_DUPORIG, }; extern struct notifier_block batadv_hard_if_notifier; struct net_device *batadv_get_real_netdev(struct net_device *net_device); bool batadv_is_cfg80211_hardif(struct batadv_hard_iface *hard_iface); bool batadv_is_wifi_hardif(struct batadv_hard_iface *hard_iface); struct batadv_hard_iface* batadv_hardif_get_by_netdev(const struct net_device *net_dev); int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface, struct net_device *soft_iface); void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface); int batadv_hardif_min_mtu(struct net_device *soft_iface); void batadv_update_min_mtu(struct net_device *soft_iface); void batadv_hardif_release(struct kref *ref); int batadv_hardif_no_broadcast(struct batadv_hard_iface *if_outgoing, u8 *orig_addr, u8 *orig_neigh); /** * batadv_hardif_put() - decrement the hard interface refcounter and possibly * release it * @hard_iface: the hard interface to free */ static inline void batadv_hardif_put(struct batadv_hard_iface *hard_iface) { if (!hard_iface) return; kref_put(&hard_iface->refcount, batadv_hardif_release); } /** * batadv_primary_if_get_selected() - Get reference to primary interface * @bat_priv: the bat priv with all the soft interface information * * Return: primary interface (with increased refcnt), otherwise NULL */ static inline struct batadv_hard_iface * batadv_primary_if_get_selected(struct batadv_priv *bat_priv) { struct batadv_hard_iface *hard_iface; rcu_read_lock(); hard_iface = rcu_dereference(bat_priv->primary_if); if (!hard_iface) goto out; if (!kref_get_unless_zero(&hard_iface->refcount)) hard_iface = NULL; out: rcu_read_unlock(); return hard_iface; } #endif /* _NET_BATMAN_ADV_HARD_INTERFACE_H_ */
592 24 79 1 835 1788 1784 835 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __NET_LWTUNNEL_H #define __NET_LWTUNNEL_H 1 #include <linux/lwtunnel.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/types.h> #include <net/route.h> #define LWTUNNEL_HASH_BITS 7 #define LWTUNNEL_HASH_SIZE (1 << LWTUNNEL_HASH_BITS) /* lw tunnel state flags */ #define LWTUNNEL_STATE_OUTPUT_REDIRECT BIT(0) #define LWTUNNEL_STATE_INPUT_REDIRECT BIT(1) #define LWTUNNEL_STATE_XMIT_REDIRECT BIT(2) /* LWTUNNEL_XMIT_CONTINUE should be distinguishable from dst_output return * values (NET_XMIT_xxx and NETDEV_TX_xxx in linux/netdevice.h) for safety. */ enum { LWTUNNEL_XMIT_DONE, LWTUNNEL_XMIT_CONTINUE = 0x100, }; struct lwtunnel_state { __u16 type; __u16 flags; __u16 headroom; atomic_t refcnt; int (*orig_output)(struct net *net, struct sock *sk, struct sk_buff *skb); int (*orig_input)(struct sk_buff *); struct rcu_head rcu; __u8 data[]; }; struct lwtunnel_encap_ops { int (*build_state)(struct net *net, struct nlattr *encap, unsigned int family, const void *cfg, struct lwtunnel_state **ts, struct netlink_ext_ack *extack); void (*destroy_state)(struct lwtunnel_state *lws); int (*output)(struct net *net, struct sock *sk, struct sk_buff *skb); int (*input)(struct sk_buff *skb); int (*fill_encap)(struct sk_buff *skb, struct lwtunnel_state *lwtstate); int (*get_encap_size)(struct lwtunnel_state *lwtstate); int (*cmp_encap)(struct lwtunnel_state *a, struct lwtunnel_state *b); int (*xmit)(struct sk_buff *skb); struct module *owner; }; #ifdef CONFIG_LWTUNNEL DECLARE_STATIC_KEY_FALSE(nf_hooks_lwtunnel_enabled); void lwtstate_free(struct lwtunnel_state *lws); static inline struct lwtunnel_state * lwtstate_get(struct lwtunnel_state *lws) { if (lws) atomic_inc(&lws->refcnt); return lws; } static inline void lwtstate_put(struct lwtunnel_state *lws) { if (!lws) return; if (atomic_dec_and_test(&lws->refcnt)) lwtstate_free(lws); } static inline bool lwtunnel_output_redirect(struct lwtunnel_state *lwtstate) { if (lwtstate && (lwtstate->flags & LWTUNNEL_STATE_OUTPUT_REDIRECT)) return true; return false; } static inline bool lwtunnel_input_redirect(struct lwtunnel_state *lwtstate) { if (lwtstate && (lwtstate->flags & LWTUNNEL_STATE_INPUT_REDIRECT)) return true; return false; } static inline bool lwtunnel_xmit_redirect(struct lwtunnel_state *lwtstate) { if (lwtstate && (lwtstate->flags & LWTUNNEL_STATE_XMIT_REDIRECT)) return true; return false; } static inline unsigned int lwtunnel_headroom(struct lwtunnel_state *lwtstate, unsigned int mtu) { if ((lwtunnel_xmit_redirect(lwtstate) || lwtunnel_output_redirect(lwtstate)) && lwtstate->headroom < mtu) return lwtstate->headroom; return 0; } int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *op, unsigned int num); int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *op, unsigned int num); int lwtunnel_valid_encap_type(u16 encap_type, struct netlink_ext_ack *extack); int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int len, struct netlink_ext_ack *extack); int lwtunnel_build_state(struct net *net, u16 encap_type, struct nlattr *encap, unsigned int family, const void *cfg, struct lwtunnel_state **lws, struct netlink_ext_ack *extack); int lwtunnel_fill_encap(struct sk_buff *skb, struct lwtunnel_state *lwtstate, int encap_attr, int encap_type_attr); int lwtunnel_get_encap_size(struct lwtunnel_state *lwtstate); struct lwtunnel_state *lwtunnel_state_alloc(int hdr_len); int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b); int lwtunnel_output(struct net *net, struct sock *sk, struct sk_buff *skb); int lwtunnel_input(struct sk_buff *skb); int lwtunnel_xmit(struct sk_buff *skb); int bpf_lwt_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len, bool ingress); static inline void lwtunnel_set_redirect(struct dst_entry *dst) { if (lwtunnel_output_redirect(dst->lwtstate)) { dst->lwtstate->orig_output = dst->output; dst->output = lwtunnel_output; } if (lwtunnel_input_redirect(dst->lwtstate)) { dst->lwtstate->orig_input = dst->input; dst->input = lwtunnel_input; } } #else static inline void lwtstate_free(struct lwtunnel_state *lws) { } static inline struct lwtunnel_state * lwtstate_get(struct lwtunnel_state *lws) { return lws; } static inline void lwtstate_put(struct lwtunnel_state *lws) { } static inline bool lwtunnel_output_redirect(struct lwtunnel_state *lwtstate) { return false; } static inline bool lwtunnel_input_redirect(struct lwtunnel_state *lwtstate) { return false; } static inline bool lwtunnel_xmit_redirect(struct lwtunnel_state *lwtstate) { return false; } static inline void lwtunnel_set_redirect(struct dst_entry *dst) { } static inline unsigned int lwtunnel_headroom(struct lwtunnel_state *lwtstate, unsigned int mtu) { return 0; } static inline int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *op, unsigned int num) { return -EOPNOTSUPP; } static inline int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *op, unsigned int num) { return -EOPNOTSUPP; } static inline int lwtunnel_valid_encap_type(u16 encap_type, struct netlink_ext_ack *extack) { NL_SET_ERR_MSG(extack, "CONFIG_LWTUNNEL is not enabled in this kernel"); return -EOPNOTSUPP; } static inline int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int len, struct netlink_ext_ack *extack) { /* return 0 since we are not walking attr looking for * RTA_ENCAP_TYPE attribute on nexthops. */ return 0; } static inline int lwtunnel_build_state(struct net *net, u16 encap_type, struct nlattr *encap, unsigned int family, const void *cfg, struct lwtunnel_state **lws, struct netlink_ext_ack *extack) { return -EOPNOTSUPP; } static inline int lwtunnel_fill_encap(struct sk_buff *skb, struct lwtunnel_state *lwtstate, int encap_attr, int encap_type_attr) { return 0; } static inline int lwtunnel_get_encap_size(struct lwtunnel_state *lwtstate) { return 0; } static inline struct lwtunnel_state *lwtunnel_state_alloc(int hdr_len) { return NULL; } static inline int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b) { return 0; } static inline int lwtunnel_output(struct net *net, struct sock *sk, struct sk_buff *skb) { return -EOPNOTSUPP; } static inline int lwtunnel_input(struct sk_buff *skb) { return -EOPNOTSUPP; } static inline int lwtunnel_xmit(struct sk_buff *skb) { return -EOPNOTSUPP; } #endif /* CONFIG_LWTUNNEL */ #define MODULE_ALIAS_RTNL_LWT(encap_type) MODULE_ALIAS("rtnl-lwt-" __stringify(encap_type)) #endif /* __NET_LWTUNNEL_H */
12 12 12 6 12 9 9 9 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 // SPDX-License-Identifier: (GPL-2.0-only OR Apache-2.0) /* * Generic implementation of the BLAKE2b digest algorithm. Based on the BLAKE2b * reference implementation, but it has been heavily modified for use in the * kernel. The reference implementation was: * * Copyright 2012, Samuel Neves <sneves@dei.uc.pt>. You may use this under * the terms of the CC0, the OpenSSL Licence, or the Apache Public License * 2.0, at your option. The terms of these licenses can be found at: * * - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0 * - OpenSSL license : https://www.openssl.org/source/license.html * - Apache 2.0 : https://www.apache.org/licenses/LICENSE-2.0 * * More information about BLAKE2 can be found at https://blake2.net. */ #include <linux/unaligned.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/bitops.h> #include <crypto/internal/blake2b.h> #include <crypto/internal/hash.h> static const u8 blake2b_sigma[12][16] = { { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }, { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 }, { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 }, { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 }, { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } }; static void blake2b_increment_counter(struct blake2b_state *S, const u64 inc) { S->t[0] += inc; S->t[1] += (S->t[0] < inc); } #define G(r,i,a,b,c,d) \ do { \ a = a + b + m[blake2b_sigma[r][2*i+0]]; \ d = ror64(d ^ a, 32); \ c = c + d; \ b = ror64(b ^ c, 24); \ a = a + b + m[blake2b_sigma[r][2*i+1]]; \ d = ror64(d ^ a, 16); \ c = c + d; \ b = ror64(b ^ c, 63); \ } while (0) #define ROUND(r) \ do { \ G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \ G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \ G(r,2,v[ 2],v[ 6],v[10],v[14]); \ G(r,3,v[ 3],v[ 7],v[11],v[15]); \ G(r,4,v[ 0],v[ 5],v[10],v[15]); \ G(r,5,v[ 1],v[ 6],v[11],v[12]); \ G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \ G(r,7,v[ 3],v[ 4],v[ 9],v[14]); \ } while (0) static void blake2b_compress_one_generic(struct blake2b_state *S, const u8 block[BLAKE2B_BLOCK_SIZE]) { u64 m[16]; u64 v[16]; size_t i; for (i = 0; i < 16; ++i) m[i] = get_unaligned_le64(block + i * sizeof(m[i])); for (i = 0; i < 8; ++i) v[i] = S->h[i]; v[ 8] = BLAKE2B_IV0; v[ 9] = BLAKE2B_IV1; v[10] = BLAKE2B_IV2; v[11] = BLAKE2B_IV3; v[12] = BLAKE2B_IV4 ^ S->t[0]; v[13] = BLAKE2B_IV5 ^ S->t[1]; v[14] = BLAKE2B_IV6 ^ S->f[0]; v[15] = BLAKE2B_IV7 ^ S->f[1]; ROUND(0); ROUND(1); ROUND(2); ROUND(3); ROUND(4); ROUND(5); ROUND(6); ROUND(7); ROUND(8); ROUND(9); ROUND(10); ROUND(11); #ifdef CONFIG_CC_IS_CLANG #pragma nounroll /* https://llvm.org/pr45803 */ #endif for (i = 0; i < 8; ++i) S->h[i] = S->h[i] ^ v[i] ^ v[i + 8]; } #undef G #undef ROUND void blake2b_compress_generic(struct blake2b_state *state, const u8 *block, size_t nblocks, u32 inc) { do { blake2b_increment_counter(state, inc); blake2b_compress_one_generic(state, block); block += BLAKE2B_BLOCK_SIZE; } while (--nblocks); } EXPORT_SYMBOL(blake2b_compress_generic); static int crypto_blake2b_update_generic(struct shash_desc *desc, const u8 *in, unsigned int inlen) { return crypto_blake2b_update(desc, in, inlen, blake2b_compress_generic); } static int crypto_blake2b_final_generic(struct shash_desc *desc, u8 *out) { return crypto_blake2b_final(desc, out, blake2b_compress_generic); } #define BLAKE2B_ALG(name, driver_name, digest_size) \ { \ .base.cra_name = name, \ .base.cra_driver_name = driver_name, \ .base.cra_priority = 100, \ .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, \ .base.cra_blocksize = BLAKE2B_BLOCK_SIZE, \ .base.cra_ctxsize = sizeof(struct blake2b_tfm_ctx), \ .base.cra_module = THIS_MODULE, \ .digestsize = digest_size, \ .setkey = crypto_blake2b_setkey, \ .init = crypto_blake2b_init, \ .update = crypto_blake2b_update_generic, \ .final = crypto_blake2b_final_generic, \ .descsize = sizeof(struct blake2b_state), \ } static struct shash_alg blake2b_algs[] = { BLAKE2B_ALG("blake2b-160", "blake2b-160-generic", BLAKE2B_160_HASH_SIZE), BLAKE2B_ALG("blake2b-256", "blake2b-256-generic", BLAKE2B_256_HASH_SIZE), BLAKE2B_ALG("blake2b-384", "blake2b-384-generic", BLAKE2B_384_HASH_SIZE), BLAKE2B_ALG("blake2b-512", "blake2b-512-generic", BLAKE2B_512_HASH_SIZE), }; static int __init blake2b_mod_init(void) { return crypto_register_shashes(blake2b_algs, ARRAY_SIZE(blake2b_algs)); } static void __exit blake2b_mod_fini(void) { crypto_unregister_shashes(blake2b_algs, ARRAY_SIZE(blake2b_algs)); } subsys_initcall(blake2b_mod_init); module_exit(blake2b_mod_fini); MODULE_AUTHOR("David Sterba <kdave@kernel.org>"); MODULE_DESCRIPTION("BLAKE2b generic implementation"); MODULE_LICENSE("GPL"); MODULE_ALIAS_CRYPTO("blake2b-160"); MODULE_ALIAS_CRYPTO("blake2b-160-generic"); MODULE_ALIAS_CRYPTO("blake2b-256"); MODULE_ALIAS_CRYPTO("blake2b-256-generic"); MODULE_ALIAS_CRYPTO("blake2b-384"); MODULE_ALIAS_CRYPTO("blake2b-384-generic"); MODULE_ALIAS_CRYPTO("blake2b-512"); MODULE_ALIAS_CRYPTO("blake2b-512-generic");
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 // SPDX-License-Identifier: GPL-2.0-only /* * IPV6 GSO/GRO offload support * Linux INET implementation * * Copyright (C) 2016 secunet Security Networks AG * Author: Steffen Klassert <steffen.klassert@secunet.com> * * ESP GRO support */ #include <linux/skbuff.h> #include <linux/init.h> #include <net/protocol.h> #include <crypto/aead.h> #include <crypto/authenc.h> #include <linux/err.h> #include <linux/module.h> #include <net/gro.h> #include <net/gso.h> #include <net/ip.h> #include <net/xfrm.h> #include <net/esp.h> #include <linux/scatterlist.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <net/ip6_route.h> #include <net/ipv6.h> #include <linux/icmpv6.h> static __u16 esp6_nexthdr_esp_offset(struct ipv6hdr *ipv6_hdr, int nhlen) { int off = sizeof(struct ipv6hdr); struct ipv6_opt_hdr *exthdr; /* ESP or ESPINUDP */ if (likely(ipv6_hdr->nexthdr == NEXTHDR_ESP || ipv6_hdr->nexthdr == NEXTHDR_UDP)) return offsetof(struct ipv6hdr, nexthdr); while (off < nhlen) { exthdr = (void *)ipv6_hdr + off; if (exthdr->nexthdr == NEXTHDR_ESP) return off; off += ipv6_optlen(exthdr); } return 0; } static struct sk_buff *esp6_gro_receive(struct list_head *head, struct sk_buff *skb) { int offset = skb_gro_offset(skb); struct xfrm_offload *xo; struct xfrm_state *x; int encap_type = 0; __be32 seq; __be32 spi; int nhoff; if (NAPI_GRO_CB(skb)->proto == IPPROTO_UDP) encap_type = UDP_ENCAP_ESPINUDP; if (!pskb_pull(skb, offset)) return NULL; if (xfrm_parse_spi(skb, IPPROTO_ESP, &spi, &seq) != 0) goto out; xo = xfrm_offload(skb); if (!xo || !(xo->flags & CRYPTO_DONE)) { struct sec_path *sp = secpath_set(skb); if (!sp) goto out; if (sp->len == XFRM_MAX_DEPTH) goto out_reset; x = xfrm_input_state_lookup(dev_net(skb->dev), skb->mark, (xfrm_address_t *)&ipv6_hdr(skb)->daddr, spi, IPPROTO_ESP, AF_INET6); if (unlikely(x && x->dir && x->dir != XFRM_SA_DIR_IN)) { /* non-offload path will record the error and audit log */ xfrm_state_put(x); x = NULL; } if (!x) goto out_reset; skb->mark = xfrm_smark_get(skb->mark, x); sp->xvec[sp->len++] = x; sp->olen++; xo = xfrm_offload(skb); if (!xo) goto out_reset; } xo->flags |= XFRM_GRO; nhoff = esp6_nexthdr_esp_offset(ipv6_hdr(skb), offset); if (!nhoff) goto out; IP6CB(skb)->nhoff = nhoff; XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = NULL; XFRM_SPI_SKB_CB(skb)->family = AF_INET6; XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct ipv6hdr, daddr); XFRM_SPI_SKB_CB(skb)->seq = seq; /* We don't need to handle errors from xfrm_input, it does all * the error handling and frees the resources on error. */ xfrm_input(skb, IPPROTO_ESP, spi, encap_type); return ERR_PTR(-EINPROGRESS); out_reset: secpath_reset(skb); out: skb_push(skb, offset); NAPI_GRO_CB(skb)->same_flow = 0; NAPI_GRO_CB(skb)->flush = 1; return NULL; } static void esp6_gso_encap(struct xfrm_state *x, struct sk_buff *skb) { struct ip_esp_hdr *esph; struct ipv6hdr *iph = ipv6_hdr(skb); struct xfrm_offload *xo = xfrm_offload(skb); u8 proto = iph->nexthdr; skb_push(skb, -skb_network_offset(skb)); if (x->outer_mode.encap == XFRM_MODE_TRANSPORT) { __be16 frag; ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &proto, &frag); } esph = ip_esp_hdr(skb); *skb_mac_header(skb) = IPPROTO_ESP; esph->spi = x->id.spi; esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low); xo->proto = proto; } static struct sk_buff *xfrm6_tunnel_gso_segment(struct xfrm_state *x, struct sk_buff *skb, netdev_features_t features) { __be16 type = x->inner_mode.family == AF_INET ? htons(ETH_P_IP) : htons(ETH_P_IPV6); return skb_eth_gso_segment(skb, features, type); } static struct sk_buff *xfrm6_transport_gso_segment(struct xfrm_state *x, struct sk_buff *skb, netdev_features_t features) { const struct net_offload *ops; struct sk_buff *segs = ERR_PTR(-EINVAL); struct xfrm_offload *xo = xfrm_offload(skb); skb->transport_header += x->props.header_len; ops = rcu_dereference(inet6_offloads[xo->proto]); if (likely(ops && ops->callbacks.gso_segment)) segs = ops->callbacks.gso_segment(skb, features); return segs; } static struct sk_buff *xfrm6_beet_gso_segment(struct xfrm_state *x, struct sk_buff *skb, netdev_features_t features) { struct xfrm_offload *xo = xfrm_offload(skb); struct sk_buff *segs = ERR_PTR(-EINVAL); const struct net_offload *ops; u8 proto = xo->proto; skb->transport_header += x->props.header_len; if (x->sel.family != AF_INET6) { skb->transport_header -= (sizeof(struct ipv6hdr) - sizeof(struct iphdr)); if (proto == IPPROTO_BEETPH) { struct ip_beet_phdr *ph = (struct ip_beet_phdr *)skb->data; skb->transport_header += ph->hdrlen * 8; proto = ph->nexthdr; } else { skb->transport_header -= IPV4_BEET_PHMAXLEN; } if (proto == IPPROTO_TCP) skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV6; } else { __be16 frag; skb->transport_header += ipv6_skip_exthdr(skb, 0, &proto, &frag); } if (proto == IPPROTO_IPIP) skb_shinfo(skb)->gso_type |= SKB_GSO_IPXIP6; __skb_pull(skb, skb_transport_offset(skb)); ops = rcu_dereference(inet6_offloads[proto]); if (likely(ops && ops->callbacks.gso_segment)) segs = ops->callbacks.gso_segment(skb, features); return segs; } static struct sk_buff *xfrm6_outer_mode_gso_segment(struct xfrm_state *x, struct sk_buff *skb, netdev_features_t features) { switch (x->outer_mode.encap) { case XFRM_MODE_TUNNEL: return xfrm6_tunnel_gso_segment(x, skb, features); case XFRM_MODE_TRANSPORT: return xfrm6_transport_gso_segment(x, skb, features); case XFRM_MODE_BEET: return xfrm6_beet_gso_segment(x, skb, features); } return ERR_PTR(-EOPNOTSUPP); } static struct sk_buff *esp6_gso_segment(struct sk_buff *skb, netdev_features_t features) { struct xfrm_state *x; struct ip_esp_hdr *esph; struct crypto_aead *aead; netdev_features_t esp_features = features; struct xfrm_offload *xo = xfrm_offload(skb); struct sec_path *sp; if (!xo) return ERR_PTR(-EINVAL); if (!(skb_shinfo(skb)->gso_type & SKB_GSO_ESP)) return ERR_PTR(-EINVAL); sp = skb_sec_path(skb); x = sp->xvec[sp->len - 1]; aead = x->data; esph = ip_esp_hdr(skb); if (esph->spi != x->id.spi) return ERR_PTR(-EINVAL); if (!pskb_may_pull(skb, sizeof(*esph) + crypto_aead_ivsize(aead))) return ERR_PTR(-EINVAL); __skb_pull(skb, sizeof(*esph) + crypto_aead_ivsize(aead)); skb->encap_hdr_csum = 1; if (!(features & NETIF_F_HW_ESP) || x->xso.dev != skb->dev) esp_features = features & ~(NETIF_F_SG | NETIF_F_CSUM_MASK | NETIF_F_SCTP_CRC); else if (!(features & NETIF_F_HW_ESP_TX_CSUM)) esp_features = features & ~(NETIF_F_CSUM_MASK | NETIF_F_SCTP_CRC); xo->flags |= XFRM_GSO_SEGMENT; return xfrm6_outer_mode_gso_segment(x, skb, esp_features); } static int esp6_input_tail(struct xfrm_state *x, struct sk_buff *skb) { struct crypto_aead *aead = x->data; struct xfrm_offload *xo = xfrm_offload(skb); if (!pskb_may_pull(skb, sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead))) return -EINVAL; if (!(xo->flags & CRYPTO_DONE)) skb->ip_summed = CHECKSUM_NONE; return esp6_input_done2(skb, 0); } static int esp6_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features_t features) { int len; int err; int alen; int blksize; struct xfrm_offload *xo; struct crypto_aead *aead; struct esp_info esp; bool hw_offload = true; __u32 seq; esp.inplace = true; xo = xfrm_offload(skb); if (!xo) return -EINVAL; if (!(features & NETIF_F_HW_ESP) || x->xso.dev != skb->dev) { xo->flags |= CRYPTO_FALLBACK; hw_offload = false; } esp.proto = xo->proto; /* skb is pure payload to encrypt */ aead = x->data; alen = crypto_aead_authsize(aead); esp.tfclen = 0; /* XXX: Add support for tfc padding here. */ blksize = ALIGN(crypto_aead_blocksize(aead), 4); esp.clen = ALIGN(skb->len + 2 + esp.tfclen, blksize); esp.plen = esp.clen - skb->len - esp.tfclen; esp.tailen = esp.tfclen + esp.plen + alen; if (!hw_offload || !skb_is_gso(skb)) { esp.nfrags = esp6_output_head(x, skb, &esp); if (esp.nfrags < 0) return esp.nfrags; } seq = xo->seq.low; esp.esph = ip_esp_hdr(skb); esp.esph->spi = x->id.spi; skb_push(skb, -skb_network_offset(skb)); if (xo->flags & XFRM_GSO_SEGMENT) { esp.esph->seq_no = htonl(seq); if (!skb_is_gso(skb)) xo->seq.low++; else xo->seq.low += skb_shinfo(skb)->gso_segs; } if (xo->seq.low < seq) xo->seq.hi++; esp.seqno = cpu_to_be64(xo->seq.low + ((u64)xo->seq.hi << 32)); len = skb->len - sizeof(struct ipv6hdr); if (len > IPV6_MAXPLEN) len = 0; ipv6_hdr(skb)->payload_len = htons(len); if (hw_offload) { if (!skb_ext_add(skb, SKB_EXT_SEC_PATH)) return -ENOMEM; xo = xfrm_offload(skb); if (!xo) return -EINVAL; xo->flags |= XFRM_XMIT; return 0; } err = esp6_output_tail(x, skb, &esp); if (err) return err; secpath_reset(skb); if (skb_needs_linearize(skb, skb->dev->features) && __skb_linearize(skb)) return -ENOMEM; return 0; } static const struct net_offload esp6_offload = { .callbacks = { .gro_receive = esp6_gro_receive, .gso_segment = esp6_gso_segment, }, }; static const struct xfrm_type_offload esp6_type_offload = { .owner = THIS_MODULE, .proto = IPPROTO_ESP, .input_tail = esp6_input_tail, .xmit = esp6_xmit, .encap = esp6_gso_encap, }; static int __init esp6_offload_init(void) { if (xfrm_register_type_offload(&esp6_type_offload, AF_INET6) < 0) { pr_info("%s: can't add xfrm type offload\n", __func__); return -EAGAIN; } return inet6_add_offload(&esp6_offload, IPPROTO_ESP); } static void __exit esp6_offload_exit(void) { xfrm_unregister_type_offload(&esp6_type_offload, AF_INET6); inet6_del_offload(&esp6_offload, IPPROTO_ESP); } module_init(esp6_offload_init); module_exit(esp6_offload_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Steffen Klassert <steffen.klassert@secunet.com>"); MODULE_ALIAS_XFRM_OFFLOAD_TYPE(AF_INET6, XFRM_PROTO_ESP); MODULE_DESCRIPTION("IPV6 GSO/GRO offload support");
9 10 7041 7045 3 3 3 3 3 3 3 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 // SPDX-License-Identifier: GPL-2.0 /* * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner * * NOHZ implementation for low and high resolution timers * * Started by: Thomas Gleixner and Ingo Molnar */ #include <linux/compiler.h> #include <linux/cpu.h> #include <linux/err.h> #include <linux/hrtimer.h> #include <linux/interrupt.h> #include <linux/kernel_stat.h> #include <linux/percpu.h> #include <linux/nmi.h> #include <linux/profile.h> #include <linux/sched/signal.h> #include <linux/sched/clock.h> #include <linux/sched/stat.h> #include <linux/sched/nohz.h> #include <linux/sched/loadavg.h> #include <linux/module.h> #include <linux/irq_work.h> #include <linux/posix-timers.h> #include <linux/context_tracking.h> #include <linux/mm.h> #include <asm/irq_regs.h> #include "tick-internal.h" #include <trace/events/timer.h> /* * Per-CPU nohz control structure */ static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); struct tick_sched *tick_get_tick_sched(int cpu) { return &per_cpu(tick_cpu_sched, cpu); } /* * The time when the last jiffy update happened. Write access must hold * jiffies_lock and jiffies_seq. tick_nohz_next_event() needs to get a * consistent view of jiffies and last_jiffies_update. */ static ktime_t last_jiffies_update; /* * Must be called with interrupts disabled ! */ static void tick_do_update_jiffies64(ktime_t now) { unsigned long ticks = 1; ktime_t delta, nextp; /* * 64-bit can do a quick check without holding the jiffies lock and * without looking at the sequence count. The smp_load_acquire() * pairs with the update done later in this function. * * 32-bit cannot do that because the store of 'tick_next_period' * consists of two 32-bit stores, and the first store could be * moved by the CPU to a random point in the future. */ if (IS_ENABLED(CONFIG_64BIT)) { if (ktime_before(now, smp_load_acquire(&tick_next_period))) return; } else { unsigned int seq; /* * Avoid contention on 'jiffies_lock' and protect the quick * check with the sequence count. */ do { seq = read_seqcount_begin(&jiffies_seq); nextp = tick_next_period; } while (read_seqcount_retry(&jiffies_seq, seq)); if (ktime_before(now, nextp)) return; } /* Quick check failed, i.e. update is required. */ raw_spin_lock(&jiffies_lock); /* * Re-evaluate with the lock held. Another CPU might have done the * update already. */ if (ktime_before(now, tick_next_period)) { raw_spin_unlock(&jiffies_lock); return; } write_seqcount_begin(&jiffies_seq); delta = ktime_sub(now, tick_next_period); if (unlikely(delta >= TICK_NSEC)) { /* Slow path for long idle sleep times */ s64 incr = TICK_NSEC; ticks += ktime_divns(delta, incr); last_jiffies_update = ktime_add_ns(last_jiffies_update, incr * ticks); } else { last_jiffies_update = ktime_add_ns(last_jiffies_update, TICK_NSEC); } /* Advance jiffies to complete the 'jiffies_seq' protected job */ jiffies_64 += ticks; /* Keep the tick_next_period variable up to date */ nextp = ktime_add_ns(last_jiffies_update, TICK_NSEC); if (IS_ENABLED(CONFIG_64BIT)) { /* * Pairs with smp_load_acquire() in the lockless quick * check above, and ensures that the update to 'jiffies_64' is * not reordered vs. the store to 'tick_next_period', neither * by the compiler nor by the CPU. */ smp_store_release(&tick_next_period, nextp); } else { /* * A plain store is good enough on 32-bit, as the quick check * above is protected by the sequence count. */ tick_next_period = nextp; } /* * Release the sequence count. calc_global_load() below is not * protected by it, but 'jiffies_lock' needs to be held to prevent * concurrent invocations. */ write_seqcount_end(&jiffies_seq); calc_global_load(); raw_spin_unlock(&jiffies_lock); update_wall_time(); } /* * Initialize and return retrieve the jiffies update. */ static ktime_t tick_init_jiffy_update(void) { ktime_t period; raw_spin_lock(&jiffies_lock); write_seqcount_begin(&jiffies_seq); /* Have we started the jiffies update yet ? */ if (last_jiffies_update == 0) { u32 rem; /* * Ensure that the tick is aligned to a multiple of * TICK_NSEC. */ div_u64_rem(tick_next_period, TICK_NSEC, &rem); if (rem) tick_next_period += TICK_NSEC - rem; last_jiffies_update = tick_next_period; } period = last_jiffies_update; write_seqcount_end(&jiffies_seq); raw_spin_unlock(&jiffies_lock); return period; } static inline int tick_sched_flag_test(struct tick_sched *ts, unsigned long flag) { return !!(ts->flags & flag); } static inline void tick_sched_flag_set(struct tick_sched *ts, unsigned long flag) { lockdep_assert_irqs_disabled(); ts->flags |= flag; } static inline void tick_sched_flag_clear(struct tick_sched *ts, unsigned long flag) { lockdep_assert_irqs_disabled(); ts->flags &= ~flag; } #define MAX_STALLED_JIFFIES 5 static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now) { int tick_cpu, cpu = smp_processor_id(); /* * Check if the do_timer duty was dropped. We don't care about * concurrency: This happens only when the CPU in charge went * into a long sleep. If two CPUs happen to assign themselves to * this duty, then the jiffies update is still serialized by * 'jiffies_lock'. * * If nohz_full is enabled, this should not happen because the * 'tick_do_timer_cpu' CPU never relinquishes. */ tick_cpu = READ_ONCE(tick_do_timer_cpu); if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && unlikely(tick_cpu == TICK_DO_TIMER_NONE)) { #ifdef CONFIG_NO_HZ_FULL WARN_ON_ONCE(tick_nohz_full_running); #endif WRITE_ONCE(tick_do_timer_cpu, cpu); tick_cpu = cpu; } /* Check if jiffies need an update */ if (tick_cpu == cpu) tick_do_update_jiffies64(now); /* * If the jiffies update stalled for too long (timekeeper in stop_machine() * or VMEXIT'ed for several msecs), force an update. */ if (ts->last_tick_jiffies != jiffies) { ts->stalled_jiffies = 0; ts->last_tick_jiffies = READ_ONCE(jiffies); } else { if (++ts->stalled_jiffies == MAX_STALLED_JIFFIES) { tick_do_update_jiffies64(now); ts->stalled_jiffies = 0; ts->last_tick_jiffies = READ_ONCE(jiffies); } } if (tick_sched_flag_test(ts, TS_FLAG_INIDLE)) ts->got_idle_tick = 1; } static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) { /* * When we are idle and the tick is stopped, we have to touch * the watchdog as we might not schedule for a really long * time. This happens on completely idle SMP systems while * waiting on the login prompt. We also increment the "start of * idle" jiffy stamp so the idle accounting adjustment we do * when we go busy again does not account too many ticks. */ if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && tick_sched_flag_test(ts, TS_FLAG_STOPPED)) { touch_softlockup_watchdog_sched(); if (is_idle_task(current)) ts->idle_jiffies++; /* * In case the current tick fired too early past its expected * expiration, make sure we don't bypass the next clock reprogramming * to the same deadline. */ ts->next_tick = 0; } update_process_times(user_mode(regs)); profile_tick(CPU_PROFILING); } /* * We rearm the timer until we get disabled by the idle code. * Called with interrupts disabled. */ static enum hrtimer_restart tick_nohz_handler(struct hrtimer *timer) { struct tick_sched *ts = container_of(timer, struct tick_sched, sched_timer); struct pt_regs *regs = get_irq_regs(); ktime_t now = ktime_get(); tick_sched_do_timer(ts, now); /* * Do not call when we are not in IRQ context and have * no valid 'regs' pointer */ if (regs) tick_sched_handle(ts, regs); else ts->next_tick = 0; /* * In dynticks mode, tick reprogram is deferred: * - to the idle task if in dynticks-idle * - to IRQ exit if in full-dynticks. */ if (unlikely(tick_sched_flag_test(ts, TS_FLAG_STOPPED))) return HRTIMER_NORESTART; hrtimer_forward(timer, now, TICK_NSEC); return HRTIMER_RESTART; } #ifdef CONFIG_NO_HZ_FULL cpumask_var_t tick_nohz_full_mask; EXPORT_SYMBOL_GPL(tick_nohz_full_mask); bool tick_nohz_full_running; EXPORT_SYMBOL_GPL(tick_nohz_full_running); static atomic_t tick_dep_mask; static bool check_tick_dependency(atomic_t *dep) { int val = atomic_read(dep); if (val & TICK_DEP_MASK_POSIX_TIMER) { trace_tick_stop(0, TICK_DEP_MASK_POSIX_TIMER); return true; } if (val & TICK_DEP_MASK_PERF_EVENTS) { trace_tick_stop(0, TICK_DEP_MASK_PERF_EVENTS); return true; } if (val & TICK_DEP_MASK_SCHED) { trace_tick_stop(0, TICK_DEP_MASK_SCHED); return true; } if (val & TICK_DEP_MASK_CLOCK_UNSTABLE) { trace_tick_stop(0, TICK_DEP_MASK_CLOCK_UNSTABLE); return true; } if (val & TICK_DEP_MASK_RCU) { trace_tick_stop(0, TICK_DEP_MASK_RCU); return true; } if (val & TICK_DEP_MASK_RCU_EXP) { trace_tick_stop(0, TICK_DEP_MASK_RCU_EXP); return true; } return false; } static bool can_stop_full_tick(int cpu, struct tick_sched *ts) { lockdep_assert_irqs_disabled(); if (unlikely(!cpu_online(cpu))) return false; if (check_tick_dependency(&tick_dep_mask)) return false; if (check_tick_dependency(&ts->tick_dep_mask)) return false; if (check_tick_dependency(&current->tick_dep_mask)) return false; if (check_tick_dependency(&current->signal->tick_dep_mask)) return false; return true; } static void nohz_full_kick_func(struct irq_work *work) { /* Empty, the tick restart happens on tick_nohz_irq_exit() */ } static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = IRQ_WORK_INIT_HARD(nohz_full_kick_func); /* * Kick this CPU if it's full dynticks in order to force it to * re-evaluate its dependency on the tick and restart it if necessary. * This kick, unlike tick_nohz_full_kick_cpu() and tick_nohz_full_kick_all(), * is NMI safe. */ static void tick_nohz_full_kick(void) { if (!tick_nohz_full_cpu(smp_processor_id())) return; irq_work_queue(this_cpu_ptr(&nohz_full_kick_work)); } /* * Kick the CPU if it's full dynticks in order to force it to * re-evaluate its dependency on the tick and restart it if necessary. */ void tick_nohz_full_kick_cpu(int cpu) { if (!tick_nohz_full_cpu(cpu)) return; irq_work_queue_on(&per_cpu(nohz_full_kick_work, cpu), cpu); } static void tick_nohz_kick_task(struct task_struct *tsk) { int cpu; /* * If the task is not running, run_posix_cpu_timers() * has nothing to elapse, and an IPI can then be optimized out. * * activate_task() STORE p->tick_dep_mask * STORE p->on_rq * __schedule() (switch to task 'p') smp_mb() (atomic_fetch_or()) * LOCK rq->lock LOAD p->on_rq * smp_mb__after_spin_lock() * tick_nohz_task_switch() * LOAD p->tick_dep_mask * * XXX given a task picks up the dependency on schedule(), should we * only care about tasks that are currently on the CPU instead of all * that are on the runqueue? * * That is, does this want to be: task_on_cpu() / task_curr()? */ if (!sched_task_on_rq(tsk)) return; /* * If the task concurrently migrates to another CPU, * we guarantee it sees the new tick dependency upon * schedule. * * set_task_cpu(p, cpu); * STORE p->cpu = @cpu * __schedule() (switch to task 'p') * LOCK rq->lock * smp_mb__after_spin_lock() STORE p->tick_dep_mask * tick_nohz_task_switch() smp_mb() (atomic_fetch_or()) * LOAD p->tick_dep_mask LOAD p->cpu */ cpu = task_cpu(tsk); preempt_disable(); if (cpu_online(cpu)) tick_nohz_full_kick_cpu(cpu); preempt_enable(); } /* * Kick all full dynticks CPUs in order to force these to re-evaluate * their dependency on the tick and restart it if necessary. */ static void tick_nohz_full_kick_all(void) { int cpu; if (!tick_nohz_full_running) return; preempt_disable(); for_each_cpu_and(cpu, tick_nohz_full_mask, cpu_online_mask) tick_nohz_full_kick_cpu(cpu); preempt_enable(); } static void tick_nohz_dep_set_all(atomic_t *dep, enum tick_dep_bits bit) { int prev; prev = atomic_fetch_or(BIT(bit), dep); if (!prev) tick_nohz_full_kick_all(); } /* * Set a global tick dependency. Used by perf events that rely on freq and * unstable clocks. */ void tick_nohz_dep_set(enum tick_dep_bits bit) { tick_nohz_dep_set_all(&tick_dep_mask, bit); } void tick_nohz_dep_clear(enum tick_dep_bits bit) { atomic_andnot(BIT(bit), &tick_dep_mask); } /* * Set per-CPU tick dependency. Used by scheduler and perf events in order to * manage event-throttling. */ void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit) { int prev; struct tick_sched *ts; ts = per_cpu_ptr(&tick_cpu_sched, cpu); prev = atomic_fetch_or(BIT(bit), &ts->tick_dep_mask); if (!prev) { preempt_disable(); /* Perf needs local kick that is NMI safe */ if (cpu == smp_processor_id()) { tick_nohz_full_kick(); } else { /* Remote IRQ work not NMI-safe */ if (!WARN_ON_ONCE(in_nmi())) tick_nohz_full_kick_cpu(cpu); } preempt_enable(); } } EXPORT_SYMBOL_GPL(tick_nohz_dep_set_cpu); void tick_nohz_dep_clear_cpu(int cpu, enum tick_dep_bits bit) { struct tick_sched *ts = per_cpu_ptr(&tick_cpu_sched, cpu); atomic_andnot(BIT(bit), &ts->tick_dep_mask); } EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_cpu); /* * Set a per-task tick dependency. RCU needs this. Also posix CPU timers * in order to elapse per task timers. */ void tick_nohz_dep_set_task(struct task_struct *tsk, enum tick_dep_bits bit) { if (!atomic_fetch_or(BIT(bit), &tsk->tick_dep_mask)) tick_nohz_kick_task(tsk); } EXPORT_SYMBOL_GPL(tick_nohz_dep_set_task); void tick_nohz_dep_clear_task(struct task_struct *tsk, enum tick_dep_bits bit) { atomic_andnot(BIT(bit), &tsk->tick_dep_mask); } EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_task); /* * Set a per-taskgroup tick dependency. Posix CPU timers need this in order to elapse * per process timers. */ void tick_nohz_dep_set_signal(struct task_struct *tsk, enum tick_dep_bits bit) { int prev; struct signal_struct *sig = tsk->signal; prev = atomic_fetch_or(BIT(bit), &sig->tick_dep_mask); if (!prev) { struct task_struct *t; lockdep_assert_held(&tsk->sighand->siglock); __for_each_thread(sig, t) tick_nohz_kick_task(t); } } void tick_nohz_dep_clear_signal(struct signal_struct *sig, enum tick_dep_bits bit) { atomic_andnot(BIT(bit), &sig->tick_dep_mask); } /* * Re-evaluate the need for the tick as we switch the current task. * It might need the tick due to per task/process properties: * perf events, posix CPU timers, ... */ void __tick_nohz_task_switch(void) { struct tick_sched *ts; if (!tick_nohz_full_cpu(smp_processor_id())) return; ts = this_cpu_ptr(&tick_cpu_sched); if (tick_sched_flag_test(ts, TS_FLAG_STOPPED)) { if (atomic_read(&current->tick_dep_mask) || atomic_read(&current->signal->tick_dep_mask)) tick_nohz_full_kick(); } } /* Get the boot-time nohz CPU list from the kernel parameters. */ void __init tick_nohz_full_setup(cpumask_var_t cpumask) { alloc_bootmem_cpumask_var(&tick_nohz_full_mask); cpumask_copy(tick_nohz_full_mask, cpumask); tick_nohz_full_running = true; } bool tick_nohz_cpu_hotpluggable(unsigned int cpu) { /* * The 'tick_do_timer_cpu' CPU handles housekeeping duty (unbound * timers, workqueues, timekeeping, ...) on behalf of full dynticks * CPUs. It must remain online when nohz full is enabled. */ if (tick_nohz_full_running && READ_ONCE(tick_do_timer_cpu) == cpu) return false; return true; } static int tick_nohz_cpu_down(unsigned int cpu) { return tick_nohz_cpu_hotpluggable(cpu) ? 0 : -EBUSY; } void __init tick_nohz_init(void) { int cpu, ret; if (!tick_nohz_full_running) return; /* * Full dynticks uses IRQ work to drive the tick rescheduling on safe * locking contexts. But then we need IRQ work to raise its own * interrupts to avoid circular dependency on the tick. */ if (!arch_irq_work_has_interrupt()) { pr_warn("NO_HZ: Can't run full dynticks because arch doesn't support IRQ work self-IPIs\n"); cpumask_clear(tick_nohz_full_mask); tick_nohz_full_running = false; return; } if (IS_ENABLED(CONFIG_PM_SLEEP_SMP) && !IS_ENABLED(CONFIG_PM_SLEEP_SMP_NONZERO_CPU)) { cpu = smp_processor_id(); if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) { pr_warn("NO_HZ: Clearing %d from nohz_full range " "for timekeeping\n", cpu); cpumask_clear_cpu(cpu, tick_nohz_full_mask); } } for_each_cpu(cpu, tick_nohz_full_mask) ct_cpu_track_user(cpu); ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "kernel/nohz:predown", NULL, tick_nohz_cpu_down); WARN_ON(ret < 0); pr_info("NO_HZ: Full dynticks CPUs: %*pbl.\n", cpumask_pr_args(tick_nohz_full_mask)); } #endif /* #ifdef CONFIG_NO_HZ_FULL */ /* * NOHZ - aka dynamic tick functionality */ #ifdef CONFIG_NO_HZ_COMMON /* * NO HZ enabled ? */ bool tick_nohz_enabled __read_mostly = true; unsigned long tick_nohz_active __read_mostly; /* * Enable / Disable tickless mode */ static int __init setup_tick_nohz(char *str) { return (kstrtobool(str, &tick_nohz_enabled) == 0); } __setup("nohz=", setup_tick_nohz); bool tick_nohz_tick_stopped(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); return tick_sched_flag_test(ts, TS_FLAG_STOPPED); } bool tick_nohz_tick_stopped_cpu(int cpu) { struct tick_sched *ts = per_cpu_ptr(&tick_cpu_sched, cpu); return tick_sched_flag_test(ts, TS_FLAG_STOPPED); } /** * tick_nohz_update_jiffies - update jiffies when idle was interrupted * @now: current ktime_t * * Called from interrupt entry when the CPU was idle * * In case the sched_tick was stopped on this CPU, we have to check if jiffies * must be updated. Otherwise an interrupt handler could use a stale jiffy * value. We do this unconditionally on any CPU, as we don't know whether the * CPU, which has the update task assigned, is in a long sleep. */ static void tick_nohz_update_jiffies(ktime_t now) { unsigned long flags; __this_cpu_write(tick_cpu_sched.idle_waketime, now); local_irq_save(flags); tick_do_update_jiffies64(now); local_irq_restore(flags); touch_softlockup_watchdog_sched(); } static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now) { ktime_t delta; if (WARN_ON_ONCE(!tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE))) return; delta = ktime_sub(now, ts->idle_entrytime); write_seqcount_begin(&ts->idle_sleeptime_seq); if (nr_iowait_cpu(smp_processor_id()) > 0) ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta); else ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); ts->idle_entrytime = now; tick_sched_flag_clear(ts, TS_FLAG_IDLE_ACTIVE); write_seqcount_end(&ts->idle_sleeptime_seq); sched_clock_idle_wakeup_event(); } static void tick_nohz_start_idle(struct tick_sched *ts) { write_seqcount_begin(&ts->idle_sleeptime_seq); ts->idle_entrytime = ktime_get(); tick_sched_flag_set(ts, TS_FLAG_IDLE_ACTIVE); write_seqcount_end(&ts->idle_sleeptime_seq); sched_clock_idle_sleep_event(); } static u64 get_cpu_sleep_time_us(struct tick_sched *ts, ktime_t *sleeptime, bool compute_delta, u64 *last_update_time) { ktime_t now, idle; unsigned int seq; if (!tick_nohz_active) return -1; now = ktime_get(); if (last_update_time) *last_update_time = ktime_to_us(now); do { seq = read_seqcount_begin(&ts->idle_sleeptime_seq); if (tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE) && compute_delta) { ktime_t delta = ktime_sub(now, ts->idle_entrytime); idle = ktime_add(*sleeptime, delta); } else { idle = *sleeptime; } } while (read_seqcount_retry(&ts->idle_sleeptime_seq, seq)); return ktime_to_us(idle); } /** * get_cpu_idle_time_us - get the total idle time of a CPU * @cpu: CPU number to query * @last_update_time: variable to store update time in. Do not update * counters if NULL. * * Return the cumulative idle time (since boot) for a given * CPU, in microseconds. Note that this is partially broken due to * the counter of iowait tasks that can be remotely updated without * any synchronization. Therefore it is possible to observe backward * values within two consecutive reads. * * This time is measured via accounting rather than sampling, * and is as accurate as ktime_get() is. * * Return: -1 if NOHZ is not enabled, else total idle time of the @cpu */ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); return get_cpu_sleep_time_us(ts, &ts->idle_sleeptime, !nr_iowait_cpu(cpu), last_update_time); } EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); /** * get_cpu_iowait_time_us - get the total iowait time of a CPU * @cpu: CPU number to query * @last_update_time: variable to store update time in. Do not update * counters if NULL. * * Return the cumulative iowait time (since boot) for a given * CPU, in microseconds. Note this is partially broken due to * the counter of iowait tasks that can be remotely updated without * any synchronization. Therefore it is possible to observe backward * values within two consecutive reads. * * This time is measured via accounting rather than sampling, * and is as accurate as ktime_get() is. * * Return: -1 if NOHZ is not enabled, else total iowait time of @cpu */ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); return get_cpu_sleep_time_us(ts, &ts->iowait_sleeptime, nr_iowait_cpu(cpu), last_update_time); } EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) { hrtimer_cancel(&ts->sched_timer); hrtimer_set_expires(&ts->sched_timer, ts->last_tick); /* Forward the time to expire in the future */ hrtimer_forward(&ts->sched_timer, now, TICK_NSEC); if (tick_sched_flag_test(ts, TS_FLAG_HIGHRES)) { hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED_HARD); } else { tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); } /* * Reset to make sure the next tick stop doesn't get fooled by past * cached clock deadline. */ ts->next_tick = 0; } static inline bool local_timer_softirq_pending(void) { return local_timers_pending() & BIT(TIMER_SOFTIRQ); } /* * Read jiffies and the time when jiffies were updated last */ u64 get_jiffies_update(unsigned long *basej) { unsigned long basejiff; unsigned int seq; u64 basemono; do { seq = read_seqcount_begin(&jiffies_seq); basemono = last_jiffies_update; basejiff = jiffies; } while (read_seqcount_retry(&jiffies_seq, seq)); *basej = basejiff; return basemono; } /** * tick_nohz_next_event() - return the clock monotonic based next event * @ts: pointer to tick_sched struct * @cpu: CPU number * * Return: * *%0 - When the next event is a maximum of TICK_NSEC in the future * and the tick is not stopped yet * *%next_event - Next event based on clock monotonic */ static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu) { u64 basemono, next_tick, delta, expires; unsigned long basejiff; int tick_cpu; basemono = get_jiffies_update(&basejiff); ts->last_jiffies = basejiff; ts->timer_expires_base = basemono; /* * Keep the periodic tick, when RCU, architecture or irq_work * requests it. * Aside of that, check whether the local timer softirq is * pending. If so, its a bad idea to call get_next_timer_interrupt(), * because there is an already expired timer, so it will request * immediate expiry, which rearms the hardware timer with a * minimal delta, which brings us back to this place * immediately. Lather, rinse and repeat... */ if (rcu_needs_cpu() || arch_needs_cpu() || irq_work_needs_cpu() || local_timer_softirq_pending()) { next_tick = basemono + TICK_NSEC; } else { /* * Get the next pending timer. If high resolution * timers are enabled this only takes the timer wheel * timers into account. If high resolution timers are * disabled this also looks at the next expiring * hrtimer. */ next_tick = get_next_timer_interrupt(basejiff, basemono); ts->next_timer = next_tick; } /* Make sure next_tick is never before basemono! */ if (WARN_ON_ONCE(basemono > next_tick)) next_tick = basemono; /* * If the tick is due in the next period, keep it ticking or * force prod the timer. */ delta = next_tick - basemono; if (delta <= (u64)TICK_NSEC) { /* * We've not stopped the tick yet, and there's a timer in the * next period, so no point in stopping it either, bail. */ if (!tick_sched_flag_test(ts, TS_FLAG_STOPPED)) { ts->timer_expires = 0; goto out; } } /* * If this CPU is the one which had the do_timer() duty last, we limit * the sleep time to the timekeeping 'max_deferment' value. * Otherwise we can sleep as long as we want. */ delta = timekeeping_max_deferment(); tick_cpu = READ_ONCE(tick_do_timer_cpu); if (tick_cpu != cpu && (tick_cpu != TICK_DO_TIMER_NONE || !tick_sched_flag_test(ts, TS_FLAG_DO_TIMER_LAST))) delta = KTIME_MAX; /* Calculate the next expiry time */ if (delta < (KTIME_MAX - basemono)) expires = basemono + delta; else expires = KTIME_MAX; ts->timer_expires = min_t(u64, expires, next_tick); out: return ts->timer_expires; } static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu) { struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); unsigned long basejiff = ts->last_jiffies; u64 basemono = ts->timer_expires_base; bool timer_idle = tick_sched_flag_test(ts, TS_FLAG_STOPPED); int tick_cpu; u64 expires; /* Make sure we won't be trying to stop it twice in a row. */ ts->timer_expires_base = 0; /* * Now the tick should be stopped definitely - so the timer base needs * to be marked idle as well to not miss a newly queued timer. */ expires = timer_base_try_to_set_idle(basejiff, basemono, &timer_idle); if (expires > ts->timer_expires) { /* * This path could only happen when the first timer was removed * between calculating the possible sleep length and now (when * high resolution mode is not active, timer could also be a * hrtimer). * * We have to stick to the original calculated expiry value to * not stop the tick for too long with a shallow C-state (which * was programmed by cpuidle because of an early next expiration * value). */ expires = ts->timer_expires; } /* If the timer base is not idle, retain the not yet stopped tick. */ if (!timer_idle) return; /* * If this CPU is the one which updates jiffies, then give up * the assignment and let it be taken by the CPU which runs * the tick timer next, which might be this CPU as well. If we * don't drop this here, the jiffies might be stale and * do_timer() never gets invoked. Keep track of the fact that it * was the one which had the do_timer() duty last. */ tick_cpu = READ_ONCE(tick_do_timer_cpu); if (tick_cpu == cpu) { WRITE_ONCE(tick_do_timer_cpu, TICK_DO_TIMER_NONE); tick_sched_flag_set(ts, TS_FLAG_DO_TIMER_LAST); } else if (tick_cpu != TICK_DO_TIMER_NONE) { tick_sched_flag_clear(ts, TS_FLAG_DO_TIMER_LAST); } /* Skip reprogram of event if it's not changed */ if (tick_sched_flag_test(ts, TS_FLAG_STOPPED) && (expires == ts->next_tick)) { /* Sanity check: make sure clockevent is actually programmed */ if (expires == KTIME_MAX || ts->next_tick == hrtimer_get_expires(&ts->sched_timer)) return; WARN_ONCE(1, "basemono: %llu ts->next_tick: %llu dev->next_event: %llu " "timer->active: %d timer->expires: %llu\n", basemono, ts->next_tick, dev->next_event, hrtimer_active(&ts->sched_timer), hrtimer_get_expires(&ts->sched_timer)); } /* * tick_nohz_stop_tick() can be called several times before * tick_nohz_restart_sched_tick() is called. This happens when * interrupts arrive which do not cause a reschedule. In the first * call we save the current tick time, so we can restart the * scheduler tick in tick_nohz_restart_sched_tick(). */ if (!tick_sched_flag_test(ts, TS_FLAG_STOPPED)) { calc_load_nohz_start(); quiet_vmstat(); ts->last_tick = hrtimer_get_expires(&ts->sched_timer); tick_sched_flag_set(ts, TS_FLAG_STOPPED); trace_tick_stop(1, TICK_DEP_MASK_NONE); } ts->next_tick = expires; /* * If the expiration time == KTIME_MAX, then we simply stop * the tick timer. */ if (unlikely(expires == KTIME_MAX)) { if (tick_sched_flag_test(ts, TS_FLAG_HIGHRES)) hrtimer_cancel(&ts->sched_timer); else tick_program_event(KTIME_MAX, 1); return; } if (tick_sched_flag_test(ts, TS_FLAG_HIGHRES)) { hrtimer_start(&ts->sched_timer, expires, HRTIMER_MODE_ABS_PINNED_HARD); } else { hrtimer_set_expires(&ts->sched_timer, expires); tick_program_event(expires, 1); } } static void tick_nohz_retain_tick(struct tick_sched *ts) { ts->timer_expires_base = 0; } #ifdef CONFIG_NO_HZ_FULL static void tick_nohz_full_stop_tick(struct tick_sched *ts, int cpu) { if (tick_nohz_next_event(ts, cpu)) tick_nohz_stop_tick(ts, cpu); else tick_nohz_retain_tick(ts); } #endif /* CONFIG_NO_HZ_FULL */ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) { /* Update jiffies first */ tick_do_update_jiffies64(now); /* * Clear the timer idle flag, so we avoid IPIs on remote queueing and * the clock forward checks in the enqueue path: */ timer_clear_idle(); calc_load_nohz_stop(); touch_softlockup_watchdog_sched(); /* Cancel the scheduled timer and restore the tick: */ tick_sched_flag_clear(ts, TS_FLAG_STOPPED); tick_nohz_restart(ts, now); } static void __tick_nohz_full_update_tick(struct tick_sched *ts, ktime_t now) { #ifdef CONFIG_NO_HZ_FULL int cpu = smp_processor_id(); if (can_stop_full_tick(cpu, ts)) tick_nohz_full_stop_tick(ts, cpu); else if (tick_sched_flag_test(ts, TS_FLAG_STOPPED)) tick_nohz_restart_sched_tick(ts, now); #endif } static void tick_nohz_full_update_tick(struct tick_sched *ts) { if (!tick_nohz_full_cpu(smp_processor_id())) return; if (!tick_sched_flag_test(ts, TS_FLAG_NOHZ)) return; __tick_nohz_full_update_tick(ts, ktime_get()); } /* * A pending softirq outside an IRQ (or softirq disabled section) context * should be waiting for ksoftirqd to handle it. Therefore we shouldn't * reach this code due to the need_resched() early check in can_stop_idle_tick(). * * However if we are between CPUHP_AP_SMPBOOT_THREADS and CPU_TEARDOWN_CPU on the * cpu_down() process, softirqs can still be raised while ksoftirqd is parked, * triggering the code below, since wakep_softirqd() is ignored. * */ static bool report_idle_softirq(void) { static int ratelimit; unsigned int pending = local_softirq_pending(); if (likely(!pending)) return false; /* Some softirqs claim to be safe against hotplug and ksoftirqd parking */ if (!cpu_active(smp_processor_id())) { pending &= ~SOFTIRQ_HOTPLUG_SAFE_MASK; if (!pending) return false; } if (ratelimit >= 10) return false; /* On RT, softirq handling may be waiting on some lock */ if (local_bh_blocked()) return false; pr_warn("NOHZ tick-stop error: local softirq work is pending, handler #%02x!!!\n", pending); ratelimit++; return true; } static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) { WARN_ON_ONCE(cpu_is_offline(cpu)); if (unlikely(!tick_sched_flag_test(ts, TS_FLAG_NOHZ))) return false; if (need_resched()) return false; if (unlikely(report_idle_softirq())) return false; if (tick_nohz_full_enabled()) { int tick_cpu = READ_ONCE(tick_do_timer_cpu); /* * Keep the tick alive to guarantee timekeeping progression * if there are full dynticks CPUs around */ if (tick_cpu == cpu) return false; /* Should not happen for nohz-full */ if (WARN_ON_ONCE(tick_cpu == TICK_DO_TIMER_NONE)) return false; } return true; } /** * tick_nohz_idle_stop_tick - stop the idle tick from the idle task * * When the next event is more than a tick into the future, stop the idle tick */ void tick_nohz_idle_stop_tick(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); int cpu = smp_processor_id(); ktime_t expires; /* * If tick_nohz_get_sleep_length() ran tick_nohz_next_event(), the * tick timer expiration time is known already. */ if (ts->timer_expires_base) expires = ts->timer_expires; else if (can_stop_idle_tick(cpu, ts)) expires = tick_nohz_next_event(ts, cpu); else return; ts->idle_calls++; if (expires > 0LL) { int was_stopped = tick_sched_flag_test(ts, TS_FLAG_STOPPED); tick_nohz_stop_tick(ts, cpu); ts->idle_sleeps++; ts->idle_expires = expires; if (!was_stopped && tick_sched_flag_test(ts, TS_FLAG_STOPPED)) { ts->idle_jiffies = ts->last_jiffies; nohz_balance_enter_idle(cpu); } } else { tick_nohz_retain_tick(ts); } } void tick_nohz_idle_retain_tick(void) { tick_nohz_retain_tick(this_cpu_ptr(&tick_cpu_sched)); } /** * tick_nohz_idle_enter - prepare for entering idle on the current CPU * * Called when we start the idle loop. */ void tick_nohz_idle_enter(void) { struct tick_sched *ts; lockdep_assert_irqs_enabled(); local_irq_disable(); ts = this_cpu_ptr(&tick_cpu_sched); WARN_ON_ONCE(ts->timer_expires_base); tick_sched_flag_set(ts, TS_FLAG_INIDLE); tick_nohz_start_idle(ts); local_irq_enable(); } /** * tick_nohz_irq_exit - Notify the tick about IRQ exit * * A timer may have been added/modified/deleted either by the current IRQ, * or by another place using this IRQ as a notification. This IRQ may have * also updated the RCU callback list. These events may require a * re-evaluation of the next tick. Depending on the context: * * 1) If the CPU is idle and no resched is pending, just proceed with idle * time accounting. The next tick will be re-evaluated on the next idle * loop iteration. * * 2) If the CPU is nohz_full: * * 2.1) If there is any tick dependency, restart the tick if stopped. * * 2.2) If there is no tick dependency, (re-)evaluate the next tick and * stop/update it accordingly. */ void tick_nohz_irq_exit(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); if (tick_sched_flag_test(ts, TS_FLAG_INIDLE)) tick_nohz_start_idle(ts); else tick_nohz_full_update_tick(ts); } /** * tick_nohz_idle_got_tick - Check whether or not the tick handler has run * * Return: %true if the tick handler has run, otherwise %false */ bool tick_nohz_idle_got_tick(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); if (ts->got_idle_tick) { ts->got_idle_tick = 0; return true; } return false; } /** * tick_nohz_get_next_hrtimer - return the next expiration time for the hrtimer * or the tick, whichever expires first. Note that, if the tick has been * stopped, it returns the next hrtimer. * * Called from power state control code with interrupts disabled * * Return: the next expiration time */ ktime_t tick_nohz_get_next_hrtimer(void) { return __this_cpu_read(tick_cpu_device.evtdev)->next_event; } /** * tick_nohz_get_sleep_length - return the expected length of the current sleep * @delta_next: duration until the next event if the tick cannot be stopped * * Called from power state control code with interrupts disabled. * * The return value of this function and/or the value returned by it through the * @delta_next pointer can be negative which must be taken into account by its * callers. * * Return: the expected length of the current sleep */ ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next) { struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); int cpu = smp_processor_id(); /* * The idle entry time is expected to be a sufficient approximation of * the current time at this point. */ ktime_t now = ts->idle_entrytime; ktime_t next_event; WARN_ON_ONCE(!tick_sched_flag_test(ts, TS_FLAG_INIDLE)); *delta_next = ktime_sub(dev->next_event, now); if (!can_stop_idle_tick(cpu, ts)) return *delta_next; next_event = tick_nohz_next_event(ts, cpu); if (!next_event) return *delta_next; /* * If the next highres timer to expire is earlier than 'next_event', the * idle governor needs to know that. */ next_event = min_t(u64, next_event, hrtimer_next_event_without(&ts->sched_timer)); return ktime_sub(next_event, now); } /** * tick_nohz_get_idle_calls_cpu - return the current idle calls counter value * for a particular CPU. * @cpu: target CPU number * * Called from the schedutil frequency scaling governor in scheduler context. * * Return: the current idle calls counter value for @cpu */ unsigned long tick_nohz_get_idle_calls_cpu(int cpu) { struct tick_sched *ts = tick_get_tick_sched(cpu); return ts->idle_calls; } static void tick_nohz_account_idle_time(struct tick_sched *ts, ktime_t now) { unsigned long ticks; ts->idle_exittime = now; if (vtime_accounting_enabled_this_cpu()) return; /* * We stopped the tick in idle. update_process_times() would miss the * time we slept, as it does only a 1 tick accounting. * Enforce that this is accounted to idle ! */ ticks = jiffies - ts->idle_jiffies; /* * We might be one off. Do not randomly account a huge number of ticks! */ if (ticks && ticks < LONG_MAX) account_idle_ticks(ticks); } void tick_nohz_idle_restart_tick(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); if (tick_sched_flag_test(ts, TS_FLAG_STOPPED)) { ktime_t now = ktime_get(); tick_nohz_restart_sched_tick(ts, now); tick_nohz_account_idle_time(ts, now); } } static void tick_nohz_idle_update_tick(struct tick_sched *ts, ktime_t now) { if (tick_nohz_full_cpu(smp_processor_id())) __tick_nohz_full_update_tick(ts, now); else tick_nohz_restart_sched_tick(ts, now); tick_nohz_account_idle_time(ts, now); } /** * tick_nohz_idle_exit - Update the tick upon idle task exit * * When the idle task exits, update the tick depending on the * following situations: * * 1) If the CPU is not in nohz_full mode (most cases), then * restart the tick. * * 2) If the CPU is in nohz_full mode (corner case): * 2.1) If the tick can be kept stopped (no tick dependencies) * then re-evaluate the next tick and try to keep it stopped * as long as possible. * 2.2) If the tick has dependencies, restart the tick. * */ void tick_nohz_idle_exit(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); bool idle_active, tick_stopped; ktime_t now; local_irq_disable(); WARN_ON_ONCE(!tick_sched_flag_test(ts, TS_FLAG_INIDLE)); WARN_ON_ONCE(ts->timer_expires_base); tick_sched_flag_clear(ts, TS_FLAG_INIDLE); idle_active = tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE); tick_stopped = tick_sched_flag_test(ts, TS_FLAG_STOPPED); if (idle_active || tick_stopped) now = ktime_get(); if (idle_active) tick_nohz_stop_idle(ts, now); if (tick_stopped) tick_nohz_idle_update_tick(ts, now); local_irq_enable(); } /* * In low-resolution mode, the tick handler must be implemented directly * at the clockevent level. hrtimer can't be used instead, because its * infrastructure actually relies on the tick itself as a backend in * low-resolution mode (see hrtimer_run_queues()). */ static void tick_nohz_lowres_handler(struct clock_event_device *dev) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); dev->next_event = KTIME_MAX; if (likely(tick_nohz_handler(&ts->sched_timer) == HRTIMER_RESTART)) tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); } static inline void tick_nohz_activate(struct tick_sched *ts) { if (!tick_nohz_enabled) return; tick_sched_flag_set(ts, TS_FLAG_NOHZ); /* One update is enough */ if (!test_and_set_bit(0, &tick_nohz_active)) timers_update_nohz(); } /** * tick_nohz_switch_to_nohz - switch to NOHZ mode */ static void tick_nohz_switch_to_nohz(void) { if (!tick_nohz_enabled) return; if (tick_switch_to_oneshot(tick_nohz_lowres_handler)) return; /* * Recycle the hrtimer in 'ts', so we can share the * highres code. */ tick_setup_sched_timer(false); } static inline void tick_nohz_irq_enter(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); ktime_t now; if (!tick_sched_flag_test(ts, TS_FLAG_STOPPED | TS_FLAG_IDLE_ACTIVE)) return; now = ktime_get(); if (tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE)) tick_nohz_stop_idle(ts, now); /* * If all CPUs are idle we may need to update a stale jiffies value. * Note nohz_full is a special case: a timekeeper is guaranteed to stay * alive but it might be busy looping with interrupts disabled in some * rare case (typically stop machine). So we must make sure we have a * last resort. */ if (tick_sched_flag_test(ts, TS_FLAG_STOPPED)) tick_nohz_update_jiffies(now); } #else static inline void tick_nohz_switch_to_nohz(void) { } static inline void tick_nohz_irq_enter(void) { } static inline void tick_nohz_activate(struct tick_sched *ts) { } #endif /* CONFIG_NO_HZ_COMMON */ /* * Called from irq_enter() to notify about the possible interruption of idle() */ void tick_irq_enter(void) { tick_check_oneshot_broadcast_this_cpu(); tick_nohz_irq_enter(); } static int sched_skew_tick; static int __init skew_tick(char *str) { get_option(&str, &sched_skew_tick); return 0; } early_param("skew_tick", skew_tick); /** * tick_setup_sched_timer - setup the tick emulation timer * @hrtimer: whether to use the hrtimer or not */ void tick_setup_sched_timer(bool hrtimer) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); /* Emulate tick processing via per-CPU hrtimers: */ hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); if (IS_ENABLED(CONFIG_HIGH_RES_TIMERS) && hrtimer) { tick_sched_flag_set(ts, TS_FLAG_HIGHRES); ts->sched_timer.function = tick_nohz_handler; } /* Get the next period (per-CPU) */ hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); /* Offset the tick to avert 'jiffies_lock' contention. */ if (sched_skew_tick) { u64 offset = TICK_NSEC >> 1; do_div(offset, num_possible_cpus()); offset *= smp_processor_id(); hrtimer_add_expires_ns(&ts->sched_timer, offset); } hrtimer_forward_now(&ts->sched_timer, TICK_NSEC); if (IS_ENABLED(CONFIG_HIGH_RES_TIMERS) && hrtimer) hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED_HARD); else tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); tick_nohz_activate(ts); } /* * Shut down the tick and make sure the CPU won't try to retake the timekeeping * duty before disabling IRQs in idle for the last time. */ void tick_sched_timer_dying(int cpu) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); ktime_t idle_sleeptime, iowait_sleeptime; unsigned long idle_calls, idle_sleeps; /* This must happen before hrtimers are migrated! */ if (tick_sched_flag_test(ts, TS_FLAG_HIGHRES)) hrtimer_cancel(&ts->sched_timer); idle_sleeptime = ts->idle_sleeptime; iowait_sleeptime = ts->iowait_sleeptime; idle_calls = ts->idle_calls; idle_sleeps = ts->idle_sleeps; memset(ts, 0, sizeof(*ts)); ts->idle_sleeptime = idle_sleeptime; ts->iowait_sleeptime = iowait_sleeptime; ts->idle_calls = idle_calls; ts->idle_sleeps = idle_sleeps; } /* * Async notification about clocksource changes */ void tick_clock_notify(void) { int cpu; for_each_possible_cpu(cpu) set_bit(0, &per_cpu(tick_cpu_sched, cpu).check_clocks); } /* * Async notification about clock event changes */ void tick_oneshot_notify(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); set_bit(0, &ts->check_clocks); } /* * Check if a change happened, which makes oneshot possible. * * Called cyclically from the hrtimer softirq (driven by the timer * softirq). 'allow_nohz' signals that we can switch into low-res NOHZ * mode, because high resolution timers are disabled (either compile * or runtime). Called with interrupts disabled. */ int tick_check_oneshot_change(int allow_nohz) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); if (!test_and_clear_bit(0, &ts->check_clocks)) return 0; if (tick_sched_flag_test(ts, TS_FLAG_NOHZ)) return 0; if (!timekeeping_valid_for_hres() || !tick_is_oneshot_available()) return 0; if (!allow_nohz) return 1; tick_nohz_switch_to_nohz(); return 0; }
11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2007, 2008, 2009 Siemens AG * * Written by: * Dmitry Eremin-Solenikov <dbaryshkov@gmail.com> */ #ifndef __NET_CFG802154_H #define __NET_CFG802154_H #include <linux/ieee802154.h> #include <linux/netdevice.h> #include <linux/spinlock.h> #include <linux/bug.h> #include <net/nl802154.h> struct wpan_phy; struct wpan_phy_cca; struct cfg802154_scan_request; struct cfg802154_beacon_request; struct ieee802154_addr; #ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL struct ieee802154_llsec_device_key; struct ieee802154_llsec_seclevel; struct ieee802154_llsec_params; struct ieee802154_llsec_device; struct ieee802154_llsec_table; struct ieee802154_llsec_key_id; struct ieee802154_llsec_key; #endif /* CONFIG_IEEE802154_NL802154_EXPERIMENTAL */ struct cfg802154_ops { struct net_device * (*add_virtual_intf_deprecated)(struct wpan_phy *wpan_phy, const char *name, unsigned char name_assign_type, int type); void (*del_virtual_intf_deprecated)(struct wpan_phy *wpan_phy, struct net_device *dev); int (*suspend)(struct wpan_phy *wpan_phy); int (*resume)(struct wpan_phy *wpan_phy); int (*add_virtual_intf)(struct wpan_phy *wpan_phy, const char *name, unsigned char name_assign_type, enum nl802154_iftype type, __le64 extended_addr); int (*del_virtual_intf)(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev); int (*set_channel)(struct wpan_phy *wpan_phy, u8 page, u8 channel); int (*set_cca_mode)(struct wpan_phy *wpan_phy, const struct wpan_phy_cca *cca); int (*set_cca_ed_level)(struct wpan_phy *wpan_phy, s32 ed_level); int (*set_tx_power)(struct wpan_phy *wpan_phy, s32 power); int (*set_pan_id)(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, __le16 pan_id); int (*set_short_addr)(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, __le16 short_addr); int (*set_backoff_exponent)(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, u8 min_be, u8 max_be); int (*set_max_csma_backoffs)(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, u8 max_csma_backoffs); int (*set_max_frame_retries)(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, s8 max_frame_retries); int (*set_lbt_mode)(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, bool mode); int (*set_ackreq_default)(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, bool ackreq); int (*trigger_scan)(struct wpan_phy *wpan_phy, struct cfg802154_scan_request *request); int (*abort_scan)(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev); int (*send_beacons)(struct wpan_phy *wpan_phy, struct cfg802154_beacon_request *request); int (*stop_beacons)(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev); int (*associate)(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, struct ieee802154_addr *coord); int (*disassociate)(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, struct ieee802154_addr *target); #ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL void (*get_llsec_table)(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, struct ieee802154_llsec_table **table); void (*lock_llsec_table)(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev); void (*unlock_llsec_table)(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev); /* TODO remove locking/get table callbacks, this is part of the * nl802154 interface and should be accessible from ieee802154 layer. */ int (*get_llsec_params)(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, struct ieee802154_llsec_params *params); int (*set_llsec_params)(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, const struct ieee802154_llsec_params *params, int changed); int (*add_llsec_key)(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, const struct ieee802154_llsec_key_id *id, const struct ieee802154_llsec_key *key); int (*del_llsec_key)(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, const struct ieee802154_llsec_key_id *id); int (*add_seclevel)(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, const struct ieee802154_llsec_seclevel *sl); int (*del_seclevel)(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, const struct ieee802154_llsec_seclevel *sl); int (*add_device)(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, const struct ieee802154_llsec_device *dev); int (*del_device)(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, __le64 extended_addr); int (*add_devkey)(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, __le64 extended_addr, const struct ieee802154_llsec_device_key *key); int (*del_devkey)(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, __le64 extended_addr, const struct ieee802154_llsec_device_key *key); #endif /* CONFIG_IEEE802154_NL802154_EXPERIMENTAL */ }; static inline bool wpan_phy_supported_bool(bool b, enum nl802154_supported_bool_states st) { switch (st) { case NL802154_SUPPORTED_BOOL_TRUE: return b; case NL802154_SUPPORTED_BOOL_FALSE: return !b; case NL802154_SUPPORTED_BOOL_BOTH: return true; default: WARN_ON(1); } return false; } struct wpan_phy_supported { u32 channels[IEEE802154_MAX_PAGE + 1], cca_modes, cca_opts, iftypes; enum nl802154_supported_bool_states lbt; u8 min_minbe, max_minbe, min_maxbe, max_maxbe, min_csma_backoffs, max_csma_backoffs; s8 min_frame_retries, max_frame_retries; size_t tx_powers_size, cca_ed_levels_size; const s32 *tx_powers, *cca_ed_levels; }; struct wpan_phy_cca { enum nl802154_cca_modes mode; enum nl802154_cca_opts opt; }; static inline bool wpan_phy_cca_cmp(const struct wpan_phy_cca *a, const struct wpan_phy_cca *b) { if (a->mode != b->mode) return false; if (a->mode == NL802154_CCA_ENERGY_CARRIER) return a->opt == b->opt; return true; } /** * enum wpan_phy_flags - WPAN PHY state flags * @WPAN_PHY_FLAG_TXPOWER: Indicates that transceiver will support * transmit power setting. * @WPAN_PHY_FLAG_CCA_ED_LEVEL: Indicates that transceiver will support cca ed * level setting. * @WPAN_PHY_FLAG_CCA_MODE: Indicates that transceiver will support cca mode * setting. * @WPAN_PHY_FLAG_STATE_QUEUE_STOPPED: Indicates that the transmit queue was * temporarily stopped. * @WPAN_PHY_FLAG_DATAGRAMS_ONLY: Indicates that transceiver is only able to * send/receive datagrams. */ enum wpan_phy_flags { WPAN_PHY_FLAG_TXPOWER = BIT(1), WPAN_PHY_FLAG_CCA_ED_LEVEL = BIT(2), WPAN_PHY_FLAG_CCA_MODE = BIT(3), WPAN_PHY_FLAG_STATE_QUEUE_STOPPED = BIT(4), WPAN_PHY_FLAG_DATAGRAMS_ONLY = BIT(5), }; struct wpan_phy { /* If multiple wpan_phys are registered and you're handed e.g. * a regular netdev with assigned ieee802154_ptr, you won't * know whether it points to a wpan_phy your driver has registered * or not. Assign this to something global to your driver to * help determine whether you own this wpan_phy or not. */ const void *privid; unsigned long flags; /* * This is a PIB according to 802.15.4-2011. * We do not provide timing-related variables, as they * aren't used outside of driver */ u8 current_channel; u8 current_page; struct wpan_phy_supported supported; /* current transmit_power in mBm */ s32 transmit_power; struct wpan_phy_cca cca; __le64 perm_extended_addr; /* current cca ed threshold in mBm */ s32 cca_ed_level; /* PHY depended MAC PIB values */ /* 802.15.4 acronym: Tdsym in nsec */ u32 symbol_duration; /* lifs and sifs periods timing */ u16 lifs_period; u16 sifs_period; struct device dev; /* the network namespace this phy lives in currently */ possible_net_t _net; /* Transmission monitoring and control */ spinlock_t queue_lock; atomic_t ongoing_txs; atomic_t hold_txs; wait_queue_head_t sync_txq; /* Current filtering level on reception. * Only allowed to be changed if phy is not operational. */ enum ieee802154_filtering_level filtering; char priv[] __aligned(NETDEV_ALIGN); }; static inline struct net *wpan_phy_net(struct wpan_phy *wpan_phy) { return read_pnet(&wpan_phy->_net); } static inline void wpan_phy_net_set(struct wpan_phy *wpan_phy, struct net *net) { write_pnet(&wpan_phy->_net, net); } static inline bool ieee802154_chan_is_valid(struct wpan_phy *phy, u8 page, u8 channel) { if (page > IEEE802154_MAX_PAGE || channel > IEEE802154_MAX_CHANNEL || !(phy->supported.channels[page] & BIT(channel))) return false; return true; } /** * struct ieee802154_addr - IEEE802.15.4 device address * @mode: Address mode from frame header. Can be one of: * - @IEEE802154_ADDR_NONE * - @IEEE802154_ADDR_SHORT * - @IEEE802154_ADDR_LONG * @pan_id: The PAN ID this address belongs to * @short_addr: address if @mode is @IEEE802154_ADDR_SHORT * @extended_addr: address if @mode is @IEEE802154_ADDR_LONG */ struct ieee802154_addr { u8 mode; __le16 pan_id; union { __le16 short_addr; __le64 extended_addr; }; }; /** * struct ieee802154_coord_desc - Coordinator descriptor * @addr: PAN ID and coordinator address * @page: page this coordinator is using * @channel: channel this coordinator is using * @superframe_spec: SuperFrame specification as received * @link_quality: link quality indicator at which the beacon was received * @gts_permit: the coordinator accepts GTS requests */ struct ieee802154_coord_desc { struct ieee802154_addr addr; u8 page; u8 channel; u16 superframe_spec; u8 link_quality; bool gts_permit; }; /** * struct ieee802154_pan_device - PAN device information * @pan_id: the PAN ID of this device * @mode: the preferred mode to reach the device * @short_addr: the short address of this device * @extended_addr: the extended address of this device * @node: the list node */ struct ieee802154_pan_device { __le16 pan_id; u8 mode; __le16 short_addr; __le64 extended_addr; struct list_head node; }; /** * struct cfg802154_scan_request - Scan request * * @type: type of scan to be performed * @page: page on which to perform the scan * @channels: channels in te %page to be scanned * @duration: time spent on each channel, calculated with: * aBaseSuperframeDuration * (2 ^ duration + 1) * @wpan_dev: the wpan device on which to perform the scan * @wpan_phy: the wpan phy on which to perform the scan */ struct cfg802154_scan_request { enum nl802154_scan_types type; u8 page; u32 channels; u8 duration; struct wpan_dev *wpan_dev; struct wpan_phy *wpan_phy; }; /** * struct cfg802154_beacon_request - Beacon request descriptor * * @interval: interval n between sendings, in multiple order of the super frame * duration: aBaseSuperframeDuration * (2^n) unless the interval * order is greater or equal to 15, in this case beacons won't be * passively sent out at a fixed rate but instead inform the device * that it should answer beacon requests as part of active scan * procedures * @wpan_dev: the concerned wpan device * @wpan_phy: the wpan phy this was for */ struct cfg802154_beacon_request { u8 interval; struct wpan_dev *wpan_dev; struct wpan_phy *wpan_phy; }; /** * struct cfg802154_mac_pkt - MAC packet descriptor (beacon/command) * @node: MAC packets to process list member * @skb: the received sk_buff * @sdata: the interface on which @skb was received * @page: page configuration when @skb was received * @channel: channel configuration when @skb was received */ struct cfg802154_mac_pkt { struct list_head node; struct sk_buff *skb; struct ieee802154_sub_if_data *sdata; u8 page; u8 channel; }; struct ieee802154_llsec_key_id { u8 mode; u8 id; union { struct ieee802154_addr device_addr; __le32 short_source; __le64 extended_source; }; }; #define IEEE802154_LLSEC_KEY_SIZE 16 struct ieee802154_llsec_key { u8 frame_types; u32 cmd_frame_ids; /* TODO replace with NL802154_KEY_SIZE */ u8 key[IEEE802154_LLSEC_KEY_SIZE]; }; struct ieee802154_llsec_key_entry { struct list_head list; struct rcu_head rcu; struct ieee802154_llsec_key_id id; struct ieee802154_llsec_key *key; }; struct ieee802154_llsec_params { bool enabled; __be32 frame_counter; u8 out_level; struct ieee802154_llsec_key_id out_key; __le64 default_key_source; __le16 pan_id; __le64 hwaddr; __le64 coord_hwaddr; __le16 coord_shortaddr; }; struct ieee802154_llsec_table { struct list_head keys; struct list_head devices; struct list_head security_levels; }; struct ieee802154_llsec_seclevel { struct list_head list; u8 frame_type; u8 cmd_frame_id; bool device_override; u32 sec_levels; }; struct ieee802154_llsec_device { struct list_head list; __le16 pan_id; __le16 short_addr; __le64 hwaddr; u32 frame_counter; bool seclevel_exempt; u8 key_mode; struct list_head keys; }; struct ieee802154_llsec_device_key { struct list_head list; struct ieee802154_llsec_key_id key_id; u32 frame_counter; }; struct wpan_dev_header_ops { /* TODO create callback currently assumes ieee802154_mac_cb inside * skb->cb. This should be changed to give these information as * parameter. */ int (*create)(struct sk_buff *skb, struct net_device *dev, const struct ieee802154_addr *daddr, const struct ieee802154_addr *saddr, unsigned int len); }; struct wpan_dev { struct wpan_phy *wpan_phy; int iftype; /* the remainder of this struct should be private to cfg802154 */ struct list_head list; struct net_device *netdev; const struct wpan_dev_header_ops *header_ops; /* lowpan interface, set when the wpan_dev belongs to one lowpan_dev */ struct net_device *lowpan_dev; u32 identifier; /* MAC PIB */ __le16 pan_id; __le16 short_addr; __le64 extended_addr; /* MAC BSN field */ atomic_t bsn; /* MAC DSN field */ atomic_t dsn; u8 min_be; u8 max_be; u8 csma_retries; s8 frame_retries; bool lbt; /* fallback for acknowledgment bit setting */ bool ackreq; /* Associations */ struct mutex association_lock; struct ieee802154_pan_device *parent; struct list_head children; unsigned int max_associations; unsigned int nchildren; }; #define to_phy(_dev) container_of(_dev, struct wpan_phy, dev) #if IS_ENABLED(CONFIG_IEEE802154) || IS_ENABLED(CONFIG_6LOWPAN) static inline int wpan_dev_hard_header(struct sk_buff *skb, struct net_device *dev, const struct ieee802154_addr *daddr, const struct ieee802154_addr *saddr, unsigned int len) { struct wpan_dev *wpan_dev = dev->ieee802154_ptr; return wpan_dev->header_ops->create(skb, dev, daddr, saddr, len); } #endif struct wpan_phy * wpan_phy_new(const struct cfg802154_ops *ops, size_t priv_size); static inline void wpan_phy_set_dev(struct wpan_phy *phy, struct device *dev) { phy->dev.parent = dev; } int wpan_phy_register(struct wpan_phy *phy); void wpan_phy_unregister(struct wpan_phy *phy); void wpan_phy_free(struct wpan_phy *phy); /* Same semantics as for class_for_each_device */ int wpan_phy_for_each(int (*fn)(struct wpan_phy *phy, void *data), void *data); static inline void *wpan_phy_priv(struct wpan_phy *phy) { BUG_ON(!phy); return &phy->priv; } struct wpan_phy *wpan_phy_find(const char *str); static inline void wpan_phy_put(struct wpan_phy *phy) { put_device(&phy->dev); } static inline const char *wpan_phy_name(struct wpan_phy *phy) { return dev_name(&phy->dev); } void ieee802154_configure_durations(struct wpan_phy *phy, unsigned int page, unsigned int channel); /** * cfg802154_device_is_associated - Checks whether we are associated to any device * @wpan_dev: the wpan device * @return: true if we are associated */ bool cfg802154_device_is_associated(struct wpan_dev *wpan_dev); /** * cfg802154_device_is_parent - Checks if a device is our coordinator * @wpan_dev: the wpan device * @target: the expected parent * @return: true if @target is our coordinator */ bool cfg802154_device_is_parent(struct wpan_dev *wpan_dev, struct ieee802154_addr *target); /** * cfg802154_device_is_child - Checks whether a device is associated to us * @wpan_dev: the wpan device * @target: the expected child * @return: the PAN device */ struct ieee802154_pan_device * cfg802154_device_is_child(struct wpan_dev *wpan_dev, struct ieee802154_addr *target); /** * cfg802154_set_max_associations - Limit the number of future associations * @wpan_dev: the wpan device * @max: the maximum number of devices we accept to associate * @return: the old maximum value */ unsigned int cfg802154_set_max_associations(struct wpan_dev *wpan_dev, unsigned int max); /** * cfg802154_get_free_short_addr - Get a free address among the known devices * @wpan_dev: the wpan device * @return: a random short address expectedly unused on our PAN */ __le16 cfg802154_get_free_short_addr(struct wpan_dev *wpan_dev); #endif /* __NET_CFG802154_H */
3 2 1 1 3 1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 // SPDX-License-Identifier: GPL-2.0-only /* * HID driver for some ITE "special" devices * Copyright (c) 2017 Hans de Goede <hdegoede@redhat.com> */ #include <linux/device.h> #include <linux/input.h> #include <linux/hid.h> #include <linux/module.h> #include "hid-ids.h" #define QUIRK_TOUCHPAD_ON_OFF_REPORT BIT(0) static const __u8 *ite_report_fixup(struct hid_device *hdev, __u8 *rdesc, unsigned int *rsize) { unsigned long quirks = (unsigned long)hid_get_drvdata(hdev); if (quirks & QUIRK_TOUCHPAD_ON_OFF_REPORT) { /* For Acer Aspire Switch 10 SW5-012 keyboard-dock */ if (*rsize == 188 && rdesc[162] == 0x81 && rdesc[163] == 0x02) { hid_info(hdev, "Fixing up Acer Sw5-012 ITE keyboard report descriptor\n"); rdesc[163] = HID_MAIN_ITEM_RELATIVE; } /* For Acer One S1002/S1003 keyboard-dock */ if (*rsize == 188 && rdesc[185] == 0x81 && rdesc[186] == 0x02) { hid_info(hdev, "Fixing up Acer S1002/S1003 ITE keyboard report descriptor\n"); rdesc[186] = HID_MAIN_ITEM_RELATIVE; } /* For Acer Aspire Switch 10E (SW3-016) keyboard-dock */ if (*rsize == 210 && rdesc[184] == 0x81 && rdesc[185] == 0x02) { hid_info(hdev, "Fixing up Acer Aspire Switch 10E (SW3-016) ITE keyboard report descriptor\n"); rdesc[185] = HID_MAIN_ITEM_RELATIVE; } } return rdesc; } static int ite_input_mapping(struct hid_device *hdev, struct hid_input *hi, struct hid_field *field, struct hid_usage *usage, unsigned long **bit, int *max) { unsigned long quirks = (unsigned long)hid_get_drvdata(hdev); if ((quirks & QUIRK_TOUCHPAD_ON_OFF_REPORT) && (usage->hid & HID_USAGE_PAGE) == 0x00880000) { if (usage->hid == 0x00880078) { /* Touchpad on, userspace expects F22 for this */ hid_map_usage_clear(hi, usage, bit, max, EV_KEY, KEY_F22); return 1; } if (usage->hid == 0x00880079) { /* Touchpad off, userspace expects F23 for this */ hid_map_usage_clear(hi, usage, bit, max, EV_KEY, KEY_F23); return 1; } return -1; } return 0; } static int ite_event(struct hid_device *hdev, struct hid_field *field, struct hid_usage *usage, __s32 value) { struct input_dev *input; if (!(hdev->claimed & HID_CLAIMED_INPUT) || !field->hidinput) return 0; input = field->hidinput->input; /* * The ITE8595 always reports 0 as value for the rfkill button. Luckily * it is the only button in its report, and it sends a report on * release only, so receiving a report means the button was pressed. */ if (usage->hid == HID_GD_RFKILL_BTN) { input_event(input, EV_KEY, KEY_RFKILL, 1); input_sync(input); input_event(input, EV_KEY, KEY_RFKILL, 0); input_sync(input); return 1; } return 0; } static int ite_probe(struct hid_device *hdev, const struct hid_device_id *id) { int ret; hid_set_drvdata(hdev, (void *)id->driver_data); ret = hid_open_report(hdev); if (ret) return ret; return hid_hw_start(hdev, HID_CONNECT_DEFAULT); } static const struct hid_device_id ite_devices[] = { { HID_USB_DEVICE(USB_VENDOR_ID_ITE, USB_DEVICE_ID_ITE8595) }, { HID_USB_DEVICE(USB_VENDOR_ID_258A, USB_DEVICE_ID_258A_6A88) }, /* ITE8595 USB kbd ctlr, with Synaptics touchpad connected to it. */ { HID_DEVICE(BUS_USB, HID_GROUP_GENERIC, USB_VENDOR_ID_SYNAPTICS, USB_DEVICE_ID_SYNAPTICS_ACER_SWITCH5_012), .driver_data = QUIRK_TOUCHPAD_ON_OFF_REPORT }, /* ITE8910 USB kbd ctlr, with Synaptics touchpad connected to it. */ { HID_DEVICE(BUS_USB, HID_GROUP_GENERIC, USB_VENDOR_ID_SYNAPTICS, USB_DEVICE_ID_SYNAPTICS_ACER_ONE_S1002), .driver_data = QUIRK_TOUCHPAD_ON_OFF_REPORT }, /* ITE8910 USB kbd ctlr, with Synaptics touchpad connected to it. */ { HID_DEVICE(BUS_USB, HID_GROUP_GENERIC, USB_VENDOR_ID_SYNAPTICS, USB_DEVICE_ID_SYNAPTICS_ACER_ONE_S1003), .driver_data = QUIRK_TOUCHPAD_ON_OFF_REPORT }, /* ITE8910 USB kbd ctlr, with Synaptics touchpad connected to it. */ { HID_DEVICE(BUS_USB, HID_GROUP_GENERIC, USB_VENDOR_ID_SYNAPTICS, USB_DEVICE_ID_SYNAPTICS_ACER_SWITCH5_017), .driver_data = QUIRK_TOUCHPAD_ON_OFF_REPORT }, { } }; MODULE_DEVICE_TABLE(hid, ite_devices); static struct hid_driver ite_driver = { .name = "itetech", .id_table = ite_devices, .probe = ite_probe, .report_fixup = ite_report_fixup, .input_mapping = ite_input_mapping, .event = ite_event, }; module_hid_driver(ite_driver); MODULE_AUTHOR("Hans de Goede <hdegoede@redhat.com>"); MODULE_DESCRIPTION("HID driver for some ITE \"special\" devices"); MODULE_LICENSE("GPL");
25 22 1 2 87 87 15 15 15 15 14 1 15 15 15 1 14 35 35 35 31 32 15 16 8 16 1 14 35 35 33 35 49 50 5 45 37 12 3 34 20 16 16 16 16 84 1 94 3 31 304 305 16 236 305 8 271 58 227 228 228 228 216 177 3 1 2 2 1 1 33 33 4 29 24 9 2 22 23 1 1 4 3 1 4 7 16 16 16 16 304 304 269 270 15 235 53 254 48 215 216 158 158 216 23 23 10 87 87 87 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 // SPDX-License-Identifier: GPL-2.0-or-later /* * IPVS An implementation of the IP virtual server support for the * LINUX operating system. IPVS is now implemented as a module * over the Netfilter framework. IPVS can be used to build a * high-performance and highly available server based on a * cluster of servers. * * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> * Peter Kese <peter.kese@ijs.si> * Julian Anastasov <ja@ssi.bg> * * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese, * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms * and others. * * Changes: * Paul `Rusty' Russell properly handle non-linear skbs * Harald Welte don't use nfcache */ #define KMSG_COMPONENT "IPVS" #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt #include <linux/module.h> #include <linux/kernel.h> #include <linux/ip.h> #include <linux/tcp.h> #include <linux/sctp.h> #include <linux/icmp.h> #include <linux/slab.h> #include <net/ip.h> #include <net/tcp.h> #include <net/udp.h> #include <net/icmp.h> /* for icmp_send */ #include <net/gue.h> #include <net/gre.h> #include <net/route.h> #include <net/ip6_checksum.h> #include <net/netns/generic.h> /* net_generic() */ #include <linux/netfilter.h> #include <linux/netfilter_ipv4.h> #ifdef CONFIG_IP_VS_IPV6 #include <net/ipv6.h> #include <linux/netfilter_ipv6.h> #include <net/ip6_route.h> #endif #include <net/ip_vs.h> #include <linux/indirect_call_wrapper.h> EXPORT_SYMBOL(register_ip_vs_scheduler); EXPORT_SYMBOL(unregister_ip_vs_scheduler); EXPORT_SYMBOL(ip_vs_proto_name); EXPORT_SYMBOL(ip_vs_conn_new); EXPORT_SYMBOL(ip_vs_conn_in_get); EXPORT_SYMBOL(ip_vs_conn_out_get); #ifdef CONFIG_IP_VS_PROTO_TCP EXPORT_SYMBOL(ip_vs_tcp_conn_listen); #endif EXPORT_SYMBOL(ip_vs_conn_put); #ifdef CONFIG_IP_VS_DEBUG EXPORT_SYMBOL(ip_vs_get_debug_level); #endif EXPORT_SYMBOL(ip_vs_new_conn_out); #if defined(CONFIG_IP_VS_PROTO_TCP) && defined(CONFIG_IP_VS_PROTO_UDP) #define SNAT_CALL(f, ...) \ INDIRECT_CALL_2(f, tcp_snat_handler, udp_snat_handler, __VA_ARGS__) #elif defined(CONFIG_IP_VS_PROTO_TCP) #define SNAT_CALL(f, ...) INDIRECT_CALL_1(f, tcp_snat_handler, __VA_ARGS__) #elif defined(CONFIG_IP_VS_PROTO_UDP) #define SNAT_CALL(f, ...) INDIRECT_CALL_1(f, udp_snat_handler, __VA_ARGS__) #else #define SNAT_CALL(f, ...) f(__VA_ARGS__) #endif static unsigned int ip_vs_net_id __read_mostly; /* netns cnt used for uniqueness */ static atomic_t ipvs_netns_cnt = ATOMIC_INIT(0); /* ID used in ICMP lookups */ #define icmp_id(icmph) (((icmph)->un).echo.id) #define icmpv6_id(icmph) (icmph->icmp6_dataun.u_echo.identifier) const char *ip_vs_proto_name(unsigned int proto) { static char buf[20]; switch (proto) { case IPPROTO_IP: return "IP"; case IPPROTO_UDP: return "UDP"; case IPPROTO_TCP: return "TCP"; case IPPROTO_SCTP: return "SCTP"; case IPPROTO_ICMP: return "ICMP"; #ifdef CONFIG_IP_VS_IPV6 case IPPROTO_ICMPV6: return "ICMPv6"; #endif default: sprintf(buf, "IP_%u", proto); return buf; } } void ip_vs_init_hash_table(struct list_head *table, int rows) { while (--rows >= 0) INIT_LIST_HEAD(&table[rows]); } static inline void ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb) { struct ip_vs_dest *dest = cp->dest; struct netns_ipvs *ipvs = cp->ipvs; if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { struct ip_vs_cpu_stats *s; struct ip_vs_service *svc; local_bh_disable(); s = this_cpu_ptr(dest->stats.cpustats); u64_stats_update_begin(&s->syncp); u64_stats_inc(&s->cnt.inpkts); u64_stats_add(&s->cnt.inbytes, skb->len); u64_stats_update_end(&s->syncp); svc = rcu_dereference(dest->svc); s = this_cpu_ptr(svc->stats.cpustats); u64_stats_update_begin(&s->syncp); u64_stats_inc(&s->cnt.inpkts); u64_stats_add(&s->cnt.inbytes, skb->len); u64_stats_update_end(&s->syncp); s = this_cpu_ptr(ipvs->tot_stats->s.cpustats); u64_stats_update_begin(&s->syncp); u64_stats_inc(&s->cnt.inpkts); u64_stats_add(&s->cnt.inbytes, skb->len); u64_stats_update_end(&s->syncp); local_bh_enable(); } } static inline void ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb) { struct ip_vs_dest *dest = cp->dest; struct netns_ipvs *ipvs = cp->ipvs; if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { struct ip_vs_cpu_stats *s; struct ip_vs_service *svc; local_bh_disable(); s = this_cpu_ptr(dest->stats.cpustats); u64_stats_update_begin(&s->syncp); u64_stats_inc(&s->cnt.outpkts); u64_stats_add(&s->cnt.outbytes, skb->len); u64_stats_update_end(&s->syncp); svc = rcu_dereference(dest->svc); s = this_cpu_ptr(svc->stats.cpustats); u64_stats_update_begin(&s->syncp); u64_stats_inc(&s->cnt.outpkts); u64_stats_add(&s->cnt.outbytes, skb->len); u64_stats_update_end(&s->syncp); s = this_cpu_ptr(ipvs->tot_stats->s.cpustats); u64_stats_update_begin(&s->syncp); u64_stats_inc(&s->cnt.outpkts); u64_stats_add(&s->cnt.outbytes, skb->len); u64_stats_update_end(&s->syncp); local_bh_enable(); } } static inline void ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc) { struct netns_ipvs *ipvs = svc->ipvs; struct ip_vs_cpu_stats *s; local_bh_disable(); s = this_cpu_ptr(cp->dest->stats.cpustats); u64_stats_update_begin(&s->syncp); u64_stats_inc(&s->cnt.conns); u64_stats_update_end(&s->syncp); s = this_cpu_ptr(svc->stats.cpustats); u64_stats_update_begin(&s->syncp); u64_stats_inc(&s->cnt.conns); u64_stats_update_end(&s->syncp); s = this_cpu_ptr(ipvs->tot_stats->s.cpustats); u64_stats_update_begin(&s->syncp); u64_stats_inc(&s->cnt.conns); u64_stats_update_end(&s->syncp); local_bh_enable(); } static inline void ip_vs_set_state(struct ip_vs_conn *cp, int direction, const struct sk_buff *skb, struct ip_vs_proto_data *pd) { if (likely(pd->pp->state_transition)) pd->pp->state_transition(cp, direction, skb, pd); } static inline int ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc, struct sk_buff *skb, int protocol, const union nf_inet_addr *caddr, __be16 cport, const union nf_inet_addr *vaddr, __be16 vport, struct ip_vs_conn_param *p) { ip_vs_conn_fill_param(svc->ipvs, svc->af, protocol, caddr, cport, vaddr, vport, p); p->pe = rcu_dereference(svc->pe); if (p->pe && p->pe->fill_param) return p->pe->fill_param(p, skb); return 0; } /* * IPVS persistent scheduling function * It creates a connection entry according to its template if exists, * or selects a server and creates a connection entry plus a template. * Locking: we are svc user (svc->refcnt), so we hold all dests too * Protocols supported: TCP, UDP */ static struct ip_vs_conn * ip_vs_sched_persist(struct ip_vs_service *svc, struct sk_buff *skb, __be16 src_port, __be16 dst_port, int *ignored, struct ip_vs_iphdr *iph) { struct ip_vs_conn *cp = NULL; struct ip_vs_dest *dest; struct ip_vs_conn *ct; __be16 dport = 0; /* destination port to forward */ unsigned int flags; struct ip_vs_conn_param param; const union nf_inet_addr fwmark = { .ip = htonl(svc->fwmark) }; union nf_inet_addr snet; /* source network of the client, after masking */ const union nf_inet_addr *src_addr, *dst_addr; if (likely(!ip_vs_iph_inverse(iph))) { src_addr = &iph->saddr; dst_addr = &iph->daddr; } else { src_addr = &iph->daddr; dst_addr = &iph->saddr; } /* Mask saddr with the netmask to adjust template granularity */ #ifdef CONFIG_IP_VS_IPV6 if (svc->af == AF_INET6) ipv6_addr_prefix(&snet.in6, &src_addr->in6, (__force __u32) svc->netmask); else #endif snet.ip = src_addr->ip & svc->netmask; IP_VS_DBG_BUF(6, "p-schedule: src %s:%u dest %s:%u " "mnet %s\n", IP_VS_DBG_ADDR(svc->af, src_addr), ntohs(src_port), IP_VS_DBG_ADDR(svc->af, dst_addr), ntohs(dst_port), IP_VS_DBG_ADDR(svc->af, &snet)); /* * As far as we know, FTP is a very complicated network protocol, and * it uses control connection and data connections. For active FTP, * FTP server initialize data connection to the client, its source port * is often 20. For passive FTP, FTP server tells the clients the port * that it passively listens to, and the client issues the data * connection. In the tunneling or direct routing mode, the load * balancer is on the client-to-server half of connection, the port * number is unknown to the load balancer. So, a conn template like * <caddr, 0, vaddr, 0, daddr, 0> is created for persistent FTP * service, and a template like <caddr, 0, vaddr, vport, daddr, dport> * is created for other persistent services. */ { int protocol = iph->protocol; const union nf_inet_addr *vaddr = dst_addr; __be16 vport = 0; if (dst_port == svc->port) { /* non-FTP template: * <protocol, caddr, 0, vaddr, vport, daddr, dport> * FTP template: * <protocol, caddr, 0, vaddr, 0, daddr, 0> */ if (svc->port != FTPPORT) vport = dst_port; } else { /* Note: persistent fwmark-based services and * persistent port zero service are handled here. * fwmark template: * <IPPROTO_IP,caddr,0,fwmark,0,daddr,0> * port zero template: * <protocol,caddr,0,vaddr,0,daddr,0> */ if (svc->fwmark) { protocol = IPPROTO_IP; vaddr = &fwmark; } } /* return *ignored = -1 so NF_DROP can be used */ if (ip_vs_conn_fill_param_persist(svc, skb, protocol, &snet, 0, vaddr, vport, &param) < 0) { *ignored = -1; return NULL; } } /* Check if a template already exists */ ct = ip_vs_ct_in_get(&param); if (!ct || !ip_vs_check_template(ct, NULL)) { struct ip_vs_scheduler *sched; /* * No template found or the dest of the connection * template is not available. * return *ignored=0 i.e. ICMP and NF_DROP */ sched = rcu_dereference(svc->scheduler); if (sched) { /* read svc->sched_data after svc->scheduler */ smp_rmb(); dest = sched->schedule(svc, skb, iph); } else { dest = NULL; } if (!dest) { IP_VS_DBG(1, "p-schedule: no dest found.\n"); kfree(param.pe_data); *ignored = 0; return NULL; } if (dst_port == svc->port && svc->port != FTPPORT) dport = dest->port; /* Create a template * This adds param.pe_data to the template, * and thus param.pe_data will be destroyed * when the template expires */ ct = ip_vs_conn_new(&param, dest->af, &dest->addr, dport, IP_VS_CONN_F_TEMPLATE, dest, skb->mark); if (ct == NULL) { kfree(param.pe_data); *ignored = -1; return NULL; } ct->timeout = svc->timeout; } else { /* set destination with the found template */ dest = ct->dest; kfree(param.pe_data); } dport = dst_port; if (dport == svc->port && dest->port) dport = dest->port; flags = (svc->flags & IP_VS_SVC_F_ONEPACKET && iph->protocol == IPPROTO_UDP) ? IP_VS_CONN_F_ONE_PACKET : 0; /* * Create a new connection according to the template */ ip_vs_conn_fill_param(svc->ipvs, svc->af, iph->protocol, src_addr, src_port, dst_addr, dst_port, &param); cp = ip_vs_conn_new(&param, dest->af, &dest->addr, dport, flags, dest, skb->mark); if (cp == NULL) { ip_vs_conn_put(ct); *ignored = -1; return NULL; } /* * Add its control */ ip_vs_control_add(cp, ct); ip_vs_conn_put(ct); ip_vs_conn_stats(cp, svc); return cp; } /* * IPVS main scheduling function * It selects a server according to the virtual service, and * creates a connection entry. * Protocols supported: TCP, UDP * * Usage of *ignored * * 1 : protocol tried to schedule (eg. on SYN), found svc but the * svc/scheduler decides that this packet should be accepted with * NF_ACCEPT because it must not be scheduled. * * 0 : scheduler can not find destination, so try bypass or * return ICMP and then NF_DROP (ip_vs_leave). * * -1 : scheduler tried to schedule but fatal error occurred, eg. * ip_vs_conn_new failure (ENOMEM) or ip_vs_sip_fill_param * failure such as missing Call-ID, ENOMEM on skb_linearize * or pe_data. In this case we should return NF_DROP without * any attempts to send ICMP with ip_vs_leave. */ struct ip_vs_conn * ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, struct ip_vs_proto_data *pd, int *ignored, struct ip_vs_iphdr *iph) { struct ip_vs_protocol *pp = pd->pp; struct ip_vs_conn *cp = NULL; struct ip_vs_scheduler *sched; struct ip_vs_dest *dest; __be16 _ports[2], *pptr, cport, vport; const void *caddr, *vaddr; unsigned int flags; *ignored = 1; /* * IPv6 frags, only the first hit here. */ pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports); if (pptr == NULL) return NULL; if (likely(!ip_vs_iph_inverse(iph))) { cport = pptr[0]; caddr = &iph->saddr; vport = pptr[1]; vaddr = &iph->daddr; } else { cport = pptr[1]; caddr = &iph->daddr; vport = pptr[0]; vaddr = &iph->saddr; } /* * FTPDATA needs this check when using local real server. * Never schedule Active FTPDATA connections from real server. * For LVS-NAT they must be already created. For other methods * with persistence the connection is created on SYN+ACK. */ if (cport == FTPDATA) { IP_VS_DBG_PKT(12, svc->af, pp, skb, iph->off, "Not scheduling FTPDATA"); return NULL; } /* * Do not schedule replies from local real server. */ if ((!skb->dev || skb->dev->flags & IFF_LOOPBACK)) { iph->hdr_flags ^= IP_VS_HDR_INVERSE; cp = INDIRECT_CALL_1(pp->conn_in_get, ip_vs_conn_in_get_proto, svc->ipvs, svc->af, skb, iph); iph->hdr_flags ^= IP_VS_HDR_INVERSE; if (cp) { IP_VS_DBG_PKT(12, svc->af, pp, skb, iph->off, "Not scheduling reply for existing" " connection"); __ip_vs_conn_put(cp); return NULL; } } /* * Persistent service */ if (svc->flags & IP_VS_SVC_F_PERSISTENT) return ip_vs_sched_persist(svc, skb, cport, vport, ignored, iph); *ignored = 0; /* * Non-persistent service */ if (!svc->fwmark && vport != svc->port) { if (!svc->port) pr_err("Schedule: port zero only supported " "in persistent services, " "check your ipvs configuration\n"); return NULL; } sched = rcu_dereference(svc->scheduler); if (sched) { /* read svc->sched_data after svc->scheduler */ smp_rmb(); dest = sched->schedule(svc, skb, iph); } else { dest = NULL; } if (dest == NULL) { IP_VS_DBG(1, "Schedule: no dest found.\n"); return NULL; } flags = (svc->flags & IP_VS_SVC_F_ONEPACKET && iph->protocol == IPPROTO_UDP) ? IP_VS_CONN_F_ONE_PACKET : 0; /* * Create a connection entry. */ { struct ip_vs_conn_param p; ip_vs_conn_fill_param(svc->ipvs, svc->af, iph->protocol, caddr, cport, vaddr, vport, &p); cp = ip_vs_conn_new(&p, dest->af, &dest->addr, dest->port ? dest->port : vport, flags, dest, skb->mark); if (!cp) { *ignored = -1; return NULL; } } IP_VS_DBG_BUF(6, "Schedule fwd:%c c:%s:%u v:%s:%u " "d:%s:%u conn->flags:%X conn->refcnt:%d\n", ip_vs_fwd_tag(cp), IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), IP_VS_DBG_ADDR(cp->daf, &cp->daddr), ntohs(cp->dport), cp->flags, refcount_read(&cp->refcnt)); ip_vs_conn_stats(cp, svc); return cp; } static inline int ip_vs_addr_is_unicast(struct net *net, int af, union nf_inet_addr *addr) { #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) return ipv6_addr_type(&addr->in6) & IPV6_ADDR_UNICAST; #endif return (inet_addr_type(net, addr->ip) == RTN_UNICAST); } /* * Pass or drop the packet. * Called by ip_vs_in, when the virtual service is available but * no destination is available for a new connection. */ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, struct ip_vs_proto_data *pd, struct ip_vs_iphdr *iph) { __be16 _ports[2], *pptr, dport; struct netns_ipvs *ipvs = svc->ipvs; struct net *net = ipvs->net; pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports); if (!pptr) return NF_DROP; dport = likely(!ip_vs_iph_inverse(iph)) ? pptr[1] : pptr[0]; /* if it is fwmark-based service, the cache_bypass sysctl is up and the destination is a non-local unicast, then create a cache_bypass connection entry */ if (sysctl_cache_bypass(ipvs) && svc->fwmark && !(iph->hdr_flags & (IP_VS_HDR_INVERSE | IP_VS_HDR_ICMP)) && ip_vs_addr_is_unicast(net, svc->af, &iph->daddr)) { int ret; struct ip_vs_conn *cp; unsigned int flags = (svc->flags & IP_VS_SVC_F_ONEPACKET && iph->protocol == IPPROTO_UDP) ? IP_VS_CONN_F_ONE_PACKET : 0; union nf_inet_addr daddr = { .all = { 0, 0, 0, 0 } }; /* create a new connection entry */ IP_VS_DBG(6, "%s(): create a cache_bypass entry\n", __func__); { struct ip_vs_conn_param p; ip_vs_conn_fill_param(svc->ipvs, svc->af, iph->protocol, &iph->saddr, pptr[0], &iph->daddr, pptr[1], &p); cp = ip_vs_conn_new(&p, svc->af, &daddr, 0, IP_VS_CONN_F_BYPASS | flags, NULL, skb->mark); if (!cp) return NF_DROP; } /* statistics */ ip_vs_in_stats(cp, skb); /* set state */ ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd); /* transmit the first SYN packet */ ret = cp->packet_xmit(skb, cp, pd->pp, iph); /* do not touch skb anymore */ if ((cp->flags & IP_VS_CONN_F_ONE_PACKET) && cp->control) atomic_inc(&cp->control->in_pkts); else atomic_inc(&cp->in_pkts); ip_vs_conn_put(cp); return ret; } /* * When the virtual ftp service is presented, packets destined * for other services on the VIP may get here (except services * listed in the ipvs table), pass the packets, because it is * not ipvs job to decide to drop the packets. */ if (svc->port == FTPPORT && dport != FTPPORT) return NF_ACCEPT; if (unlikely(ip_vs_iph_icmp(iph))) return NF_DROP; /* * Notify the client that the destination is unreachable, and * release the socket buffer. * Since it is in IP layer, the TCP socket is not actually * created, the TCP RST packet cannot be sent, instead that * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ */ #ifdef CONFIG_IP_VS_IPV6 if (svc->af == AF_INET6) { if (!skb->dev) skb->dev = net->loopback_dev; icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); } else #endif icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); return NF_DROP; } #ifdef CONFIG_SYSCTL static int sysctl_snat_reroute(struct netns_ipvs *ipvs) { return ipvs->sysctl_snat_reroute; } static int sysctl_nat_icmp_send(struct netns_ipvs *ipvs) { return ipvs->sysctl_nat_icmp_send; } #else static int sysctl_snat_reroute(struct netns_ipvs *ipvs) { return 0; } static int sysctl_nat_icmp_send(struct netns_ipvs *ipvs) { return 0; } #endif __sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset) { return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0)); } static inline enum ip_defrag_users ip_vs_defrag_user(unsigned int hooknum) { if (NF_INET_LOCAL_IN == hooknum) return IP_DEFRAG_VS_IN; if (NF_INET_FORWARD == hooknum) return IP_DEFRAG_VS_FWD; return IP_DEFRAG_VS_OUT; } static inline int ip_vs_gather_frags(struct netns_ipvs *ipvs, struct sk_buff *skb, u_int32_t user) { int err; local_bh_disable(); err = ip_defrag(ipvs->net, skb, user); local_bh_enable(); if (!err) ip_send_check(ip_hdr(skb)); return err; } static int ip_vs_route_me_harder(struct netns_ipvs *ipvs, int af, struct sk_buff *skb, unsigned int hooknum) { if (!sysctl_snat_reroute(ipvs)) return 0; /* Reroute replies only to remote clients (FORWARD and LOCAL_OUT) */ if (NF_INET_LOCAL_IN == hooknum) return 0; #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) { struct dst_entry *dst = skb_dst(skb); if (dst->dev && !(dst->dev->flags & IFF_LOOPBACK) && ip6_route_me_harder(ipvs->net, skb->sk, skb) != 0) return 1; } else #endif if (!(skb_rtable(skb)->rt_flags & RTCF_LOCAL) && ip_route_me_harder(ipvs->net, skb->sk, skb, RTN_LOCAL) != 0) return 1; return 0; } /* * Packet has been made sufficiently writable in caller * - inout: 1=in->out, 0=out->in */ void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp, int inout) { struct iphdr *iph = ip_hdr(skb); unsigned int icmp_offset = iph->ihl*4; struct icmphdr *icmph = (struct icmphdr *)(skb_network_header(skb) + icmp_offset); struct iphdr *ciph = (struct iphdr *)(icmph + 1); if (inout) { iph->saddr = cp->vaddr.ip; ip_send_check(iph); ciph->daddr = cp->vaddr.ip; ip_send_check(ciph); } else { iph->daddr = cp->daddr.ip; ip_send_check(iph); ciph->saddr = cp->daddr.ip; ip_send_check(ciph); } /* the TCP/UDP/SCTP port */ if (IPPROTO_TCP == ciph->protocol || IPPROTO_UDP == ciph->protocol || IPPROTO_SCTP == ciph->protocol) { __be16 *ports = (void *)ciph + ciph->ihl*4; if (inout) ports[1] = cp->vport; else ports[0] = cp->dport; } /* And finally the ICMP checksum */ icmph->checksum = 0; icmph->checksum = ip_vs_checksum_complete(skb, icmp_offset); skb->ip_summed = CHECKSUM_UNNECESSARY; if (inout) IP_VS_DBG_PKT(11, AF_INET, pp, skb, (void *)ciph - (void *)iph, "Forwarding altered outgoing ICMP"); else IP_VS_DBG_PKT(11, AF_INET, pp, skb, (void *)ciph - (void *)iph, "Forwarding altered incoming ICMP"); } #ifdef CONFIG_IP_VS_IPV6 void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp, int inout) { struct ipv6hdr *iph = ipv6_hdr(skb); unsigned int icmp_offset = 0; unsigned int offs = 0; /* header offset*/ int protocol; struct icmp6hdr *icmph; struct ipv6hdr *ciph; unsigned short fragoffs; ipv6_find_hdr(skb, &icmp_offset, IPPROTO_ICMPV6, &fragoffs, NULL); icmph = (struct icmp6hdr *)(skb_network_header(skb) + icmp_offset); offs = icmp_offset + sizeof(struct icmp6hdr); ciph = (struct ipv6hdr *)(skb_network_header(skb) + offs); protocol = ipv6_find_hdr(skb, &offs, -1, &fragoffs, NULL); if (inout) { iph->saddr = cp->vaddr.in6; ciph->daddr = cp->vaddr.in6; } else { iph->daddr = cp->daddr.in6; ciph->saddr = cp->daddr.in6; } /* the TCP/UDP/SCTP port */ if (!fragoffs && (IPPROTO_TCP == protocol || IPPROTO_UDP == protocol || IPPROTO_SCTP == protocol)) { __be16 *ports = (void *)(skb_network_header(skb) + offs); IP_VS_DBG(11, "%s() changed port %d to %d\n", __func__, ntohs(inout ? ports[1] : ports[0]), ntohs(inout ? cp->vport : cp->dport)); if (inout) ports[1] = cp->vport; else ports[0] = cp->dport; } /* And finally the ICMP checksum */ icmph->icmp6_cksum = ~csum_ipv6_magic(&iph->saddr, &iph->daddr, skb->len - icmp_offset, IPPROTO_ICMPV6, 0); skb->csum_start = skb_network_header(skb) - skb->head + icmp_offset; skb->csum_offset = offsetof(struct icmp6hdr, icmp6_cksum); skb->ip_summed = CHECKSUM_PARTIAL; if (inout) IP_VS_DBG_PKT(11, AF_INET6, pp, skb, (void *)ciph - (void *)iph, "Forwarding altered outgoing ICMPv6"); else IP_VS_DBG_PKT(11, AF_INET6, pp, skb, (void *)ciph - (void *)iph, "Forwarding altered incoming ICMPv6"); } #endif /* Handle relevant response ICMP messages - forward to the right * destination host. */ static int handle_response_icmp(int af, struct sk_buff *skb, union nf_inet_addr *snet, __u8 protocol, struct ip_vs_conn *cp, struct ip_vs_protocol *pp, unsigned int offset, unsigned int ihl, unsigned int hooknum) { unsigned int verdict = NF_DROP; if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) goto after_nat; /* Ensure the checksum is correct */ if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) { /* Failed checksum! */ IP_VS_DBG_BUF(1, "Forward ICMP: failed checksum from %s!\n", IP_VS_DBG_ADDR(af, snet)); goto out; } if (IPPROTO_TCP == protocol || IPPROTO_UDP == protocol || IPPROTO_SCTP == protocol) offset += 2 * sizeof(__u16); if (skb_ensure_writable(skb, offset)) goto out; #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) ip_vs_nat_icmp_v6(skb, pp, cp, 1); else #endif ip_vs_nat_icmp(skb, pp, cp, 1); if (ip_vs_route_me_harder(cp->ipvs, af, skb, hooknum)) goto out; after_nat: /* do the statistics and put it back */ ip_vs_out_stats(cp, skb); skb->ipvs_property = 1; if (!(cp->flags & IP_VS_CONN_F_NFCT)) ip_vs_notrack(skb); else ip_vs_update_conntrack(skb, cp, 0); verdict = NF_ACCEPT; out: __ip_vs_conn_put(cp); return verdict; } /* * Handle ICMP messages in the inside-to-outside direction (outgoing). * Find any that might be relevant, check against existing connections. * Currently handles error types - unreachable, quench, ttl exceeded. */ static int ip_vs_out_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related, unsigned int hooknum) { struct iphdr *iph; struct icmphdr _icmph, *ic; struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */ struct ip_vs_iphdr ciph; struct ip_vs_conn *cp; struct ip_vs_protocol *pp; unsigned int offset, ihl; union nf_inet_addr snet; *related = 1; /* reassemble IP fragments */ if (ip_is_fragment(ip_hdr(skb))) { if (ip_vs_gather_frags(ipvs, skb, ip_vs_defrag_user(hooknum))) return NF_STOLEN; } iph = ip_hdr(skb); offset = ihl = iph->ihl * 4; ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); if (ic == NULL) return NF_DROP; IP_VS_DBG(12, "Outgoing ICMP (%d,%d) %pI4->%pI4\n", ic->type, ntohs(icmp_id(ic)), &iph->saddr, &iph->daddr); /* * Work through seeing if this is for us. * These checks are supposed to be in an order that means easy * things are checked first to speed up processing.... however * this means that some packets will manage to get a long way * down this stack and then be rejected, but that's life. */ if ((ic->type != ICMP_DEST_UNREACH) && (ic->type != ICMP_SOURCE_QUENCH) && (ic->type != ICMP_TIME_EXCEEDED)) { *related = 0; return NF_ACCEPT; } /* Now find the contained IP header */ offset += sizeof(_icmph); cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); if (cih == NULL) return NF_ACCEPT; /* The packet looks wrong, ignore */ pp = ip_vs_proto_get(cih->protocol); if (!pp) return NF_ACCEPT; /* Is the embedded protocol header present? */ if (unlikely(cih->frag_off & htons(IP_OFFSET) && pp->dont_defrag)) return NF_ACCEPT; IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset, "Checking outgoing ICMP for"); ip_vs_fill_iph_skb_icmp(AF_INET, skb, offset, true, &ciph); /* The embedded headers contain source and dest in reverse order */ cp = INDIRECT_CALL_1(pp->conn_out_get, ip_vs_conn_out_get_proto, ipvs, AF_INET, skb, &ciph); if (!cp) return NF_ACCEPT; snet.ip = iph->saddr; return handle_response_icmp(AF_INET, skb, &snet, cih->protocol, cp, pp, ciph.len, ihl, hooknum); } #ifdef CONFIG_IP_VS_IPV6 static int ip_vs_out_icmp_v6(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related, unsigned int hooknum, struct ip_vs_iphdr *ipvsh) { struct icmp6hdr _icmph, *ic; struct ip_vs_iphdr ciph = {.flags = 0, .fragoffs = 0};/*Contained IP */ struct ip_vs_conn *cp; struct ip_vs_protocol *pp; union nf_inet_addr snet; unsigned int offset; *related = 1; ic = frag_safe_skb_hp(skb, ipvsh->len, sizeof(_icmph), &_icmph); if (ic == NULL) return NF_DROP; /* * Work through seeing if this is for us. * These checks are supposed to be in an order that means easy * things are checked first to speed up processing.... however * this means that some packets will manage to get a long way * down this stack and then be rejected, but that's life. */ if (ic->icmp6_type & ICMPV6_INFOMSG_MASK) { *related = 0; return NF_ACCEPT; } /* Fragment header that is before ICMP header tells us that: * it's not an error message since they can't be fragmented. */ if (ipvsh->flags & IP6_FH_F_FRAG) return NF_DROP; IP_VS_DBG(8, "Outgoing ICMPv6 (%d,%d) %pI6c->%pI6c\n", ic->icmp6_type, ntohs(icmpv6_id(ic)), &ipvsh->saddr, &ipvsh->daddr); if (!ip_vs_fill_iph_skb_icmp(AF_INET6, skb, ipvsh->len + sizeof(_icmph), true, &ciph)) return NF_ACCEPT; /* The packet looks wrong, ignore */ pp = ip_vs_proto_get(ciph.protocol); if (!pp) return NF_ACCEPT; /* The embedded headers contain source and dest in reverse order */ cp = INDIRECT_CALL_1(pp->conn_out_get, ip_vs_conn_out_get_proto, ipvs, AF_INET6, skb, &ciph); if (!cp) return NF_ACCEPT; snet.in6 = ciph.saddr.in6; offset = ciph.len; return handle_response_icmp(AF_INET6, skb, &snet, ciph.protocol, cp, pp, offset, sizeof(struct ipv6hdr), hooknum); } #endif /* * Check if sctp chunc is ABORT chunk */ static inline int is_sctp_abort(const struct sk_buff *skb, int nh_len) { struct sctp_chunkhdr *sch, schunk; sch = skb_header_pointer(skb, nh_len + sizeof(struct sctphdr), sizeof(schunk), &schunk); if (sch == NULL) return 0; if (sch->type == SCTP_CID_ABORT) return 1; return 0; } static inline int is_tcp_reset(const struct sk_buff *skb, int nh_len) { struct tcphdr _tcph, *th; th = skb_header_pointer(skb, nh_len, sizeof(_tcph), &_tcph); if (th == NULL) return 0; return th->rst; } static inline bool is_new_conn(const struct sk_buff *skb, struct ip_vs_iphdr *iph) { switch (iph->protocol) { case IPPROTO_TCP: { struct tcphdr _tcph, *th; th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph); if (th == NULL) return false; return th->syn; } case IPPROTO_SCTP: { struct sctp_chunkhdr *sch, schunk; sch = skb_header_pointer(skb, iph->len + sizeof(struct sctphdr), sizeof(schunk), &schunk); if (sch == NULL) return false; return sch->type == SCTP_CID_INIT; } default: return false; } } static inline bool is_new_conn_expected(const struct ip_vs_conn *cp, int conn_reuse_mode) { /* Controlled (FTP DATA or persistence)? */ if (cp->control) return false; switch (cp->protocol) { case IPPROTO_TCP: return (cp->state == IP_VS_TCP_S_TIME_WAIT) || (cp->state == IP_VS_TCP_S_CLOSE) || ((conn_reuse_mode & 2) && (cp->state == IP_VS_TCP_S_FIN_WAIT) && (cp->flags & IP_VS_CONN_F_NOOUTPUT)); case IPPROTO_SCTP: return cp->state == IP_VS_SCTP_S_CLOSED; default: return false; } } /* Generic function to create new connections for outgoing RS packets * * Pre-requisites for successful connection creation: * 1) Virtual Service is NOT fwmark based: * In fwmark-VS actual vaddr and vport are unknown to IPVS * 2) Real Server and Virtual Service were NOT configured without port: * This is to allow match of different VS to the same RS ip-addr */ struct ip_vs_conn *ip_vs_new_conn_out(struct ip_vs_service *svc, struct ip_vs_dest *dest, struct sk_buff *skb, const struct ip_vs_iphdr *iph, __be16 dport, __be16 cport) { struct ip_vs_conn_param param; struct ip_vs_conn *ct = NULL, *cp = NULL; const union nf_inet_addr *vaddr, *daddr, *caddr; union nf_inet_addr snet; __be16 vport; unsigned int flags; vaddr = &svc->addr; vport = svc->port; daddr = &iph->saddr; caddr = &iph->daddr; /* check pre-requisites are satisfied */ if (svc->fwmark) return NULL; if (!vport || !dport) return NULL; /* for persistent service first create connection template */ if (svc->flags & IP_VS_SVC_F_PERSISTENT) { /* apply netmask the same way ingress-side does */ #ifdef CONFIG_IP_VS_IPV6 if (svc->af == AF_INET6) ipv6_addr_prefix(&snet.in6, &caddr->in6, (__force __u32)svc->netmask); else #endif snet.ip = caddr->ip & svc->netmask; /* fill params and create template if not existent */ if (ip_vs_conn_fill_param_persist(svc, skb, iph->protocol, &snet, 0, vaddr, vport, &param) < 0) return NULL; ct = ip_vs_ct_in_get(&param); /* check if template exists and points to the same dest */ if (!ct || !ip_vs_check_template(ct, dest)) { ct = ip_vs_conn_new(&param, dest->af, daddr, dport, IP_VS_CONN_F_TEMPLATE, dest, 0); if (!ct) { kfree(param.pe_data); return NULL; } ct->timeout = svc->timeout; } else { kfree(param.pe_data); } } /* connection flags */ flags = ((svc->flags & IP_VS_SVC_F_ONEPACKET) && iph->protocol == IPPROTO_UDP) ? IP_VS_CONN_F_ONE_PACKET : 0; /* create connection */ ip_vs_conn_fill_param(svc->ipvs, svc->af, iph->protocol, caddr, cport, vaddr, vport, &param); cp = ip_vs_conn_new(&param, dest->af, daddr, dport, flags, dest, 0); if (!cp) { if (ct) ip_vs_conn_put(ct); return NULL; } if (ct) { ip_vs_control_add(cp, ct); ip_vs_conn_put(ct); } ip_vs_conn_stats(cp, svc); /* return connection (will be used to handle outgoing packet) */ IP_VS_DBG_BUF(6, "New connection RS-initiated:%c c:%s:%u v:%s:%u " "d:%s:%u conn->flags:%X conn->refcnt:%d\n", ip_vs_fwd_tag(cp), IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), IP_VS_DBG_ADDR(cp->af, &cp->daddr), ntohs(cp->dport), cp->flags, refcount_read(&cp->refcnt)); return cp; } /* Handle outgoing packets which are considered requests initiated by * real servers, so that subsequent responses from external client can be * routed to the right real server. * Used also for outgoing responses in OPS mode. * * Connection management is handled by persistent-engine specific callback. */ static struct ip_vs_conn *__ip_vs_rs_conn_out(unsigned int hooknum, struct netns_ipvs *ipvs, int af, struct sk_buff *skb, const struct ip_vs_iphdr *iph) { struct ip_vs_dest *dest; struct ip_vs_conn *cp = NULL; __be16 _ports[2], *pptr; if (hooknum == NF_INET_LOCAL_IN) return NULL; pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports); if (!pptr) return NULL; dest = ip_vs_find_real_service(ipvs, af, iph->protocol, &iph->saddr, pptr[0]); if (dest) { struct ip_vs_service *svc; struct ip_vs_pe *pe; svc = rcu_dereference(dest->svc); if (svc) { pe = rcu_dereference(svc->pe); if (pe && pe->conn_out) cp = pe->conn_out(svc, dest, skb, iph, pptr[0], pptr[1]); } } return cp; } /* Handle response packets: rewrite addresses and send away... */ static unsigned int handle_response(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, struct ip_vs_conn *cp, struct ip_vs_iphdr *iph, unsigned int hooknum) { struct ip_vs_protocol *pp = pd->pp; if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) goto after_nat; IP_VS_DBG_PKT(11, af, pp, skb, iph->off, "Outgoing packet"); if (skb_ensure_writable(skb, iph->len)) goto drop; /* mangle the packet */ if (pp->snat_handler && !SNAT_CALL(pp->snat_handler, skb, pp, cp, iph)) goto drop; #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) ipv6_hdr(skb)->saddr = cp->vaddr.in6; else #endif { ip_hdr(skb)->saddr = cp->vaddr.ip; ip_send_check(ip_hdr(skb)); } /* * nf_iterate does not expect change in the skb->dst->dev. * It looks like it is not fatal to enable this code for hooks * where our handlers are at the end of the chain list and * when all next handlers use skb->dst->dev and not outdev. * It will definitely route properly the inout NAT traffic * when multiple paths are used. */ /* For policy routing, packets originating from this * machine itself may be routed differently to packets * passing through. We want this packet to be routed as * if it came from this machine itself. So re-compute * the routing information. */ if (ip_vs_route_me_harder(cp->ipvs, af, skb, hooknum)) goto drop; IP_VS_DBG_PKT(10, af, pp, skb, iph->off, "After SNAT"); after_nat: ip_vs_out_stats(cp, skb); ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pd); skb->ipvs_property = 1; if (!(cp->flags & IP_VS_CONN_F_NFCT)) ip_vs_notrack(skb); else ip_vs_update_conntrack(skb, cp, 0); ip_vs_conn_put(cp); return NF_ACCEPT; drop: ip_vs_conn_put(cp); kfree_skb(skb); return NF_STOLEN; } /* * Check if outgoing packet belongs to the established ip_vs_conn. */ static unsigned int ip_vs_out_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct netns_ipvs *ipvs = net_ipvs(state->net); unsigned int hooknum = state->hook; struct ip_vs_iphdr iph; struct ip_vs_protocol *pp; struct ip_vs_proto_data *pd; struct ip_vs_conn *cp; int af = state->pf; struct sock *sk; /* Already marked as IPVS request or reply? */ if (skb->ipvs_property) return NF_ACCEPT; sk = skb_to_full_sk(skb); /* Bad... Do not break raw sockets */ if (unlikely(sk && hooknum == NF_INET_LOCAL_OUT && af == AF_INET)) { if (sk->sk_family == PF_INET && inet_test_bit(NODEFRAG, sk)) return NF_ACCEPT; } if (unlikely(!skb_dst(skb))) return NF_ACCEPT; if (!ipvs->enable) return NF_ACCEPT; ip_vs_fill_iph_skb(af, skb, false, &iph); #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) { if (unlikely(iph.protocol == IPPROTO_ICMPV6)) { int related; int verdict = ip_vs_out_icmp_v6(ipvs, skb, &related, hooknum, &iph); if (related) return verdict; } } else #endif if (unlikely(iph.protocol == IPPROTO_ICMP)) { int related; int verdict = ip_vs_out_icmp(ipvs, skb, &related, hooknum); if (related) return verdict; } pd = ip_vs_proto_data_get(ipvs, iph.protocol); if (unlikely(!pd)) return NF_ACCEPT; pp = pd->pp; /* reassemble IP fragments */ #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET) #endif if (unlikely(ip_is_fragment(ip_hdr(skb)) && !pp->dont_defrag)) { if (ip_vs_gather_frags(ipvs, skb, ip_vs_defrag_user(hooknum))) return NF_STOLEN; ip_vs_fill_iph_skb(AF_INET, skb, false, &iph); } /* * Check if the packet belongs to an existing entry */ cp = INDIRECT_CALL_1(pp->conn_out_get, ip_vs_conn_out_get_proto, ipvs, af, skb, &iph); if (likely(cp)) return handle_response(af, skb, pd, cp, &iph, hooknum); /* Check for real-server-started requests */ if (atomic_read(&ipvs->conn_out_counter)) { /* Currently only for UDP: * connection oriented protocols typically use * ephemeral ports for outgoing connections, so * related incoming responses would not match any VS */ if (pp->protocol == IPPROTO_UDP) { cp = __ip_vs_rs_conn_out(hooknum, ipvs, af, skb, &iph); if (likely(cp)) return handle_response(af, skb, pd, cp, &iph, hooknum); } } if (sysctl_nat_icmp_send(ipvs) && (pp->protocol == IPPROTO_TCP || pp->protocol == IPPROTO_UDP || pp->protocol == IPPROTO_SCTP)) { __be16 _ports[2], *pptr; pptr = frag_safe_skb_hp(skb, iph.len, sizeof(_ports), _ports); if (pptr == NULL) return NF_ACCEPT; /* Not for me */ if (ip_vs_has_real_service(ipvs, af, iph.protocol, &iph.saddr, pptr[0])) { /* * Notify the real server: there is no * existing entry if it is not RST * packet or not TCP packet. */ if ((iph.protocol != IPPROTO_TCP && iph.protocol != IPPROTO_SCTP) || ((iph.protocol == IPPROTO_TCP && !is_tcp_reset(skb, iph.len)) || (iph.protocol == IPPROTO_SCTP && !is_sctp_abort(skb, iph.len)))) { #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) { if (!skb->dev) skb->dev = ipvs->net->loopback_dev; icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); } else #endif icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); return NF_DROP; } } } IP_VS_DBG_PKT(12, af, pp, skb, iph.off, "ip_vs_out: packet continues traversal as normal"); return NF_ACCEPT; } static unsigned int ip_vs_try_to_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, int *verdict, struct ip_vs_conn **cpp, struct ip_vs_iphdr *iph) { struct ip_vs_protocol *pp = pd->pp; if (!iph->fragoffs) { /* No (second) fragments need to enter here, as nf_defrag_ipv6 * replayed fragment zero will already have created the cp */ /* Schedule and create new connection entry into cpp */ if (!pp->conn_schedule(ipvs, af, skb, pd, verdict, cpp, iph)) return 0; } if (unlikely(!*cpp)) { /* sorry, all this trouble for a no-hit :) */ IP_VS_DBG_PKT(12, af, pp, skb, iph->off, "ip_vs_in: packet continues traversal as normal"); /* Fragment couldn't be mapped to a conn entry */ if (iph->fragoffs) IP_VS_DBG_PKT(7, af, pp, skb, iph->off, "unhandled fragment"); *verdict = NF_ACCEPT; return 0; } return 1; } /* Check the UDP tunnel and return its header length */ static int ipvs_udp_decap(struct netns_ipvs *ipvs, struct sk_buff *skb, unsigned int offset, __u16 af, const union nf_inet_addr *daddr, __u8 *proto) { struct udphdr _udph, *udph; struct ip_vs_dest *dest; udph = skb_header_pointer(skb, offset, sizeof(_udph), &_udph); if (!udph) goto unk; offset += sizeof(struct udphdr); dest = ip_vs_find_tunnel(ipvs, af, daddr, udph->dest); if (!dest) goto unk; if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { struct guehdr _gueh, *gueh; gueh = skb_header_pointer(skb, offset, sizeof(_gueh), &_gueh); if (!gueh) goto unk; if (gueh->control != 0 || gueh->version != 0) goto unk; /* Later we can support also IPPROTO_IPV6 */ if (gueh->proto_ctype != IPPROTO_IPIP) goto unk; *proto = gueh->proto_ctype; return sizeof(struct udphdr) + sizeof(struct guehdr) + (gueh->hlen << 2); } unk: return 0; } /* Check the GRE tunnel and return its header length */ static int ipvs_gre_decap(struct netns_ipvs *ipvs, struct sk_buff *skb, unsigned int offset, __u16 af, const union nf_inet_addr *daddr, __u8 *proto) { struct gre_base_hdr _greh, *greh; struct ip_vs_dest *dest; greh = skb_header_pointer(skb, offset, sizeof(_greh), &_greh); if (!greh) goto unk; dest = ip_vs_find_tunnel(ipvs, af, daddr, 0); if (!dest) goto unk; if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { IP_TUNNEL_DECLARE_FLAGS(flags); __be16 type; /* Only support version 0 and C (csum) */ if ((greh->flags & ~GRE_CSUM) != 0) goto unk; type = greh->protocol; /* Later we can support also IPPROTO_IPV6 */ if (type != htons(ETH_P_IP)) goto unk; *proto = IPPROTO_IPIP; gre_flags_to_tnl_flags(flags, greh->flags); return gre_calc_hlen(flags); } unk: return 0; } /* * Handle ICMP messages in the outside-to-inside direction (incoming). * Find any that might be relevant, check against existing connections, * forward to the right destination host if relevant. * Currently handles error types - unreachable, quench, ttl exceeded. */ static int ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related, unsigned int hooknum) { struct iphdr *iph; struct icmphdr _icmph, *ic; struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */ struct ip_vs_iphdr ciph; struct ip_vs_conn *cp; struct ip_vs_protocol *pp; struct ip_vs_proto_data *pd; unsigned int offset, offset2, ihl, verdict; bool tunnel, new_cp = false; union nf_inet_addr *raddr; char *outer_proto = "IPIP"; *related = 1; /* reassemble IP fragments */ if (ip_is_fragment(ip_hdr(skb))) { if (ip_vs_gather_frags(ipvs, skb, ip_vs_defrag_user(hooknum))) return NF_STOLEN; } iph = ip_hdr(skb); offset = ihl = iph->ihl * 4; ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); if (ic == NULL) return NF_DROP; IP_VS_DBG(12, "Incoming ICMP (%d,%d) %pI4->%pI4\n", ic->type, ntohs(icmp_id(ic)), &iph->saddr, &iph->daddr); /* * Work through seeing if this is for us. * These checks are supposed to be in an order that means easy * things are checked first to speed up processing.... however * this means that some packets will manage to get a long way * down this stack and then be rejected, but that's life. */ if ((ic->type != ICMP_DEST_UNREACH) && (ic->type != ICMP_SOURCE_QUENCH) && (ic->type != ICMP_TIME_EXCEEDED)) { *related = 0; return NF_ACCEPT; } /* Now find the contained IP header */ offset += sizeof(_icmph); cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); if (cih == NULL) return NF_ACCEPT; /* The packet looks wrong, ignore */ raddr = (union nf_inet_addr *)&cih->daddr; /* Special case for errors for IPIP/UDP/GRE tunnel packets */ tunnel = false; if (cih->protocol == IPPROTO_IPIP) { struct ip_vs_dest *dest; if (unlikely(cih->frag_off & htons(IP_OFFSET))) return NF_ACCEPT; /* Error for our IPIP must arrive at LOCAL_IN */ if (!(skb_rtable(skb)->rt_flags & RTCF_LOCAL)) return NF_ACCEPT; dest = ip_vs_find_tunnel(ipvs, AF_INET, raddr, 0); /* Only for known tunnel */ if (!dest || dest->tun_type != IP_VS_CONN_F_TUNNEL_TYPE_IPIP) return NF_ACCEPT; offset += cih->ihl * 4; cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); if (cih == NULL) return NF_ACCEPT; /* The packet looks wrong, ignore */ tunnel = true; } else if ((cih->protocol == IPPROTO_UDP || /* Can be UDP encap */ cih->protocol == IPPROTO_GRE) && /* Can be GRE encap */ /* Error for our tunnel must arrive at LOCAL_IN */ (skb_rtable(skb)->rt_flags & RTCF_LOCAL)) { __u8 iproto; int ulen; /* Non-first fragment has no UDP/GRE header */ if (unlikely(cih->frag_off & htons(IP_OFFSET))) return NF_ACCEPT; offset2 = offset + cih->ihl * 4; if (cih->protocol == IPPROTO_UDP) { ulen = ipvs_udp_decap(ipvs, skb, offset2, AF_INET, raddr, &iproto); outer_proto = "UDP"; } else { ulen = ipvs_gre_decap(ipvs, skb, offset2, AF_INET, raddr, &iproto); outer_proto = "GRE"; } if (ulen > 0) { /* Skip IP and UDP/GRE tunnel headers */ offset = offset2 + ulen; /* Now we should be at the original IP header */ cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); if (cih && cih->version == 4 && cih->ihl >= 5 && iproto == IPPROTO_IPIP) tunnel = true; else return NF_ACCEPT; } } pd = ip_vs_proto_data_get(ipvs, cih->protocol); if (!pd) return NF_ACCEPT; pp = pd->pp; /* Is the embedded protocol header present? */ if (unlikely(cih->frag_off & htons(IP_OFFSET) && pp->dont_defrag)) return NF_ACCEPT; IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset, "Checking incoming ICMP for"); offset2 = offset; ip_vs_fill_iph_skb_icmp(AF_INET, skb, offset, !tunnel, &ciph); offset = ciph.len; /* The embedded headers contain source and dest in reverse order. * For IPIP/UDP/GRE tunnel this is error for request, not for reply. */ cp = INDIRECT_CALL_1(pp->conn_in_get, ip_vs_conn_in_get_proto, ipvs, AF_INET, skb, &ciph); if (!cp) { int v; if (tunnel || !sysctl_schedule_icmp(ipvs)) return NF_ACCEPT; if (!ip_vs_try_to_schedule(ipvs, AF_INET, skb, pd, &v, &cp, &ciph)) return v; new_cp = true; } verdict = NF_DROP; /* Ensure the checksum is correct */ if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) { /* Failed checksum! */ IP_VS_DBG(1, "Incoming ICMP: failed checksum from %pI4!\n", &iph->saddr); goto out; } if (tunnel) { __be32 info = ic->un.gateway; __u8 type = ic->type; __u8 code = ic->code; /* Update the MTU */ if (ic->type == ICMP_DEST_UNREACH && ic->code == ICMP_FRAG_NEEDED) { struct ip_vs_dest *dest = cp->dest; u32 mtu = ntohs(ic->un.frag.mtu); __be16 frag_off = cih->frag_off; /* Strip outer IP and ICMP, go to IPIP/UDP/GRE header */ if (pskb_pull(skb, ihl + sizeof(_icmph)) == NULL) goto ignore_tunnel; offset2 -= ihl + sizeof(_icmph); skb_reset_network_header(skb); IP_VS_DBG(12, "ICMP for %s %pI4->%pI4: mtu=%u\n", outer_proto, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, mtu); ipv4_update_pmtu(skb, ipvs->net, mtu, 0, 0); /* Client uses PMTUD? */ if (!(frag_off & htons(IP_DF))) goto ignore_tunnel; /* Prefer the resulting PMTU */ if (dest) { struct ip_vs_dest_dst *dest_dst; dest_dst = rcu_dereference(dest->dest_dst); if (dest_dst) mtu = dst_mtu(dest_dst->dst_cache); } if (mtu > 68 + sizeof(struct iphdr)) mtu -= sizeof(struct iphdr); info = htonl(mtu); } /* Strip outer IP, ICMP and IPIP/UDP/GRE, go to IP header of * original request. */ if (pskb_pull(skb, offset2) == NULL) goto ignore_tunnel; skb_reset_network_header(skb); IP_VS_DBG(12, "Sending ICMP for %pI4->%pI4: t=%u, c=%u, i=%u\n", &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, type, code, ntohl(info)); icmp_send(skb, type, code, info); /* ICMP can be shorter but anyways, account it */ ip_vs_out_stats(cp, skb); ignore_tunnel: consume_skb(skb); verdict = NF_STOLEN; goto out; } /* do the statistics and put it back */ ip_vs_in_stats(cp, skb); if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol || IPPROTO_SCTP == cih->protocol) offset += 2 * sizeof(__u16); verdict = ip_vs_icmp_xmit(skb, cp, pp, offset, hooknum, &ciph); out: if (likely(!new_cp)) __ip_vs_conn_put(cp); else ip_vs_conn_put(cp); return verdict; } #ifdef CONFIG_IP_VS_IPV6 static int ip_vs_in_icmp_v6(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related, unsigned int hooknum, struct ip_vs_iphdr *iph) { struct icmp6hdr _icmph, *ic; struct ip_vs_iphdr ciph = {.flags = 0, .fragoffs = 0};/*Contained IP */ struct ip_vs_conn *cp; struct ip_vs_protocol *pp; struct ip_vs_proto_data *pd; unsigned int offset, verdict; bool new_cp = false; *related = 1; ic = frag_safe_skb_hp(skb, iph->len, sizeof(_icmph), &_icmph); if (ic == NULL) return NF_DROP; /* * Work through seeing if this is for us. * These checks are supposed to be in an order that means easy * things are checked first to speed up processing.... however * this means that some packets will manage to get a long way * down this stack and then be rejected, but that's life. */ if (ic->icmp6_type & ICMPV6_INFOMSG_MASK) { *related = 0; return NF_ACCEPT; } /* Fragment header that is before ICMP header tells us that: * it's not an error message since they can't be fragmented. */ if (iph->flags & IP6_FH_F_FRAG) return NF_DROP; IP_VS_DBG(8, "Incoming ICMPv6 (%d,%d) %pI6c->%pI6c\n", ic->icmp6_type, ntohs(icmpv6_id(ic)), &iph->saddr, &iph->daddr); offset = iph->len + sizeof(_icmph); if (!ip_vs_fill_iph_skb_icmp(AF_INET6, skb, offset, true, &ciph)) return NF_ACCEPT; pd = ip_vs_proto_data_get(ipvs, ciph.protocol); if (!pd) return NF_ACCEPT; pp = pd->pp; /* Cannot handle fragmented embedded protocol */ if (ciph.fragoffs) return NF_ACCEPT; IP_VS_DBG_PKT(11, AF_INET6, pp, skb, offset, "Checking incoming ICMPv6 for"); /* The embedded headers contain source and dest in reverse order * if not from localhost */ cp = INDIRECT_CALL_1(pp->conn_in_get, ip_vs_conn_in_get_proto, ipvs, AF_INET6, skb, &ciph); if (!cp) { int v; if (!sysctl_schedule_icmp(ipvs)) return NF_ACCEPT; if (!ip_vs_try_to_schedule(ipvs, AF_INET6, skb, pd, &v, &cp, &ciph)) return v; new_cp = true; } /* VS/TUN, VS/DR and LOCALNODE just let it go */ if ((hooknum == NF_INET_LOCAL_OUT) && (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)) { verdict = NF_ACCEPT; goto out; } /* do the statistics and put it back */ ip_vs_in_stats(cp, skb); /* Need to mangle contained IPv6 header in ICMPv6 packet */ offset = ciph.len; if (IPPROTO_TCP == ciph.protocol || IPPROTO_UDP == ciph.protocol || IPPROTO_SCTP == ciph.protocol) offset += 2 * sizeof(__u16); /* Also mangle ports */ verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, offset, hooknum, &ciph); out: if (likely(!new_cp)) __ip_vs_conn_put(cp); else ip_vs_conn_put(cp); return verdict; } #endif /* * Check if it's for virtual services, look it up, * and send it on its way... */ static unsigned int ip_vs_in_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct netns_ipvs *ipvs = net_ipvs(state->net); unsigned int hooknum = state->hook; struct ip_vs_iphdr iph; struct ip_vs_protocol *pp; struct ip_vs_proto_data *pd; struct ip_vs_conn *cp; int ret, pkts; struct sock *sk; int af = state->pf; /* Already marked as IPVS request or reply? */ if (skb->ipvs_property) return NF_ACCEPT; /* * Big tappo: * - remote client: only PACKET_HOST * - route: used for struct net when skb->dev is unset */ if (unlikely((skb->pkt_type != PACKET_HOST && hooknum != NF_INET_LOCAL_OUT) || !skb_dst(skb))) { ip_vs_fill_iph_skb(af, skb, false, &iph); IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s" " ignored in hook %u\n", skb->pkt_type, iph.protocol, IP_VS_DBG_ADDR(af, &iph.daddr), hooknum); return NF_ACCEPT; } /* ipvs enabled in this netns ? */ if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable)) return NF_ACCEPT; ip_vs_fill_iph_skb(af, skb, false, &iph); /* Bad... Do not break raw sockets */ sk = skb_to_full_sk(skb); if (unlikely(sk && hooknum == NF_INET_LOCAL_OUT && af == AF_INET)) { if (sk->sk_family == PF_INET && inet_test_bit(NODEFRAG, sk)) return NF_ACCEPT; } #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) { if (unlikely(iph.protocol == IPPROTO_ICMPV6)) { int related; int verdict = ip_vs_in_icmp_v6(ipvs, skb, &related, hooknum, &iph); if (related) return verdict; } } else #endif if (unlikely(iph.protocol == IPPROTO_ICMP)) { int related; int verdict = ip_vs_in_icmp(ipvs, skb, &related, hooknum); if (related) return verdict; } /* Protocol supported? */ pd = ip_vs_proto_data_get(ipvs, iph.protocol); if (unlikely(!pd)) { /* The only way we'll see this packet again is if it's * encapsulated, so mark it with ipvs_property=1 so we * skip it if we're ignoring tunneled packets */ if (sysctl_ignore_tunneled(ipvs)) skb->ipvs_property = 1; return NF_ACCEPT; } pp = pd->pp; /* * Check if the packet belongs to an existing connection entry */ cp = INDIRECT_CALL_1(pp->conn_in_get, ip_vs_conn_in_get_proto, ipvs, af, skb, &iph); if (!iph.fragoffs && is_new_conn(skb, &iph) && cp) { int conn_reuse_mode = sysctl_conn_reuse_mode(ipvs); bool old_ct = false, resched = false; if (unlikely(sysctl_expire_nodest_conn(ipvs)) && cp->dest && unlikely(!atomic_read(&cp->dest->weight))) { resched = true; old_ct = ip_vs_conn_uses_old_conntrack(cp, skb); } else if (conn_reuse_mode && is_new_conn_expected(cp, conn_reuse_mode)) { old_ct = ip_vs_conn_uses_old_conntrack(cp, skb); if (!atomic_read(&cp->n_control)) { resched = true; } else { /* Do not reschedule controlling connection * that uses conntrack while it is still * referenced by controlled connection(s). */ resched = !old_ct; } } if (resched) { if (!old_ct) cp->flags &= ~IP_VS_CONN_F_NFCT; if (!atomic_read(&cp->n_control)) ip_vs_conn_expire_now(cp); __ip_vs_conn_put(cp); if (old_ct) return NF_DROP; cp = NULL; } } /* Check the server status */ if (cp && cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) { /* the destination server is not available */ if (sysctl_expire_nodest_conn(ipvs)) { bool old_ct = ip_vs_conn_uses_old_conntrack(cp, skb); if (!old_ct) cp->flags &= ~IP_VS_CONN_F_NFCT; ip_vs_conn_expire_now(cp); __ip_vs_conn_put(cp); if (old_ct) return NF_DROP; cp = NULL; } else { __ip_vs_conn_put(cp); return NF_DROP; } } if (unlikely(!cp)) { int v; if (!ip_vs_try_to_schedule(ipvs, af, skb, pd, &v, &cp, &iph)) return v; } IP_VS_DBG_PKT(11, af, pp, skb, iph.off, "Incoming packet"); ip_vs_in_stats(cp, skb); ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd); if (cp->packet_xmit) ret = cp->packet_xmit(skb, cp, pp, &iph); /* do not touch skb anymore */ else { IP_VS_DBG_RL("warning: packet_xmit is null"); ret = NF_ACCEPT; } /* Increase its packet counter and check if it is needed * to be synchronized * * Sync connection if it is about to close to * encorage the standby servers to update the connections timeout * * For ONE_PKT let ip_vs_sync_conn() do the filter work. */ if (cp->flags & IP_VS_CONN_F_ONE_PACKET) pkts = sysctl_sync_threshold(ipvs); else pkts = atomic_inc_return(&cp->in_pkts); if (ipvs->sync_state & IP_VS_STATE_MASTER) ip_vs_sync_conn(ipvs, cp, pkts); else if ((cp->flags & IP_VS_CONN_F_ONE_PACKET) && cp->control) /* increment is done inside ip_vs_sync_conn too */ atomic_inc(&cp->control->in_pkts); ip_vs_conn_put(cp); return ret; } /* * It is hooked at the NF_INET_FORWARD chain, in order to catch ICMP * related packets destined for 0.0.0.0/0. * When fwmark-based virtual service is used, such as transparent * cache cluster, TCP packets can be marked and routed to ip_vs_in, * but ICMP destined for 0.0.0.0/0 cannot not be easily marked and * sent to ip_vs_in_icmp. So, catch them at the NF_INET_FORWARD chain * and send them to ip_vs_in_icmp. */ static unsigned int ip_vs_forward_icmp(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct netns_ipvs *ipvs = net_ipvs(state->net); int r; /* ipvs enabled in this netns ? */ if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable)) return NF_ACCEPT; if (state->pf == NFPROTO_IPV4) { if (ip_hdr(skb)->protocol != IPPROTO_ICMP) return NF_ACCEPT; #ifdef CONFIG_IP_VS_IPV6 } else { struct ip_vs_iphdr iphdr; ip_vs_fill_iph_skb(AF_INET6, skb, false, &iphdr); if (iphdr.protocol != IPPROTO_ICMPV6) return NF_ACCEPT; return ip_vs_in_icmp_v6(ipvs, skb, &r, state->hook, &iphdr); #endif } return ip_vs_in_icmp(ipvs, skb, &r, state->hook); } static const struct nf_hook_ops ip_vs_ops4[] = { /* After packet filtering, change source only for VS/NAT */ { .hook = ip_vs_out_hook, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP_PRI_NAT_SRC - 2, }, /* After packet filtering, forward packet through VS/DR, VS/TUN, * or VS/NAT(change destination), so that filtering rules can be * applied to IPVS. */ { .hook = ip_vs_in_hook, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP_PRI_NAT_SRC - 1, }, /* Before ip_vs_in, change source only for VS/NAT */ { .hook = ip_vs_out_hook, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP_PRI_NAT_DST + 1, }, /* After mangle, schedule and forward local requests */ { .hook = ip_vs_in_hook, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP_PRI_NAT_DST + 2, }, /* After packet filtering (but before ip_vs_out_icmp), catch icmp * destined for 0.0.0.0/0, which is for incoming IPVS connections */ { .hook = ip_vs_forward_icmp, .pf = NFPROTO_IPV4, .hooknum = NF_INET_FORWARD, .priority = 99, }, /* After packet filtering, change source only for VS/NAT */ { .hook = ip_vs_out_hook, .pf = NFPROTO_IPV4, .hooknum = NF_INET_FORWARD, .priority = 100, }, }; #ifdef CONFIG_IP_VS_IPV6 static const struct nf_hook_ops ip_vs_ops6[] = { /* After packet filtering, change source only for VS/NAT */ { .hook = ip_vs_out_hook, .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP6_PRI_NAT_SRC - 2, }, /* After packet filtering, forward packet through VS/DR, VS/TUN, * or VS/NAT(change destination), so that filtering rules can be * applied to IPVS. */ { .hook = ip_vs_in_hook, .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP6_PRI_NAT_SRC - 1, }, /* Before ip_vs_in, change source only for VS/NAT */ { .hook = ip_vs_out_hook, .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP6_PRI_NAT_DST + 1, }, /* After mangle, schedule and forward local requests */ { .hook = ip_vs_in_hook, .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP6_PRI_NAT_DST + 2, }, /* After packet filtering (but before ip_vs_out_icmp), catch icmp * destined for 0.0.0.0/0, which is for incoming IPVS connections */ { .hook = ip_vs_forward_icmp, .pf = NFPROTO_IPV6, .hooknum = NF_INET_FORWARD, .priority = 99, }, /* After packet filtering, change source only for VS/NAT */ { .hook = ip_vs_out_hook, .pf = NFPROTO_IPV6, .hooknum = NF_INET_FORWARD, .priority = 100, }, }; #endif int ip_vs_register_hooks(struct netns_ipvs *ipvs, unsigned int af) { const struct nf_hook_ops *ops; unsigned int count; unsigned int afmask; int ret = 0; if (af == AF_INET6) { #ifdef CONFIG_IP_VS_IPV6 ops = ip_vs_ops6; count = ARRAY_SIZE(ip_vs_ops6); afmask = 2; #else return -EINVAL; #endif } else { ops = ip_vs_ops4; count = ARRAY_SIZE(ip_vs_ops4); afmask = 1; } if (!(ipvs->hooks_afmask & afmask)) { ret = nf_register_net_hooks(ipvs->net, ops, count); if (ret >= 0) ipvs->hooks_afmask |= afmask; } return ret; } void ip_vs_unregister_hooks(struct netns_ipvs *ipvs, unsigned int af) { const struct nf_hook_ops *ops; unsigned int count; unsigned int afmask; if (af == AF_INET6) { #ifdef CONFIG_IP_VS_IPV6 ops = ip_vs_ops6; count = ARRAY_SIZE(ip_vs_ops6); afmask = 2; #else return; #endif } else { ops = ip_vs_ops4; count = ARRAY_SIZE(ip_vs_ops4); afmask = 1; } if (ipvs->hooks_afmask & afmask) { nf_unregister_net_hooks(ipvs->net, ops, count); ipvs->hooks_afmask &= ~afmask; } } /* * Initialize IP Virtual Server netns mem. */ static int __net_init __ip_vs_init(struct net *net) { struct netns_ipvs *ipvs; ipvs = net_generic(net, ip_vs_net_id); if (ipvs == NULL) return -ENOMEM; /* Hold the beast until a service is registered */ ipvs->enable = 0; ipvs->net = net; /* Counters used for creating unique names */ ipvs->gen = atomic_read(&ipvs_netns_cnt); atomic_inc(&ipvs_netns_cnt); net->ipvs = ipvs; if (ip_vs_estimator_net_init(ipvs) < 0) goto estimator_fail; if (ip_vs_control_net_init(ipvs) < 0) goto control_fail; if (ip_vs_protocol_net_init(ipvs) < 0) goto protocol_fail; if (ip_vs_app_net_init(ipvs) < 0) goto app_fail; if (ip_vs_conn_net_init(ipvs) < 0) goto conn_fail; if (ip_vs_sync_net_init(ipvs) < 0) goto sync_fail; return 0; /* * Error handling */ sync_fail: ip_vs_conn_net_cleanup(ipvs); conn_fail: ip_vs_app_net_cleanup(ipvs); app_fail: ip_vs_protocol_net_cleanup(ipvs); protocol_fail: ip_vs_control_net_cleanup(ipvs); control_fail: ip_vs_estimator_net_cleanup(ipvs); estimator_fail: net->ipvs = NULL; return -ENOMEM; } static void __net_exit __ip_vs_cleanup_batch(struct list_head *net_list) { struct netns_ipvs *ipvs; struct net *net; ip_vs_service_nets_cleanup(net_list); /* ip_vs_flush() with locks */ list_for_each_entry(net, net_list, exit_list) { ipvs = net_ipvs(net); ip_vs_conn_net_cleanup(ipvs); ip_vs_app_net_cleanup(ipvs); ip_vs_protocol_net_cleanup(ipvs); ip_vs_control_net_cleanup(ipvs); ip_vs_estimator_net_cleanup(ipvs); IP_VS_DBG(2, "ipvs netns %d released\n", ipvs->gen); net->ipvs = NULL; } } static void __net_exit __ip_vs_dev_cleanup_batch(struct list_head *net_list) { struct netns_ipvs *ipvs; struct net *net; list_for_each_entry(net, net_list, exit_list) { ipvs = net_ipvs(net); ip_vs_unregister_hooks(ipvs, AF_INET); ip_vs_unregister_hooks(ipvs, AF_INET6); ipvs->enable = 0; /* Disable packet reception */ smp_wmb(); ip_vs_sync_net_cleanup(ipvs); } } static struct pernet_operations ipvs_core_ops = { .init = __ip_vs_init, .exit_batch = __ip_vs_cleanup_batch, .id = &ip_vs_net_id, .size = sizeof(struct netns_ipvs), }; static struct pernet_operations ipvs_core_dev_ops = { .exit_batch = __ip_vs_dev_cleanup_batch, }; /* * Initialize IP Virtual Server */ static int __init ip_vs_init(void) { int ret; ret = ip_vs_control_init(); if (ret < 0) { pr_err("can't setup control.\n"); goto exit; } ip_vs_protocol_init(); ret = ip_vs_conn_init(); if (ret < 0) { pr_err("can't setup connection table.\n"); goto cleanup_protocol; } ret = register_pernet_subsys(&ipvs_core_ops); /* Alloc ip_vs struct */ if (ret < 0) goto cleanup_conn; ret = register_pernet_device(&ipvs_core_dev_ops); if (ret < 0) goto cleanup_sub; ret = ip_vs_register_nl_ioctl(); if (ret < 0) { pr_err("can't register netlink/ioctl.\n"); goto cleanup_dev; } pr_info("ipvs loaded.\n"); return ret; cleanup_dev: unregister_pernet_device(&ipvs_core_dev_ops); cleanup_sub: unregister_pernet_subsys(&ipvs_core_ops); cleanup_conn: ip_vs_conn_cleanup(); cleanup_protocol: ip_vs_protocol_cleanup(); ip_vs_control_cleanup(); exit: return ret; } static void __exit ip_vs_cleanup(void) { ip_vs_unregister_nl_ioctl(); unregister_pernet_device(&ipvs_core_dev_ops); unregister_pernet_subsys(&ipvs_core_ops); /* free ip_vs struct */ ip_vs_conn_cleanup(); ip_vs_protocol_cleanup(); ip_vs_control_cleanup(); /* common rcu_barrier() used by: * - ip_vs_control_cleanup() */ rcu_barrier(); pr_info("ipvs unloaded.\n"); } module_init(ip_vs_init); module_exit(ip_vs_cleanup); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("IP Virtual Server");
62 64 54 62 65 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 // SPDX-License-Identifier: GPL-2.0-or-later /* mpih-rshift.c - MPI helper functions * Copyright (C) 1994, 1996, 1998, 1999, * 2000, 2001 Free Software Foundation, Inc. * * This file is part of GNUPG * * Note: This code is heavily based on the GNU MP Library. * Actually it's the same code with only minor changes in the * way the data is stored; this is to support the abstraction * of an optional secure memory allocation which may be used * to avoid revealing of sensitive data due to paging etc. * The GNU MP Library itself is published under the LGPL; * however I decided to publish this code under the plain GPL. */ #include "mpi-internal.h" /* Shift U (pointed to by UP and USIZE limbs long) CNT bits to the right * and store the USIZE least significant limbs of the result at WP. * The bits shifted out to the right are returned. * * Argument constraints: * 1. 0 < CNT < BITS_PER_MP_LIMB * 2. If the result is to be written over the input, WP must be <= UP. */ mpi_limb_t mpihelp_rshift(mpi_ptr_t wp, mpi_ptr_t up, mpi_size_t usize, unsigned cnt) { mpi_limb_t high_limb, low_limb; unsigned sh_1, sh_2; mpi_size_t i; mpi_limb_t retval; sh_1 = cnt; wp -= 1; sh_2 = BITS_PER_MPI_LIMB - sh_1; high_limb = up[0]; retval = high_limb << sh_2; low_limb = high_limb; for (i = 1; i < usize; i++) { high_limb = up[i]; wp[i] = (low_limb >> sh_1) | (high_limb << sh_2); low_limb = high_limb; } wp[i] = low_limb >> sh_1; return retval; }
1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 // SPDX-License-Identifier: GPL-2.0-or-later /* * * Broadcom Blutonium firmware driver * * Copyright (C) 2003 Maxim Krasnyansky <maxk@qualcomm.com> * Copyright (C) 2003 Marcel Holtmann <marcel@holtmann.org> */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/types.h> #include <linux/errno.h> #include <linux/device.h> #include <linux/firmware.h> #include <linux/usb.h> #include <net/bluetooth/bluetooth.h> #define VERSION "1.2" static const struct usb_device_id bcm203x_table[] = { /* Broadcom Blutonium (BCM2033) */ { USB_DEVICE(0x0a5c, 0x2033) }, { } /* Terminating entry */ }; MODULE_DEVICE_TABLE(usb, bcm203x_table); #define BCM203X_ERROR 0 #define BCM203X_RESET 1 #define BCM203X_LOAD_MINIDRV 2 #define BCM203X_SELECT_MEMORY 3 #define BCM203X_CHECK_MEMORY 4 #define BCM203X_LOAD_FIRMWARE 5 #define BCM203X_CHECK_FIRMWARE 6 #define BCM203X_IN_EP 0x81 #define BCM203X_OUT_EP 0x02 struct bcm203x_data { struct usb_device *udev; unsigned long state; struct work_struct work; atomic_t shutdown; struct urb *urb; unsigned char *buffer; unsigned char *fw_data; unsigned int fw_size; unsigned int fw_sent; }; static void bcm203x_complete(struct urb *urb) { struct bcm203x_data *data = urb->context; struct usb_device *udev = urb->dev; int len; BT_DBG("udev %p urb %p", udev, urb); if (urb->status) { BT_ERR("URB failed with status %d", urb->status); data->state = BCM203X_ERROR; return; } switch (data->state) { case BCM203X_LOAD_MINIDRV: memcpy(data->buffer, "#", 1); usb_fill_bulk_urb(urb, udev, usb_sndbulkpipe(udev, BCM203X_OUT_EP), data->buffer, 1, bcm203x_complete, data); data->state = BCM203X_SELECT_MEMORY; /* use workqueue to have a small delay */ schedule_work(&data->work); break; case BCM203X_SELECT_MEMORY: usb_fill_int_urb(urb, udev, usb_rcvintpipe(udev, BCM203X_IN_EP), data->buffer, 32, bcm203x_complete, data, 1); data->state = BCM203X_CHECK_MEMORY; if (usb_submit_urb(data->urb, GFP_ATOMIC) < 0) BT_ERR("Can't submit URB"); break; case BCM203X_CHECK_MEMORY: if (data->buffer[0] != '#') { BT_ERR("Memory select failed"); data->state = BCM203X_ERROR; break; } data->state = BCM203X_LOAD_FIRMWARE; fallthrough; case BCM203X_LOAD_FIRMWARE: if (data->fw_sent == data->fw_size) { usb_fill_int_urb(urb, udev, usb_rcvintpipe(udev, BCM203X_IN_EP), data->buffer, 32, bcm203x_complete, data, 1); data->state = BCM203X_CHECK_FIRMWARE; } else { len = min_t(uint, data->fw_size - data->fw_sent, 4096); usb_fill_bulk_urb(urb, udev, usb_sndbulkpipe(udev, BCM203X_OUT_EP), data->fw_data + data->fw_sent, len, bcm203x_complete, data); data->fw_sent += len; } if (usb_submit_urb(data->urb, GFP_ATOMIC) < 0) BT_ERR("Can't submit URB"); break; case BCM203X_CHECK_FIRMWARE: if (data->buffer[0] != '.') { BT_ERR("Firmware loading failed"); data->state = BCM203X_ERROR; break; } data->state = BCM203X_RESET; break; } } static void bcm203x_work(struct work_struct *work) { struct bcm203x_data *data = container_of(work, struct bcm203x_data, work); if (atomic_read(&data->shutdown)) return; if (usb_submit_urb(data->urb, GFP_KERNEL) < 0) BT_ERR("Can't submit URB"); } static int bcm203x_probe(struct usb_interface *intf, const struct usb_device_id *id) { const struct firmware *firmware; struct usb_device *udev = interface_to_usbdev(intf); struct bcm203x_data *data; int size; BT_DBG("intf %p id %p", intf, id); if (intf->cur_altsetting->desc.bInterfaceNumber != 0) return -ENODEV; data = devm_kzalloc(&intf->dev, sizeof(*data), GFP_KERNEL); if (!data) return -ENOMEM; data->udev = udev; data->state = BCM203X_LOAD_MINIDRV; data->urb = usb_alloc_urb(0, GFP_KERNEL); if (!data->urb) return -ENOMEM; if (request_firmware(&firmware, "BCM2033-MD.hex", &udev->dev) < 0) { BT_ERR("Mini driver request failed"); usb_free_urb(data->urb); return -EIO; } BT_DBG("minidrv data %p size %zu", firmware->data, firmware->size); size = max_t(uint, firmware->size, 4096); data->buffer = kmalloc(size, GFP_KERNEL); if (!data->buffer) { BT_ERR("Can't allocate memory for mini driver"); release_firmware(firmware); usb_free_urb(data->urb); return -ENOMEM; } memcpy(data->buffer, firmware->data, firmware->size); usb_fill_bulk_urb(data->urb, udev, usb_sndbulkpipe(udev, BCM203X_OUT_EP), data->buffer, firmware->size, bcm203x_complete, data); release_firmware(firmware); if (request_firmware(&firmware, "BCM2033-FW.bin", &udev->dev) < 0) { BT_ERR("Firmware request failed"); usb_free_urb(data->urb); kfree(data->buffer); return -EIO; } BT_DBG("firmware data %p size %zu", firmware->data, firmware->size); data->fw_data = kmemdup(firmware->data, firmware->size, GFP_KERNEL); if (!data->fw_data) { BT_ERR("Can't allocate memory for firmware image"); release_firmware(firmware); usb_free_urb(data->urb); kfree(data->buffer); return -ENOMEM; } data->fw_size = firmware->size; data->fw_sent = 0; release_firmware(firmware); INIT_WORK(&data->work, bcm203x_work); usb_set_intfdata(intf, data); /* use workqueue to have a small delay */ schedule_work(&data->work); return 0; } static void bcm203x_disconnect(struct usb_interface *intf) { struct bcm203x_data *data = usb_get_intfdata(intf); BT_DBG("intf %p", intf); atomic_inc(&data->shutdown); cancel_work_sync(&data->work); usb_kill_urb(data->urb); usb_set_intfdata(intf, NULL); usb_free_urb(data->urb); kfree(data->fw_data); kfree(data->buffer); } static struct usb_driver bcm203x_driver = { .name = "bcm203x", .probe = bcm203x_probe, .disconnect = bcm203x_disconnect, .id_table = bcm203x_table, .disable_hub_initiated_lpm = 1, }; module_usb_driver(bcm203x_driver); MODULE_AUTHOR("Marcel Holtmann <marcel@holtmann.org>"); MODULE_DESCRIPTION("Broadcom Blutonium firmware driver ver " VERSION); MODULE_VERSION(VERSION); MODULE_LICENSE("GPL"); MODULE_FIRMWARE("BCM2033-MD.hex"); MODULE_FIRMWARE("BCM2033-FW.bin");
1 1 1 1 2 1 1 1 7 1 1 6 1 1 1 1 4 3 1 1 1 1 2 2 1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 // SPDX-License-Identifier: GPL-2.0-only /* * kernel/ksysfs.c - sysfs attributes in /sys/kernel, which * are not related to any other subsystem * * Copyright (C) 2004 Kay Sievers <kay.sievers@vrfy.org> */ #include <asm/byteorder.h> #include <linux/kobject.h> #include <linux/string.h> #include <linux/sysfs.h> #include <linux/export.h> #include <linux/init.h> #include <linux/kexec.h> #include <linux/profile.h> #include <linux/stat.h> #include <linux/sched.h> #include <linux/capability.h> #include <linux/compiler.h> #include <linux/rcupdate.h> /* rcu_expedited and rcu_normal */ #if defined(__LITTLE_ENDIAN) #define CPU_BYTEORDER_STRING "little" #elif defined(__BIG_ENDIAN) #define CPU_BYTEORDER_STRING "big" #else #error Unknown byteorder #endif #define KERNEL_ATTR_RO(_name) \ static struct kobj_attribute _name##_attr = __ATTR_RO(_name) #define KERNEL_ATTR_RW(_name) \ static struct kobj_attribute _name##_attr = __ATTR_RW(_name) /* current uevent sequence number */ static ssize_t uevent_seqnum_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%llu\n", (u64)atomic64_read(&uevent_seqnum)); } KERNEL_ATTR_RO(uevent_seqnum); /* cpu byteorder */ static ssize_t cpu_byteorder_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%s\n", CPU_BYTEORDER_STRING); } KERNEL_ATTR_RO(cpu_byteorder); /* address bits */ static ssize_t address_bits_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%zu\n", sizeof(void *) * 8 /* CHAR_BIT */); } KERNEL_ATTR_RO(address_bits); #ifdef CONFIG_UEVENT_HELPER /* uevent helper program, used during early boot */ static ssize_t uevent_helper_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%s\n", uevent_helper); } static ssize_t uevent_helper_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { if (count+1 > UEVENT_HELPER_PATH_LEN) return -ENOENT; memcpy(uevent_helper, buf, count); uevent_helper[count] = '\0'; if (count && uevent_helper[count-1] == '\n') uevent_helper[count-1] = '\0'; return count; } KERNEL_ATTR_RW(uevent_helper); #endif #ifdef CONFIG_PROFILING static ssize_t profiling_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%d\n", prof_on); } static ssize_t profiling_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { int ret; static DEFINE_MUTEX(lock); /* * We need serialization, for profile_setup() initializes prof_on * value and profile_init() must not reallocate prof_buffer after * once allocated. */ guard(mutex)(&lock); if (prof_on) return -EEXIST; /* * This eventually calls into get_option() which * has a ton of callers and is not const. It is * easiest to cast it away here. */ profile_setup((char *)buf); ret = profile_init(); if (ret) return ret; ret = create_proc_profile(); if (ret) return ret; return count; } KERNEL_ATTR_RW(profiling); #endif #ifdef CONFIG_KEXEC_CORE static ssize_t kexec_loaded_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%d\n", !!kexec_image); } KERNEL_ATTR_RO(kexec_loaded); #ifdef CONFIG_CRASH_DUMP static ssize_t kexec_crash_loaded_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%d\n", kexec_crash_loaded()); } KERNEL_ATTR_RO(kexec_crash_loaded); static ssize_t kexec_crash_size_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { ssize_t size = crash_get_memory_size(); if (size < 0) return size; return sysfs_emit(buf, "%zd\n", size); } static ssize_t kexec_crash_size_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { unsigned long cnt; int ret; if (kstrtoul(buf, 0, &cnt)) return -EINVAL; ret = crash_shrink_memory(cnt); return ret < 0 ? ret : count; } KERNEL_ATTR_RW(kexec_crash_size); #endif /* CONFIG_CRASH_DUMP*/ #endif /* CONFIG_KEXEC_CORE */ #ifdef CONFIG_VMCORE_INFO static ssize_t vmcoreinfo_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { phys_addr_t vmcore_base = paddr_vmcoreinfo_note(); return sysfs_emit(buf, "%pa %x\n", &vmcore_base, (unsigned int)VMCOREINFO_NOTE_SIZE); } KERNEL_ATTR_RO(vmcoreinfo); #ifdef CONFIG_CRASH_HOTPLUG static ssize_t crash_elfcorehdr_size_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { unsigned int sz = crash_get_elfcorehdr_size(); return sysfs_emit(buf, "%u\n", sz); } KERNEL_ATTR_RO(crash_elfcorehdr_size); #endif #endif /* CONFIG_VMCORE_INFO */ /* whether file capabilities are enabled */ static ssize_t fscaps_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%d\n", file_caps_enabled); } KERNEL_ATTR_RO(fscaps); #ifndef CONFIG_TINY_RCU int rcu_expedited; static ssize_t rcu_expedited_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%d\n", READ_ONCE(rcu_expedited)); } static ssize_t rcu_expedited_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { if (kstrtoint(buf, 0, &rcu_expedited)) return -EINVAL; return count; } KERNEL_ATTR_RW(rcu_expedited); int rcu_normal; static ssize_t rcu_normal_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%d\n", READ_ONCE(rcu_normal)); } static ssize_t rcu_normal_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { if (kstrtoint(buf, 0, &rcu_normal)) return -EINVAL; return count; } KERNEL_ATTR_RW(rcu_normal); #endif /* #ifndef CONFIG_TINY_RCU */ /* * Make /sys/kernel/notes give the raw contents of our kernel .notes section. */ extern const void __start_notes; extern const void __stop_notes; #define notes_size (&__stop_notes - &__start_notes) static __ro_after_init BIN_ATTR_SIMPLE_RO(notes); struct kobject *kernel_kobj; EXPORT_SYMBOL_GPL(kernel_kobj); static struct attribute * kernel_attrs[] = { &fscaps_attr.attr, &uevent_seqnum_attr.attr, &cpu_byteorder_attr.attr, &address_bits_attr.attr, #ifdef CONFIG_UEVENT_HELPER &uevent_helper_attr.attr, #endif #ifdef CONFIG_PROFILING &profiling_attr.attr, #endif #ifdef CONFIG_KEXEC_CORE &kexec_loaded_attr.attr, #ifdef CONFIG_CRASH_DUMP &kexec_crash_loaded_attr.attr, &kexec_crash_size_attr.attr, #endif #endif #ifdef CONFIG_VMCORE_INFO &vmcoreinfo_attr.attr, #ifdef CONFIG_CRASH_HOTPLUG &crash_elfcorehdr_size_attr.attr, #endif #endif #ifndef CONFIG_TINY_RCU &rcu_expedited_attr.attr, &rcu_normal_attr.attr, #endif NULL }; static const struct attribute_group kernel_attr_group = { .attrs = kernel_attrs, }; static int __init ksysfs_init(void) { int error; kernel_kobj = kobject_create_and_add("kernel", NULL); if (!kernel_kobj) { error = -ENOMEM; goto exit; } error = sysfs_create_group(kernel_kobj, &kernel_attr_group); if (error) goto kset_exit; if (notes_size > 0) { bin_attr_notes.private = (void *)&__start_notes; bin_attr_notes.size = notes_size; error = sysfs_create_bin_file(kernel_kobj, &bin_attr_notes); if (error) goto group_exit; } return 0; group_exit: sysfs_remove_group(kernel_kobj, &kernel_attr_group); kset_exit: kobject_put(kernel_kobj); exit: return error; } core_initcall(ksysfs_init);
5 3 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2009 Patrick McHardy <kaber@trash.net> * * Development of this code funded by Astaro AG (http://www.astaro.com/) */ #include <linux/kernel.h> #include <linux/init.h> #include <linux/list.h> #include <linux/rbtree.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> struct nft_lookup { struct nft_set *set; u8 sreg; u8 dreg; bool dreg_set; bool invert; struct nft_set_binding binding; }; #ifdef CONFIG_MITIGATION_RETPOLINE bool nft_set_do_lookup(const struct net *net, const struct nft_set *set, const u32 *key, const struct nft_set_ext **ext) { if (set->ops == &nft_set_hash_fast_type.ops) return nft_hash_lookup_fast(net, set, key, ext); if (set->ops == &nft_set_hash_type.ops) return nft_hash_lookup(net, set, key, ext); if (set->ops == &nft_set_rhash_type.ops) return nft_rhash_lookup(net, set, key, ext); if (set->ops == &nft_set_bitmap_type.ops) return nft_bitmap_lookup(net, set, key, ext); if (set->ops == &nft_set_pipapo_type.ops) return nft_pipapo_lookup(net, set, key, ext); #if defined(CONFIG_X86_64) && !defined(CONFIG_UML) if (set->ops == &nft_set_pipapo_avx2_type.ops) return nft_pipapo_avx2_lookup(net, set, key, ext); #endif if (set->ops == &nft_set_rbtree_type.ops) return nft_rbtree_lookup(net, set, key, ext); WARN_ON_ONCE(1); return set->ops->lookup(net, set, key, ext); } EXPORT_SYMBOL_GPL(nft_set_do_lookup); #endif void nft_lookup_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_lookup *priv = nft_expr_priv(expr); const struct nft_set *set = priv->set; const struct nft_set_ext *ext = NULL; const struct net *net = nft_net(pkt); bool found; found = nft_set_do_lookup(net, set, &regs->data[priv->sreg], &ext) ^ priv->invert; if (!found) { ext = nft_set_catchall_lookup(net, set); if (!ext) { regs->verdict.code = NFT_BREAK; return; } } if (ext) { if (priv->dreg_set) nft_data_copy(&regs->data[priv->dreg], nft_set_ext_data(ext), set->dlen); nft_set_elem_update_expr(ext, regs, pkt); } } static const struct nla_policy nft_lookup_policy[NFTA_LOOKUP_MAX + 1] = { [NFTA_LOOKUP_SET] = { .type = NLA_STRING, .len = NFT_SET_MAXNAMELEN - 1 }, [NFTA_LOOKUP_SET_ID] = { .type = NLA_U32 }, [NFTA_LOOKUP_SREG] = { .type = NLA_U32 }, [NFTA_LOOKUP_DREG] = { .type = NLA_U32 }, [NFTA_LOOKUP_FLAGS] = NLA_POLICY_MASK(NLA_BE32, NFT_LOOKUP_F_INV), }; static int nft_lookup_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_lookup *priv = nft_expr_priv(expr); u8 genmask = nft_genmask_next(ctx->net); struct nft_set *set; u32 flags; int err; if (tb[NFTA_LOOKUP_SET] == NULL || tb[NFTA_LOOKUP_SREG] == NULL) return -EINVAL; set = nft_set_lookup_global(ctx->net, ctx->table, tb[NFTA_LOOKUP_SET], tb[NFTA_LOOKUP_SET_ID], genmask); if (IS_ERR(set)) return PTR_ERR(set); err = nft_parse_register_load(ctx, tb[NFTA_LOOKUP_SREG], &priv->sreg, set->klen); if (err < 0) return err; if (tb[NFTA_LOOKUP_FLAGS]) { flags = ntohl(nla_get_be32(tb[NFTA_LOOKUP_FLAGS])); if (flags & NFT_LOOKUP_F_INV) priv->invert = true; } if (tb[NFTA_LOOKUP_DREG] != NULL) { if (priv->invert) return -EINVAL; if (!(set->flags & NFT_SET_MAP)) return -EINVAL; err = nft_parse_register_store(ctx, tb[NFTA_LOOKUP_DREG], &priv->dreg, NULL, nft_set_datatype(set), set->dlen); if (err < 0) return err; priv->dreg_set = true; } else if (set->flags & NFT_SET_MAP) { /* Map given, but user asks for lookup only (i.e. to * ignore value assoicated with key). * * This makes no sense for anonymous maps since they are * scoped to the rule, but for named sets this can be useful. */ if (set->flags & NFT_SET_ANONYMOUS) return -EINVAL; } priv->binding.flags = set->flags & NFT_SET_MAP; err = nf_tables_bind_set(ctx, set, &priv->binding); if (err < 0) return err; priv->set = set; return 0; } static void nft_lookup_deactivate(const struct nft_ctx *ctx, const struct nft_expr *expr, enum nft_trans_phase phase) { struct nft_lookup *priv = nft_expr_priv(expr); nf_tables_deactivate_set(ctx, priv->set, &priv->binding, phase); } static void nft_lookup_activate(const struct nft_ctx *ctx, const struct nft_expr *expr) { struct nft_lookup *priv = nft_expr_priv(expr); nf_tables_activate_set(ctx, priv->set); } static void nft_lookup_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { struct nft_lookup *priv = nft_expr_priv(expr); nf_tables_destroy_set(ctx, priv->set); } static int nft_lookup_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { const struct nft_lookup *priv = nft_expr_priv(expr); u32 flags = priv->invert ? NFT_LOOKUP_F_INV : 0; if (nla_put_string(skb, NFTA_LOOKUP_SET, priv->set->name)) goto nla_put_failure; if (nft_dump_register(skb, NFTA_LOOKUP_SREG, priv->sreg)) goto nla_put_failure; if (priv->dreg_set) if (nft_dump_register(skb, NFTA_LOOKUP_DREG, priv->dreg)) goto nla_put_failure; if (nla_put_be32(skb, NFTA_LOOKUP_FLAGS, htonl(flags))) goto nla_put_failure; return 0; nla_put_failure: return -1; } static int nft_lookup_validate(const struct nft_ctx *ctx, const struct nft_expr *expr) { const struct nft_lookup *priv = nft_expr_priv(expr); struct nft_set_iter iter; if (!(priv->set->flags & NFT_SET_MAP) || priv->set->dtype != NFT_DATA_VERDICT) return 0; iter.genmask = nft_genmask_next(ctx->net); iter.type = NFT_ITER_UPDATE; iter.skip = 0; iter.count = 0; iter.err = 0; iter.fn = nft_setelem_validate; priv->set->ops->walk(ctx, priv->set, &iter); if (!iter.err) iter.err = nft_set_catchall_validate(ctx, priv->set); if (iter.err < 0) return iter.err; return 0; } static bool nft_lookup_reduce(struct nft_regs_track *track, const struct nft_expr *expr) { const struct nft_lookup *priv = nft_expr_priv(expr); if (priv->set->flags & NFT_SET_MAP) nft_reg_track_cancel(track, priv->dreg, priv->set->dlen); return false; } static const struct nft_expr_ops nft_lookup_ops = { .type = &nft_lookup_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_lookup)), .eval = nft_lookup_eval, .init = nft_lookup_init, .activate = nft_lookup_activate, .deactivate = nft_lookup_deactivate, .destroy = nft_lookup_destroy, .dump = nft_lookup_dump, .validate = nft_lookup_validate, .reduce = nft_lookup_reduce, }; struct nft_expr_type nft_lookup_type __read_mostly = { .name = "lookup", .ops = &nft_lookup_ops, .policy = nft_lookup_policy, .maxattr = NFTA_LOOKUP_MAX, .owner = THIS_MODULE, };
2153 2153 2151 2151 1346 1015 6276 6276 6284 6283 3479 4359 20147 20156 20179 20149 20150 4038 8723 2116 111 163 127 3 16 7307 3 9232 3547 7309 7300 7301 1549 6230 3 186 187 186 187 86 151 35 35 35 35 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2021, Google LLC. * Pasha Tatashin <pasha.tatashin@soleen.com> */ #include <linux/kstrtox.h> #include <linux/mm.h> #include <linux/page_table_check.h> #include <linux/swap.h> #include <linux/swapops.h> #undef pr_fmt #define pr_fmt(fmt) "page_table_check: " fmt struct page_table_check { atomic_t anon_map_count; atomic_t file_map_count; }; static bool __page_table_check_enabled __initdata = IS_ENABLED(CONFIG_PAGE_TABLE_CHECK_ENFORCED); DEFINE_STATIC_KEY_TRUE(page_table_check_disabled); EXPORT_SYMBOL(page_table_check_disabled); static int __init early_page_table_check_param(char *buf) { return kstrtobool(buf, &__page_table_check_enabled); } early_param("page_table_check", early_page_table_check_param); static bool __init need_page_table_check(void) { return __page_table_check_enabled; } static void __init init_page_table_check(void) { if (!__page_table_check_enabled) return; static_branch_disable(&page_table_check_disabled); } struct page_ext_operations page_table_check_ops = { .size = sizeof(struct page_table_check), .need = need_page_table_check, .init = init_page_table_check, .need_shared_flags = false, }; static struct page_table_check *get_page_table_check(struct page_ext *page_ext) { BUG_ON(!page_ext); return page_ext_data(page_ext, &page_table_check_ops); } /* * An entry is removed from the page table, decrement the counters for that page * verify that it is of correct type and counters do not become negative. */ static void page_table_check_clear(unsigned long pfn, unsigned long pgcnt) { struct page_ext *page_ext; struct page *page; unsigned long i; bool anon; if (!pfn_valid(pfn)) return; page = pfn_to_page(pfn); page_ext = page_ext_get(page); if (!page_ext) return; BUG_ON(PageSlab(page)); anon = PageAnon(page); for (i = 0; i < pgcnt; i++) { struct page_table_check *ptc = get_page_table_check(page_ext); if (anon) { BUG_ON(atomic_read(&ptc->file_map_count)); BUG_ON(atomic_dec_return(&ptc->anon_map_count) < 0); } else { BUG_ON(atomic_read(&ptc->anon_map_count)); BUG_ON(atomic_dec_return(&ptc->file_map_count) < 0); } page_ext = page_ext_next(page_ext); } page_ext_put(page_ext); } /* * A new entry is added to the page table, increment the counters for that page * verify that it is of correct type and is not being mapped with a different * type to a different process. */ static void page_table_check_set(unsigned long pfn, unsigned long pgcnt, bool rw) { struct page_ext *page_ext; struct page *page; unsigned long i; bool anon; if (!pfn_valid(pfn)) return; page = pfn_to_page(pfn); page_ext = page_ext_get(page); if (!page_ext) return; BUG_ON(PageSlab(page)); anon = PageAnon(page); for (i = 0; i < pgcnt; i++) { struct page_table_check *ptc = get_page_table_check(page_ext); if (anon) { BUG_ON(atomic_read(&ptc->file_map_count)); BUG_ON(atomic_inc_return(&ptc->anon_map_count) > 1 && rw); } else { BUG_ON(atomic_read(&ptc->anon_map_count)); BUG_ON(atomic_inc_return(&ptc->file_map_count) < 0); } page_ext = page_ext_next(page_ext); } page_ext_put(page_ext); } /* * page is on free list, or is being allocated, verify that counters are zeroes * crash if they are not. */ void __page_table_check_zero(struct page *page, unsigned int order) { struct page_ext *page_ext; unsigned long i; BUG_ON(PageSlab(page)); page_ext = page_ext_get(page); if (!page_ext) return; for (i = 0; i < (1ul << order); i++) { struct page_table_check *ptc = get_page_table_check(page_ext); BUG_ON(atomic_read(&ptc->anon_map_count)); BUG_ON(atomic_read(&ptc->file_map_count)); page_ext = page_ext_next(page_ext); } page_ext_put(page_ext); } void __page_table_check_pte_clear(struct mm_struct *mm, pte_t pte) { if (&init_mm == mm) return; if (pte_user_accessible_page(pte)) { page_table_check_clear(pte_pfn(pte), PAGE_SIZE >> PAGE_SHIFT); } } EXPORT_SYMBOL(__page_table_check_pte_clear); void __page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd) { if (&init_mm == mm) return; if (pmd_user_accessible_page(pmd)) { page_table_check_clear(pmd_pfn(pmd), PMD_SIZE >> PAGE_SHIFT); } } EXPORT_SYMBOL(__page_table_check_pmd_clear); void __page_table_check_pud_clear(struct mm_struct *mm, pud_t pud) { if (&init_mm == mm) return; if (pud_user_accessible_page(pud)) { page_table_check_clear(pud_pfn(pud), PUD_SIZE >> PAGE_SHIFT); } } EXPORT_SYMBOL(__page_table_check_pud_clear); /* Whether the swap entry cached writable information */ static inline bool swap_cached_writable(swp_entry_t entry) { return is_writable_device_exclusive_entry(entry) || is_writable_device_private_entry(entry) || is_writable_migration_entry(entry); } static inline void page_table_check_pte_flags(pte_t pte) { if (pte_present(pte) && pte_uffd_wp(pte)) WARN_ON_ONCE(pte_write(pte)); else if (is_swap_pte(pte) && pte_swp_uffd_wp(pte)) WARN_ON_ONCE(swap_cached_writable(pte_to_swp_entry(pte))); } void __page_table_check_ptes_set(struct mm_struct *mm, pte_t *ptep, pte_t pte, unsigned int nr) { unsigned int i; if (&init_mm == mm) return; page_table_check_pte_flags(pte); for (i = 0; i < nr; i++) __page_table_check_pte_clear(mm, ptep_get(ptep + i)); if (pte_user_accessible_page(pte)) page_table_check_set(pte_pfn(pte), nr, pte_write(pte)); } EXPORT_SYMBOL(__page_table_check_ptes_set); static inline void page_table_check_pmd_flags(pmd_t pmd) { if (pmd_present(pmd) && pmd_uffd_wp(pmd)) WARN_ON_ONCE(pmd_write(pmd)); else if (is_swap_pmd(pmd) && pmd_swp_uffd_wp(pmd)) WARN_ON_ONCE(swap_cached_writable(pmd_to_swp_entry(pmd))); } void __page_table_check_pmd_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd) { if (&init_mm == mm) return; page_table_check_pmd_flags(pmd); __page_table_check_pmd_clear(mm, *pmdp); if (pmd_user_accessible_page(pmd)) { page_table_check_set(pmd_pfn(pmd), PMD_SIZE >> PAGE_SHIFT, pmd_write(pmd)); } } EXPORT_SYMBOL(__page_table_check_pmd_set); void __page_table_check_pud_set(struct mm_struct *mm, pud_t *pudp, pud_t pud) { if (&init_mm == mm) return; __page_table_check_pud_clear(mm, *pudp); if (pud_user_accessible_page(pud)) { page_table_check_set(pud_pfn(pud), PUD_SIZE >> PAGE_SHIFT, pud_write(pud)); } } EXPORT_SYMBOL(__page_table_check_pud_set); void __page_table_check_pte_clear_range(struct mm_struct *mm, unsigned long addr, pmd_t pmd) { if (&init_mm == mm) return; if (!pmd_bad(pmd) && !pmd_leaf(pmd)) { pte_t *ptep = pte_offset_map(&pmd, addr); unsigned long i; if (WARN_ON(!ptep)) return; for (i = 0; i < PTRS_PER_PTE; i++) { __page_table_check_pte_clear(mm, ptep_get(ptep)); addr += PAGE_SIZE; ptep++; } pte_unmap(ptep - PTRS_PER_PTE); } }
4276 3 3 4205 33 4205 4209 4208 3397 36 3402 3394 9 9 4447 3155 434 4444 4442 3261 4444 4448 4442 4444 1787 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 // SPDX-License-Identifier: GPL-2.0 /* * drivers/base/power/main.c - Where the driver meets power management. * * Copyright (c) 2003 Patrick Mochel * Copyright (c) 2003 Open Source Development Lab * * The driver model core calls device_pm_add() when a device is registered. * This will initialize the embedded device_pm_info object in the device * and add it to the list of power-controlled devices. sysfs entries for * controlling device power management will also be added. * * A separate list is used for keeping track of power info, because the power * domain dependencies may differ from the ancestral dependencies that the * subsystem list maintains. */ #define pr_fmt(fmt) "PM: " fmt #define dev_fmt pr_fmt #include <linux/device.h> #include <linux/export.h> #include <linux/mutex.h> #include <linux/pm.h> #include <linux/pm_runtime.h> #include <linux/pm-trace.h> #include <linux/pm_wakeirq.h> #include <linux/interrupt.h> #include <linux/sched.h> #include <linux/sched/debug.h> #include <linux/async.h> #include <linux/suspend.h> #include <trace/events/power.h> #include <linux/cpufreq.h> #include <linux/devfreq.h> #include <linux/timer.h> #include "../base.h" #include "power.h" typedef int (*pm_callback_t)(struct device *); #define list_for_each_entry_rcu_locked(pos, head, member) \ list_for_each_entry_rcu(pos, head, member, \ device_links_read_lock_held()) /* * The entries in the dpm_list list are in a depth first order, simply * because children are guaranteed to be discovered after parents, and * are inserted at the back of the list on discovery. * * Since device_pm_add() may be called with a device lock held, * we must never try to acquire a device lock while holding * dpm_list_mutex. */ LIST_HEAD(dpm_list); static LIST_HEAD(dpm_prepared_list); static LIST_HEAD(dpm_suspended_list); static LIST_HEAD(dpm_late_early_list); static LIST_HEAD(dpm_noirq_list); static DEFINE_MUTEX(dpm_list_mtx); static pm_message_t pm_transition; static int async_error; static const char *pm_verb(int event) { switch (event) { case PM_EVENT_SUSPEND: return "suspend"; case PM_EVENT_RESUME: return "resume"; case PM_EVENT_FREEZE: return "freeze"; case PM_EVENT_QUIESCE: return "quiesce"; case PM_EVENT_HIBERNATE: return "hibernate"; case PM_EVENT_THAW: return "thaw"; case PM_EVENT_RESTORE: return "restore"; case PM_EVENT_RECOVER: return "recover"; default: return "(unknown PM event)"; } } /** * device_pm_sleep_init - Initialize system suspend-related device fields. * @dev: Device object being initialized. */ void device_pm_sleep_init(struct device *dev) { dev->power.is_prepared = false; dev->power.is_suspended = false; dev->power.is_noirq_suspended = false; dev->power.is_late_suspended = false; init_completion(&dev->power.completion); complete_all(&dev->power.completion); dev->power.wakeup = NULL; INIT_LIST_HEAD(&dev->power.entry); } /** * device_pm_lock - Lock the list of active devices used by the PM core. */ void device_pm_lock(void) { mutex_lock(&dpm_list_mtx); } /** * device_pm_unlock - Unlock the list of active devices used by the PM core. */ void device_pm_unlock(void) { mutex_unlock(&dpm_list_mtx); } /** * device_pm_add - Add a device to the PM core's list of active devices. * @dev: Device to add to the list. */ void device_pm_add(struct device *dev) { /* Skip PM setup/initialization. */ if (device_pm_not_required(dev)) return; pr_debug("Adding info for %s:%s\n", dev->bus ? dev->bus->name : "No Bus", dev_name(dev)); device_pm_check_callbacks(dev); mutex_lock(&dpm_list_mtx); if (dev->parent && dev->parent->power.is_prepared) dev_warn(dev, "parent %s should not be sleeping\n", dev_name(dev->parent)); list_add_tail(&dev->power.entry, &dpm_list); dev->power.in_dpm_list = true; mutex_unlock(&dpm_list_mtx); } /** * device_pm_remove - Remove a device from the PM core's list of active devices. * @dev: Device to be removed from the list. */ void device_pm_remove(struct device *dev) { if (device_pm_not_required(dev)) return; pr_debug("Removing info for %s:%s\n", dev->bus ? dev->bus->name : "No Bus", dev_name(dev)); complete_all(&dev->power.completion); mutex_lock(&dpm_list_mtx); list_del_init(&dev->power.entry); dev->power.in_dpm_list = false; mutex_unlock(&dpm_list_mtx); device_wakeup_disable(dev); pm_runtime_remove(dev); device_pm_check_callbacks(dev); } /** * device_pm_move_before - Move device in the PM core's list of active devices. * @deva: Device to move in dpm_list. * @devb: Device @deva should come before. */ void device_pm_move_before(struct device *deva, struct device *devb) { pr_debug("Moving %s:%s before %s:%s\n", deva->bus ? deva->bus->name : "No Bus", dev_name(deva), devb->bus ? devb->bus->name : "No Bus", dev_name(devb)); /* Delete deva from dpm_list and reinsert before devb. */ list_move_tail(&deva->power.entry, &devb->power.entry); } /** * device_pm_move_after - Move device in the PM core's list of active devices. * @deva: Device to move in dpm_list. * @devb: Device @deva should come after. */ void device_pm_move_after(struct device *deva, struct device *devb) { pr_debug("Moving %s:%s after %s:%s\n", deva->bus ? deva->bus->name : "No Bus", dev_name(deva), devb->bus ? devb->bus->name : "No Bus", dev_name(devb)); /* Delete deva from dpm_list and reinsert after devb. */ list_move(&deva->power.entry, &devb->power.entry); } /** * device_pm_move_last - Move device to end of the PM core's list of devices. * @dev: Device to move in dpm_list. */ void device_pm_move_last(struct device *dev) { pr_debug("Moving %s:%s to end of list\n", dev->bus ? dev->bus->name : "No Bus", dev_name(dev)); list_move_tail(&dev->power.entry, &dpm_list); } static ktime_t initcall_debug_start(struct device *dev, void *cb) { if (!pm_print_times_enabled) return 0; dev_info(dev, "calling %ps @ %i, parent: %s\n", cb, task_pid_nr(current), dev->parent ? dev_name(dev->parent) : "none"); return ktime_get(); } static void initcall_debug_report(struct device *dev, ktime_t calltime, void *cb, int error) { ktime_t rettime; if (!pm_print_times_enabled) return; rettime = ktime_get(); dev_info(dev, "%ps returned %d after %Ld usecs\n", cb, error, (unsigned long long)ktime_us_delta(rettime, calltime)); } /** * dpm_wait - Wait for a PM operation to complete. * @dev: Device to wait for. * @async: If unset, wait only if the device's power.async_suspend flag is set. */ static void dpm_wait(struct device *dev, bool async) { if (!dev) return; if (async || (pm_async_enabled && dev->power.async_suspend)) wait_for_completion(&dev->power.completion); } static int dpm_wait_fn(struct device *dev, void *async_ptr) { dpm_wait(dev, *((bool *)async_ptr)); return 0; } static void dpm_wait_for_children(struct device *dev, bool async) { device_for_each_child(dev, &async, dpm_wait_fn); } static void dpm_wait_for_suppliers(struct device *dev, bool async) { struct device_link *link; int idx; idx = device_links_read_lock(); /* * If the supplier goes away right after we've checked the link to it, * we'll wait for its completion to change the state, but that's fine, * because the only things that will block as a result are the SRCU * callbacks freeing the link objects for the links in the list we're * walking. */ list_for_each_entry_rcu_locked(link, &dev->links.suppliers, c_node) if (READ_ONCE(link->status) != DL_STATE_DORMANT) dpm_wait(link->supplier, async); device_links_read_unlock(idx); } static bool dpm_wait_for_superior(struct device *dev, bool async) { struct device *parent; /* * If the device is resumed asynchronously and the parent's callback * deletes both the device and the parent itself, the parent object may * be freed while this function is running, so avoid that by reference * counting the parent once more unless the device has been deleted * already (in which case return right away). */ mutex_lock(&dpm_list_mtx); if (!device_pm_initialized(dev)) { mutex_unlock(&dpm_list_mtx); return false; } parent = get_device(dev->parent); mutex_unlock(&dpm_list_mtx); dpm_wait(parent, async); put_device(parent); dpm_wait_for_suppliers(dev, async); /* * If the parent's callback has deleted the device, attempting to resume * it would be invalid, so avoid doing that then. */ return device_pm_initialized(dev); } static void dpm_wait_for_consumers(struct device *dev, bool async) { struct device_link *link; int idx; idx = device_links_read_lock(); /* * The status of a device link can only be changed from "dormant" by a * probe, but that cannot happen during system suspend/resume. In * theory it can change to "dormant" at that time, but then it is * reasonable to wait for the target device anyway (eg. if it goes * away, it's better to wait for it to go away completely and then * continue instead of trying to continue in parallel with its * unregistration). */ list_for_each_entry_rcu_locked(link, &dev->links.consumers, s_node) if (READ_ONCE(link->status) != DL_STATE_DORMANT) dpm_wait(link->consumer, async); device_links_read_unlock(idx); } static void dpm_wait_for_subordinate(struct device *dev, bool async) { dpm_wait_for_children(dev, async); dpm_wait_for_consumers(dev, async); } /** * pm_op - Return the PM operation appropriate for given PM event. * @ops: PM operations to choose from. * @state: PM transition of the system being carried out. */ static pm_callback_t pm_op(const struct dev_pm_ops *ops, pm_message_t state) { switch (state.event) { #ifdef CONFIG_SUSPEND case PM_EVENT_SUSPEND: return ops->suspend; case PM_EVENT_RESUME: return ops->resume; #endif /* CONFIG_SUSPEND */ #ifdef CONFIG_HIBERNATE_CALLBACKS case PM_EVENT_FREEZE: case PM_EVENT_QUIESCE: return ops->freeze; case PM_EVENT_HIBERNATE: return ops->poweroff; case PM_EVENT_THAW: case PM_EVENT_RECOVER: return ops->thaw; case PM_EVENT_RESTORE: return ops->restore; #endif /* CONFIG_HIBERNATE_CALLBACKS */ } return NULL; } /** * pm_late_early_op - Return the PM operation appropriate for given PM event. * @ops: PM operations to choose from. * @state: PM transition of the system being carried out. * * Runtime PM is disabled for @dev while this function is being executed. */ static pm_callback_t pm_late_early_op(const struct dev_pm_ops *ops, pm_message_t state) { switch (state.event) { #ifdef CONFIG_SUSPEND case PM_EVENT_SUSPEND: return ops->suspend_late; case PM_EVENT_RESUME: return ops->resume_early; #endif /* CONFIG_SUSPEND */ #ifdef CONFIG_HIBERNATE_CALLBACKS case PM_EVENT_FREEZE: case PM_EVENT_QUIESCE: return ops->freeze_late; case PM_EVENT_HIBERNATE: return ops->poweroff_late; case PM_EVENT_THAW: case PM_EVENT_RECOVER: return ops->thaw_early; case PM_EVENT_RESTORE: return ops->restore_early; #endif /* CONFIG_HIBERNATE_CALLBACKS */ } return NULL; } /** * pm_noirq_op - Return the PM operation appropriate for given PM event. * @ops: PM operations to choose from. * @state: PM transition of the system being carried out. * * The driver of @dev will not receive interrupts while this function is being * executed. */ static pm_callback_t pm_noirq_op(const struct dev_pm_ops *ops, pm_message_t state) { switch (state.event) { #ifdef CONFIG_SUSPEND case PM_EVENT_SUSPEND: return ops->suspend_noirq; case PM_EVENT_RESUME: return ops->resume_noirq; #endif /* CONFIG_SUSPEND */ #ifdef CONFIG_HIBERNATE_CALLBACKS case PM_EVENT_FREEZE: case PM_EVENT_QUIESCE: return ops->freeze_noirq; case PM_EVENT_HIBERNATE: return ops->poweroff_noirq; case PM_EVENT_THAW: case PM_EVENT_RECOVER: return ops->thaw_noirq; case PM_EVENT_RESTORE: return ops->restore_noirq; #endif /* CONFIG_HIBERNATE_CALLBACKS */ } return NULL; } static void pm_dev_dbg(struct device *dev, pm_message_t state, const char *info) { dev_dbg(dev, "%s%s%s driver flags: %x\n", info, pm_verb(state.event), ((state.event & PM_EVENT_SLEEP) && device_may_wakeup(dev)) ? ", may wakeup" : "", dev->power.driver_flags); } static void pm_dev_err(struct device *dev, pm_message_t state, const char *info, int error) { dev_err(dev, "failed to %s%s: error %d\n", pm_verb(state.event), info, error); } static void dpm_show_time(ktime_t starttime, pm_message_t state, int error, const char *info) { ktime_t calltime; u64 usecs64; int usecs; calltime = ktime_get(); usecs64 = ktime_to_ns(ktime_sub(calltime, starttime)); do_div(usecs64, NSEC_PER_USEC); usecs = usecs64; if (usecs == 0) usecs = 1; pm_pr_dbg("%s%s%s of devices %s after %ld.%03ld msecs\n", info ?: "", info ? " " : "", pm_verb(state.event), error ? "aborted" : "complete", usecs / USEC_PER_MSEC, usecs % USEC_PER_MSEC); } static int dpm_run_callback(pm_callback_t cb, struct device *dev, pm_message_t state, const char *info) { ktime_t calltime; int error; if (!cb) return 0; calltime = initcall_debug_start(dev, cb); pm_dev_dbg(dev, state, info); trace_device_pm_callback_start(dev, info, state.event); error = cb(dev); trace_device_pm_callback_end(dev, error); suspend_report_result(dev, cb, error); initcall_debug_report(dev, calltime, cb, error); return error; } #ifdef CONFIG_DPM_WATCHDOG struct dpm_watchdog { struct device *dev; struct task_struct *tsk; struct timer_list timer; bool fatal; }; #define DECLARE_DPM_WATCHDOG_ON_STACK(wd) \ struct dpm_watchdog wd /** * dpm_watchdog_handler - Driver suspend / resume watchdog handler. * @t: The timer that PM watchdog depends on. * * Called when a driver has timed out suspending or resuming. * There's not much we can do here to recover so panic() to * capture a crash-dump in pstore. */ static void dpm_watchdog_handler(struct timer_list *t) { struct dpm_watchdog *wd = from_timer(wd, t, timer); struct timer_list *timer = &wd->timer; unsigned int time_left; if (wd->fatal) { dev_emerg(wd->dev, "**** DPM device timeout ****\n"); show_stack(wd->tsk, NULL, KERN_EMERG); panic("%s %s: unrecoverable failure\n", dev_driver_string(wd->dev), dev_name(wd->dev)); } time_left = CONFIG_DPM_WATCHDOG_TIMEOUT - CONFIG_DPM_WATCHDOG_WARNING_TIMEOUT; dev_warn(wd->dev, "**** DPM device timeout after %u seconds; %u seconds until panic ****\n", CONFIG_DPM_WATCHDOG_WARNING_TIMEOUT, time_left); show_stack(wd->tsk, NULL, KERN_WARNING); wd->fatal = true; mod_timer(timer, jiffies + HZ * time_left); } /** * dpm_watchdog_set - Enable pm watchdog for given device. * @wd: Watchdog. Must be allocated on the stack. * @dev: Device to handle. */ static void dpm_watchdog_set(struct dpm_watchdog *wd, struct device *dev) { struct timer_list *timer = &wd->timer; wd->dev = dev; wd->tsk = current; wd->fatal = CONFIG_DPM_WATCHDOG_TIMEOUT == CONFIG_DPM_WATCHDOG_WARNING_TIMEOUT; timer_setup_on_stack(timer, dpm_watchdog_handler, 0); /* use same timeout value for both suspend and resume */ timer->expires = jiffies + HZ * CONFIG_DPM_WATCHDOG_WARNING_TIMEOUT; add_timer(timer); } /** * dpm_watchdog_clear - Disable suspend/resume watchdog. * @wd: Watchdog to disable. */ static void dpm_watchdog_clear(struct dpm_watchdog *wd) { struct timer_list *timer = &wd->timer; del_timer_sync(timer); destroy_timer_on_stack(timer); } #else #define DECLARE_DPM_WATCHDOG_ON_STACK(wd) #define dpm_watchdog_set(x, y) #define dpm_watchdog_clear(x) #endif /*------------------------- Resume routines -------------------------*/ /** * dev_pm_skip_resume - System-wide device resume optimization check. * @dev: Target device. * * Return: * - %false if the transition under way is RESTORE. * - Return value of dev_pm_skip_suspend() if the transition under way is THAW. * - The logical negation of %power.must_resume otherwise (that is, when the * transition under way is RESUME). */ bool dev_pm_skip_resume(struct device *dev) { if (pm_transition.event == PM_EVENT_RESTORE) return false; if (pm_transition.event == PM_EVENT_THAW) return dev_pm_skip_suspend(dev); return !dev->power.must_resume; } static bool is_async(struct device *dev) { return dev->power.async_suspend && pm_async_enabled && !pm_trace_is_enabled(); } static bool dpm_async_fn(struct device *dev, async_func_t func) { reinit_completion(&dev->power.completion); if (is_async(dev)) { dev->power.async_in_progress = true; get_device(dev); if (async_schedule_dev_nocall(func, dev)) return true; put_device(dev); } /* * Because async_schedule_dev_nocall() above has returned false or it * has not been called at all, func() is not running and it is safe to * update the async_in_progress flag without extra synchronization. */ dev->power.async_in_progress = false; return false; } /** * device_resume_noirq - Execute a "noirq resume" callback for given device. * @dev: Device to handle. * @state: PM transition of the system being carried out. * @async: If true, the device is being resumed asynchronously. * * The driver of @dev will not receive interrupts while this function is being * executed. */ static void device_resume_noirq(struct device *dev, pm_message_t state, bool async) { pm_callback_t callback = NULL; const char *info = NULL; bool skip_resume; int error = 0; TRACE_DEVICE(dev); TRACE_RESUME(0); if (dev->power.syscore || dev->power.direct_complete) goto Out; if (!dev->power.is_noirq_suspended) goto Out; if (!dpm_wait_for_superior(dev, async)) goto Out; skip_resume = dev_pm_skip_resume(dev); /* * If the driver callback is skipped below or by the middle layer * callback and device_resume_early() also skips the driver callback for * this device later, it needs to appear as "suspended" to PM-runtime, * so change its status accordingly. * * Otherwise, the device is going to be resumed, so set its PM-runtime * status to "active" unless its power.set_active flag is clear, in * which case it is not necessary to update its PM-runtime status. */ if (skip_resume) { pm_runtime_set_suspended(dev); } else if (dev->power.set_active) { pm_runtime_set_active(dev); dev->power.set_active = false; } if (dev->pm_domain) { info = "noirq power domain "; callback = pm_noirq_op(&dev->pm_domain->ops, state); } else if (dev->type && dev->type->pm) { info = "noirq type "; callback = pm_noirq_op(dev->type->pm, state); } else if (dev->class && dev->class->pm) { info = "noirq class "; callback = pm_noirq_op(dev->class->pm, state); } else if (dev->bus && dev->bus->pm) { info = "noirq bus "; callback = pm_noirq_op(dev->bus->pm, state); } if (callback) goto Run; if (skip_resume) goto Skip; if (dev->driver && dev->driver->pm) { info = "noirq driver "; callback = pm_noirq_op(dev->driver->pm, state); } Run: error = dpm_run_callback(callback, dev, state, info); Skip: dev->power.is_noirq_suspended = false; Out: complete_all(&dev->power.completion); TRACE_RESUME(error); if (error) { async_error = error; dpm_save_failed_dev(dev_name(dev)); pm_dev_err(dev, state, async ? " async noirq" : " noirq", error); } } static void async_resume_noirq(void *data, async_cookie_t cookie) { struct device *dev = data; device_resume_noirq(dev, pm_transition, true); put_device(dev); } static void dpm_noirq_resume_devices(pm_message_t state) { struct device *dev; ktime_t starttime = ktime_get(); trace_suspend_resume(TPS("dpm_resume_noirq"), state.event, true); async_error = 0; pm_transition = state; mutex_lock(&dpm_list_mtx); /* * Trigger the resume of "async" devices upfront so they don't have to * wait for the "non-async" ones they don't depend on. */ list_for_each_entry(dev, &dpm_noirq_list, power.entry) dpm_async_fn(dev, async_resume_noirq); while (!list_empty(&dpm_noirq_list)) { dev = to_device(dpm_noirq_list.next); list_move_tail(&dev->power.entry, &dpm_late_early_list); if (!dev->power.async_in_progress) { get_device(dev); mutex_unlock(&dpm_list_mtx); device_resume_noirq(dev, state, false); put_device(dev); mutex_lock(&dpm_list_mtx); } } mutex_unlock(&dpm_list_mtx); async_synchronize_full(); dpm_show_time(starttime, state, 0, "noirq"); if (async_error) dpm_save_failed_step(SUSPEND_RESUME_NOIRQ); trace_suspend_resume(TPS("dpm_resume_noirq"), state.event, false); } /** * dpm_resume_noirq - Execute "noirq resume" callbacks for all devices. * @state: PM transition of the system being carried out. * * Invoke the "noirq" resume callbacks for all devices in dpm_noirq_list and * allow device drivers' interrupt handlers to be called. */ void dpm_resume_noirq(pm_message_t state) { dpm_noirq_resume_devices(state); resume_device_irqs(); device_wakeup_disarm_wake_irqs(); } /** * device_resume_early - Execute an "early resume" callback for given device. * @dev: Device to handle. * @state: PM transition of the system being carried out. * @async: If true, the device is being resumed asynchronously. * * Runtime PM is disabled for @dev while this function is being executed. */ static void device_resume_early(struct device *dev, pm_message_t state, bool async) { pm_callback_t callback = NULL; const char *info = NULL; int error = 0; TRACE_DEVICE(dev); TRACE_RESUME(0); if (dev->power.syscore || dev->power.direct_complete) goto Out; if (!dev->power.is_late_suspended) goto Out; if (!dpm_wait_for_superior(dev, async)) goto Out; if (dev->pm_domain) { info = "early power domain "; callback = pm_late_early_op(&dev->pm_domain->ops, state); } else if (dev->type && dev->type->pm) { info = "early type "; callback = pm_late_early_op(dev->type->pm, state); } else if (dev->class && dev->class->pm) { info = "early class "; callback = pm_late_early_op(dev->class->pm, state); } else if (dev->bus && dev->bus->pm) { info = "early bus "; callback = pm_late_early_op(dev->bus->pm, state); } if (callback) goto Run; if (dev_pm_skip_resume(dev)) goto Skip; if (dev->driver && dev->driver->pm) { info = "early driver "; callback = pm_late_early_op(dev->driver->pm, state); } Run: error = dpm_run_callback(callback, dev, state, info); Skip: dev->power.is_late_suspended = false; Out: TRACE_RESUME(error); pm_runtime_enable(dev); complete_all(&dev->power.completion); if (error) { async_error = error; dpm_save_failed_dev(dev_name(dev)); pm_dev_err(dev, state, async ? " async early" : " early", error); } } static void async_resume_early(void *data, async_cookie_t cookie) { struct device *dev = data; device_resume_early(dev, pm_transition, true); put_device(dev); } /** * dpm_resume_early - Execute "early resume" callbacks for all devices. * @state: PM transition of the system being carried out. */ void dpm_resume_early(pm_message_t state) { struct device *dev; ktime_t starttime = ktime_get(); trace_suspend_resume(TPS("dpm_resume_early"), state.event, true); async_error = 0; pm_transition = state; mutex_lock(&dpm_list_mtx); /* * Trigger the resume of "async" devices upfront so they don't have to * wait for the "non-async" ones they don't depend on. */ list_for_each_entry(dev, &dpm_late_early_list, power.entry) dpm_async_fn(dev, async_resume_early); while (!list_empty(&dpm_late_early_list)) { dev = to_device(dpm_late_early_list.next); list_move_tail(&dev->power.entry, &dpm_suspended_list); if (!dev->power.async_in_progress) { get_device(dev); mutex_unlock(&dpm_list_mtx); device_resume_early(dev, state, false); put_device(dev); mutex_lock(&dpm_list_mtx); } } mutex_unlock(&dpm_list_mtx); async_synchronize_full(); dpm_show_time(starttime, state, 0, "early"); if (async_error) dpm_save_failed_step(SUSPEND_RESUME_EARLY); trace_suspend_resume(TPS("dpm_resume_early"), state.event, false); } /** * dpm_resume_start - Execute "noirq" and "early" device callbacks. * @state: PM transition of the system being carried out. */ void dpm_resume_start(pm_message_t state) { dpm_resume_noirq(state); dpm_resume_early(state); } EXPORT_SYMBOL_GPL(dpm_resume_start); /** * device_resume - Execute "resume" callbacks for given device. * @dev: Device to handle. * @state: PM transition of the system being carried out. * @async: If true, the device is being resumed asynchronously. */ static void device_resume(struct device *dev, pm_message_t state, bool async) { pm_callback_t callback = NULL; const char *info = NULL; int error = 0; DECLARE_DPM_WATCHDOG_ON_STACK(wd); TRACE_DEVICE(dev); TRACE_RESUME(0); if (dev->power.syscore) goto Complete; if (dev->power.direct_complete) { /* Match the pm_runtime_disable() in device_suspend(). */ pm_runtime_enable(dev); goto Complete; } if (!dpm_wait_for_superior(dev, async)) goto Complete; dpm_watchdog_set(&wd, dev); device_lock(dev); /* * This is a fib. But we'll allow new children to be added below * a resumed device, even if the device hasn't been completed yet. */ dev->power.is_prepared = false; if (!dev->power.is_suspended) goto Unlock; if (dev->pm_domain) { info = "power domain "; callback = pm_op(&dev->pm_domain->ops, state); goto Driver; } if (dev->type && dev->type->pm) { info = "type "; callback = pm_op(dev->type->pm, state); goto Driver; } if (dev->class && dev->class->pm) { info = "class "; callback = pm_op(dev->class->pm, state); goto Driver; } if (dev->bus) { if (dev->bus->pm) { info = "bus "; callback = pm_op(dev->bus->pm, state); } else if (dev->bus->resume) { info = "legacy bus "; callback = dev->bus->resume; goto End; } } Driver: if (!callback && dev->driver && dev->driver->pm) { info = "driver "; callback = pm_op(dev->driver->pm, state); } End: error = dpm_run_callback(callback, dev, state, info); dev->power.is_suspended = false; Unlock: device_unlock(dev); dpm_watchdog_clear(&wd); Complete: complete_all(&dev->power.completion); TRACE_RESUME(error); if (error) { async_error = error; dpm_save_failed_dev(dev_name(dev)); pm_dev_err(dev, state, async ? " async" : "", error); } } static void async_resume(void *data, async_cookie_t cookie) { struct device *dev = data; device_resume(dev, pm_transition, true); put_device(dev); } /** * dpm_resume - Execute "resume" callbacks for non-sysdev devices. * @state: PM transition of the system being carried out. * * Execute the appropriate "resume" callback for all devices whose status * indicates that they are suspended. */ void dpm_resume(pm_message_t state) { struct device *dev; ktime_t starttime = ktime_get(); trace_suspend_resume(TPS("dpm_resume"), state.event, true); might_sleep(); pm_transition = state; async_error = 0; mutex_lock(&dpm_list_mtx); /* * Trigger the resume of "async" devices upfront so they don't have to * wait for the "non-async" ones they don't depend on. */ list_for_each_entry(dev, &dpm_suspended_list, power.entry) dpm_async_fn(dev, async_resume); while (!list_empty(&dpm_suspended_list)) { dev = to_device(dpm_suspended_list.next); list_move_tail(&dev->power.entry, &dpm_prepared_list); if (!dev->power.async_in_progress) { get_device(dev); mutex_unlock(&dpm_list_mtx); device_resume(dev, state, false); put_device(dev); mutex_lock(&dpm_list_mtx); } } mutex_unlock(&dpm_list_mtx); async_synchronize_full(); dpm_show_time(starttime, state, 0, NULL); if (async_error) dpm_save_failed_step(SUSPEND_RESUME); cpufreq_resume(); devfreq_resume(); trace_suspend_resume(TPS("dpm_resume"), state.event, false); } /** * device_complete - Complete a PM transition for given device. * @dev: Device to handle. * @state: PM transition of the system being carried out. */ static void device_complete(struct device *dev, pm_message_t state) { void (*callback)(struct device *) = NULL; const char *info = NULL; if (dev->power.syscore) goto out; device_lock(dev); if (dev->pm_domain) { info = "completing power domain "; callback = dev->pm_domain->ops.complete; } else if (dev->type && dev->type->pm) { info = "completing type "; callback = dev->type->pm->complete; } else if (dev->class && dev->class->pm) { info = "completing class "; callback = dev->class->pm->complete; } else if (dev->bus && dev->bus->pm) { info = "completing bus "; callback = dev->bus->pm->complete; } if (!callback && dev->driver && dev->driver->pm) { info = "completing driver "; callback = dev->driver->pm->complete; } if (callback) { pm_dev_dbg(dev, state, info); callback(dev); } device_unlock(dev); out: pm_runtime_put(dev); } /** * dpm_complete - Complete a PM transition for all non-sysdev devices. * @state: PM transition of the system being carried out. * * Execute the ->complete() callbacks for all devices whose PM status is not * DPM_ON (this allows new devices to be registered). */ void dpm_complete(pm_message_t state) { struct list_head list; trace_suspend_resume(TPS("dpm_complete"), state.event, true); might_sleep(); INIT_LIST_HEAD(&list); mutex_lock(&dpm_list_mtx); while (!list_empty(&dpm_prepared_list)) { struct device *dev = to_device(dpm_prepared_list.prev); get_device(dev); dev->power.is_prepared = false; list_move(&dev->power.entry, &list); mutex_unlock(&dpm_list_mtx); trace_device_pm_callback_start(dev, "", state.event); device_complete(dev, state); trace_device_pm_callback_end(dev, 0); put_device(dev); mutex_lock(&dpm_list_mtx); } list_splice(&list, &dpm_list); mutex_unlock(&dpm_list_mtx); /* Allow device probing and trigger re-probing of deferred devices */ device_unblock_probing(); trace_suspend_resume(TPS("dpm_complete"), state.event, false); } /** * dpm_resume_end - Execute "resume" callbacks and complete system transition. * @state: PM transition of the system being carried out. * * Execute "resume" callbacks for all devices and complete the PM transition of * the system. */ void dpm_resume_end(pm_message_t state) { dpm_resume(state); dpm_complete(state); } EXPORT_SYMBOL_GPL(dpm_resume_end); /*------------------------- Suspend routines -------------------------*/ /** * resume_event - Return a "resume" message for given "suspend" sleep state. * @sleep_state: PM message representing a sleep state. * * Return a PM message representing the resume event corresponding to given * sleep state. */ static pm_message_t resume_event(pm_message_t sleep_state) { switch (sleep_state.event) { case PM_EVENT_SUSPEND: return PMSG_RESUME; case PM_EVENT_FREEZE: case PM_EVENT_QUIESCE: return PMSG_RECOVER; case PM_EVENT_HIBERNATE: return PMSG_RESTORE; } return PMSG_ON; } static void dpm_superior_set_must_resume(struct device *dev) { struct device_link *link; int idx; if (dev->parent) dev->parent->power.must_resume = true; idx = device_links_read_lock(); list_for_each_entry_rcu_locked(link, &dev->links.suppliers, c_node) link->supplier->power.must_resume = true; device_links_read_unlock(idx); } /** * device_suspend_noirq - Execute a "noirq suspend" callback for given device. * @dev: Device to handle. * @state: PM transition of the system being carried out. * @async: If true, the device is being suspended asynchronously. * * The driver of @dev will not receive interrupts while this function is being * executed. */ static int device_suspend_noirq(struct device *dev, pm_message_t state, bool async) { pm_callback_t callback = NULL; const char *info = NULL; int error = 0; TRACE_DEVICE(dev); TRACE_SUSPEND(0); dpm_wait_for_subordinate(dev, async); if (async_error) goto Complete; if (dev->power.syscore || dev->power.direct_complete) goto Complete; if (dev->pm_domain) { info = "noirq power domain "; callback = pm_noirq_op(&dev->pm_domain->ops, state); } else if (dev->type && dev->type->pm) { info = "noirq type "; callback = pm_noirq_op(dev->type->pm, state); } else if (dev->class && dev->class->pm) { info = "noirq class "; callback = pm_noirq_op(dev->class->pm, state); } else if (dev->bus && dev->bus->pm) { info = "noirq bus "; callback = pm_noirq_op(dev->bus->pm, state); } if (callback) goto Run; if (dev_pm_skip_suspend(dev)) goto Skip; if (dev->driver && dev->driver->pm) { info = "noirq driver "; callback = pm_noirq_op(dev->driver->pm, state); } Run: error = dpm_run_callback(callback, dev, state, info); if (error) { async_error = error; dpm_save_failed_dev(dev_name(dev)); pm_dev_err(dev, state, async ? " async noirq" : " noirq", error); goto Complete; } Skip: dev->power.is_noirq_suspended = true; /* * Skipping the resume of devices that were in use right before the * system suspend (as indicated by their PM-runtime usage counters) * would be suboptimal. Also resume them if doing that is not allowed * to be skipped. */ if (atomic_read(&dev->power.usage_count) > 1 || !(dev_pm_test_driver_flags(dev, DPM_FLAG_MAY_SKIP_RESUME) && dev->power.may_skip_resume)) dev->power.must_resume = true; if (dev->power.must_resume) { if (dev_pm_test_driver_flags(dev, DPM_FLAG_SMART_SUSPEND)) { dev->power.set_active = true; if (dev->parent && !dev->parent->power.ignore_children) dev->parent->power.set_active = true; } dpm_superior_set_must_resume(dev); } Complete: complete_all(&dev->power.completion); TRACE_SUSPEND(error); return error; } static void async_suspend_noirq(void *data, async_cookie_t cookie) { struct device *dev = data; device_suspend_noirq(dev, pm_transition, true); put_device(dev); } static int dpm_noirq_suspend_devices(pm_message_t state) { ktime_t starttime = ktime_get(); int error = 0; trace_suspend_resume(TPS("dpm_suspend_noirq"), state.event, true); pm_transition = state; async_error = 0; mutex_lock(&dpm_list_mtx); while (!list_empty(&dpm_late_early_list)) { struct device *dev = to_device(dpm_late_early_list.prev); list_move(&dev->power.entry, &dpm_noirq_list); if (dpm_async_fn(dev, async_suspend_noirq)) continue; get_device(dev); mutex_unlock(&dpm_list_mtx); error = device_suspend_noirq(dev, state, false); put_device(dev); mutex_lock(&dpm_list_mtx); if (error || async_error) break; } mutex_unlock(&dpm_list_mtx); async_synchronize_full(); if (!error) error = async_error; if (error) dpm_save_failed_step(SUSPEND_SUSPEND_NOIRQ); dpm_show_time(starttime, state, error, "noirq"); trace_suspend_resume(TPS("dpm_suspend_noirq"), state.event, false); return error; } /** * dpm_suspend_noirq - Execute "noirq suspend" callbacks for all devices. * @state: PM transition of the system being carried out. * * Prevent device drivers' interrupt handlers from being called and invoke * "noirq" suspend callbacks for all non-sysdev devices. */ int dpm_suspend_noirq(pm_message_t state) { int ret; device_wakeup_arm_wake_irqs(); suspend_device_irqs(); ret = dpm_noirq_suspend_devices(state); if (ret) dpm_resume_noirq(resume_event(state)); return ret; } static void dpm_propagate_wakeup_to_parent(struct device *dev) { struct device *parent = dev->parent; if (!parent) return; spin_lock_irq(&parent->power.lock); if (device_wakeup_path(dev) && !parent->power.ignore_children) parent->power.wakeup_path = true; spin_unlock_irq(&parent->power.lock); } /** * device_suspend_late - Execute a "late suspend" callback for given device. * @dev: Device to handle. * @state: PM transition of the system being carried out. * @async: If true, the device is being suspended asynchronously. * * Runtime PM is disabled for @dev while this function is being executed. */ static int device_suspend_late(struct device *dev, pm_message_t state, bool async) { pm_callback_t callback = NULL; const char *info = NULL; int error = 0; TRACE_DEVICE(dev); TRACE_SUSPEND(0); __pm_runtime_disable(dev, false); dpm_wait_for_subordinate(dev, async); if (async_error) goto Complete; if (pm_wakeup_pending()) { async_error = -EBUSY; goto Complete; } if (dev->power.syscore || dev->power.direct_complete) goto Complete; if (dev->pm_domain) { info = "late power domain "; callback = pm_late_early_op(&dev->pm_domain->ops, state); } else if (dev->type && dev->type->pm) { info = "late type "; callback = pm_late_early_op(dev->type->pm, state); } else if (dev->class && dev->class->pm) { info = "late class "; callback = pm_late_early_op(dev->class->pm, state); } else if (dev->bus && dev->bus->pm) { info = "late bus "; callback = pm_late_early_op(dev->bus->pm, state); } if (callback) goto Run; if (dev_pm_skip_suspend(dev)) goto Skip; if (dev->driver && dev->driver->pm) { info = "late driver "; callback = pm_late_early_op(dev->driver->pm, state); } Run: error = dpm_run_callback(callback, dev, state, info); if (error) { async_error = error; dpm_save_failed_dev(dev_name(dev)); pm_dev_err(dev, state, async ? " async late" : " late", error); goto Complete; } dpm_propagate_wakeup_to_parent(dev); Skip: dev->power.is_late_suspended = true; Complete: TRACE_SUSPEND(error); complete_all(&dev->power.completion); return error; } static void async_suspend_late(void *data, async_cookie_t cookie) { struct device *dev = data; device_suspend_late(dev, pm_transition, true); put_device(dev); } /** * dpm_suspend_late - Execute "late suspend" callbacks for all devices. * @state: PM transition of the system being carried out. */ int dpm_suspend_late(pm_message_t state) { ktime_t starttime = ktime_get(); int error = 0; trace_suspend_resume(TPS("dpm_suspend_late"), state.event, true); pm_transition = state; async_error = 0; wake_up_all_idle_cpus(); mutex_lock(&dpm_list_mtx); while (!list_empty(&dpm_suspended_list)) { struct device *dev = to_device(dpm_suspended_list.prev); list_move(&dev->power.entry, &dpm_late_early_list); if (dpm_async_fn(dev, async_suspend_late)) continue; get_device(dev); mutex_unlock(&dpm_list_mtx); error = device_suspend_late(dev, state, false); put_device(dev); mutex_lock(&dpm_list_mtx); if (error || async_error) break; } mutex_unlock(&dpm_list_mtx); async_synchronize_full(); if (!error) error = async_error; if (error) { dpm_save_failed_step(SUSPEND_SUSPEND_LATE); dpm_resume_early(resume_event(state)); } dpm_show_time(starttime, state, error, "late"); trace_suspend_resume(TPS("dpm_suspend_late"), state.event, false); return error; } /** * dpm_suspend_end - Execute "late" and "noirq" device suspend callbacks. * @state: PM transition of the system being carried out. */ int dpm_suspend_end(pm_message_t state) { ktime_t starttime = ktime_get(); int error; error = dpm_suspend_late(state); if (error) goto out; error = dpm_suspend_noirq(state); if (error) dpm_resume_early(resume_event(state)); out: dpm_show_time(starttime, state, error, "end"); return error; } EXPORT_SYMBOL_GPL(dpm_suspend_end); /** * legacy_suspend - Execute a legacy (bus or class) suspend callback for device. * @dev: Device to suspend. * @state: PM transition of the system being carried out. * @cb: Suspend callback to execute. * @info: string description of caller. */ static int legacy_suspend(struct device *dev, pm_message_t state, int (*cb)(struct device *dev, pm_message_t state), const char *info) { int error; ktime_t calltime; calltime = initcall_debug_start(dev, cb); trace_device_pm_callback_start(dev, info, state.event); error = cb(dev, state); trace_device_pm_callback_end(dev, error); suspend_report_result(dev, cb, error); initcall_debug_report(dev, calltime, cb, error); return error; } static void dpm_clear_superiors_direct_complete(struct device *dev) { struct device_link *link; int idx; if (dev->parent) { spin_lock_irq(&dev->parent->power.lock); dev->parent->power.direct_complete = false; spin_unlock_irq(&dev->parent->power.lock); } idx = device_links_read_lock(); list_for_each_entry_rcu_locked(link, &dev->links.suppliers, c_node) { spin_lock_irq(&link->supplier->power.lock); link->supplier->power.direct_complete = false; spin_unlock_irq(&link->supplier->power.lock); } device_links_read_unlock(idx); } /** * device_suspend - Execute "suspend" callbacks for given device. * @dev: Device to handle. * @state: PM transition of the system being carried out. * @async: If true, the device is being suspended asynchronously. */ static int device_suspend(struct device *dev, pm_message_t state, bool async) { pm_callback_t callback = NULL; const char *info = NULL; int error = 0; DECLARE_DPM_WATCHDOG_ON_STACK(wd); TRACE_DEVICE(dev); TRACE_SUSPEND(0); dpm_wait_for_subordinate(dev, async); if (async_error) { dev->power.direct_complete = false; goto Complete; } /* * Wait for possible runtime PM transitions of the device in progress * to complete and if there's a runtime resume request pending for it, * resume it before proceeding with invoking the system-wide suspend * callbacks for it. * * If the system-wide suspend callbacks below change the configuration * of the device, they must disable runtime PM for it or otherwise * ensure that its runtime-resume callbacks will not be confused by that * change in case they are invoked going forward. */ pm_runtime_barrier(dev); if (pm_wakeup_pending()) { dev->power.direct_complete = false; async_error = -EBUSY; goto Complete; } if (dev->power.syscore) goto Complete; /* Avoid direct_complete to let wakeup_path propagate. */ if (device_may_wakeup(dev) || device_wakeup_path(dev)) dev->power.direct_complete = false; if (dev->power.direct_complete) { if (pm_runtime_status_suspended(dev)) { pm_runtime_disable(dev); if (pm_runtime_status_suspended(dev)) { pm_dev_dbg(dev, state, "direct-complete "); goto Complete; } pm_runtime_enable(dev); } dev->power.direct_complete = false; } dev->power.may_skip_resume = true; dev->power.must_resume = !dev_pm_test_driver_flags(dev, DPM_FLAG_MAY_SKIP_RESUME); dpm_watchdog_set(&wd, dev); device_lock(dev); if (dev->pm_domain) { info = "power domain "; callback = pm_op(&dev->pm_domain->ops, state); goto Run; } if (dev->type && dev->type->pm) { info = "type "; callback = pm_op(dev->type->pm, state); goto Run; } if (dev->class && dev->class->pm) { info = "class "; callback = pm_op(dev->class->pm, state); goto Run; } if (dev->bus) { if (dev->bus->pm) { info = "bus "; callback = pm_op(dev->bus->pm, state); } else if (dev->bus->suspend) { pm_dev_dbg(dev, state, "legacy bus "); error = legacy_suspend(dev, state, dev->bus->suspend, "legacy bus "); goto End; } } Run: if (!callback && dev->driver && dev->driver->pm) { info = "driver "; callback = pm_op(dev->driver->pm, state); } error = dpm_run_callback(callback, dev, state, info); End: if (!error) { dev->power.is_suspended = true; if (device_may_wakeup(dev)) dev->power.wakeup_path = true; dpm_propagate_wakeup_to_parent(dev); dpm_clear_superiors_direct_complete(dev); } device_unlock(dev); dpm_watchdog_clear(&wd); Complete: if (error) { async_error = error; dpm_save_failed_dev(dev_name(dev)); pm_dev_err(dev, state, async ? " async" : "", error); } complete_all(&dev->power.completion); TRACE_SUSPEND(error); return error; } static void async_suspend(void *data, async_cookie_t cookie) { struct device *dev = data; device_suspend(dev, pm_transition, true); put_device(dev); } /** * dpm_suspend - Execute "suspend" callbacks for all non-sysdev devices. * @state: PM transition of the system being carried out. */ int dpm_suspend(pm_message_t state) { ktime_t starttime = ktime_get(); int error = 0; trace_suspend_resume(TPS("dpm_suspend"), state.event, true); might_sleep(); devfreq_suspend(); cpufreq_suspend(); pm_transition = state; async_error = 0; mutex_lock(&dpm_list_mtx); while (!list_empty(&dpm_prepared_list)) { struct device *dev = to_device(dpm_prepared_list.prev); list_move(&dev->power.entry, &dpm_suspended_list); if (dpm_async_fn(dev, async_suspend)) continue; get_device(dev); mutex_unlock(&dpm_list_mtx); error = device_suspend(dev, state, false); put_device(dev); mutex_lock(&dpm_list_mtx); if (error || async_error) break; } mutex_unlock(&dpm_list_mtx); async_synchronize_full(); if (!error) error = async_error; if (error) dpm_save_failed_step(SUSPEND_SUSPEND); dpm_show_time(starttime, state, error, NULL); trace_suspend_resume(TPS("dpm_suspend"), state.event, false); return error; } /** * device_prepare - Prepare a device for system power transition. * @dev: Device to handle. * @state: PM transition of the system being carried out. * * Execute the ->prepare() callback(s) for given device. No new children of the * device may be registered after this function has returned. */ static int device_prepare(struct device *dev, pm_message_t state) { int (*callback)(struct device *) = NULL; int ret = 0; /* * If a device's parent goes into runtime suspend at the wrong time, * it won't be possible to resume the device. To prevent this we * block runtime suspend here, during the prepare phase, and allow * it again during the complete phase. */ pm_runtime_get_noresume(dev); if (dev->power.syscore) return 0; device_lock(dev); dev->power.wakeup_path = false; if (dev->power.no_pm_callbacks) goto unlock; if (dev->pm_domain) callback = dev->pm_domain->ops.prepare; else if (dev->type && dev->type->pm) callback = dev->type->pm->prepare; else if (dev->class && dev->class->pm) callback = dev->class->pm->prepare; else if (dev->bus && dev->bus->pm) callback = dev->bus->pm->prepare; if (!callback && dev->driver && dev->driver->pm) callback = dev->driver->pm->prepare; if (callback) ret = callback(dev); unlock: device_unlock(dev); if (ret < 0) { suspend_report_result(dev, callback, ret); pm_runtime_put(dev); return ret; } /* * A positive return value from ->prepare() means "this device appears * to be runtime-suspended and its state is fine, so if it really is * runtime-suspended, you can leave it in that state provided that you * will do the same thing with all of its descendants". This only * applies to suspend transitions, however. */ spin_lock_irq(&dev->power.lock); dev->power.direct_complete = state.event == PM_EVENT_SUSPEND && (ret > 0 || dev->power.no_pm_callbacks) && !dev_pm_test_driver_flags(dev, DPM_FLAG_NO_DIRECT_COMPLETE); spin_unlock_irq(&dev->power.lock); return 0; } /** * dpm_prepare - Prepare all non-sysdev devices for a system PM transition. * @state: PM transition of the system being carried out. * * Execute the ->prepare() callback(s) for all devices. */ int dpm_prepare(pm_message_t state) { int error = 0; trace_suspend_resume(TPS("dpm_prepare"), state.event, true); might_sleep(); /* * Give a chance for the known devices to complete their probes, before * disable probing of devices. This sync point is important at least * at boot time + hibernation restore. */ wait_for_device_probe(); /* * It is unsafe if probing of devices will happen during suspend or * hibernation and system behavior will be unpredictable in this case. * So, let's prohibit device's probing here and defer their probes * instead. The normal behavior will be restored in dpm_complete(). */ device_block_probing(); mutex_lock(&dpm_list_mtx); while (!list_empty(&dpm_list) && !error) { struct device *dev = to_device(dpm_list.next); get_device(dev); mutex_unlock(&dpm_list_mtx); trace_device_pm_callback_start(dev, "", state.event); error = device_prepare(dev, state); trace_device_pm_callback_end(dev, error); mutex_lock(&dpm_list_mtx); if (!error) { dev->power.is_prepared = true; if (!list_empty(&dev->power.entry)) list_move_tail(&dev->power.entry, &dpm_prepared_list); } else if (error == -EAGAIN) { error = 0; } else { dev_info(dev, "not prepared for power transition: code %d\n", error); } mutex_unlock(&dpm_list_mtx); put_device(dev); mutex_lock(&dpm_list_mtx); } mutex_unlock(&dpm_list_mtx); trace_suspend_resume(TPS("dpm_prepare"), state.event, false); return error; } /** * dpm_suspend_start - Prepare devices for PM transition and suspend them. * @state: PM transition of the system being carried out. * * Prepare all non-sysdev devices for system PM transition and execute "suspend" * callbacks for them. */ int dpm_suspend_start(pm_message_t state) { ktime_t starttime = ktime_get(); int error; error = dpm_prepare(state); if (error) dpm_save_failed_step(SUSPEND_PREPARE); else error = dpm_suspend(state); dpm_show_time(starttime, state, error, "start"); return error; } EXPORT_SYMBOL_GPL(dpm_suspend_start); void __suspend_report_result(const char *function, struct device *dev, void *fn, int ret) { if (ret) dev_err(dev, "%s(): %ps returns %d\n", function, fn, ret); } EXPORT_SYMBOL_GPL(__suspend_report_result); /** * device_pm_wait_for_dev - Wait for suspend/resume of a device to complete. * @subordinate: Device that needs to wait for @dev. * @dev: Device to wait for. */ int device_pm_wait_for_dev(struct device *subordinate, struct device *dev) { dpm_wait(dev, subordinate->power.async_suspend); return async_error; } EXPORT_SYMBOL_GPL(device_pm_wait_for_dev); /** * dpm_for_each_dev - device iterator. * @data: data for the callback. * @fn: function to be called for each device. * * Iterate over devices in dpm_list, and call @fn for each device, * passing it @data. */ void dpm_for_each_dev(void *data, void (*fn)(struct device *, void *)) { struct device *dev; if (!fn) return; device_pm_lock(); list_for_each_entry(dev, &dpm_list, power.entry) fn(dev, data); device_pm_unlock(); } EXPORT_SYMBOL_GPL(dpm_for_each_dev); static bool pm_ops_is_empty(const struct dev_pm_ops *ops) { if (!ops) return true; return !ops->prepare && !ops->suspend && !ops->suspend_late && !ops->suspend_noirq && !ops->resume_noirq && !ops->resume_early && !ops->resume && !ops->complete; } void device_pm_check_callbacks(struct device *dev) { unsigned long flags; spin_lock_irqsave(&dev->power.lock, flags); dev->power.no_pm_callbacks = (!dev->bus || (pm_ops_is_empty(dev->bus->pm) && !dev->bus->suspend && !dev->bus->resume)) && (!dev->class || pm_ops_is_empty(dev->class->pm)) && (!dev->type || pm_ops_is_empty(dev->type->pm)) && (!dev->pm_domain || pm_ops_is_empty(&dev->pm_domain->ops)) && (!dev->driver || (pm_ops_is_empty(dev->driver->pm) && !dev->driver->suspend && !dev->driver->resume)); spin_unlock_irqrestore(&dev->power.lock, flags); } bool dev_pm_skip_suspend(struct device *dev) { return dev_pm_test_driver_flags(dev, DPM_FLAG_SMART_SUSPEND) && pm_runtime_status_suspended(dev); }
204 203 32 32 192 194 8 8 8 18 19 356 44 45 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 // SPDX-License-Identifier: GPL-2.0-or-later /* * Cryptographic API for algorithms (i.e., low-level API). * * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au> */ #include <crypto/algapi.h> #include <linux/err.h> #include <linux/errno.h> #include <linux/fips.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/list.h> #include <linux/module.h> #include <linux/rtnetlink.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/workqueue.h> #include "internal.h" static LIST_HEAD(crypto_template_list); static inline void crypto_check_module_sig(struct module *mod) { if (fips_enabled && mod && !module_sig_ok(mod)) panic("Module %s signature verification failed in FIPS mode\n", module_name(mod)); } static int crypto_check_alg(struct crypto_alg *alg) { crypto_check_module_sig(alg->cra_module); if (!alg->cra_name[0] || !alg->cra_driver_name[0]) return -EINVAL; if (alg->cra_alignmask & (alg->cra_alignmask + 1)) return -EINVAL; /* General maximums for all algs. */ if (alg->cra_alignmask > MAX_ALGAPI_ALIGNMASK) return -EINVAL; if (alg->cra_blocksize > MAX_ALGAPI_BLOCKSIZE) return -EINVAL; /* Lower maximums for specific alg types. */ if (!alg->cra_type && (alg->cra_flags & CRYPTO_ALG_TYPE_MASK) == CRYPTO_ALG_TYPE_CIPHER) { if (alg->cra_alignmask > MAX_CIPHER_ALIGNMASK) return -EINVAL; if (alg->cra_blocksize > MAX_CIPHER_BLOCKSIZE) return -EINVAL; } if (alg->cra_priority < 0) return -EINVAL; refcount_set(&alg->cra_refcnt, 1); return 0; } static void crypto_free_instance(struct crypto_instance *inst) { inst->alg.cra_type->free(inst); } static void crypto_destroy_instance_workfn(struct work_struct *w) { struct crypto_instance *inst = container_of(w, struct crypto_instance, free_work); struct crypto_template *tmpl = inst->tmpl; crypto_free_instance(inst); crypto_tmpl_put(tmpl); } static void crypto_destroy_instance(struct crypto_alg *alg) { struct crypto_instance *inst = container_of(alg, struct crypto_instance, alg); INIT_WORK(&inst->free_work, crypto_destroy_instance_workfn); schedule_work(&inst->free_work); } /* * This function adds a spawn to the list secondary_spawns which * will be used at the end of crypto_remove_spawns to unregister * instances, unless the spawn happens to be one that is depended * on by the new algorithm (nalg in crypto_remove_spawns). * * This function is also responsible for resurrecting any algorithms * in the dependency chain of nalg by unsetting n->dead. */ static struct list_head *crypto_more_spawns(struct crypto_alg *alg, struct list_head *stack, struct list_head *top, struct list_head *secondary_spawns) { struct crypto_spawn *spawn, *n; spawn = list_first_entry_or_null(stack, struct crypto_spawn, list); if (!spawn) return NULL; n = list_prev_entry(spawn, list); list_move(&spawn->list, secondary_spawns); if (list_is_last(&n->list, stack)) return top; n = list_next_entry(n, list); if (!spawn->dead) n->dead = false; return &n->inst->alg.cra_users; } static void crypto_remove_instance(struct crypto_instance *inst, struct list_head *list) { struct crypto_template *tmpl = inst->tmpl; if (crypto_is_dead(&inst->alg)) return; inst->alg.cra_flags |= CRYPTO_ALG_DEAD; if (!tmpl || !crypto_tmpl_get(tmpl)) return; list_move(&inst->alg.cra_list, list); hlist_del(&inst->list); inst->alg.cra_destroy = crypto_destroy_instance; BUG_ON(!list_empty(&inst->alg.cra_users)); } /* * Given an algorithm alg, remove all algorithms that depend on it * through spawns. If nalg is not null, then exempt any algorithms * that is depended on by nalg. This is useful when nalg itself * depends on alg. */ void crypto_remove_spawns(struct crypto_alg *alg, struct list_head *list, struct crypto_alg *nalg) { u32 new_type = (nalg ?: alg)->cra_flags; struct crypto_spawn *spawn, *n; LIST_HEAD(secondary_spawns); struct list_head *spawns; LIST_HEAD(stack); LIST_HEAD(top); spawns = &alg->cra_users; list_for_each_entry_safe(spawn, n, spawns, list) { if ((spawn->alg->cra_flags ^ new_type) & spawn->mask) continue; list_move(&spawn->list, &top); } /* * Perform a depth-first walk starting from alg through * the cra_users tree. The list stack records the path * from alg to the current spawn. */ spawns = &top; do { while (!list_empty(spawns)) { struct crypto_instance *inst; spawn = list_first_entry(spawns, struct crypto_spawn, list); inst = spawn->inst; list_move(&spawn->list, &stack); spawn->dead = !spawn->registered || &inst->alg != nalg; if (!spawn->registered) break; BUG_ON(&inst->alg == alg); if (&inst->alg == nalg) break; spawns = &inst->alg.cra_users; /* * Even if spawn->registered is true, the * instance itself may still be unregistered. * This is because it may have failed during * registration. Therefore we still need to * make the following test. * * We may encounter an unregistered instance here, since * an instance's spawns are set up prior to the instance * being registered. An unregistered instance will have * NULL ->cra_users.next, since ->cra_users isn't * properly initialized until registration. But an * unregistered instance cannot have any users, so treat * it the same as ->cra_users being empty. */ if (spawns->next == NULL) break; } } while ((spawns = crypto_more_spawns(alg, &stack, &top, &secondary_spawns))); /* * Remove all instances that are marked as dead. Also * complete the resurrection of the others by moving them * back to the cra_users list. */ list_for_each_entry_safe(spawn, n, &secondary_spawns, list) { if (!spawn->dead) list_move(&spawn->list, &spawn->alg->cra_users); else if (spawn->registered) crypto_remove_instance(spawn->inst, list); } } EXPORT_SYMBOL_GPL(crypto_remove_spawns); static void crypto_alg_finish_registration(struct crypto_alg *alg, struct list_head *algs_to_put) { struct crypto_alg *q; list_for_each_entry(q, &crypto_alg_list, cra_list) { if (q == alg) continue; if (crypto_is_moribund(q)) continue; if (crypto_is_larval(q)) continue; if (strcmp(alg->cra_name, q->cra_name)) continue; if (strcmp(alg->cra_driver_name, q->cra_driver_name) && q->cra_priority > alg->cra_priority) continue; crypto_remove_spawns(q, algs_to_put, alg); } crypto_notify(CRYPTO_MSG_ALG_LOADED, alg); } static struct crypto_larval *crypto_alloc_test_larval(struct crypto_alg *alg) { struct crypto_larval *larval; if (!IS_ENABLED(CONFIG_CRYPTO_MANAGER) || IS_ENABLED(CONFIG_CRYPTO_MANAGER_DISABLE_TESTS) || (alg->cra_flags & CRYPTO_ALG_INTERNAL)) return NULL; /* No self-test needed */ larval = crypto_larval_alloc(alg->cra_name, alg->cra_flags | CRYPTO_ALG_TESTED, 0); if (IS_ERR(larval)) return larval; larval->adult = crypto_mod_get(alg); if (!larval->adult) { kfree(larval); return ERR_PTR(-ENOENT); } refcount_set(&larval->alg.cra_refcnt, 1); memcpy(larval->alg.cra_driver_name, alg->cra_driver_name, CRYPTO_MAX_ALG_NAME); larval->alg.cra_priority = alg->cra_priority; return larval; } static struct crypto_larval * __crypto_register_alg(struct crypto_alg *alg, struct list_head *algs_to_put) { struct crypto_alg *q; struct crypto_larval *larval; int ret = -EAGAIN; if (crypto_is_dead(alg)) goto err; INIT_LIST_HEAD(&alg->cra_users); ret = -EEXIST; list_for_each_entry(q, &crypto_alg_list, cra_list) { if (q == alg) goto err; if (crypto_is_moribund(q)) continue; if (crypto_is_larval(q)) { if (!strcmp(alg->cra_driver_name, q->cra_driver_name)) goto err; continue; } if (!strcmp(q->cra_driver_name, alg->cra_name) || !strcmp(q->cra_driver_name, alg->cra_driver_name) || !strcmp(q->cra_name, alg->cra_driver_name)) goto err; } larval = crypto_alloc_test_larval(alg); if (IS_ERR(larval)) goto out; list_add(&alg->cra_list, &crypto_alg_list); if (larval) { /* No cheating! */ alg->cra_flags &= ~CRYPTO_ALG_TESTED; list_add(&larval->alg.cra_list, &crypto_alg_list); } else { alg->cra_flags |= CRYPTO_ALG_TESTED; crypto_alg_finish_registration(alg, algs_to_put); } out: return larval; err: larval = ERR_PTR(ret); goto out; } void crypto_alg_tested(const char *name, int err) { struct crypto_larval *test; struct crypto_alg *alg; struct crypto_alg *q; LIST_HEAD(list); down_write(&crypto_alg_sem); list_for_each_entry(q, &crypto_alg_list, cra_list) { if (crypto_is_moribund(q) || !crypto_is_larval(q)) continue; test = (struct crypto_larval *)q; if (!strcmp(q->cra_driver_name, name)) goto found; } pr_err("alg: Unexpected test result for %s: %d\n", name, err); up_write(&crypto_alg_sem); return; found: q->cra_flags |= CRYPTO_ALG_DEAD; alg = test->adult; if (crypto_is_dead(alg)) goto complete; if (err == -ECANCELED) alg->cra_flags |= CRYPTO_ALG_FIPS_INTERNAL; else if (err) goto complete; else alg->cra_flags &= ~CRYPTO_ALG_FIPS_INTERNAL; alg->cra_flags |= CRYPTO_ALG_TESTED; crypto_alg_finish_registration(alg, &list); complete: list_del_init(&test->alg.cra_list); complete_all(&test->completion); up_write(&crypto_alg_sem); crypto_alg_put(&test->alg); crypto_remove_final(&list); } EXPORT_SYMBOL_GPL(crypto_alg_tested); void crypto_remove_final(struct list_head *list) { struct crypto_alg *alg; struct crypto_alg *n; list_for_each_entry_safe(alg, n, list, cra_list) { list_del_init(&alg->cra_list); crypto_alg_put(alg); } } EXPORT_SYMBOL_GPL(crypto_remove_final); int crypto_register_alg(struct crypto_alg *alg) { struct crypto_larval *larval; bool test_started = false; LIST_HEAD(algs_to_put); int err; alg->cra_flags &= ~CRYPTO_ALG_DEAD; err = crypto_check_alg(alg); if (err) return err; down_write(&crypto_alg_sem); larval = __crypto_register_alg(alg, &algs_to_put); if (!IS_ERR_OR_NULL(larval)) { test_started = crypto_boot_test_finished(); larval->test_started = test_started; } up_write(&crypto_alg_sem); if (IS_ERR(larval)) return PTR_ERR(larval); if (test_started) crypto_schedule_test(larval); else crypto_remove_final(&algs_to_put); return 0; } EXPORT_SYMBOL_GPL(crypto_register_alg); static int crypto_remove_alg(struct crypto_alg *alg, struct list_head *list) { if (unlikely(list_empty(&alg->cra_list))) return -ENOENT; alg->cra_flags |= CRYPTO_ALG_DEAD; list_del_init(&alg->cra_list); crypto_remove_spawns(alg, list, NULL); return 0; } void crypto_unregister_alg(struct crypto_alg *alg) { int ret; LIST_HEAD(list); down_write(&crypto_alg_sem); ret = crypto_remove_alg(alg, &list); up_write(&crypto_alg_sem); if (WARN(ret, "Algorithm %s is not registered", alg->cra_driver_name)) return; if (WARN_ON(refcount_read(&alg->cra_refcnt) != 1)) return; if (alg->cra_destroy) alg->cra_destroy(alg); crypto_remove_final(&list); } EXPORT_SYMBOL_GPL(crypto_unregister_alg); int crypto_register_algs(struct crypto_alg *algs, int count) { int i, ret; for (i = 0; i < count; i++) { ret = crypto_register_alg(&algs[i]); if (ret) goto err; } return 0; err: for (--i; i >= 0; --i) crypto_unregister_alg(&algs[i]); return ret; } EXPORT_SYMBOL_GPL(crypto_register_algs); void crypto_unregister_algs(struct crypto_alg *algs, int count) { int i; for (i = 0; i < count; i++) crypto_unregister_alg(&algs[i]); } EXPORT_SYMBOL_GPL(crypto_unregister_algs); int crypto_register_template(struct crypto_template *tmpl) { struct crypto_template *q; int err = -EEXIST; down_write(&crypto_alg_sem); crypto_check_module_sig(tmpl->module); list_for_each_entry(q, &crypto_template_list, list) { if (q == tmpl) goto out; } list_add(&tmpl->list, &crypto_template_list); err = 0; out: up_write(&crypto_alg_sem); return err; } EXPORT_SYMBOL_GPL(crypto_register_template); int crypto_register_templates(struct crypto_template *tmpls, int count) { int i, err; for (i = 0; i < count; i++) { err = crypto_register_template(&tmpls[i]); if (err) goto out; } return 0; out: for (--i; i >= 0; --i) crypto_unregister_template(&tmpls[i]); return err; } EXPORT_SYMBOL_GPL(crypto_register_templates); void crypto_unregister_template(struct crypto_template *tmpl) { struct crypto_instance *inst; struct hlist_node *n; struct hlist_head *list; LIST_HEAD(users); down_write(&crypto_alg_sem); BUG_ON(list_empty(&tmpl->list)); list_del_init(&tmpl->list); list = &tmpl->instances; hlist_for_each_entry(inst, list, list) { int err = crypto_remove_alg(&inst->alg, &users); BUG_ON(err); } up_write(&crypto_alg_sem); hlist_for_each_entry_safe(inst, n, list, list) { BUG_ON(refcount_read(&inst->alg.cra_refcnt) != 1); crypto_free_instance(inst); } crypto_remove_final(&users); } EXPORT_SYMBOL_GPL(crypto_unregister_template); void crypto_unregister_templates(struct crypto_template *tmpls, int count) { int i; for (i = count - 1; i >= 0; --i) crypto_unregister_template(&tmpls[i]); } EXPORT_SYMBOL_GPL(crypto_unregister_templates); static struct crypto_template *__crypto_lookup_template(const char *name) { struct crypto_template *q, *tmpl = NULL; down_read(&crypto_alg_sem); list_for_each_entry(q, &crypto_template_list, list) { if (strcmp(q->name, name)) continue; if (unlikely(!crypto_tmpl_get(q))) continue; tmpl = q; break; } up_read(&crypto_alg_sem); return tmpl; } struct crypto_template *crypto_lookup_template(const char *name) { return try_then_request_module(__crypto_lookup_template(name), "crypto-%s", name); } EXPORT_SYMBOL_GPL(crypto_lookup_template); int crypto_register_instance(struct crypto_template *tmpl, struct crypto_instance *inst) { struct crypto_larval *larval; struct crypto_spawn *spawn; u32 fips_internal = 0; LIST_HEAD(algs_to_put); int err; err = crypto_check_alg(&inst->alg); if (err) return err; inst->alg.cra_module = tmpl->module; inst->alg.cra_flags |= CRYPTO_ALG_INSTANCE; down_write(&crypto_alg_sem); larval = ERR_PTR(-EAGAIN); for (spawn = inst->spawns; spawn;) { struct crypto_spawn *next; if (spawn->dead) goto unlock; next = spawn->next; spawn->inst = inst; spawn->registered = true; fips_internal |= spawn->alg->cra_flags; crypto_mod_put(spawn->alg); spawn = next; } inst->alg.cra_flags |= (fips_internal & CRYPTO_ALG_FIPS_INTERNAL); larval = __crypto_register_alg(&inst->alg, &algs_to_put); if (IS_ERR(larval)) goto unlock; else if (larval) larval->test_started = true; hlist_add_head(&inst->list, &tmpl->instances); inst->tmpl = tmpl; unlock: up_write(&crypto_alg_sem); if (IS_ERR(larval)) return PTR_ERR(larval); if (larval) crypto_schedule_test(larval); else crypto_remove_final(&algs_to_put); return 0; } EXPORT_SYMBOL_GPL(crypto_register_instance); void crypto_unregister_instance(struct crypto_instance *inst) { LIST_HEAD(list); down_write(&crypto_alg_sem); crypto_remove_spawns(&inst->alg, &list, NULL); crypto_remove_instance(inst, &list); up_write(&crypto_alg_sem); crypto_remove_final(&list); } EXPORT_SYMBOL_GPL(crypto_unregister_instance); int crypto_grab_spawn(struct crypto_spawn *spawn, struct crypto_instance *inst, const char *name, u32 type, u32 mask) { struct crypto_alg *alg; int err = -EAGAIN; if (WARN_ON_ONCE(inst == NULL)) return -EINVAL; /* Allow the result of crypto_attr_alg_name() to be passed directly */ if (IS_ERR(name)) return PTR_ERR(name); alg = crypto_find_alg(name, spawn->frontend, type | CRYPTO_ALG_FIPS_INTERNAL, mask); if (IS_ERR(alg)) return PTR_ERR(alg); down_write(&crypto_alg_sem); if (!crypto_is_moribund(alg)) { list_add(&spawn->list, &alg->cra_users); spawn->alg = alg; spawn->mask = mask; spawn->next = inst->spawns; inst->spawns = spawn; inst->alg.cra_flags |= (alg->cra_flags & CRYPTO_ALG_INHERITED_FLAGS); err = 0; } up_write(&crypto_alg_sem); if (err) crypto_mod_put(alg); return err; } EXPORT_SYMBOL_GPL(crypto_grab_spawn); void crypto_drop_spawn(struct crypto_spawn *spawn) { if (!spawn->alg) /* not yet initialized? */ return; down_write(&crypto_alg_sem); if (!spawn->dead) list_del(&spawn->list); up_write(&crypto_alg_sem); if (!spawn->registered) crypto_mod_put(spawn->alg); } EXPORT_SYMBOL_GPL(crypto_drop_spawn); static struct crypto_alg *crypto_spawn_alg(struct crypto_spawn *spawn) { struct crypto_alg *alg = ERR_PTR(-EAGAIN); struct crypto_alg *target; bool shoot = false; down_read(&crypto_alg_sem); if (!spawn->dead) { alg = spawn->alg; if (!crypto_mod_get(alg)) { target = crypto_alg_get(alg); shoot = true; alg = ERR_PTR(-EAGAIN); } } up_read(&crypto_alg_sem); if (shoot) { crypto_shoot_alg(target); crypto_alg_put(target); } return alg; } struct crypto_tfm *crypto_spawn_tfm(struct crypto_spawn *spawn, u32 type, u32 mask) { struct crypto_alg *alg; struct crypto_tfm *tfm; alg = crypto_spawn_alg(spawn); if (IS_ERR(alg)) return ERR_CAST(alg); tfm = ERR_PTR(-EINVAL); if (unlikely((alg->cra_flags ^ type) & mask)) goto out_put_alg; tfm = __crypto_alloc_tfm(alg, type, mask); if (IS_ERR(tfm)) goto out_put_alg; return tfm; out_put_alg: crypto_mod_put(alg); return tfm; } EXPORT_SYMBOL_GPL(crypto_spawn_tfm); void *crypto_spawn_tfm2(struct crypto_spawn *spawn) { struct crypto_alg *alg; struct crypto_tfm *tfm; alg = crypto_spawn_alg(spawn); if (IS_ERR(alg)) return ERR_CAST(alg); tfm = crypto_create_tfm(alg, spawn->frontend); if (IS_ERR(tfm)) goto out_put_alg; return tfm; out_put_alg: crypto_mod_put(alg); return tfm; } EXPORT_SYMBOL_GPL(crypto_spawn_tfm2); int crypto_register_notifier(struct notifier_block *nb) { return blocking_notifier_chain_register(&crypto_chain, nb); } EXPORT_SYMBOL_GPL(crypto_register_notifier); int crypto_unregister_notifier(struct notifier_block *nb) { return blocking_notifier_chain_unregister(&crypto_chain, nb); } EXPORT_SYMBOL_GPL(crypto_unregister_notifier); struct crypto_attr_type *crypto_get_attr_type(struct rtattr **tb) { struct rtattr *rta = tb[0]; struct crypto_attr_type *algt; if (!rta) return ERR_PTR(-ENOENT); if (RTA_PAYLOAD(rta) < sizeof(*algt)) return ERR_PTR(-EINVAL); if (rta->rta_type != CRYPTOA_TYPE) return ERR_PTR(-EINVAL); algt = RTA_DATA(rta); return algt; } EXPORT_SYMBOL_GPL(crypto_get_attr_type); /** * crypto_check_attr_type() - check algorithm type and compute inherited mask * @tb: the template parameters * @type: the algorithm type the template would be instantiated as * @mask_ret: (output) the mask that should be passed to crypto_grab_*() * to restrict the flags of any inner algorithms * * Validate that the algorithm type the user requested is compatible with the * one the template would actually be instantiated as. E.g., if the user is * doing crypto_alloc_shash("cbc(aes)", ...), this would return an error because * the "cbc" template creates an "skcipher" algorithm, not an "shash" algorithm. * * Also compute the mask to use to restrict the flags of any inner algorithms. * * Return: 0 on success; -errno on failure */ int crypto_check_attr_type(struct rtattr **tb, u32 type, u32 *mask_ret) { struct crypto_attr_type *algt; algt = crypto_get_attr_type(tb); if (IS_ERR(algt)) return PTR_ERR(algt); if ((algt->type ^ type) & algt->mask) return -EINVAL; *mask_ret = crypto_algt_inherited_mask(algt); return 0; } EXPORT_SYMBOL_GPL(crypto_check_attr_type); const char *crypto_attr_alg_name(struct rtattr *rta) { struct crypto_attr_alg *alga; if (!rta) return ERR_PTR(-ENOENT); if (RTA_PAYLOAD(rta) < sizeof(*alga)) return ERR_PTR(-EINVAL); if (rta->rta_type != CRYPTOA_ALG) return ERR_PTR(-EINVAL); alga = RTA_DATA(rta); alga->name[CRYPTO_MAX_ALG_NAME - 1] = 0; return alga->name; } EXPORT_SYMBOL_GPL(crypto_attr_alg_name); int crypto_inst_setname(struct crypto_instance *inst, const char *name, struct crypto_alg *alg) { if (snprintf(inst->alg.cra_name, CRYPTO_MAX_ALG_NAME, "%s(%s)", name, alg->cra_name) >= CRYPTO_MAX_ALG_NAME) return -ENAMETOOLONG; if (snprintf(inst->alg.cra_driver_name, CRYPTO_MAX_ALG_NAME, "%s(%s)", name, alg->cra_driver_name) >= CRYPTO_MAX_ALG_NAME) return -ENAMETOOLONG; return 0; } EXPORT_SYMBOL_GPL(crypto_inst_setname); void crypto_init_queue(struct crypto_queue *queue, unsigned int max_qlen) { INIT_LIST_HEAD(&queue->list); queue->backlog = &queue->list; queue->qlen = 0; queue->max_qlen = max_qlen; } EXPORT_SYMBOL_GPL(crypto_init_queue); int crypto_enqueue_request(struct crypto_queue *queue, struct crypto_async_request *request) { int err = -EINPROGRESS; if (unlikely(queue->qlen >= queue->max_qlen)) { if (!(request->flags & CRYPTO_TFM_REQ_MAY_BACKLOG)) { err = -ENOSPC; goto out; } err = -EBUSY; if (queue->backlog == &queue->list) queue->backlog = &request->list; } queue->qlen++; list_add_tail(&request->list, &queue->list); out: return err; } EXPORT_SYMBOL_GPL(crypto_enqueue_request); void crypto_enqueue_request_head(struct crypto_queue *queue, struct crypto_async_request *request) { if (unlikely(queue->qlen >= queue->max_qlen)) queue->backlog = queue->backlog->prev; queue->qlen++; list_add(&request->list, &queue->list); } EXPORT_SYMBOL_GPL(crypto_enqueue_request_head); struct crypto_async_request *crypto_dequeue_request(struct crypto_queue *queue) { struct list_head *request; if (unlikely(!queue->qlen)) return NULL; queue->qlen--; if (queue->backlog != &queue->list) queue->backlog = queue->backlog->next; request = queue->list.next; list_del(request); return list_entry(request, struct crypto_async_request, list); } EXPORT_SYMBOL_GPL(crypto_dequeue_request); static inline void crypto_inc_byte(u8 *a, unsigned int size) { u8 *b = (a + size); u8 c; for (; size; size--) { c = *--b + 1; *b = c; if (c) break; } } void crypto_inc(u8 *a, unsigned int size) { __be32 *b = (__be32 *)(a + size); u32 c; if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) || IS_ALIGNED((unsigned long)b, __alignof__(*b))) for (; size >= 4; size -= 4) { c = be32_to_cpu(*--b) + 1; *b = cpu_to_be32(c); if (likely(c)) return; } crypto_inc_byte(a, size); } EXPORT_SYMBOL_GPL(crypto_inc); unsigned int crypto_alg_extsize(struct crypto_alg *alg) { return alg->cra_ctxsize + (alg->cra_alignmask & ~(crypto_tfm_ctx_alignment() - 1)); } EXPORT_SYMBOL_GPL(crypto_alg_extsize); int crypto_type_has_alg(const char *name, const struct crypto_type *frontend, u32 type, u32 mask) { int ret = 0; struct crypto_alg *alg = crypto_find_alg(name, frontend, type, mask); if (!IS_ERR(alg)) { crypto_mod_put(alg); ret = 1; } return ret; } EXPORT_SYMBOL_GPL(crypto_type_has_alg); static void __init crypto_start_tests(void) { if (!IS_BUILTIN(CONFIG_CRYPTO_ALGAPI)) return; if (IS_ENABLED(CONFIG_CRYPTO_MANAGER_DISABLE_TESTS)) return; set_crypto_boot_test_finished(); for (;;) { struct crypto_larval *larval = NULL; struct crypto_alg *q; down_write(&crypto_alg_sem); list_for_each_entry(q, &crypto_alg_list, cra_list) { struct crypto_larval *l; if (!crypto_is_larval(q)) continue; l = (void *)q; if (!crypto_is_test_larval(l)) continue; if (l->test_started) continue; l->test_started = true; larval = l; break; } up_write(&crypto_alg_sem); if (!larval) break; crypto_schedule_test(larval); } } static int __init crypto_algapi_init(void) { crypto_init_proc(); crypto_start_tests(); return 0; } static void __exit crypto_algapi_exit(void) { crypto_exit_proc(); } /* * We run this at late_initcall so that all the built-in algorithms * have had a chance to register themselves first. */ late_initcall(crypto_algapi_init); module_exit(crypto_algapi_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Cryptographic algorithms API"); MODULE_SOFTDEP("pre: cryptomgr");
11 11 11 43 43 11 43 43 43 43 43 11 11 7099 7107 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 // SPDX-License-Identifier: GPL-2.0+ /* * Universal/legacy driver for 8250/16550-type serial ports * * Based on drivers/char/serial.c, by Linus Torvalds, Theodore Ts'o. * * Copyright (C) 2001 Russell King. * * Supports: * early_serial_setup() ports * userspace-configurable "phantom" ports * serial8250_register_8250_port() ports */ #include <linux/acpi.h> #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/ioport.h> #include <linux/init.h> #include <linux/console.h> #include <linux/sysrq.h> #include <linux/delay.h> #include <linux/platform_device.h> #include <linux/pm_runtime.h> #include <linux/tty.h> #include <linux/ratelimit.h> #include <linux/tty_flip.h> #include <linux/serial.h> #include <linux/serial_8250.h> #include <linux/nmi.h> #include <linux/mutex.h> #include <linux/slab.h> #include <linux/string_helpers.h> #include <linux/uaccess.h> #include <linux/io.h> #include <asm/irq.h> #include "8250.h" #define PASS_LIMIT 512 struct irq_info { struct hlist_node node; int irq; spinlock_t lock; /* Protects list not the hash */ struct list_head *head; }; #define NR_IRQ_HASH 32 /* Can be adjusted later */ static struct hlist_head irq_lists[NR_IRQ_HASH]; static DEFINE_MUTEX(hash_mutex); /* Used to walk the hash */ /* * This is the serial driver's interrupt routine. * * Arjan thinks the old way was overly complex, so it got simplified. * Alan disagrees, saying that need the complexity to handle the weird * nature of ISA shared interrupts. (This is a special exception.) * * In order to handle ISA shared interrupts properly, we need to check * that all ports have been serviced, and therefore the ISA interrupt * line has been de-asserted. * * This means we need to loop through all ports. checking that they * don't have an interrupt pending. */ static irqreturn_t serial8250_interrupt(int irq, void *dev_id) { struct irq_info *i = dev_id; struct list_head *l, *end = NULL; int pass_counter = 0, handled = 0; pr_debug("%s(%d): start\n", __func__, irq); spin_lock(&i->lock); l = i->head; do { struct uart_8250_port *up; struct uart_port *port; up = list_entry(l, struct uart_8250_port, list); port = &up->port; if (port->handle_irq(port)) { handled = 1; end = NULL; } else if (end == NULL) end = l; l = l->next; if (l == i->head && pass_counter++ > PASS_LIMIT) break; } while (l != end); spin_unlock(&i->lock); pr_debug("%s(%d): end\n", __func__, irq); return IRQ_RETVAL(handled); } /* * To support ISA shared interrupts, we need to have one interrupt * handler that ensures that the IRQ line has been deasserted * before returning. Failing to do this will result in the IRQ * line being stuck active, and, since ISA irqs are edge triggered, * no more IRQs will be seen. */ static void serial_do_unlink(struct irq_info *i, struct uart_8250_port *up) { spin_lock_irq(&i->lock); if (!list_empty(i->head)) { if (i->head == &up->list) i->head = i->head->next; list_del(&up->list); } else { BUG_ON(i->head != &up->list); i->head = NULL; } spin_unlock_irq(&i->lock); /* List empty so throw away the hash node */ if (i->head == NULL) { hlist_del(&i->node); kfree(i); } } static int serial_link_irq_chain(struct uart_8250_port *up) { struct hlist_head *h; struct irq_info *i; int ret; mutex_lock(&hash_mutex); h = &irq_lists[up->port.irq % NR_IRQ_HASH]; hlist_for_each_entry(i, h, node) if (i->irq == up->port.irq) break; if (i == NULL) { i = kzalloc(sizeof(struct irq_info), GFP_KERNEL); if (i == NULL) { mutex_unlock(&hash_mutex); return -ENOMEM; } spin_lock_init(&i->lock); i->irq = up->port.irq; hlist_add_head(&i->node, h); } mutex_unlock(&hash_mutex); spin_lock_irq(&i->lock); if (i->head) { list_add(&up->list, i->head); spin_unlock_irq(&i->lock); ret = 0; } else { INIT_LIST_HEAD(&up->list); i->head = &up->list; spin_unlock_irq(&i->lock); ret = request_irq(up->port.irq, serial8250_interrupt, up->port.irqflags, up->port.name, i); if (ret < 0) serial_do_unlink(i, up); } return ret; } static void serial_unlink_irq_chain(struct uart_8250_port *up) { struct irq_info *i; struct hlist_head *h; mutex_lock(&hash_mutex); h = &irq_lists[up->port.irq % NR_IRQ_HASH]; hlist_for_each_entry(i, h, node) if (i->irq == up->port.irq) break; BUG_ON(i == NULL); BUG_ON(i->head == NULL); if (list_empty(i->head)) free_irq(up->port.irq, i); serial_do_unlink(i, up); mutex_unlock(&hash_mutex); } /* * This function is used to handle ports that do not have an * interrupt. This doesn't work very well for 16450's, but gives * barely passable results for a 16550A. (Although at the expense * of much CPU overhead). */ static void serial8250_timeout(struct timer_list *t) { struct uart_8250_port *up = from_timer(up, t, timer); up->port.handle_irq(&up->port); mod_timer(&up->timer, jiffies + uart_poll_timeout(&up->port)); } static void serial8250_backup_timeout(struct timer_list *t) { struct uart_8250_port *up = from_timer(up, t, timer); unsigned int iir, ier = 0, lsr; unsigned long flags; uart_port_lock_irqsave(&up->port, &flags); /* * Must disable interrupts or else we risk racing with the interrupt * based handler. */ if (up->port.irq) { ier = serial_in(up, UART_IER); serial_out(up, UART_IER, 0); } iir = serial_in(up, UART_IIR); /* * This should be a safe test for anyone who doesn't trust the * IIR bits on their UART, but it's specifically designed for * the "Diva" UART used on the management processor on many HP * ia64 and parisc boxes. */ lsr = serial_lsr_in(up); if ((iir & UART_IIR_NO_INT) && (up->ier & UART_IER_THRI) && (!kfifo_is_empty(&up->port.state->port.xmit_fifo) || up->port.x_char) && (lsr & UART_LSR_THRE)) { iir &= ~(UART_IIR_ID | UART_IIR_NO_INT); iir |= UART_IIR_THRI; } if (!(iir & UART_IIR_NO_INT)) serial8250_tx_chars(up); if (up->port.irq) serial_out(up, UART_IER, ier); uart_port_unlock_irqrestore(&up->port, flags); /* Standard timer interval plus 0.2s to keep the port running */ mod_timer(&up->timer, jiffies + uart_poll_timeout(&up->port) + HZ / 5); } static void univ8250_setup_timer(struct uart_8250_port *up) { struct uart_port *port = &up->port; /* * The above check will only give an accurate result the first time * the port is opened so this value needs to be preserved. */ if (up->bugs & UART_BUG_THRE) { pr_debug("%s - using backup timer\n", port->name); up->timer.function = serial8250_backup_timeout; mod_timer(&up->timer, jiffies + uart_poll_timeout(port) + HZ / 5); } /* * If the "interrupt" for this port doesn't correspond with any * hardware interrupt, we use a timer-based system. The original * driver used to do this with IRQ0. */ if (!port->irq) mod_timer(&up->timer, jiffies + uart_poll_timeout(port)); } static int univ8250_setup_irq(struct uart_8250_port *up) { struct uart_port *port = &up->port; if (port->irq) return serial_link_irq_chain(up); return 0; } static void univ8250_release_irq(struct uart_8250_port *up) { struct uart_port *port = &up->port; del_timer_sync(&up->timer); up->timer.function = serial8250_timeout; if (port->irq) serial_unlink_irq_chain(up); } const struct uart_ops *univ8250_port_base_ops = NULL; struct uart_ops univ8250_port_ops; static const struct uart_8250_ops univ8250_driver_ops = { .setup_irq = univ8250_setup_irq, .release_irq = univ8250_release_irq, .setup_timer = univ8250_setup_timer, }; static struct uart_8250_port serial8250_ports[UART_NR]; /** * serial8250_get_port - retrieve struct uart_8250_port * @line: serial line number * * This function retrieves struct uart_8250_port for the specific line. * This struct *must* *not* be used to perform a 8250 or serial core operation * which is not accessible otherwise. Its only purpose is to make the struct * accessible to the runtime-pm callbacks for context suspend/restore. * The lock assumption made here is none because runtime-pm suspend/resume * callbacks should not be invoked if there is any operation performed on the * port. */ struct uart_8250_port *serial8250_get_port(int line) { return &serial8250_ports[line]; } EXPORT_SYMBOL_GPL(serial8250_get_port); static inline void serial8250_apply_quirks(struct uart_8250_port *up) { up->port.quirks |= skip_txen_test ? UPQ_NO_TXEN_TEST : 0; } struct uart_8250_port *serial8250_setup_port(int index) { struct uart_8250_port *up; if (index >= UART_NR) return NULL; up = &serial8250_ports[index]; up->port.line = index; up->port.port_id = index; serial8250_init_port(up); if (!univ8250_port_base_ops) univ8250_port_base_ops = up->port.ops; up->port.ops = &univ8250_port_ops; timer_setup(&up->timer, serial8250_timeout, 0); up->ops = &univ8250_driver_ops; serial8250_set_defaults(up); return up; } void __init serial8250_register_ports(struct uart_driver *drv, struct device *dev) { int i; for (i = 0; i < nr_uarts; i++) { struct uart_8250_port *up = &serial8250_ports[i]; if (up->port.type == PORT_8250_CIR) continue; if (up->port.dev) continue; up->port.dev = dev; if (uart_console_registered(&up->port)) pm_runtime_get_sync(up->port.dev); serial8250_apply_quirks(up); uart_add_one_port(drv, &up->port); } } #ifdef CONFIG_SERIAL_8250_CONSOLE static void univ8250_console_write(struct console *co, const char *s, unsigned int count) { struct uart_8250_port *up = &serial8250_ports[co->index]; serial8250_console_write(up, s, count); } static int univ8250_console_setup(struct console *co, char *options) { struct uart_8250_port *up; struct uart_port *port; int retval, i; /* * Check whether an invalid uart number has been specified, and * if so, search for the first available port that does have * console support. */ if (co->index < 0 || co->index >= UART_NR) co->index = 0; /* * If the console is past the initial isa ports, init more ports up to * co->index as needed and increment nr_uarts accordingly. */ for (i = nr_uarts; i <= co->index; i++) { up = serial8250_setup_port(i); if (!up) return -ENODEV; nr_uarts++; } port = &serial8250_ports[co->index].port; /* link port to console */ uart_port_set_cons(port, co); retval = serial8250_console_setup(port, options, false); if (retval != 0) uart_port_set_cons(port, NULL); return retval; } static int univ8250_console_exit(struct console *co) { struct uart_port *port; port = &serial8250_ports[co->index].port; return serial8250_console_exit(port); } /** * univ8250_console_match - non-standard console matching * @co: registering console * @name: name from console command line * @idx: index from console command line * @options: ptr to option string from console command line * * Only attempts to match console command lines of the form: * console=uart[8250],io|mmio|mmio16|mmio32,<addr>[,<options>] * console=uart[8250],0x<addr>[,<options>] * This form is used to register an initial earlycon boot console and * replace it with the serial8250_console at 8250 driver init. * * Performs console setup for a match (as required by interface) * If no <options> are specified, then assume the h/w is already setup. * * Returns 0 if console matches; otherwise non-zero to use default matching */ static int univ8250_console_match(struct console *co, char *name, int idx, char *options) { char match[] = "uart"; /* 8250-specific earlycon name */ unsigned char iotype; resource_size_t addr; int i; if (strncmp(name, match, 4) != 0) return -ENODEV; if (uart_parse_earlycon(options, &iotype, &addr, &options)) return -ENODEV; /* try to match the port specified on the command line */ for (i = 0; i < nr_uarts; i++) { struct uart_port *port = &serial8250_ports[i].port; if (port->iotype != iotype) continue; if ((iotype == UPIO_MEM || iotype == UPIO_MEM16 || iotype == UPIO_MEM32 || iotype == UPIO_MEM32BE) && (port->mapbase != addr)) continue; if (iotype == UPIO_PORT && port->iobase != addr) continue; co->index = i; uart_port_set_cons(port, co); return serial8250_console_setup(port, options, true); } return -ENODEV; } static struct console univ8250_console = { .name = "ttyS", .write = univ8250_console_write, .device = uart_console_device, .setup = univ8250_console_setup, .exit = univ8250_console_exit, .match = univ8250_console_match, .flags = CON_PRINTBUFFER | CON_ANYTIME, .index = -1, .data = &serial8250_reg, }; static int __init univ8250_console_init(void) { if (nr_uarts == 0) return -ENODEV; serial8250_isa_init_ports(); register_console(&univ8250_console); return 0; } console_initcall(univ8250_console_init); #define SERIAL8250_CONSOLE (&univ8250_console) #else #define SERIAL8250_CONSOLE NULL #endif struct uart_driver serial8250_reg = { .owner = THIS_MODULE, .driver_name = "serial", .dev_name = "ttyS", .major = TTY_MAJOR, .minor = 64, .cons = SERIAL8250_CONSOLE, }; /* * early_serial_setup - early registration for 8250 ports * * Setup an 8250 port structure prior to console initialisation. Use * after console initialisation will cause undefined behaviour. */ int __init early_serial_setup(struct uart_port *port) { struct uart_port *p; if (port->line >= ARRAY_SIZE(serial8250_ports) || nr_uarts == 0) return -ENODEV; serial8250_isa_init_ports(); p = &serial8250_ports[port->line].port; p->iobase = port->iobase; p->membase = port->membase; p->irq = port->irq; p->irqflags = port->irqflags; p->uartclk = port->uartclk; p->fifosize = port->fifosize; p->regshift = port->regshift; p->iotype = port->iotype; p->flags = port->flags; p->mapbase = port->mapbase; p->mapsize = port->mapsize; p->private_data = port->private_data; p->type = port->type; p->line = port->line; serial8250_set_defaults(up_to_u8250p(p)); if (port->serial_in) p->serial_in = port->serial_in; if (port->serial_out) p->serial_out = port->serial_out; if (port->handle_irq) p->handle_irq = port->handle_irq; return 0; } /** * serial8250_suspend_port - suspend one serial port * @line: serial line number * * Suspend one serial port. */ void serial8250_suspend_port(int line) { struct uart_8250_port *up = &serial8250_ports[line]; struct uart_port *port = &up->port; if (!console_suspend_enabled && uart_console(port) && port->type != PORT_8250) { unsigned char canary = 0xa5; serial_out(up, UART_SCR, canary); if (serial_in(up, UART_SCR) == canary) up->canary = canary; } uart_suspend_port(&serial8250_reg, port); } EXPORT_SYMBOL(serial8250_suspend_port); /** * serial8250_resume_port - resume one serial port * @line: serial line number * * Resume one serial port. */ void serial8250_resume_port(int line) { struct uart_8250_port *up = &serial8250_ports[line]; struct uart_port *port = &up->port; up->canary = 0; if (up->capabilities & UART_NATSEMI) { /* Ensure it's still in high speed mode */ serial_port_out(port, UART_LCR, 0xE0); ns16550a_goto_highspeed(up); serial_port_out(port, UART_LCR, 0); port->uartclk = 921600*16; } uart_resume_port(&serial8250_reg, port); } EXPORT_SYMBOL(serial8250_resume_port); /* * serial8250_register_8250_port and serial8250_unregister_port allows for * 16x50 serial ports to be configured at run-time, to support PCMCIA * modems and PCI multiport cards. */ static DEFINE_MUTEX(serial_mutex); static struct uart_8250_port *serial8250_find_match_or_unused(const struct uart_port *port) { int i; /* * First, find a port entry which matches. */ for (i = 0; i < nr_uarts; i++) if (uart_match_port(&serial8250_ports[i].port, port)) return &serial8250_ports[i]; /* try line number first if still available */ i = port->line; if (i < nr_uarts && serial8250_ports[i].port.type == PORT_UNKNOWN && serial8250_ports[i].port.iobase == 0) return &serial8250_ports[i]; /* * We didn't find a matching entry, so look for the first * free entry. We look for one which hasn't been previously * used (indicated by zero iobase). */ for (i = 0; i < nr_uarts; i++) if (serial8250_ports[i].port.type == PORT_UNKNOWN && serial8250_ports[i].port.iobase == 0) return &serial8250_ports[i]; /* * That also failed. Last resort is to find any entry which * doesn't have a real port associated with it. */ for (i = 0; i < nr_uarts; i++) if (serial8250_ports[i].port.type == PORT_UNKNOWN) return &serial8250_ports[i]; return NULL; } static void serial_8250_overrun_backoff_work(struct work_struct *work) { struct uart_8250_port *up = container_of(to_delayed_work(work), struct uart_8250_port, overrun_backoff); struct uart_port *port = &up->port; unsigned long flags; uart_port_lock_irqsave(port, &flags); up->ier |= UART_IER_RLSI | UART_IER_RDI; serial_out(up, UART_IER, up->ier); uart_port_unlock_irqrestore(port, flags); } /** * serial8250_register_8250_port - register a serial port * @up: serial port template * * Configure the serial port specified by the request. If the * port exists and is in use, it is hung up and unregistered * first. * * The port is then probed and if necessary the IRQ is autodetected * If this fails an error is returned. * * On success the port is ready to use and the line number is returned. */ int serial8250_register_8250_port(const struct uart_8250_port *up) { struct uart_8250_port *uart; int ret = -ENOSPC; if (up->port.uartclk == 0) return -EINVAL; mutex_lock(&serial_mutex); uart = serial8250_find_match_or_unused(&up->port); if (!uart) { /* * If the port is past the initial isa ports, initialize a new * port and increment nr_uarts accordingly. */ uart = serial8250_setup_port(nr_uarts); if (!uart) goto unlock; nr_uarts++; } if (uart->port.type != PORT_8250_CIR) { struct mctrl_gpios *gpios; if (uart->port.dev) uart_remove_one_port(&serial8250_reg, &uart->port); uart->port.ctrl_id = up->port.ctrl_id; uart->port.port_id = up->port.port_id; uart->port.iobase = up->port.iobase; uart->port.membase = up->port.membase; uart->port.irq = up->port.irq; uart->port.irqflags = up->port.irqflags; uart->port.uartclk = up->port.uartclk; uart->port.fifosize = up->port.fifosize; uart->port.regshift = up->port.regshift; uart->port.iotype = up->port.iotype; uart->port.flags = up->port.flags | UPF_BOOT_AUTOCONF; uart->bugs = up->bugs; uart->port.mapbase = up->port.mapbase; uart->port.mapsize = up->port.mapsize; uart->port.private_data = up->port.private_data; uart->tx_loadsz = up->tx_loadsz; uart->capabilities = up->capabilities; uart->port.throttle = up->port.throttle; uart->port.unthrottle = up->port.unthrottle; uart->port.rs485_config = up->port.rs485_config; uart->port.rs485_supported = up->port.rs485_supported; uart->port.rs485 = up->port.rs485; uart->rs485_start_tx = up->rs485_start_tx; uart->rs485_stop_tx = up->rs485_stop_tx; uart->lsr_save_mask = up->lsr_save_mask; uart->dma = up->dma; /* Take tx_loadsz from fifosize if it wasn't set separately */ if (uart->port.fifosize && !uart->tx_loadsz) uart->tx_loadsz = uart->port.fifosize; if (up->port.dev) { uart->port.dev = up->port.dev; ret = uart_get_rs485_mode(&uart->port); if (ret) goto err; } if (up->port.flags & UPF_FIXED_TYPE) uart->port.type = up->port.type; /* * Only call mctrl_gpio_init(), if the device has no ACPI * companion device */ if (!has_acpi_companion(uart->port.dev)) { gpios = mctrl_gpio_init(&uart->port, 0); if (IS_ERR(gpios)) { ret = PTR_ERR(gpios); goto err; } else { uart->gpios = gpios; } } serial8250_set_defaults(uart); /* Possibly override default I/O functions. */ if (up->port.serial_in) uart->port.serial_in = up->port.serial_in; if (up->port.serial_out) uart->port.serial_out = up->port.serial_out; if (up->port.handle_irq) uart->port.handle_irq = up->port.handle_irq; /* Possibly override set_termios call */ if (up->port.set_termios) uart->port.set_termios = up->port.set_termios; if (up->port.set_ldisc) uart->port.set_ldisc = up->port.set_ldisc; if (up->port.get_mctrl) uart->port.get_mctrl = up->port.get_mctrl; if (up->port.set_mctrl) uart->port.set_mctrl = up->port.set_mctrl; if (up->port.get_divisor) uart->port.get_divisor = up->port.get_divisor; if (up->port.set_divisor) uart->port.set_divisor = up->port.set_divisor; if (up->port.startup) uart->port.startup = up->port.startup; if (up->port.shutdown) uart->port.shutdown = up->port.shutdown; if (up->port.pm) uart->port.pm = up->port.pm; if (up->port.handle_break) uart->port.handle_break = up->port.handle_break; if (up->dl_read) uart->dl_read = up->dl_read; if (up->dl_write) uart->dl_write = up->dl_write; if (uart->port.type != PORT_8250_CIR) { if (uart_console_registered(&uart->port)) pm_runtime_get_sync(uart->port.dev); if (serial8250_isa_config != NULL) serial8250_isa_config(0, &uart->port, &uart->capabilities); serial8250_apply_quirks(uart); ret = uart_add_one_port(&serial8250_reg, &uart->port); if (ret) goto err; ret = uart->port.line; } else { dev_info(uart->port.dev, "skipping CIR port at 0x%lx / 0x%llx, IRQ %d\n", uart->port.iobase, (unsigned long long)uart->port.mapbase, uart->port.irq); ret = 0; } if (!uart->lsr_save_mask) uart->lsr_save_mask = LSR_SAVE_FLAGS; /* Use default LSR mask */ /* Initialise interrupt backoff work if required */ if (up->overrun_backoff_time_ms > 0) { uart->overrun_backoff_time_ms = up->overrun_backoff_time_ms; INIT_DELAYED_WORK(&uart->overrun_backoff, serial_8250_overrun_backoff_work); } else { uart->overrun_backoff_time_ms = 0; } } unlock: mutex_unlock(&serial_mutex); return ret; err: uart->port.dev = NULL; mutex_unlock(&serial_mutex); return ret; } EXPORT_SYMBOL(serial8250_register_8250_port); /** * serial8250_unregister_port - remove a 16x50 serial port at runtime * @line: serial line number * * Remove one serial port. This may not be called from interrupt * context. We hand the port back to the our control. */ void serial8250_unregister_port(int line) { struct uart_8250_port *uart = &serial8250_ports[line]; mutex_lock(&serial_mutex); if (uart->em485) { unsigned long flags; uart_port_lock_irqsave(&uart->port, &flags); serial8250_em485_destroy(uart); uart_port_unlock_irqrestore(&uart->port, flags); } uart_remove_one_port(&serial8250_reg, &uart->port); if (serial8250_isa_devs) { uart->port.flags &= ~UPF_BOOT_AUTOCONF; uart->port.type = PORT_UNKNOWN; uart->port.dev = &serial8250_isa_devs->dev; uart->port.port_id = line; uart->capabilities = 0; serial8250_init_port(uart); serial8250_apply_quirks(uart); uart_add_one_port(&serial8250_reg, &uart->port); } else { uart->port.dev = NULL; } mutex_unlock(&serial_mutex); } EXPORT_SYMBOL(serial8250_unregister_port); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Generic 8250/16x50 serial driver");
12 12 12 12 12 12 12 12 11 12 12 11 11 11 12 1 1 9 1 1 1 1 1 1 1 1 1 12 12 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 // SPDX-License-Identifier: GPL-2.0-or-later /* * net/sched/sch_prio.c Simple 3-band priority "scheduler". * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * Fixes: 19990609: J Hadi Salim <hadi@nortelnetworks.com>: * Init -- EINVAL when opt undefined */ #include <linux/module.h> #include <linux/slab.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/skbuff.h> #include <net/netlink.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> struct prio_sched_data { int bands; struct tcf_proto __rcu *filter_list; struct tcf_block *block; u8 prio2band[TC_PRIO_MAX+1]; struct Qdisc *queues[TCQ_PRIO_BANDS]; }; static struct Qdisc * prio_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) { struct prio_sched_data *q = qdisc_priv(sch); u32 band = skb->priority; struct tcf_result res; struct tcf_proto *fl; int err; *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; if (TC_H_MAJ(skb->priority) != sch->handle) { fl = rcu_dereference_bh(q->filter_list); err = tcf_classify(skb, NULL, fl, &res, false); #ifdef CONFIG_NET_CLS_ACT switch (err) { case TC_ACT_STOLEN: case TC_ACT_QUEUED: case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; fallthrough; case TC_ACT_SHOT: return NULL; } #endif if (!fl || err < 0) { if (TC_H_MAJ(band)) band = 0; return q->queues[q->prio2band[band & TC_PRIO_MAX]]; } band = res.classid; } band = TC_H_MIN(band) - 1; if (band >= q->bands) return q->queues[q->prio2band[0]]; return q->queues[band]; } static int prio_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { unsigned int len = qdisc_pkt_len(skb); struct Qdisc *qdisc; int ret; qdisc = prio_classify(skb, sch, &ret); #ifdef CONFIG_NET_CLS_ACT if (qdisc == NULL) { if (ret & __NET_XMIT_BYPASS) qdisc_qstats_drop(sch); __qdisc_drop(skb, to_free); return ret; } #endif ret = qdisc_enqueue(skb, qdisc, to_free); if (ret == NET_XMIT_SUCCESS) { sch->qstats.backlog += len; sch->q.qlen++; return NET_XMIT_SUCCESS; } if (net_xmit_drop_count(ret)) qdisc_qstats_drop(sch); return ret; } static struct sk_buff *prio_peek(struct Qdisc *sch) { struct prio_sched_data *q = qdisc_priv(sch); int prio; for (prio = 0; prio < q->bands; prio++) { struct Qdisc *qdisc = q->queues[prio]; struct sk_buff *skb = qdisc->ops->peek(qdisc); if (skb) return skb; } return NULL; } static struct sk_buff *prio_dequeue(struct Qdisc *sch) { struct prio_sched_data *q = qdisc_priv(sch); int prio; for (prio = 0; prio < q->bands; prio++) { struct Qdisc *qdisc = q->queues[prio]; struct sk_buff *skb = qdisc_dequeue_peeked(qdisc); if (skb) { qdisc_bstats_update(sch, skb); qdisc_qstats_backlog_dec(sch, skb); sch->q.qlen--; return skb; } } return NULL; } static void prio_reset(struct Qdisc *sch) { int prio; struct prio_sched_data *q = qdisc_priv(sch); for (prio = 0; prio < q->bands; prio++) qdisc_reset(q->queues[prio]); } static int prio_offload(struct Qdisc *sch, struct tc_prio_qopt *qopt) { struct net_device *dev = qdisc_dev(sch); struct tc_prio_qopt_offload opt = { .handle = sch->handle, .parent = sch->parent, }; if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) return -EOPNOTSUPP; if (qopt) { opt.command = TC_PRIO_REPLACE; opt.replace_params.bands = qopt->bands; memcpy(&opt.replace_params.priomap, qopt->priomap, TC_PRIO_MAX + 1); opt.replace_params.qstats = &sch->qstats; } else { opt.command = TC_PRIO_DESTROY; } return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_PRIO, &opt); } static void prio_destroy(struct Qdisc *sch) { int prio; struct prio_sched_data *q = qdisc_priv(sch); tcf_block_put(q->block); prio_offload(sch, NULL); for (prio = 0; prio < q->bands; prio++) qdisc_put(q->queues[prio]); } static int prio_tune(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct prio_sched_data *q = qdisc_priv(sch); struct Qdisc *queues[TCQ_PRIO_BANDS]; int oldbands = q->bands, i; struct tc_prio_qopt *qopt; if (nla_len(opt) < sizeof(*qopt)) return -EINVAL; qopt = nla_data(opt); if (qopt->bands > TCQ_PRIO_BANDS || qopt->bands < TCQ_MIN_PRIO_BANDS) return -EINVAL; for (i = 0; i <= TC_PRIO_MAX; i++) { if (qopt->priomap[i] >= qopt->bands) return -EINVAL; } /* Before commit, make sure we can allocate all new qdiscs */ for (i = oldbands; i < qopt->bands; i++) { queues[i] = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, TC_H_MAKE(sch->handle, i + 1), extack); if (!queues[i]) { while (i > oldbands) qdisc_put(queues[--i]); return -ENOMEM; } } prio_offload(sch, qopt); sch_tree_lock(sch); q->bands = qopt->bands; memcpy(q->prio2band, qopt->priomap, TC_PRIO_MAX+1); for (i = q->bands; i < oldbands; i++) qdisc_tree_flush_backlog(q->queues[i]); for (i = oldbands; i < q->bands; i++) { q->queues[i] = queues[i]; if (q->queues[i] != &noop_qdisc) qdisc_hash_add(q->queues[i], true); } sch_tree_unlock(sch); for (i = q->bands; i < oldbands; i++) qdisc_put(q->queues[i]); return 0; } static int prio_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct prio_sched_data *q = qdisc_priv(sch); int err; if (!opt) return -EINVAL; err = tcf_block_get(&q->block, &q->filter_list, sch, extack); if (err) return err; return prio_tune(sch, opt, extack); } static int prio_dump_offload(struct Qdisc *sch) { struct tc_prio_qopt_offload hw_stats = { .command = TC_PRIO_STATS, .handle = sch->handle, .parent = sch->parent, { .stats = { .bstats = &sch->bstats, .qstats = &sch->qstats, }, }, }; return qdisc_offload_dump_helper(sch, TC_SETUP_QDISC_PRIO, &hw_stats); } static int prio_dump(struct Qdisc *sch, struct sk_buff *skb) { struct prio_sched_data *q = qdisc_priv(sch); unsigned char *b = skb_tail_pointer(skb); struct tc_prio_qopt opt; int err; opt.bands = q->bands; memcpy(&opt.priomap, q->prio2band, TC_PRIO_MAX + 1); err = prio_dump_offload(sch); if (err) goto nla_put_failure; if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt)) goto nla_put_failure; return skb->len; nla_put_failure: nlmsg_trim(skb, b); return -1; } static int prio_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, struct Qdisc **old, struct netlink_ext_ack *extack) { struct prio_sched_data *q = qdisc_priv(sch); struct tc_prio_qopt_offload graft_offload; unsigned long band = arg - 1; if (!new) { new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, TC_H_MAKE(sch->handle, arg), extack); if (!new) new = &noop_qdisc; else qdisc_hash_add(new, true); } *old = qdisc_replace(sch, new, &q->queues[band]); graft_offload.handle = sch->handle; graft_offload.parent = sch->parent; graft_offload.graft_params.band = band; graft_offload.graft_params.child_handle = new->handle; graft_offload.command = TC_PRIO_GRAFT; qdisc_offload_graft_helper(qdisc_dev(sch), sch, new, *old, TC_SETUP_QDISC_PRIO, &graft_offload, extack); return 0; } static struct Qdisc * prio_leaf(struct Qdisc *sch, unsigned long arg) { struct prio_sched_data *q = qdisc_priv(sch); unsigned long band = arg - 1; return q->queues[band]; } static unsigned long prio_find(struct Qdisc *sch, u32 classid) { struct prio_sched_data *q = qdisc_priv(sch); unsigned long band = TC_H_MIN(classid); if (band - 1 >= q->bands) return 0; return band; } static unsigned long prio_bind(struct Qdisc *sch, unsigned long parent, u32 classid) { return prio_find(sch, classid); } static void prio_unbind(struct Qdisc *q, unsigned long cl) { } static int prio_dump_class(struct Qdisc *sch, unsigned long cl, struct sk_buff *skb, struct tcmsg *tcm) { struct prio_sched_data *q = qdisc_priv(sch); tcm->tcm_handle |= TC_H_MIN(cl); tcm->tcm_info = q->queues[cl-1]->handle; return 0; } static int prio_dump_class_stats(struct Qdisc *sch, unsigned long cl, struct gnet_dump *d) { struct prio_sched_data *q = qdisc_priv(sch); struct Qdisc *cl_q; cl_q = q->queues[cl - 1]; if (gnet_stats_copy_basic(d, cl_q->cpu_bstats, &cl_q->bstats, true) < 0 || qdisc_qstats_copy(d, cl_q) < 0) return -1; return 0; } static void prio_walk(struct Qdisc *sch, struct qdisc_walker *arg) { struct prio_sched_data *q = qdisc_priv(sch); int prio; if (arg->stop) return; for (prio = 0; prio < q->bands; prio++) { if (!tc_qdisc_stats_dump(sch, prio + 1, arg)) break; } } static struct tcf_block *prio_tcf_block(struct Qdisc *sch, unsigned long cl, struct netlink_ext_ack *extack) { struct prio_sched_data *q = qdisc_priv(sch); if (cl) return NULL; return q->block; } static const struct Qdisc_class_ops prio_class_ops = { .graft = prio_graft, .leaf = prio_leaf, .find = prio_find, .walk = prio_walk, .tcf_block = prio_tcf_block, .bind_tcf = prio_bind, .unbind_tcf = prio_unbind, .dump = prio_dump_class, .dump_stats = prio_dump_class_stats, }; static struct Qdisc_ops prio_qdisc_ops __read_mostly = { .next = NULL, .cl_ops = &prio_class_ops, .id = "prio", .priv_size = sizeof(struct prio_sched_data), .enqueue = prio_enqueue, .dequeue = prio_dequeue, .peek = prio_peek, .init = prio_init, .reset = prio_reset, .destroy = prio_destroy, .change = prio_tune, .dump = prio_dump, .owner = THIS_MODULE, }; MODULE_ALIAS_NET_SCH("prio"); static int __init prio_module_init(void) { return register_qdisc(&prio_qdisc_ops); } static void __exit prio_module_exit(void) { unregister_qdisc(&prio_qdisc_ops); } module_init(prio_module_init) module_exit(prio_module_exit) MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Simple 3-band priority qdisc");
332 16 16 1 16 4 13 2 13 16 15 15 80 113 113 113 67 112 79 40 80 32 67 67 67 32 32 32 160 157 64 80 80 80 28 79 80 80 80 80 69 80 80 80 80 80 80 80 80 79 80 67 80 79 11 80 67 80 80 161 80 161 161 161 161 161 80 80 80 80 9 80 328 328 327 80 80 80 78 9 34 263 282 283 25 21 16 25 3 16 16 10 15 25 25 161 15 161 161 32 161 276 276 32 276 276 275 208 208 122 131 131 274 3 3 200 3 200 198 1 198 114 27 27 27 16 1 16 3 3 3 3 3 6 7 7 7 332 332 331 332 332 332 274 128 22 106 106 274 324 332 2 3726 1 3719 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 // SPDX-License-Identifier: GPL-2.0-only /* * linux/mm/swapfile.c * * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds * Swap reorganised 29.12.95, Stephen Tweedie */ #include <linux/blkdev.h> #include <linux/mm.h> #include <linux/sched/mm.h> #include <linux/sched/task.h> #include <linux/hugetlb.h> #include <linux/mman.h> #include <linux/slab.h> #include <linux/kernel_stat.h> #include <linux/swap.h> #include <linux/vmalloc.h> #include <linux/pagemap.h> #include <linux/namei.h> #include <linux/shmem_fs.h> #include <linux/blk-cgroup.h> #include <linux/random.h> #include <linux/writeback.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/init.h> #include <linux/ksm.h> #include <linux/rmap.h> #include <linux/security.h> #include <linux/backing-dev.h> #include <linux/mutex.h> #include <linux/capability.h> #include <linux/syscalls.h> #include <linux/memcontrol.h> #include <linux/poll.h> #include <linux/oom.h> #include <linux/swapfile.h> #include <linux/export.h> #include <linux/swap_slots.h> #include <linux/sort.h> #include <linux/completion.h> #include <linux/suspend.h> #include <linux/zswap.h> #include <linux/plist.h> #include <asm/tlbflush.h> #include <linux/swapops.h> #include <linux/swap_cgroup.h> #include "internal.h" #include "swap.h" static bool swap_count_continued(struct swap_info_struct *, pgoff_t, unsigned char); static void free_swap_count_continuations(struct swap_info_struct *); static void swap_entry_range_free(struct swap_info_struct *si, struct swap_cluster_info *ci, swp_entry_t entry, unsigned int nr_pages); static void swap_range_alloc(struct swap_info_struct *si, unsigned int nr_entries); static bool folio_swapcache_freeable(struct folio *folio); static struct swap_cluster_info *lock_cluster(struct swap_info_struct *si, unsigned long offset); static inline void unlock_cluster(struct swap_cluster_info *ci); static DEFINE_SPINLOCK(swap_lock); static unsigned int nr_swapfiles; atomic_long_t nr_swap_pages; /* * Some modules use swappable objects and may try to swap them out under * memory pressure (via the shrinker). Before doing so, they may wish to * check to see if any swap space is available. */ EXPORT_SYMBOL_GPL(nr_swap_pages); /* protected with swap_lock. reading in vm_swap_full() doesn't need lock */ long total_swap_pages; static int least_priority = -1; unsigned long swapfile_maximum_size; #ifdef CONFIG_MIGRATION bool swap_migration_ad_supported; #endif /* CONFIG_MIGRATION */ static const char Bad_file[] = "Bad swap file entry "; static const char Unused_file[] = "Unused swap file entry "; static const char Bad_offset[] = "Bad swap offset entry "; static const char Unused_offset[] = "Unused swap offset entry "; /* * all active swap_info_structs * protected with swap_lock, and ordered by priority. */ static PLIST_HEAD(swap_active_head); /* * all available (active, not full) swap_info_structs * protected with swap_avail_lock, ordered by priority. * This is used by folio_alloc_swap() instead of swap_active_head * because swap_active_head includes all swap_info_structs, * but folio_alloc_swap() doesn't need to look at full ones. * This uses its own lock instead of swap_lock because when a * swap_info_struct changes between not-full/full, it needs to * add/remove itself to/from this list, but the swap_info_struct->lock * is held and the locking order requires swap_lock to be taken * before any swap_info_struct->lock. */ static struct plist_head *swap_avail_heads; static DEFINE_SPINLOCK(swap_avail_lock); static struct swap_info_struct *swap_info[MAX_SWAPFILES]; static DEFINE_MUTEX(swapon_mutex); static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait); /* Activity counter to indicate that a swapon or swapoff has occurred */ static atomic_t proc_poll_event = ATOMIC_INIT(0); atomic_t nr_rotate_swap = ATOMIC_INIT(0); static struct swap_info_struct *swap_type_to_swap_info(int type) { if (type >= MAX_SWAPFILES) return NULL; return READ_ONCE(swap_info[type]); /* rcu_dereference() */ } static inline unsigned char swap_count(unsigned char ent) { return ent & ~SWAP_HAS_CACHE; /* may include COUNT_CONTINUED flag */ } /* * Use the second highest bit of inuse_pages counter as the indicator * if one swap device is on the available plist, so the atomic can * still be updated arithmetically while having special data embedded. * * inuse_pages counter is the only thing indicating if a device should * be on avail_lists or not (except swapon / swapoff). By embedding the * off-list bit in the atomic counter, updates no longer need any lock * to check the list status. * * This bit will be set if the device is not on the plist and not * usable, will be cleared if the device is on the plist. */ #define SWAP_USAGE_OFFLIST_BIT (1UL << (BITS_PER_TYPE(atomic_t) - 2)) #define SWAP_USAGE_COUNTER_MASK (~SWAP_USAGE_OFFLIST_BIT) static long swap_usage_in_pages(struct swap_info_struct *si) { return atomic_long_read(&si->inuse_pages) & SWAP_USAGE_COUNTER_MASK; } /* Reclaim the swap entry anyway if possible */ #define TTRS_ANYWAY 0x1 /* * Reclaim the swap entry if there are no more mappings of the * corresponding page */ #define TTRS_UNMAPPED 0x2 /* Reclaim the swap entry if swap is getting full */ #define TTRS_FULL 0x4 /* Reclaim directly, bypass the slot cache and don't touch device lock */ #define TTRS_DIRECT 0x8 static bool swap_is_has_cache(struct swap_info_struct *si, unsigned long offset, int nr_pages) { unsigned char *map = si->swap_map + offset; unsigned char *map_end = map + nr_pages; do { VM_BUG_ON(!(*map & SWAP_HAS_CACHE)); if (*map != SWAP_HAS_CACHE) return false; } while (++map < map_end); return true; } static bool swap_is_last_map(struct swap_info_struct *si, unsigned long offset, int nr_pages, bool *has_cache) { unsigned char *map = si->swap_map + offset; unsigned char *map_end = map + nr_pages; unsigned char count = *map; if (swap_count(count) != 1) return false; while (++map < map_end) { if (*map != count) return false; } *has_cache = !!(count & SWAP_HAS_CACHE); return true; } /* * returns number of pages in the folio that backs the swap entry. If positive, * the folio was reclaimed. If negative, the folio was not reclaimed. If 0, no * folio was associated with the swap entry. */ static int __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset, unsigned long flags) { swp_entry_t entry = swp_entry(si->type, offset); struct address_space *address_space = swap_address_space(entry); struct swap_cluster_info *ci; struct folio *folio; int ret, nr_pages; bool need_reclaim; folio = filemap_get_folio(address_space, swap_cache_index(entry)); if (IS_ERR(folio)) return 0; nr_pages = folio_nr_pages(folio); ret = -nr_pages; /* * When this function is called from scan_swap_map_slots() and it's * called by vmscan.c at reclaiming folios. So we hold a folio lock * here. We have to use trylock for avoiding deadlock. This is a special * case and you should use folio_free_swap() with explicit folio_lock() * in usual operations. */ if (!folio_trylock(folio)) goto out; /* offset could point to the middle of a large folio */ entry = folio->swap; offset = swp_offset(entry); need_reclaim = ((flags & TTRS_ANYWAY) || ((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) || ((flags & TTRS_FULL) && mem_cgroup_swap_full(folio))); if (!need_reclaim || !folio_swapcache_freeable(folio)) goto out_unlock; /* * It's safe to delete the folio from swap cache only if the folio's * swap_map is HAS_CACHE only, which means the slots have no page table * reference or pending writeback, and can't be allocated to others. */ ci = lock_cluster(si, offset); need_reclaim = swap_is_has_cache(si, offset, nr_pages); unlock_cluster(ci); if (!need_reclaim) goto out_unlock; if (!(flags & TTRS_DIRECT)) { /* Free through slot cache */ delete_from_swap_cache(folio); folio_set_dirty(folio); ret = nr_pages; goto out_unlock; } xa_lock_irq(&address_space->i_pages); __delete_from_swap_cache(folio, entry, NULL); xa_unlock_irq(&address_space->i_pages); folio_ref_sub(folio, nr_pages); folio_set_dirty(folio); ci = lock_cluster(si, offset); swap_entry_range_free(si, ci, entry, nr_pages); unlock_cluster(ci); ret = nr_pages; out_unlock: folio_unlock(folio); out: folio_put(folio); return ret; } static inline struct swap_extent *first_se(struct swap_info_struct *sis) { struct rb_node *rb = rb_first(&sis->swap_extent_root); return rb_entry(rb, struct swap_extent, rb_node); } static inline struct swap_extent *next_se(struct swap_extent *se) { struct rb_node *rb = rb_next(&se->rb_node); return rb ? rb_entry(rb, struct swap_extent, rb_node) : NULL; } /* * swapon tell device that all the old swap contents can be discarded, * to allow the swap device to optimize its wear-levelling. */ static int discard_swap(struct swap_info_struct *si) { struct swap_extent *se; sector_t start_block; sector_t nr_blocks; int err = 0; /* Do not discard the swap header page! */ se = first_se(si); start_block = (se->start_block + 1) << (PAGE_SHIFT - 9); nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9); if (nr_blocks) { err = blkdev_issue_discard(si->bdev, start_block, nr_blocks, GFP_KERNEL); if (err) return err; cond_resched(); } for (se = next_se(se); se; se = next_se(se)) { start_block = se->start_block << (PAGE_SHIFT - 9); nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9); err = blkdev_issue_discard(si->bdev, start_block, nr_blocks, GFP_KERNEL); if (err) break; cond_resched(); } return err; /* That will often be -EOPNOTSUPP */ } static struct swap_extent * offset_to_swap_extent(struct swap_info_struct *sis, unsigned long offset) { struct swap_extent *se; struct rb_node *rb; rb = sis->swap_extent_root.rb_node; while (rb) { se = rb_entry(rb, struct swap_extent, rb_node); if (offset < se->start_page) rb = rb->rb_left; else if (offset >= se->start_page + se->nr_pages) rb = rb->rb_right; else return se; } /* It *must* be present */ BUG(); } sector_t swap_folio_sector(struct folio *folio) { struct swap_info_struct *sis = swp_swap_info(folio->swap); struct swap_extent *se; sector_t sector; pgoff_t offset; offset = swp_offset(folio->swap); se = offset_to_swap_extent(sis, offset); sector = se->start_block + (offset - se->start_page); return sector << (PAGE_SHIFT - 9); } /* * swap allocation tell device that a cluster of swap can now be discarded, * to allow the swap device to optimize its wear-levelling. */ static void discard_swap_cluster(struct swap_info_struct *si, pgoff_t start_page, pgoff_t nr_pages) { struct swap_extent *se = offset_to_swap_extent(si, start_page); while (nr_pages) { pgoff_t offset = start_page - se->start_page; sector_t start_block = se->start_block + offset; sector_t nr_blocks = se->nr_pages - offset; if (nr_blocks > nr_pages) nr_blocks = nr_pages; start_page += nr_blocks; nr_pages -= nr_blocks; start_block <<= PAGE_SHIFT - 9; nr_blocks <<= PAGE_SHIFT - 9; if (blkdev_issue_discard(si->bdev, start_block, nr_blocks, GFP_NOIO)) break; se = next_se(se); } } #ifdef CONFIG_THP_SWAP #define SWAPFILE_CLUSTER HPAGE_PMD_NR #define swap_entry_order(order) (order) #else #define SWAPFILE_CLUSTER 256 /* * Define swap_entry_order() as constant to let compiler to optimize * out some code if !CONFIG_THP_SWAP */ #define swap_entry_order(order) 0 #endif #define LATENCY_LIMIT 256 static inline bool cluster_is_empty(struct swap_cluster_info *info) { return info->count == 0; } static inline bool cluster_is_discard(struct swap_cluster_info *info) { return info->flags == CLUSTER_FLAG_DISCARD; } static inline bool cluster_is_usable(struct swap_cluster_info *ci, int order) { if (unlikely(ci->flags > CLUSTER_FLAG_USABLE)) return false; if (!order) return true; return cluster_is_empty(ci) || order == ci->order; } static inline unsigned int cluster_index(struct swap_info_struct *si, struct swap_cluster_info *ci) { return ci - si->cluster_info; } static inline struct swap_cluster_info *offset_to_cluster(struct swap_info_struct *si, unsigned long offset) { return &si->cluster_info[offset / SWAPFILE_CLUSTER]; } static inline unsigned int cluster_offset(struct swap_info_struct *si, struct swap_cluster_info *ci) { return cluster_index(si, ci) * SWAPFILE_CLUSTER; } static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si, unsigned long offset) { struct swap_cluster_info *ci; ci = offset_to_cluster(si, offset); spin_lock(&ci->lock); return ci; } static inline void unlock_cluster(struct swap_cluster_info *ci) { spin_unlock(&ci->lock); } static void move_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci, struct list_head *list, enum swap_cluster_flags new_flags) { VM_WARN_ON(ci->flags == new_flags); BUILD_BUG_ON(1 << sizeof(ci->flags) * BITS_PER_BYTE < CLUSTER_FLAG_MAX); lockdep_assert_held(&ci->lock); spin_lock(&si->lock); if (ci->flags == CLUSTER_FLAG_NONE) list_add_tail(&ci->list, list); else list_move_tail(&ci->list, list); spin_unlock(&si->lock); if (ci->flags == CLUSTER_FLAG_FRAG) atomic_long_dec(&si->frag_cluster_nr[ci->order]); else if (new_flags == CLUSTER_FLAG_FRAG) atomic_long_inc(&si->frag_cluster_nr[ci->order]); ci->flags = new_flags; } /* Add a cluster to discard list and schedule it to do discard */ static void swap_cluster_schedule_discard(struct swap_info_struct *si, struct swap_cluster_info *ci) { unsigned int idx = cluster_index(si, ci); /* * If scan_swap_map_slots() can't find a free cluster, it will check * si->swap_map directly. To make sure the discarding cluster isn't * taken by scan_swap_map_slots(), mark the swap entries bad (occupied). * It will be cleared after discard */ memset(si->swap_map + idx * SWAPFILE_CLUSTER, SWAP_MAP_BAD, SWAPFILE_CLUSTER); VM_BUG_ON(ci->flags == CLUSTER_FLAG_FREE); move_cluster(si, ci, &si->discard_clusters, CLUSTER_FLAG_DISCARD); schedule_work(&si->discard_work); } static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci) { lockdep_assert_held(&ci->lock); move_cluster(si, ci, &si->free_clusters, CLUSTER_FLAG_FREE); ci->order = 0; } /* * Isolate and lock the first cluster that is not contented on a list, * clean its flag before taken off-list. Cluster flag must be in sync * with list status, so cluster updaters can always know the cluster * list status without touching si lock. * * Note it's possible that all clusters on a list are contented so * this returns NULL for an non-empty list. */ static struct swap_cluster_info *isolate_lock_cluster( struct swap_info_struct *si, struct list_head *list) { struct swap_cluster_info *ci, *ret = NULL; spin_lock(&si->lock); if (unlikely(!(si->flags & SWP_WRITEOK))) goto out; list_for_each_entry(ci, list, list) { if (!spin_trylock(&ci->lock)) continue; /* We may only isolate and clear flags of following lists */ VM_BUG_ON(!ci->flags); VM_BUG_ON(ci->flags > CLUSTER_FLAG_USABLE && ci->flags != CLUSTER_FLAG_FULL); list_del(&ci->list); ci->flags = CLUSTER_FLAG_NONE; ret = ci; break; } out: spin_unlock(&si->lock); return ret; } /* * Doing discard actually. After a cluster discard is finished, the cluster * will be added to free cluster list. Discard cluster is a bit special as * they don't participate in allocation or reclaim, so clusters marked as * CLUSTER_FLAG_DISCARD must remain off-list or on discard list. */ static bool swap_do_scheduled_discard(struct swap_info_struct *si) { struct swap_cluster_info *ci; bool ret = false; unsigned int idx; spin_lock(&si->lock); while (!list_empty(&si->discard_clusters)) { ci = list_first_entry(&si->discard_clusters, struct swap_cluster_info, list); /* * Delete the cluster from list to prepare for discard, but keep * the CLUSTER_FLAG_DISCARD flag, there could be percpu_cluster * pointing to it, or ran into by relocate_cluster. */ list_del(&ci->list); idx = cluster_index(si, ci); spin_unlock(&si->lock); discard_swap_cluster(si, idx * SWAPFILE_CLUSTER, SWAPFILE_CLUSTER); spin_lock(&ci->lock); /* * Discard is done, clear its flags as it's off-list, then * return the cluster to allocation list. */ ci->flags = CLUSTER_FLAG_NONE; memset(si->swap_map + idx * SWAPFILE_CLUSTER, 0, SWAPFILE_CLUSTER); __free_cluster(si, ci); spin_unlock(&ci->lock); ret = true; spin_lock(&si->lock); } spin_unlock(&si->lock); return ret; } static void swap_discard_work(struct work_struct *work) { struct swap_info_struct *si; si = container_of(work, struct swap_info_struct, discard_work); swap_do_scheduled_discard(si); } static void swap_users_ref_free(struct percpu_ref *ref) { struct swap_info_struct *si; si = container_of(ref, struct swap_info_struct, users); complete(&si->comp); } /* * Must be called after freeing if ci->count == 0, moves the cluster to free * or discard list. */ static void free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci) { VM_BUG_ON(ci->count != 0); VM_BUG_ON(ci->flags == CLUSTER_FLAG_FREE); lockdep_assert_held(&ci->lock); /* * If the swap is discardable, prepare discard the cluster * instead of free it immediately. The cluster will be freed * after discard. */ if ((si->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) == (SWP_WRITEOK | SWP_PAGE_DISCARD)) { swap_cluster_schedule_discard(si, ci); return; } __free_cluster(si, ci); } /* * Must be called after freeing if ci->count != 0, moves the cluster to * nonfull list. */ static void partial_free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci) { VM_BUG_ON(!ci->count || ci->count == SWAPFILE_CLUSTER); lockdep_assert_held(&ci->lock); if (ci->flags != CLUSTER_FLAG_NONFULL) move_cluster(si, ci, &si->nonfull_clusters[ci->order], CLUSTER_FLAG_NONFULL); } /* * Must be called after allocation, moves the cluster to full or frag list. * Note: allocation doesn't acquire si lock, and may drop the ci lock for * reclaim, so the cluster could be any where when called. */ static void relocate_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci) { lockdep_assert_held(&ci->lock); /* Discard cluster must remain off-list or on discard list */ if (cluster_is_discard(ci)) return; if (!ci->count) { free_cluster(si, ci); } else if (ci->count != SWAPFILE_CLUSTER) { if (ci->flags != CLUSTER_FLAG_FRAG) move_cluster(si, ci, &si->frag_clusters[ci->order], CLUSTER_FLAG_FRAG); } else { if (ci->flags != CLUSTER_FLAG_FULL) move_cluster(si, ci, &si->full_clusters, CLUSTER_FLAG_FULL); } } /* * The cluster corresponding to page_nr will be used. The cluster will not be * added to free cluster list and its usage counter will be increased by 1. * Only used for initialization. */ static void inc_cluster_info_page(struct swap_info_struct *si, struct swap_cluster_info *cluster_info, unsigned long page_nr) { unsigned long idx = page_nr / SWAPFILE_CLUSTER; struct swap_cluster_info *ci; ci = cluster_info + idx; ci->count++; VM_BUG_ON(ci->count > SWAPFILE_CLUSTER); VM_BUG_ON(ci->flags); } static bool cluster_reclaim_range(struct swap_info_struct *si, struct swap_cluster_info *ci, unsigned long start, unsigned long end) { unsigned char *map = si->swap_map; unsigned long offset = start; int nr_reclaim; spin_unlock(&ci->lock); do { switch (READ_ONCE(map[offset])) { case 0: offset++; break; case SWAP_HAS_CACHE: nr_reclaim = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY | TTRS_DIRECT); if (nr_reclaim > 0) offset += nr_reclaim; else goto out; break; default: goto out; } } while (offset < end); out: spin_lock(&ci->lock); /* * Recheck the range no matter reclaim succeeded or not, the slot * could have been be freed while we are not holding the lock. */ for (offset = start; offset < end; offset++) if (READ_ONCE(map[offset])) return false; return true; } static bool cluster_scan_range(struct swap_info_struct *si, struct swap_cluster_info *ci, unsigned long start, unsigned int nr_pages, bool *need_reclaim) { unsigned long offset, end = start + nr_pages; unsigned char *map = si->swap_map; for (offset = start; offset < end; offset++) { switch (READ_ONCE(map[offset])) { case 0: continue; case SWAP_HAS_CACHE: if (!vm_swap_full()) return false; *need_reclaim = true; continue; default: return false; } } return true; } static bool cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster_info *ci, unsigned int start, unsigned char usage, unsigned int order) { unsigned int nr_pages = 1 << order; lockdep_assert_held(&ci->lock); if (!(si->flags & SWP_WRITEOK)) return false; /* * The first allocation in a cluster makes the * cluster exclusive to this order */ if (cluster_is_empty(ci)) ci->order = order; memset(si->swap_map + start, usage, nr_pages); swap_range_alloc(si, nr_pages); ci->count += nr_pages; return true; } /* Try use a new cluster for current CPU and allocate from it. */ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci, unsigned long offset, unsigned int order, unsigned char usage) { unsigned int next = SWAP_ENTRY_INVALID, found = SWAP_ENTRY_INVALID; unsigned long start = ALIGN_DOWN(offset, SWAPFILE_CLUSTER); unsigned long end = min(start + SWAPFILE_CLUSTER, si->max); unsigned int nr_pages = 1 << order; bool need_reclaim, ret; lockdep_assert_held(&ci->lock); if (end < nr_pages || ci->count + nr_pages > SWAPFILE_CLUSTER) goto out; for (end -= nr_pages; offset <= end; offset += nr_pages) { need_reclaim = false; if (!cluster_scan_range(si, ci, offset, nr_pages, &need_reclaim)) continue; if (need_reclaim) { ret = cluster_reclaim_range(si, ci, offset, offset + nr_pages); /* * Reclaim drops ci->lock and cluster could be used * by another order. Not checking flag as off-list * cluster has no flag set, and change of list * won't cause fragmentation. */ if (!cluster_is_usable(ci, order)) goto out; if (cluster_is_empty(ci)) offset = start; /* Reclaim failed but cluster is usable, try next */ if (!ret) continue; } if (!cluster_alloc_range(si, ci, offset, usage, order)) break; found = offset; offset += nr_pages; if (ci->count < SWAPFILE_CLUSTER && offset <= end) next = offset; break; } out: relocate_cluster(si, ci); unlock_cluster(ci); if (si->flags & SWP_SOLIDSTATE) __this_cpu_write(si->percpu_cluster->next[order], next); else si->global_cluster->next[order] = next; return found; } /* Return true if reclaimed a whole cluster */ static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force) { long to_scan = 1; unsigned long offset, end; struct swap_cluster_info *ci; unsigned char *map = si->swap_map; int nr_reclaim; if (force) to_scan = swap_usage_in_pages(si) / SWAPFILE_CLUSTER; while ((ci = isolate_lock_cluster(si, &si->full_clusters))) { offset = cluster_offset(si, ci); end = min(si->max, offset + SWAPFILE_CLUSTER); to_scan--; while (offset < end) { if (READ_ONCE(map[offset]) == SWAP_HAS_CACHE) { spin_unlock(&ci->lock); nr_reclaim = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY | TTRS_DIRECT); spin_lock(&ci->lock); if (nr_reclaim) { offset += abs(nr_reclaim); continue; } } offset++; } unlock_cluster(ci); if (to_scan <= 0) break; } } static void swap_reclaim_work(struct work_struct *work) { struct swap_info_struct *si; si = container_of(work, struct swap_info_struct, reclaim_work); swap_reclaim_full_clusters(si, true); } /* * Try to get swap entries with specified order from current cpu's swap entry * pool (a cluster). This might involve allocating a new cluster for current CPU * too. */ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int order, unsigned char usage) { struct swap_cluster_info *ci; unsigned int offset, found = 0; if (si->flags & SWP_SOLIDSTATE) { /* Fast path using per CPU cluster */ local_lock(&si->percpu_cluster->lock); offset = __this_cpu_read(si->percpu_cluster->next[order]); } else { /* Serialize HDD SWAP allocation for each device. */ spin_lock(&si->global_cluster_lock); offset = si->global_cluster->next[order]; } if (offset) { ci = lock_cluster(si, offset); /* Cluster could have been used by another order */ if (cluster_is_usable(ci, order)) { if (cluster_is_empty(ci)) offset = cluster_offset(si, ci); found = alloc_swap_scan_cluster(si, ci, offset, order, usage); } else { unlock_cluster(ci); } if (found) goto done; } new_cluster: ci = isolate_lock_cluster(si, &si->free_clusters); if (ci) { found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci), order, usage); if (found) goto done; } /* Try reclaim from full clusters if free clusters list is drained */ if (vm_swap_full()) swap_reclaim_full_clusters(si, false); if (order < PMD_ORDER) { unsigned int frags = 0, frags_existing; while ((ci = isolate_lock_cluster(si, &si->nonfull_clusters[order]))) { found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci), order, usage); if (found) goto done; /* Clusters failed to allocate are moved to frag_clusters */ frags++; } frags_existing = atomic_long_read(&si->frag_cluster_nr[order]); while (frags < frags_existing && (ci = isolate_lock_cluster(si, &si->frag_clusters[order]))) { atomic_long_dec(&si->frag_cluster_nr[order]); /* * Rotate the frag list to iterate, they were all * failing high order allocation or moved here due to * per-CPU usage, but they could contain newly released * reclaimable (eg. lazy-freed swap cache) slots. */ found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci), order, usage); if (found) goto done; frags++; } } /* * We don't have free cluster but have some clusters in * discarding, do discard now and reclaim them, then * reread cluster_next_cpu since we dropped si->lock */ if ((si->flags & SWP_PAGE_DISCARD) && swap_do_scheduled_discard(si)) goto new_cluster; if (order) goto done; /* Order 0 stealing from higher order */ for (int o = 1; o < SWAP_NR_ORDERS; o++) { /* * Clusters here have at least one usable slots and can't fail order 0 * allocation, but reclaim may drop si->lock and race with another user. */ while ((ci = isolate_lock_cluster(si, &si->frag_clusters[o]))) { atomic_long_dec(&si->frag_cluster_nr[o]); found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci), 0, usage); if (found) goto done; } while ((ci = isolate_lock_cluster(si, &si->nonfull_clusters[o]))) { found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci), 0, usage); if (found) goto done; } } done: if (si->flags & SWP_SOLIDSTATE) local_unlock(&si->percpu_cluster->lock); else spin_unlock(&si->global_cluster_lock); return found; } /* SWAP_USAGE_OFFLIST_BIT can only be set by this helper. */ static void del_from_avail_list(struct swap_info_struct *si, bool swapoff) { int nid; unsigned long pages; spin_lock(&swap_avail_lock); if (swapoff) { /* * Forcefully remove it. Clear the SWP_WRITEOK flags for * swapoff here so it's synchronized by both si->lock and * swap_avail_lock, to ensure the result can be seen by * add_to_avail_list. */ lockdep_assert_held(&si->lock); si->flags &= ~SWP_WRITEOK; atomic_long_or(SWAP_USAGE_OFFLIST_BIT, &si->inuse_pages); } else { /* * If not called by swapoff, take it off-list only if it's * full and SWAP_USAGE_OFFLIST_BIT is not set (strictly * si->inuse_pages == pages), any concurrent slot freeing, * or device already removed from plist by someone else * will make this return false. */ pages = si->pages; if (!atomic_long_try_cmpxchg(&si->inuse_pages, &pages, pages | SWAP_USAGE_OFFLIST_BIT)) goto skip; } for_each_node(nid) plist_del(&si->avail_lists[nid], &swap_avail_heads[nid]); skip: spin_unlock(&swap_avail_lock); } /* SWAP_USAGE_OFFLIST_BIT can only be cleared by this helper. */ static void add_to_avail_list(struct swap_info_struct *si, bool swapon) { int nid; long val; unsigned long pages; spin_lock(&swap_avail_lock); /* Corresponding to SWP_WRITEOK clearing in del_from_avail_list */ if (swapon) { lockdep_assert_held(&si->lock); si->flags |= SWP_WRITEOK; } else { if (!(READ_ONCE(si->flags) & SWP_WRITEOK)) goto skip; } if (!(atomic_long_read(&si->inuse_pages) & SWAP_USAGE_OFFLIST_BIT)) goto skip; val = atomic_long_fetch_and_relaxed(~SWAP_USAGE_OFFLIST_BIT, &si->inuse_pages); /* * When device is full and device is on the plist, only one updater will * see (inuse_pages == si->pages) and will call del_from_avail_list. If * that updater happen to be here, just skip adding. */ pages = si->pages; if (val == pages) { /* Just like the cmpxchg in del_from_avail_list */ if (atomic_long_try_cmpxchg(&si->inuse_pages, &pages, pages | SWAP_USAGE_OFFLIST_BIT)) goto skip; } for_each_node(nid) plist_add(&si->avail_lists[nid], &swap_avail_heads[nid]); skip: spin_unlock(&swap_avail_lock); } /* * swap_usage_add / swap_usage_sub of each slot are serialized by ci->lock * within each cluster, so the total contribution to the global counter should * always be positive and cannot exceed the total number of usable slots. */ static bool swap_usage_add(struct swap_info_struct *si, unsigned int nr_entries) { long val = atomic_long_add_return_relaxed(nr_entries, &si->inuse_pages); /* * If device is full, and SWAP_USAGE_OFFLIST_BIT is not set, * remove it from the plist. */ if (unlikely(val == si->pages)) { del_from_avail_list(si, false); return true; } return false; } static void swap_usage_sub(struct swap_info_struct *si, unsigned int nr_entries) { long val = atomic_long_sub_return_relaxed(nr_entries, &si->inuse_pages); /* * If device is not full, and SWAP_USAGE_OFFLIST_BIT is set, * remove it from the plist. */ if (unlikely(val & SWAP_USAGE_OFFLIST_BIT)) add_to_avail_list(si, false); } static void swap_range_alloc(struct swap_info_struct *si, unsigned int nr_entries) { if (swap_usage_add(si, nr_entries)) { if (vm_swap_full()) schedule_work(&si->reclaim_work); } } static void swap_range_free(struct swap_info_struct *si, unsigned long offset, unsigned int nr_entries) { unsigned long begin = offset; unsigned long end = offset + nr_entries - 1; void (*swap_slot_free_notify)(struct block_device *, unsigned long); unsigned int i; /* * Use atomic clear_bit operations only on zeromap instead of non-atomic * bitmap_clear to prevent adjacent bits corruption due to simultaneous writes. */ for (i = 0; i < nr_entries; i++) { clear_bit(offset + i, si->zeromap); zswap_invalidate(swp_entry(si->type, offset + i)); } if (si->flags & SWP_BLKDEV) swap_slot_free_notify = si->bdev->bd_disk->fops->swap_slot_free_notify; else swap_slot_free_notify = NULL; while (offset <= end) { arch_swap_invalidate_page(si->type, offset); if (swap_slot_free_notify) swap_slot_free_notify(si->bdev, offset); offset++; } clear_shadow_from_swap_cache(si->type, begin, end); /* * Make sure that try_to_unuse() observes si->inuse_pages reaching 0 * only after the above cleanups are done. */ smp_wmb(); atomic_long_add(nr_entries, &nr_swap_pages); swap_usage_sub(si, nr_entries); } static int cluster_alloc_swap(struct swap_info_struct *si, unsigned char usage, int nr, swp_entry_t slots[], int order) { int n_ret = 0; while (n_ret < nr) { unsigned long offset = cluster_alloc_swap_entry(si, order, usage); if (!offset) break; slots[n_ret++] = swp_entry(si->type, offset); } return n_ret; } static int scan_swap_map_slots(struct swap_info_struct *si, unsigned char usage, int nr, swp_entry_t slots[], int order) { unsigned int nr_pages = 1 << order; /* * We try to cluster swap pages by allocating them sequentially * in swap. Once we've allocated SWAPFILE_CLUSTER pages this * way, however, we resort to first-free allocation, starting * a new cluster. This prevents us from scattering swap pages * all over the entire swap partition, so that we reduce * overall disk seek times between swap pages. -- sct * But we do now try to find an empty cluster. -Andrea * And we let swap pages go all over an SSD partition. Hugh */ if (order > 0) { /* * Should not even be attempting large allocations when huge * page swap is disabled. Warn and fail the allocation. */ if (!IS_ENABLED(CONFIG_THP_SWAP) || nr_pages > SWAPFILE_CLUSTER) { VM_WARN_ON_ONCE(1); return 0; } /* * Swapfile is not block device so unable * to allocate large entries. */ if (!(si->flags & SWP_BLKDEV)) return 0; } return cluster_alloc_swap(si, usage, nr, slots, order); } static bool get_swap_device_info(struct swap_info_struct *si) { if (!percpu_ref_tryget_live(&si->users)) return false; /* * Guarantee the si->users are checked before accessing other * fields of swap_info_struct, and si->flags (SWP_WRITEOK) is * up to dated. * * Paired with the spin_unlock() after setup_swap_info() in * enable_swap_info(), and smp_wmb() in swapoff. */ smp_rmb(); return true; } int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order) { int order = swap_entry_order(entry_order); unsigned long size = 1 << order; struct swap_info_struct *si, *next; long avail_pgs; int n_ret = 0; int node; spin_lock(&swap_avail_lock); avail_pgs = atomic_long_read(&nr_swap_pages) / size; if (avail_pgs <= 0) { spin_unlock(&swap_avail_lock); goto noswap; } n_goal = min3((long)n_goal, (long)SWAP_BATCH, avail_pgs); atomic_long_sub(n_goal * size, &nr_swap_pages); start_over: node = numa_node_id(); plist_for_each_entry_safe(si, next, &swap_avail_heads[node], avail_lists[node]) { /* requeue si to after same-priority siblings */ plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]); spin_unlock(&swap_avail_lock); if (get_swap_device_info(si)) { n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE, n_goal, swp_entries, order); put_swap_device(si); if (n_ret || size > 1) goto check_out; } spin_lock(&swap_avail_lock); /* * if we got here, it's likely that si was almost full before, * and since scan_swap_map_slots() can drop the si->lock, * multiple callers probably all tried to get a page from the * same si and it filled up before we could get one; or, the si * filled up between us dropping swap_avail_lock and taking * si->lock. Since we dropped the swap_avail_lock, the * swap_avail_head list may have been modified; so if next is * still in the swap_avail_head list then try it, otherwise * start over if we have not gotten any slots. */ if (plist_node_empty(&next->avail_lists[node])) goto start_over; } spin_unlock(&swap_avail_lock); check_out: if (n_ret < n_goal) atomic_long_add((long)(n_goal - n_ret) * size, &nr_swap_pages); noswap: return n_ret; } static struct swap_info_struct *_swap_info_get(swp_entry_t entry) { struct swap_info_struct *si; unsigned long offset; if (!entry.val) goto out; si = swp_swap_info(entry); if (!si) goto bad_nofile; if (data_race(!(si->flags & SWP_USED))) goto bad_device; offset = swp_offset(entry); if (offset >= si->max) goto bad_offset; if (data_race(!si->swap_map[swp_offset(entry)])) goto bad_free; return si; bad_free: pr_err("%s: %s%08lx\n", __func__, Unused_offset, entry.val); goto out; bad_offset: pr_err("%s: %s%08lx\n", __func__, Bad_offset, entry.val); goto out; bad_device: pr_err("%s: %s%08lx\n", __func__, Unused_file, entry.val); goto out; bad_nofile: pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val); out: return NULL; } static unsigned char __swap_entry_free_locked(struct swap_info_struct *si, unsigned long offset, unsigned char usage) { unsigned char count; unsigned char has_cache; count = si->swap_map[offset]; has_cache = count & SWAP_HAS_CACHE; count &= ~SWAP_HAS_CACHE; if (usage == SWAP_HAS_CACHE) { VM_BUG_ON(!has_cache); has_cache = 0; } else if (count == SWAP_MAP_SHMEM) { /* * Or we could insist on shmem.c using a special * swap_shmem_free() and free_shmem_swap_and_cache()... */ count = 0; } else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) { if (count == COUNT_CONTINUED) { if (swap_count_continued(si, offset, count)) count = SWAP_MAP_MAX | COUNT_CONTINUED; else count = SWAP_MAP_MAX; } else count--; } usage = count | has_cache; if (usage) WRITE_ONCE(si->swap_map[offset], usage); else WRITE_ONCE(si->swap_map[offset], SWAP_HAS_CACHE); return usage; } /* * When we get a swap entry, if there aren't some other ways to * prevent swapoff, such as the folio in swap cache is locked, RCU * reader side is locked, etc., the swap entry may become invalid * because of swapoff. Then, we need to enclose all swap related * functions with get_swap_device() and put_swap_device(), unless the * swap functions call get/put_swap_device() by themselves. * * RCU reader side lock (including any spinlock) is sufficient to * prevent swapoff, because synchronize_rcu() is called in swapoff() * before freeing data structures. * * Check whether swap entry is valid in the swap device. If so, * return pointer to swap_info_struct, and keep the swap entry valid * via preventing the swap device from being swapoff, until * put_swap_device() is called. Otherwise return NULL. * * Notice that swapoff or swapoff+swapon can still happen before the * percpu_ref_tryget_live() in get_swap_device() or after the * percpu_ref_put() in put_swap_device() if there isn't any other way * to prevent swapoff. The caller must be prepared for that. For * example, the following situation is possible. * * CPU1 CPU2 * do_swap_page() * ... swapoff+swapon * __read_swap_cache_async() * swapcache_prepare() * __swap_duplicate() * // check swap_map * // verify PTE not changed * * In __swap_duplicate(), the swap_map need to be checked before * changing partly because the specified swap entry may be for another * swap device which has been swapoff. And in do_swap_page(), after * the page is read from the swap device, the PTE is verified not * changed with the page table locked to check whether the swap device * has been swapoff or swapoff+swapon. */ struct swap_info_struct *get_swap_device(swp_entry_t entry) { struct swap_info_struct *si; unsigned long offset; if (!entry.val) goto out; si = swp_swap_info(entry); if (!si) goto bad_nofile; if (!get_swap_device_info(si)) goto out; offset = swp_offset(entry); if (offset >= si->max) goto put_out; return si; bad_nofile: pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val); out: return NULL; put_out: pr_err("%s: %s%08lx\n", __func__, Bad_offset, entry.val); percpu_ref_put(&si->users); return NULL; } static unsigned char __swap_entry_free(struct swap_info_struct *si, swp_entry_t entry) { struct swap_cluster_info *ci; unsigned long offset = swp_offset(entry); unsigned char usage; ci = lock_cluster(si, offset); usage = __swap_entry_free_locked(si, offset, 1); if (!usage) swap_entry_range_free(si, ci, swp_entry(si->type, offset), 1); unlock_cluster(ci); return usage; } static bool __swap_entries_free(struct swap_info_struct *si, swp_entry_t entry, int nr) { unsigned long offset = swp_offset(entry); unsigned int type = swp_type(entry); struct swap_cluster_info *ci; bool has_cache = false; unsigned char count; int i; if (nr <= 1 || swap_count(data_race(si->swap_map[offset])) != 1) goto fallback; /* cross into another cluster */ if (nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER) goto fallback; ci = lock_cluster(si, offset); if (!swap_is_last_map(si, offset, nr, &has_cache)) { unlock_cluster(ci); goto fallback; } for (i = 0; i < nr; i++) WRITE_ONCE(si->swap_map[offset + i], SWAP_HAS_CACHE); if (!has_cache) swap_entry_range_free(si, ci, entry, nr); unlock_cluster(ci); return has_cache; fallback: for (i = 0; i < nr; i++) { if (data_race(si->swap_map[offset + i])) { count = __swap_entry_free(si, swp_entry(type, offset + i)); if (count == SWAP_HAS_CACHE) has_cache = true; } else { WARN_ON_ONCE(1); } } return has_cache; } /* * Drop the last HAS_CACHE flag of swap entries, caller have to * ensure all entries belong to the same cgroup. */ static void swap_entry_range_free(struct swap_info_struct *si, struct swap_cluster_info *ci, swp_entry_t entry, unsigned int nr_pages) { unsigned long offset = swp_offset(entry); unsigned char *map = si->swap_map + offset; unsigned char *map_end = map + nr_pages; /* It should never free entries across different clusters */ VM_BUG_ON(ci != offset_to_cluster(si, offset + nr_pages - 1)); VM_BUG_ON(cluster_is_empty(ci)); VM_BUG_ON(ci->count < nr_pages); ci->count -= nr_pages; do { VM_BUG_ON(*map != SWAP_HAS_CACHE); *map = 0; } while (++map < map_end); mem_cgroup_uncharge_swap(entry, nr_pages); swap_range_free(si, offset, nr_pages); if (!ci->count) free_cluster(si, ci); else partial_free_cluster(si, ci); } static void cluster_swap_free_nr(struct swap_info_struct *si, unsigned long offset, int nr_pages, unsigned char usage) { struct swap_cluster_info *ci; unsigned long end = offset + nr_pages; ci = lock_cluster(si, offset); do { if (!__swap_entry_free_locked(si, offset, usage)) swap_entry_range_free(si, ci, swp_entry(si->type, offset), 1); } while (++offset < end); unlock_cluster(ci); } /* * Caller has made sure that the swap device corresponding to entry * is still around or has not been recycled. */ void swap_free_nr(swp_entry_t entry, int nr_pages) { int nr; struct swap_info_struct *sis; unsigned long offset = swp_offset(entry); sis = _swap_info_get(entry); if (!sis) return; while (nr_pages) { nr = min_t(int, nr_pages, SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER); cluster_swap_free_nr(sis, offset, nr, 1); offset += nr; nr_pages -= nr; } } /* * Called after dropping swapcache to decrease refcnt to swap entries. */ void put_swap_folio(struct folio *folio, swp_entry_t entry) { unsigned long offset = swp_offset(entry); struct swap_cluster_info *ci; struct swap_info_struct *si; int size = 1 << swap_entry_order(folio_order(folio)); si = _swap_info_get(entry); if (!si) return; ci = lock_cluster(si, offset); if (swap_is_has_cache(si, offset, size)) swap_entry_range_free(si, ci, entry, size); else { for (int i = 0; i < size; i++, entry.val++) { if (!__swap_entry_free_locked(si, offset + i, SWAP_HAS_CACHE)) swap_entry_range_free(si, ci, entry, 1); } } unlock_cluster(ci); } void swapcache_free_entries(swp_entry_t *entries, int n) { int i; struct swap_cluster_info *ci; struct swap_info_struct *si = NULL; if (n <= 0) return; for (i = 0; i < n; ++i) { si = _swap_info_get(entries[i]); if (si) { ci = lock_cluster(si, swp_offset(entries[i])); swap_entry_range_free(si, ci, entries[i], 1); unlock_cluster(ci); } } } int __swap_count(swp_entry_t entry) { struct swap_info_struct *si = swp_swap_info(entry); pgoff_t offset = swp_offset(entry); return swap_count(si->swap_map[offset]); } /* * How many references to @entry are currently swapped out? * This does not give an exact answer when swap count is continued, * but does include the high COUNT_CONTINUED flag to allow for that. */ int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry) { pgoff_t offset = swp_offset(entry); struct swap_cluster_info *ci; int count; ci = lock_cluster(si, offset); count = swap_count(si->swap_map[offset]); unlock_cluster(ci); return count; } /* * How many references to @entry are currently swapped out? * This considers COUNT_CONTINUED so it returns exact answer. */ int swp_swapcount(swp_entry_t entry) { int count, tmp_count, n; struct swap_info_struct *si; struct swap_cluster_info *ci; struct page *page; pgoff_t offset; unsigned char *map; si = _swap_info_get(entry); if (!si) return 0; offset = swp_offset(entry); ci = lock_cluster(si, offset); count = swap_count(si->swap_map[offset]); if (!(count & COUNT_CONTINUED)) goto out; count &= ~COUNT_CONTINUED; n = SWAP_MAP_MAX + 1; page = vmalloc_to_page(si->swap_map + offset); offset &= ~PAGE_MASK; VM_BUG_ON(page_private(page) != SWP_CONTINUED); do { page = list_next_entry(page, lru); map = kmap_local_page(page); tmp_count = map[offset]; kunmap_local(map); count += (tmp_count & ~COUNT_CONTINUED) * n; n *= (SWAP_CONT_MAX + 1); } while (tmp_count & COUNT_CONTINUED); out: unlock_cluster(ci); return count; } static bool swap_page_trans_huge_swapped(struct swap_info_struct *si, swp_entry_t entry, int order) { struct swap_cluster_info *ci; unsigned char *map = si->swap_map; unsigned int nr_pages = 1 << order; unsigned long roffset = swp_offset(entry); unsigned long offset = round_down(roffset, nr_pages); int i; bool ret = false; ci = lock_cluster(si, offset); if (nr_pages == 1) { if (swap_count(map[roffset])) ret = true; goto unlock_out; } for (i = 0; i < nr_pages; i++) { if (swap_count(map[offset + i])) { ret = true; break; } } unlock_out: unlock_cluster(ci); return ret; } static bool folio_swapped(struct folio *folio) { swp_entry_t entry = folio->swap; struct swap_info_struct *si = _swap_info_get(entry); if (!si) return false; if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!folio_test_large(folio))) return swap_swapcount(si, entry) != 0; return swap_page_trans_huge_swapped(si, entry, folio_order(folio)); } static bool folio_swapcache_freeable(struct folio *folio) { VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); if (!folio_test_swapcache(folio)) return false; if (folio_test_writeback(folio)) return false; /* * Once hibernation has begun to create its image of memory, * there's a danger that one of the calls to folio_free_swap() * - most probably a call from __try_to_reclaim_swap() while * hibernation is allocating its own swap pages for the image, * but conceivably even a call from memory reclaim - will free * the swap from a folio which has already been recorded in the * image as a clean swapcache folio, and then reuse its swap for * another page of the image. On waking from hibernation, the * original folio might be freed under memory pressure, then * later read back in from swap, now with the wrong data. * * Hibernation suspends storage while it is writing the image * to disk so check that here. */ if (pm_suspended_storage()) return false; return true; } /** * folio_free_swap() - Free the swap space used for this folio. * @folio: The folio to remove. * * If swap is getting full, or if there are no more mappings of this folio, * then call folio_free_swap to free its swap space. * * Return: true if we were able to release the swap space. */ bool folio_free_swap(struct folio *folio) { if (!folio_swapcache_freeable(folio)) return false; if (folio_swapped(folio)) return false; delete_from_swap_cache(folio); folio_set_dirty(folio); return true; } /** * free_swap_and_cache_nr() - Release reference on range of swap entries and * reclaim their cache if no more references remain. * @entry: First entry of range. * @nr: Number of entries in range. * * For each swap entry in the contiguous range, release a reference. If any swap * entries become free, try to reclaim their underlying folios, if present. The * offset range is defined by [entry.offset, entry.offset + nr). */ void free_swap_and_cache_nr(swp_entry_t entry, int nr) { const unsigned long start_offset = swp_offset(entry); const unsigned long end_offset = start_offset + nr; struct swap_info_struct *si; bool any_only_cache = false; unsigned long offset; if (non_swap_entry(entry)) return; si = get_swap_device(entry); if (!si) return; if (WARN_ON(end_offset > si->max)) goto out; /* * First free all entries in the range. */ any_only_cache = __swap_entries_free(si, entry, nr); /* * Short-circuit the below loop if none of the entries had their * reference drop to zero. */ if (!any_only_cache) goto out; /* * Now go back over the range trying to reclaim the swap cache. This is * more efficient for large folios because we will only try to reclaim * the swap once per folio in the common case. If we do * __swap_entry_free() and __try_to_reclaim_swap() in the same loop, the * latter will get a reference and lock the folio for every individual * page but will only succeed once the swap slot for every subpage is * zero. */ for (offset = start_offset; offset < end_offset; offset += nr) { nr = 1; if (READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) { /* * Folios are always naturally aligned in swap so * advance forward to the next boundary. Zero means no * folio was found for the swap entry, so advance by 1 * in this case. Negative value means folio was found * but could not be reclaimed. Here we can still advance * to the next boundary. */ nr = __try_to_reclaim_swap(si, offset, TTRS_UNMAPPED | TTRS_FULL); if (nr == 0) nr = 1; else if (nr < 0) nr = -nr; nr = ALIGN(offset + 1, nr) - offset; } } out: put_swap_device(si); } #ifdef CONFIG_HIBERNATION swp_entry_t get_swap_page_of_type(int type) { struct swap_info_struct *si = swap_type_to_swap_info(type); swp_entry_t entry = {0}; if (!si) goto fail; /* This is called for allocating swap entry, not cache */ if (get_swap_device_info(si)) { if ((si->flags & SWP_WRITEOK) && scan_swap_map_slots(si, 1, 1, &entry, 0)) atomic_long_dec(&nr_swap_pages); put_swap_device(si); } fail: return entry; } /* * Find the swap type that corresponds to given device (if any). * * @offset - number of the PAGE_SIZE-sized block of the device, starting * from 0, in which the swap header is expected to be located. * * This is needed for the suspend to disk (aka swsusp). */ int swap_type_of(dev_t device, sector_t offset) { int type; if (!device) return -1; spin_lock(&swap_lock); for (type = 0; type < nr_swapfiles; type++) { struct swap_info_struct *sis = swap_info[type]; if (!(sis->flags & SWP_WRITEOK)) continue; if (device == sis->bdev->bd_dev) { struct swap_extent *se = first_se(sis); if (se->start_block == offset) { spin_unlock(&swap_lock); return type; } } } spin_unlock(&swap_lock); return -ENODEV; } int find_first_swap(dev_t *device) { int type; spin_lock(&swap_lock); for (type = 0; type < nr_swapfiles; type++) { struct swap_info_struct *sis = swap_info[type]; if (!(sis->flags & SWP_WRITEOK)) continue; *device = sis->bdev->bd_dev; spin_unlock(&swap_lock); return type; } spin_unlock(&swap_lock); return -ENODEV; } /* * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev * corresponding to given index in swap_info (swap type). */ sector_t swapdev_block(int type, pgoff_t offset) { struct swap_info_struct *si = swap_type_to_swap_info(type); struct swap_extent *se; if (!si || !(si->flags & SWP_WRITEOK)) return 0; se = offset_to_swap_extent(si, offset); return se->start_block + (offset - se->start_page); } /* * Return either the total number of swap pages of given type, or the number * of free pages of that type (depending on @free) * * This is needed for software suspend */ unsigned int count_swap_pages(int type, int free) { unsigned int n = 0; spin_lock(&swap_lock); if ((unsigned int)type < nr_swapfiles) { struct swap_info_struct *sis = swap_info[type]; spin_lock(&sis->lock); if (sis->flags & SWP_WRITEOK) { n = sis->pages; if (free) n -= swap_usage_in_pages(sis); } spin_unlock(&sis->lock); } spin_unlock(&swap_lock); return n; } #endif /* CONFIG_HIBERNATION */ static inline int pte_same_as_swp(pte_t pte, pte_t swp_pte) { return pte_same(pte_swp_clear_flags(pte), swp_pte); } /* * No need to decide whether this PTE shares the swap entry with others, * just let do_wp_page work it out if a write is requested later - to * force COW, vm_page_prot omits write permission from any private vma. */ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, swp_entry_t entry, struct folio *folio) { struct page *page; struct folio *swapcache; spinlock_t *ptl; pte_t *pte, new_pte, old_pte; bool hwpoisoned = false; int ret = 1; swapcache = folio; folio = ksm_might_need_to_copy(folio, vma, addr); if (unlikely(!folio)) return -ENOMEM; else if (unlikely(folio == ERR_PTR(-EHWPOISON))) { hwpoisoned = true; folio = swapcache; } page = folio_file_page(folio, swp_offset(entry)); if (PageHWPoison(page)) hwpoisoned = true; pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); if (unlikely(!pte || !pte_same_as_swp(ptep_get(pte), swp_entry_to_pte(entry)))) { ret = 0; goto out; } old_pte = ptep_get(pte); if (unlikely(hwpoisoned || !folio_test_uptodate(folio))) { swp_entry_t swp_entry; dec_mm_counter(vma->vm_mm, MM_SWAPENTS); if (hwpoisoned) { swp_entry = make_hwpoison_entry(page); } else { swp_entry = make_poisoned_swp_entry(); } new_pte = swp_entry_to_pte(swp_entry); ret = 0; goto setpte; } /* * Some architectures may have to restore extra metadata to the page * when reading from swap. This metadata may be indexed by swap entry * so this must be called before swap_free(). */ arch_swap_restore(folio_swap(entry, folio), folio); dec_mm_counter(vma->vm_mm, MM_SWAPENTS); inc_mm_counter(vma->vm_mm, MM_ANONPAGES); folio_get(folio); if (folio == swapcache) { rmap_t rmap_flags = RMAP_NONE; /* * See do_swap_page(): writeback would be problematic. * However, we do a folio_wait_writeback() just before this * call and have the folio locked. */ VM_BUG_ON_FOLIO(folio_test_writeback(folio), folio); if (pte_swp_exclusive(old_pte)) rmap_flags |= RMAP_EXCLUSIVE; /* * We currently only expect small !anon folios, which are either * fully exclusive or fully shared. If we ever get large folios * here, we have to be careful. */ if (!folio_test_anon(folio)) { VM_WARN_ON_ONCE(folio_test_large(folio)); VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio); folio_add_new_anon_rmap(folio, vma, addr, rmap_flags); } else { folio_add_anon_rmap_pte(folio, page, vma, addr, rmap_flags); } } else { /* ksm created a completely new copy */ folio_add_new_anon_rmap(folio, vma, addr, RMAP_EXCLUSIVE); folio_add_lru_vma(folio, vma); } new_pte = pte_mkold(mk_pte(page, vma->vm_page_prot)); if (pte_swp_soft_dirty(old_pte)) new_pte = pte_mksoft_dirty(new_pte); if (pte_swp_uffd_wp(old_pte)) new_pte = pte_mkuffd_wp(new_pte); setpte: set_pte_at(vma->vm_mm, addr, pte, new_pte); swap_free(entry); out: if (pte) pte_unmap_unlock(pte, ptl); if (folio != swapcache) { folio_unlock(folio); folio_put(folio); } return ret; } static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, unsigned int type) { pte_t *pte = NULL; struct swap_info_struct *si; si = swap_info[type]; do { struct folio *folio; unsigned long offset; unsigned char swp_count; swp_entry_t entry; int ret; pte_t ptent; if (!pte++) { pte = pte_offset_map(pmd, addr); if (!pte) break; } ptent = ptep_get_lockless(pte); if (!is_swap_pte(ptent)) continue; entry = pte_to_swp_entry(ptent); if (swp_type(entry) != type) continue; offset = swp_offset(entry); pte_unmap(pte); pte = NULL; folio = swap_cache_get_folio(entry, vma, addr); if (!folio) { struct vm_fault vmf = { .vma = vma, .address = addr, .real_address = addr, .pmd = pmd, }; folio = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, &vmf); } if (!folio) { swp_count = READ_ONCE(si->swap_map[offset]); if (swp_count == 0 || swp_count == SWAP_MAP_BAD) continue; return -ENOMEM; } folio_lock(folio); folio_wait_writeback(folio); ret = unuse_pte(vma, pmd, addr, entry, folio); if (ret < 0) { folio_unlock(folio); folio_put(folio); return ret; } folio_free_swap(folio); folio_unlock(folio); folio_put(folio); } while (addr += PAGE_SIZE, addr != end); if (pte) pte_unmap(pte); return 0; } static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, unsigned long addr, unsigned long end, unsigned int type) { pmd_t *pmd; unsigned long next; int ret; pmd = pmd_offset(pud, addr); do { cond_resched(); next = pmd_addr_end(addr, end); ret = unuse_pte_range(vma, pmd, addr, next, type); if (ret) return ret; } while (pmd++, addr = next, addr != end); return 0; } static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d, unsigned long addr, unsigned long end, unsigned int type) { pud_t *pud; unsigned long next; int ret; pud = pud_offset(p4d, addr); do { next = pud_addr_end(addr, end); if (pud_none_or_clear_bad(pud)) continue; ret = unuse_pmd_range(vma, pud, addr, next, type); if (ret) return ret; } while (pud++, addr = next, addr != end); return 0; } static inline int unuse_p4d_range(struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr, unsigned long end, unsigned int type) { p4d_t *p4d; unsigned long next; int ret; p4d = p4d_offset(pgd, addr); do { next = p4d_addr_end(addr, end); if (p4d_none_or_clear_bad(p4d)) continue; ret = unuse_pud_range(vma, p4d, addr, next, type); if (ret) return ret; } while (p4d++, addr = next, addr != end); return 0; } static int unuse_vma(struct vm_area_struct *vma, unsigned int type) { pgd_t *pgd; unsigned long addr, end, next; int ret; addr = vma->vm_start; end = vma->vm_end; pgd = pgd_offset(vma->vm_mm, addr); do { next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; ret = unuse_p4d_range(vma, pgd, addr, next, type); if (ret) return ret; } while (pgd++, addr = next, addr != end); return 0; } static int unuse_mm(struct mm_struct *mm, unsigned int type) { struct vm_area_struct *vma; int ret = 0; VMA_ITERATOR(vmi, mm, 0); mmap_read_lock(mm); for_each_vma(vmi, vma) { if (vma->anon_vma && !is_vm_hugetlb_page(vma)) { ret = unuse_vma(vma, type); if (ret) break; } cond_resched(); } mmap_read_unlock(mm); return ret; } /* * Scan swap_map from current position to next entry still in use. * Return 0 if there are no inuse entries after prev till end of * the map. */ static unsigned int find_next_to_unuse(struct swap_info_struct *si, unsigned int prev) { unsigned int i; unsigned char count; /* * No need for swap_lock here: we're just looking * for whether an entry is in use, not modifying it; false * hits are okay, and sys_swapoff() has already prevented new * allocations from this area (while holding swap_lock). */ for (i = prev + 1; i < si->max; i++) { count = READ_ONCE(si->swap_map[i]); if (count && swap_count(count) != SWAP_MAP_BAD) break; if ((i % LATENCY_LIMIT) == 0) cond_resched(); } if (i == si->max) i = 0; return i; } static int try_to_unuse(unsigned int type) { struct mm_struct *prev_mm; struct mm_struct *mm; struct list_head *p; int retval = 0; struct swap_info_struct *si = swap_info[type]; struct folio *folio; swp_entry_t entry; unsigned int i; if (!swap_usage_in_pages(si)) goto success; retry: retval = shmem_unuse(type); if (retval) return retval; prev_mm = &init_mm; mmget(prev_mm); spin_lock(&mmlist_lock); p = &init_mm.mmlist; while (swap_usage_in_pages(si) && !signal_pending(current) && (p = p->next) != &init_mm.mmlist) { mm = list_entry(p, struct mm_struct, mmlist); if (!mmget_not_zero(mm)) continue; spin_unlock(&mmlist_lock); mmput(prev_mm); prev_mm = mm; retval = unuse_mm(mm, type); if (retval) { mmput(prev_mm); return retval; } /* * Make sure that we aren't completely killing * interactive performance. */ cond_resched(); spin_lock(&mmlist_lock); } spin_unlock(&mmlist_lock); mmput(prev_mm); i = 0; while (swap_usage_in_pages(si) && !signal_pending(current) && (i = find_next_to_unuse(si, i)) != 0) { entry = swp_entry(type, i); folio = filemap_get_folio(swap_address_space(entry), swap_cache_index(entry)); if (IS_ERR(folio)) continue; /* * It is conceivable that a racing task removed this folio from * swap cache just before we acquired the page lock. The folio * might even be back in swap cache on another swap area. But * that is okay, folio_free_swap() only removes stale folios. */ folio_lock(folio); folio_wait_writeback(folio); folio_free_swap(folio); folio_unlock(folio); folio_put(folio); } /* * Lets check again to see if there are still swap entries in the map. * If yes, we would need to do retry the unuse logic again. * Under global memory pressure, swap entries can be reinserted back * into process space after the mmlist loop above passes over them. * * Limit the number of retries? No: when mmget_not_zero() * above fails, that mm is likely to be freeing swap from * exit_mmap(), which proceeds at its own independent pace; * and even shmem_writepage() could have been preempted after * folio_alloc_swap(), temporarily hiding that swap. It's easy * and robust (though cpu-intensive) just to keep retrying. */ if (swap_usage_in_pages(si)) { if (!signal_pending(current)) goto retry; return -EINTR; } success: /* * Make sure that further cleanups after try_to_unuse() returns happen * after swap_range_free() reduces si->inuse_pages to 0. */ smp_mb(); return 0; } /* * After a successful try_to_unuse, if no swap is now in use, we know * we can empty the mmlist. swap_lock must be held on entry and exit. * Note that mmlist_lock nests inside swap_lock, and an mm must be * added to the mmlist just after page_duplicate - before would be racy. */ static void drain_mmlist(void) { struct list_head *p, *next; unsigned int type; for (type = 0; type < nr_swapfiles; type++) if (swap_usage_in_pages(swap_info[type])) return; spin_lock(&mmlist_lock); list_for_each_safe(p, next, &init_mm.mmlist) list_del_init(p); spin_unlock(&mmlist_lock); } /* * Free all of a swapdev's extent information */ static void destroy_swap_extents(struct swap_info_struct *sis) { while (!RB_EMPTY_ROOT(&sis->swap_extent_root)) { struct rb_node *rb = sis->swap_extent_root.rb_node; struct swap_extent *se = rb_entry(rb, struct swap_extent, rb_node); rb_erase(rb, &sis->swap_extent_root); kfree(se); } if (sis->flags & SWP_ACTIVATED) { struct file *swap_file = sis->swap_file; struct address_space *mapping = swap_file->f_mapping; sis->flags &= ~SWP_ACTIVATED; if (mapping->a_ops->swap_deactivate) mapping->a_ops->swap_deactivate(swap_file); } } /* * Add a block range (and the corresponding page range) into this swapdev's * extent tree. * * This function rather assumes that it is called in ascending page order. */ int add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, unsigned long nr_pages, sector_t start_block) { struct rb_node **link = &sis->swap_extent_root.rb_node, *parent = NULL; struct swap_extent *se; struct swap_extent *new_se; /* * place the new node at the right most since the * function is called in ascending page order. */ while (*link) { parent = *link; link = &parent->rb_right; } if (parent) { se = rb_entry(parent, struct swap_extent, rb_node); BUG_ON(se->start_page + se->nr_pages != start_page); if (se->start_block + se->nr_pages == start_block) { /* Merge it */ se->nr_pages += nr_pages; return 0; } } /* No merge, insert a new extent. */ new_se = kmalloc(sizeof(*se), GFP_KERNEL); if (new_se == NULL) return -ENOMEM; new_se->start_page = start_page; new_se->nr_pages = nr_pages; new_se->start_block = start_block; rb_link_node(&new_se->rb_node, parent, link); rb_insert_color(&new_se->rb_node, &sis->swap_extent_root); return 1; } EXPORT_SYMBOL_GPL(add_swap_extent); /* * A `swap extent' is a simple thing which maps a contiguous range of pages * onto a contiguous range of disk blocks. A rbtree of swap extents is * built at swapon time and is then used at swap_writepage/swap_read_folio * time for locating where on disk a page belongs. * * If the swapfile is an S_ISBLK block device, a single extent is installed. * This is done so that the main operating code can treat S_ISBLK and S_ISREG * swap files identically. * * Whether the swapdev is an S_ISREG file or an S_ISBLK blockdev, the swap * extent rbtree operates in PAGE_SIZE disk blocks. Both S_ISREG and S_ISBLK * swapfiles are handled *identically* after swapon time. * * For S_ISREG swapfiles, setup_swap_extents() will walk all the file's blocks * and will parse them into a rbtree, in PAGE_SIZE chunks. If some stray * blocks are found which do not fall within the PAGE_SIZE alignment * requirements, they are simply tossed out - we will never use those blocks * for swapping. * * For all swap devices we set S_SWAPFILE across the life of the swapon. This * prevents users from writing to the swap device, which will corrupt memory. * * The amount of disk space which a single swap extent represents varies. * Typically it is in the 1-4 megabyte range. So we can have hundreds of * extents in the rbtree. - akpm. */ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) { struct file *swap_file = sis->swap_file; struct address_space *mapping = swap_file->f_mapping; struct inode *inode = mapping->host; int ret; if (S_ISBLK(inode->i_mode)) { ret = add_swap_extent(sis, 0, sis->max, 0); *span = sis->pages; return ret; } if (mapping->a_ops->swap_activate) { ret = mapping->a_ops->swap_activate(sis, swap_file, span); if (ret < 0) return ret; sis->flags |= SWP_ACTIVATED; if ((sis->flags & SWP_FS_OPS) && sio_pool_init() != 0) { destroy_swap_extents(sis); return -ENOMEM; } return ret; } return generic_swapfile_activate(sis, swap_file, span); } static int swap_node(struct swap_info_struct *si) { struct block_device *bdev; if (si->bdev) bdev = si->bdev; else bdev = si->swap_file->f_inode->i_sb->s_bdev; return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE; } static void setup_swap_info(struct swap_info_struct *si, int prio, unsigned char *swap_map, struct swap_cluster_info *cluster_info, unsigned long *zeromap) { int i; if (prio >= 0) si->prio = prio; else si->prio = --least_priority; /* * the plist prio is negated because plist ordering is * low-to-high, while swap ordering is high-to-low */ si->list.prio = -si->prio; for_each_node(i) { if (si->prio >= 0) si->avail_lists[i].prio = -si->prio; else { if (swap_node(si) == i) si->avail_lists[i].prio = 1; else si->avail_lists[i].prio = -si->prio; } } si->swap_map = swap_map; si->cluster_info = cluster_info; si->zeromap = zeromap; } static void _enable_swap_info(struct swap_info_struct *si) { atomic_long_add(si->pages, &nr_swap_pages); total_swap_pages += si->pages; assert_spin_locked(&swap_lock); /* * both lists are plists, and thus priority ordered. * swap_active_head needs to be priority ordered for swapoff(), * which on removal of any swap_info_struct with an auto-assigned * (i.e. negative) priority increments the auto-assigned priority * of any lower-priority swap_info_structs. * swap_avail_head needs to be priority ordered for folio_alloc_swap(), * which allocates swap pages from the highest available priority * swap_info_struct. */ plist_add(&si->list, &swap_active_head); /* Add back to available list */ add_to_avail_list(si, true); } static void enable_swap_info(struct swap_info_struct *si, int prio, unsigned char *swap_map, struct swap_cluster_info *cluster_info, unsigned long *zeromap) { spin_lock(&swap_lock); spin_lock(&si->lock); setup_swap_info(si, prio, swap_map, cluster_info, zeromap); spin_unlock(&si->lock); spin_unlock(&swap_lock); /* * Finished initializing swap device, now it's safe to reference it. */ percpu_ref_resurrect(&si->users); spin_lock(&swap_lock); spin_lock(&si->lock); _enable_swap_info(si); spin_unlock(&si->lock); spin_unlock(&swap_lock); } static void reinsert_swap_info(struct swap_info_struct *si) { spin_lock(&swap_lock); spin_lock(&si->lock); setup_swap_info(si, si->prio, si->swap_map, si->cluster_info, si->zeromap); _enable_swap_info(si); spin_unlock(&si->lock); spin_unlock(&swap_lock); } static bool __has_usable_swap(void) { return !plist_head_empty(&swap_active_head); } bool has_usable_swap(void) { bool ret; spin_lock(&swap_lock); ret = __has_usable_swap(); spin_unlock(&swap_lock); return ret; } /* * Called after clearing SWP_WRITEOK, ensures cluster_alloc_range * see the updated flags, so there will be no more allocations. */ static void wait_for_allocation(struct swap_info_struct *si) { unsigned long offset; unsigned long end = ALIGN(si->max, SWAPFILE_CLUSTER); struct swap_cluster_info *ci; BUG_ON(si->flags & SWP_WRITEOK); for (offset = 0; offset < end; offset += SWAPFILE_CLUSTER) { ci = lock_cluster(si, offset); unlock_cluster(ci); offset += SWAPFILE_CLUSTER; } } SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) { struct swap_info_struct *p = NULL; unsigned char *swap_map; unsigned long *zeromap; struct swap_cluster_info *cluster_info; struct file *swap_file, *victim; struct address_space *mapping; struct inode *inode; struct filename *pathname; int err, found = 0; if (!capable(CAP_SYS_ADMIN)) return -EPERM; BUG_ON(!current->mm); pathname = getname(specialfile); if (IS_ERR(pathname)) return PTR_ERR(pathname); victim = file_open_name(pathname, O_RDWR|O_LARGEFILE, 0); err = PTR_ERR(victim); if (IS_ERR(victim)) goto out; mapping = victim->f_mapping; spin_lock(&swap_lock); plist_for_each_entry(p, &swap_active_head, list) { if (p->flags & SWP_WRITEOK) { if (p->swap_file->f_mapping == mapping) { found = 1; break; } } } if (!found) { err = -EINVAL; spin_unlock(&swap_lock); goto out_dput; } if (!security_vm_enough_memory_mm(current->mm, p->pages)) vm_unacct_memory(p->pages); else { err = -ENOMEM; spin_unlock(&swap_lock); goto out_dput; } spin_lock(&p->lock); del_from_avail_list(p, true); if (p->prio < 0) { struct swap_info_struct *si = p; int nid; plist_for_each_entry_continue(si, &swap_active_head, list) { si->prio++; si->list.prio--; for_each_node(nid) { if (si->avail_lists[nid].prio != 1) si->avail_lists[nid].prio--; } } least_priority++; } plist_del(&p->list, &swap_active_head); atomic_long_sub(p->pages, &nr_swap_pages); total_swap_pages -= p->pages; spin_unlock(&p->lock); spin_unlock(&swap_lock); wait_for_allocation(p); disable_swap_slots_cache_lock(); set_current_oom_origin(); err = try_to_unuse(p->type); clear_current_oom_origin(); if (err) { /* re-insert swap space back into swap_list */ reinsert_swap_info(p); reenable_swap_slots_cache_unlock(); goto out_dput; } reenable_swap_slots_cache_unlock(); /* * Wait for swap operations protected by get/put_swap_device() * to complete. Because of synchronize_rcu() here, all swap * operations protected by RCU reader side lock (including any * spinlock) will be waited too. This makes it easy to * prevent folio_test_swapcache() and the following swap cache * operations from racing with swapoff. */ percpu_ref_kill(&p->users); synchronize_rcu(); wait_for_completion(&p->comp); flush_work(&p->discard_work); flush_work(&p->reclaim_work); destroy_swap_extents(p); if (p->flags & SWP_CONTINUED) free_swap_count_continuations(p); if (!p->bdev || !bdev_nonrot(p->bdev)) atomic_dec(&nr_rotate_swap); mutex_lock(&swapon_mutex); spin_lock(&swap_lock); spin_lock(&p->lock); drain_mmlist(); swap_file = p->swap_file; p->swap_file = NULL; p->max = 0; swap_map = p->swap_map; p->swap_map = NULL; zeromap = p->zeromap; p->zeromap = NULL; cluster_info = p->cluster_info; p->cluster_info = NULL; spin_unlock(&p->lock); spin_unlock(&swap_lock); arch_swap_invalidate_area(p->type); zswap_swapoff(p->type); mutex_unlock(&swapon_mutex); free_percpu(p->percpu_cluster); p->percpu_cluster = NULL; kfree(p->global_cluster); p->global_cluster = NULL; vfree(swap_map); kvfree(zeromap); kvfree(cluster_info); /* Destroy swap account information */ swap_cgroup_swapoff(p->type); exit_swap_address_space(p->type); inode = mapping->host; inode_lock(inode); inode->i_flags &= ~S_SWAPFILE; inode_unlock(inode); filp_close(swap_file, NULL); /* * Clear the SWP_USED flag after all resources are freed so that swapon * can reuse this swap_info in alloc_swap_info() safely. It is ok to * not hold p->lock after we cleared its SWP_WRITEOK. */ spin_lock(&swap_lock); p->flags = 0; spin_unlock(&swap_lock); err = 0; atomic_inc(&proc_poll_event); wake_up_interruptible(&proc_poll_wait); out_dput: filp_close(victim, NULL); out: putname(pathname); return err; } #ifdef CONFIG_PROC_FS static __poll_t swaps_poll(struct file *file, poll_table *wait) { struct seq_file *seq = file->private_data; poll_wait(file, &proc_poll_wait, wait); if (seq->poll_event != atomic_read(&proc_poll_event)) { seq->poll_event = atomic_read(&proc_poll_event); return EPOLLIN | EPOLLRDNORM | EPOLLERR | EPOLLPRI; } return EPOLLIN | EPOLLRDNORM; } /* iterator */ static void *swap_start(struct seq_file *swap, loff_t *pos) { struct swap_info_struct *si; int type; loff_t l = *pos; mutex_lock(&swapon_mutex); if (!l) return SEQ_START_TOKEN; for (type = 0; (si = swap_type_to_swap_info(type)); type++) { if (!(si->flags & SWP_USED) || !si->swap_map) continue; if (!--l) return si; } return NULL; } static void *swap_next(struct seq_file *swap, void *v, loff_t *pos) { struct swap_info_struct *si = v; int type; if (v == SEQ_START_TOKEN) type = 0; else type = si->type + 1; ++(*pos); for (; (si = swap_type_to_swap_info(type)); type++) { if (!(si->flags & SWP_USED) || !si->swap_map) continue; return si; } return NULL; } static void swap_stop(struct seq_file *swap, void *v) { mutex_unlock(&swapon_mutex); } static int swap_show(struct seq_file *swap, void *v) { struct swap_info_struct *si = v; struct file *file; int len; unsigned long bytes, inuse; if (si == SEQ_START_TOKEN) { seq_puts(swap, "Filename\t\t\t\tType\t\tSize\t\tUsed\t\tPriority\n"); return 0; } bytes = K(si->pages); inuse = K(swap_usage_in_pages(si)); file = si->swap_file; len = seq_file_path(swap, file, " \t\n\\"); seq_printf(swap, "%*s%s\t%lu\t%s%lu\t%s%d\n", len < 40 ? 40 - len : 1, " ", S_ISBLK(file_inode(file)->i_mode) ? "partition" : "file\t", bytes, bytes < 10000000 ? "\t" : "", inuse, inuse < 10000000 ? "\t" : "", si->prio); return 0; } static const struct seq_operations swaps_op = { .start = swap_start, .next = swap_next, .stop = swap_stop, .show = swap_show }; static int swaps_open(struct inode *inode, struct file *file) { struct seq_file *seq; int ret; ret = seq_open(file, &swaps_op); if (ret) return ret; seq = file->private_data; seq->poll_event = atomic_read(&proc_poll_event); return 0; } static const struct proc_ops swaps_proc_ops = { .proc_flags = PROC_ENTRY_PERMANENT, .proc_open = swaps_open, .proc_read = seq_read, .proc_lseek = seq_lseek, .proc_release = seq_release, .proc_poll = swaps_poll, }; static int __init procswaps_init(void) { proc_create("swaps", 0, NULL, &swaps_proc_ops); return 0; } __initcall(procswaps_init); #endif /* CONFIG_PROC_FS */ #ifdef MAX_SWAPFILES_CHECK static int __init max_swapfiles_check(void) { MAX_SWAPFILES_CHECK(); return 0; } late_initcall(max_swapfiles_check); #endif static struct swap_info_struct *alloc_swap_info(void) { struct swap_info_struct *p; struct swap_info_struct *defer = NULL; unsigned int type; int i; p = kvzalloc(struct_size(p, avail_lists, nr_node_ids), GFP_KERNEL); if (!p) return ERR_PTR(-ENOMEM); if (percpu_ref_init(&p->users, swap_users_ref_free, PERCPU_REF_INIT_DEAD, GFP_KERNEL)) { kvfree(p); return ERR_PTR(-ENOMEM); } spin_lock(&swap_lock); for (type = 0; type < nr_swapfiles; type++) { if (!(swap_info[type]->flags & SWP_USED)) break; } if (type >= MAX_SWAPFILES) { spin_unlock(&swap_lock); percpu_ref_exit(&p->users); kvfree(p); return ERR_PTR(-EPERM); } if (type >= nr_swapfiles) { p->type = type; /* * Publish the swap_info_struct after initializing it. * Note that kvzalloc() above zeroes all its fields. */ smp_store_release(&swap_info[type], p); /* rcu_assign_pointer() */ nr_swapfiles++; } else { defer = p; p = swap_info[type]; /* * Do not memset this entry: a racing procfs swap_next() * would be relying on p->type to remain valid. */ } p->swap_extent_root = RB_ROOT; plist_node_init(&p->list, 0); for_each_node(i) plist_node_init(&p->avail_lists[i], 0); p->flags = SWP_USED; spin_unlock(&swap_lock); if (defer) { percpu_ref_exit(&defer->users); kvfree(defer); } spin_lock_init(&p->lock); spin_lock_init(&p->cont_lock); atomic_long_set(&p->inuse_pages, SWAP_USAGE_OFFLIST_BIT); init_completion(&p->comp); return p; } static int claim_swapfile(struct swap_info_struct *si, struct inode *inode) { if (S_ISBLK(inode->i_mode)) { si->bdev = I_BDEV(inode); /* * Zoned block devices contain zones that have a sequential * write only restriction. Hence zoned block devices are not * suitable for swapping. Disallow them here. */ if (bdev_is_zoned(si->bdev)) return -EINVAL; si->flags |= SWP_BLKDEV; } else if (S_ISREG(inode->i_mode)) { si->bdev = inode->i_sb->s_bdev; } return 0; } /* * Find out how many pages are allowed for a single swap device. There * are two limiting factors: * 1) the number of bits for the swap offset in the swp_entry_t type, and * 2) the number of bits in the swap pte, as defined by the different * architectures. * * In order to find the largest possible bit mask, a swap entry with * swap type 0 and swap offset ~0UL is created, encoded to a swap pte, * decoded to a swp_entry_t again, and finally the swap offset is * extracted. * * This will mask all the bits from the initial ~0UL mask that can't * be encoded in either the swp_entry_t or the architecture definition * of a swap pte. */ unsigned long generic_max_swapfile_size(void) { return swp_offset(pte_to_swp_entry( swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1; } /* Can be overridden by an architecture for additional checks. */ __weak unsigned long arch_max_swapfile_size(void) { return generic_max_swapfile_size(); } static unsigned long read_swap_header(struct swap_info_struct *si, union swap_header *swap_header, struct inode *inode) { int i; unsigned long maxpages; unsigned long swapfilepages; unsigned long last_page; if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) { pr_err("Unable to find swap-space signature\n"); return 0; } /* swap partition endianness hack... */ if (swab32(swap_header->info.version) == 1) { swab32s(&swap_header->info.version); swab32s(&swap_header->info.last_page); swab32s(&swap_header->info.nr_badpages); if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) return 0; for (i = 0; i < swap_header->info.nr_badpages; i++) swab32s(&swap_header->info.badpages[i]); } /* Check the swap header's sub-version */ if (swap_header->info.version != 1) { pr_warn("Unable to handle swap header version %d\n", swap_header->info.version); return 0; } maxpages = swapfile_maximum_size; last_page = swap_header->info.last_page; if (!last_page) { pr_warn("Empty swap-file\n"); return 0; } if (last_page > maxpages) { pr_warn("Truncating oversized swap area, only using %luk out of %luk\n", K(maxpages), K(last_page)); } if (maxpages > last_page) { maxpages = last_page + 1; /* p->max is an unsigned int: don't overflow it */ if ((unsigned int)maxpages == 0) maxpages = UINT_MAX; } if (!maxpages) return 0; swapfilepages = i_size_read(inode) >> PAGE_SHIFT; if (swapfilepages && maxpages > swapfilepages) { pr_warn("Swap area shorter than signature indicates\n"); return 0; } if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode)) return 0; if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) return 0; return maxpages; } #define SWAP_CLUSTER_INFO_COLS \ DIV_ROUND_UP(L1_CACHE_BYTES, sizeof(struct swap_cluster_info)) #define SWAP_CLUSTER_SPACE_COLS \ DIV_ROUND_UP(SWAP_ADDRESS_SPACE_PAGES, SWAPFILE_CLUSTER) #define SWAP_CLUSTER_COLS \ max_t(unsigned int, SWAP_CLUSTER_INFO_COLS, SWAP_CLUSTER_SPACE_COLS) static int setup_swap_map_and_extents(struct swap_info_struct *si, union swap_header *swap_header, unsigned char *swap_map, unsigned long maxpages, sector_t *span) { unsigned int nr_good_pages; unsigned long i; int nr_extents; nr_good_pages = maxpages - 1; /* omit header page */ for (i = 0; i < swap_header->info.nr_badpages; i++) { unsigned int page_nr = swap_header->info.badpages[i]; if (page_nr == 0 || page_nr > swap_header->info.last_page) return -EINVAL; if (page_nr < maxpages) { swap_map[page_nr] = SWAP_MAP_BAD; nr_good_pages--; } } if (nr_good_pages) { swap_map[0] = SWAP_MAP_BAD; si->max = maxpages; si->pages = nr_good_pages; nr_extents = setup_swap_extents(si, span); if (nr_extents < 0) return nr_extents; nr_good_pages = si->pages; } if (!nr_good_pages) { pr_warn("Empty swap-file\n"); return -EINVAL; } return nr_extents; } static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si, union swap_header *swap_header, unsigned long maxpages) { unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); struct swap_cluster_info *cluster_info; unsigned long i, j, k, idx; int cpu, err = -ENOMEM; cluster_info = kvcalloc(nr_clusters, sizeof(*cluster_info), GFP_KERNEL); if (!cluster_info) goto err; for (i = 0; i < nr_clusters; i++) spin_lock_init(&cluster_info[i].lock); if (si->flags & SWP_SOLIDSTATE) { si->percpu_cluster = alloc_percpu(struct percpu_cluster); if (!si->percpu_cluster) goto err_free; for_each_possible_cpu(cpu) { struct percpu_cluster *cluster; cluster = per_cpu_ptr(si->percpu_cluster, cpu); for (i = 0; i < SWAP_NR_ORDERS; i++) cluster->next[i] = SWAP_ENTRY_INVALID; local_lock_init(&cluster->lock); } } else { si->global_cluster = kmalloc(sizeof(*si->global_cluster), GFP_KERNEL); if (!si->global_cluster) goto err_free; for (i = 0; i < SWAP_NR_ORDERS; i++) si->global_cluster->next[i] = SWAP_ENTRY_INVALID; spin_lock_init(&si->global_cluster_lock); } /* * Mark unusable pages as unavailable. The clusters aren't * marked free yet, so no list operations are involved yet. * * See setup_swap_map_and_extents(): header page, bad pages, * and the EOF part of the last cluster. */ inc_cluster_info_page(si, cluster_info, 0); for (i = 0; i < swap_header->info.nr_badpages; i++) inc_cluster_info_page(si, cluster_info, swap_header->info.badpages[i]); for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++) inc_cluster_info_page(si, cluster_info, i); INIT_LIST_HEAD(&si->free_clusters); INIT_LIST_HEAD(&si->full_clusters); INIT_LIST_HEAD(&si->discard_clusters); for (i = 0; i < SWAP_NR_ORDERS; i++) { INIT_LIST_HEAD(&si->nonfull_clusters[i]); INIT_LIST_HEAD(&si->frag_clusters[i]); atomic_long_set(&si->frag_cluster_nr[i], 0); } /* * Reduce false cache line sharing between cluster_info and * sharing same address space. */ for (k = 0; k < SWAP_CLUSTER_COLS; k++) { j = k % SWAP_CLUSTER_COLS; for (i = 0; i < DIV_ROUND_UP(nr_clusters, SWAP_CLUSTER_COLS); i++) { struct swap_cluster_info *ci; idx = i * SWAP_CLUSTER_COLS + j; ci = cluster_info + idx; if (idx >= nr_clusters) continue; if (ci->count) { ci->flags = CLUSTER_FLAG_NONFULL; list_add_tail(&ci->list, &si->nonfull_clusters[0]); continue; } ci->flags = CLUSTER_FLAG_FREE; list_add_tail(&ci->list, &si->free_clusters); } } return cluster_info; err_free: kvfree(cluster_info); err: return ERR_PTR(err); } SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) { struct swap_info_struct *si; struct filename *name; struct file *swap_file = NULL; struct address_space *mapping; struct dentry *dentry; int prio; int error; union swap_header *swap_header; int nr_extents; sector_t span; unsigned long maxpages; unsigned char *swap_map = NULL; unsigned long *zeromap = NULL; struct swap_cluster_info *cluster_info = NULL; struct folio *folio = NULL; struct inode *inode = NULL; bool inced_nr_rotate_swap = false; if (swap_flags & ~SWAP_FLAGS_VALID) return -EINVAL; if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (!swap_avail_heads) return -ENOMEM; si = alloc_swap_info(); if (IS_ERR(si)) return PTR_ERR(si); INIT_WORK(&si->discard_work, swap_discard_work); INIT_WORK(&si->reclaim_work, swap_reclaim_work); name = getname(specialfile); if (IS_ERR(name)) { error = PTR_ERR(name); name = NULL; goto bad_swap; } swap_file = file_open_name(name, O_RDWR | O_LARGEFILE | O_EXCL, 0); if (IS_ERR(swap_file)) { error = PTR_ERR(swap_file); swap_file = NULL; goto bad_swap; } si->swap_file = swap_file; mapping = swap_file->f_mapping; dentry = swap_file->f_path.dentry; inode = mapping->host; error = claim_swapfile(si, inode); if (unlikely(error)) goto bad_swap; inode_lock(inode); if (d_unlinked(dentry) || cant_mount(dentry)) { error = -ENOENT; goto bad_swap_unlock_inode; } if (IS_SWAPFILE(inode)) { error = -EBUSY; goto bad_swap_unlock_inode; } /* * Read the swap header. */ if (!mapping->a_ops->read_folio) { error = -EINVAL; goto bad_swap_unlock_inode; } folio = read_mapping_folio(mapping, 0, swap_file); if (IS_ERR(folio)) { error = PTR_ERR(folio); goto bad_swap_unlock_inode; } swap_header = kmap_local_folio(folio, 0); maxpages = read_swap_header(si, swap_header, inode); if (unlikely(!maxpages)) { error = -EINVAL; goto bad_swap_unlock_inode; } /* OK, set up the swap map and apply the bad block list */ swap_map = vzalloc(maxpages); if (!swap_map) { error = -ENOMEM; goto bad_swap_unlock_inode; } error = swap_cgroup_swapon(si->type, maxpages); if (error) goto bad_swap_unlock_inode; nr_extents = setup_swap_map_and_extents(si, swap_header, swap_map, maxpages, &span); if (unlikely(nr_extents < 0)) { error = nr_extents; goto bad_swap_unlock_inode; } /* * Use kvmalloc_array instead of bitmap_zalloc as the allocation order might * be above MAX_PAGE_ORDER incase of a large swap file. */ zeromap = kvmalloc_array(BITS_TO_LONGS(maxpages), sizeof(long), GFP_KERNEL | __GFP_ZERO); if (!zeromap) { error = -ENOMEM; goto bad_swap_unlock_inode; } if (si->bdev && bdev_stable_writes(si->bdev)) si->flags |= SWP_STABLE_WRITES; if (si->bdev && bdev_synchronous(si->bdev)) si->flags |= SWP_SYNCHRONOUS_IO; if (si->bdev && bdev_nonrot(si->bdev)) { si->flags |= SWP_SOLIDSTATE; } else { atomic_inc(&nr_rotate_swap); inced_nr_rotate_swap = true; } cluster_info = setup_clusters(si, swap_header, maxpages); if (IS_ERR(cluster_info)) { error = PTR_ERR(cluster_info); cluster_info = NULL; goto bad_swap_unlock_inode; } if ((swap_flags & SWAP_FLAG_DISCARD) && si->bdev && bdev_max_discard_sectors(si->bdev)) { /* * When discard is enabled for swap with no particular * policy flagged, we set all swap discard flags here in * order to sustain backward compatibility with older * swapon(8) releases. */ si->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD | SWP_PAGE_DISCARD); /* * By flagging sys_swapon, a sysadmin can tell us to * either do single-time area discards only, or to just * perform discards for released swap page-clusters. * Now it's time to adjust the p->flags accordingly. */ if (swap_flags & SWAP_FLAG_DISCARD_ONCE) si->flags &= ~SWP_PAGE_DISCARD; else if (swap_flags & SWAP_FLAG_DISCARD_PAGES) si->flags &= ~SWP_AREA_DISCARD; /* issue a swapon-time discard if it's still required */ if (si->flags & SWP_AREA_DISCARD) { int err = discard_swap(si); if (unlikely(err)) pr_err("swapon: discard_swap(%p): %d\n", si, err); } } error = init_swap_address_space(si->type, maxpages); if (error) goto bad_swap_unlock_inode; error = zswap_swapon(si->type, maxpages); if (error) goto free_swap_address_space; /* * Flush any pending IO and dirty mappings before we start using this * swap device. */ inode->i_flags |= S_SWAPFILE; error = inode_drain_writes(inode); if (error) { inode->i_flags &= ~S_SWAPFILE; goto free_swap_zswap; } mutex_lock(&swapon_mutex); prio = -1; if (swap_flags & SWAP_FLAG_PREFER) prio = (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT; enable_swap_info(si, prio, swap_map, cluster_info, zeromap); pr_info("Adding %uk swap on %s. Priority:%d extents:%d across:%lluk %s%s%s%s\n", K(si->pages), name->name, si->prio, nr_extents, K((unsigned long long)span), (si->flags & SWP_SOLIDSTATE) ? "SS" : "", (si->flags & SWP_DISCARDABLE) ? "D" : "", (si->flags & SWP_AREA_DISCARD) ? "s" : "", (si->flags & SWP_PAGE_DISCARD) ? "c" : ""); mutex_unlock(&swapon_mutex); atomic_inc(&proc_poll_event); wake_up_interruptible(&proc_poll_wait); error = 0; goto out; free_swap_zswap: zswap_swapoff(si->type); free_swap_address_space: exit_swap_address_space(si->type); bad_swap_unlock_inode: inode_unlock(inode); bad_swap: free_percpu(si->percpu_cluster); si->percpu_cluster = NULL; kfree(si->global_cluster); si->global_cluster = NULL; inode = NULL; destroy_swap_extents(si); swap_cgroup_swapoff(si->type); spin_lock(&swap_lock); si->swap_file = NULL; si->flags = 0; spin_unlock(&swap_lock); vfree(swap_map); kvfree(zeromap); kvfree(cluster_info); if (inced_nr_rotate_swap) atomic_dec(&nr_rotate_swap); if (swap_file) filp_close(swap_file, NULL); out: if (!IS_ERR_OR_NULL(folio)) folio_release_kmap(folio, swap_header); if (name) putname(name); if (inode) inode_unlock(inode); if (!error) enable_swap_slots_cache(); return error; } void si_swapinfo(struct sysinfo *val) { unsigned int type; unsigned long nr_to_be_unused = 0; spin_lock(&swap_lock); for (type = 0; type < nr_swapfiles; type++) { struct swap_info_struct *si = swap_info[type]; if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK)) nr_to_be_unused += swap_usage_in_pages(si); } val->freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused; val->totalswap = total_swap_pages + nr_to_be_unused; spin_unlock(&swap_lock); } /* * Verify that nr swap entries are valid and increment their swap map counts. * * Returns error code in following case. * - success -> 0 * - swp_entry is invalid -> EINVAL * - swp_entry is migration entry -> EINVAL * - swap-cache reference is requested but there is already one. -> EEXIST * - swap-cache reference is requested but the entry is not used. -> ENOENT * - swap-mapped reference requested but needs continued swap count. -> ENOMEM */ static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr) { struct swap_info_struct *si; struct swap_cluster_info *ci; unsigned long offset; unsigned char count; unsigned char has_cache; int err, i; si = swp_swap_info(entry); offset = swp_offset(entry); VM_WARN_ON(nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER); VM_WARN_ON(usage == 1 && nr > 1); ci = lock_cluster(si, offset); err = 0; for (i = 0; i < nr; i++) { count = si->swap_map[offset + i]; /* * swapin_readahead() doesn't check if a swap entry is valid, so the * swap entry could be SWAP_MAP_BAD. Check here with lock held. */ if (unlikely(swap_count(count) == SWAP_MAP_BAD)) { err = -ENOENT; goto unlock_out; } has_cache = count & SWAP_HAS_CACHE; count &= ~SWAP_HAS_CACHE; if (!count && !has_cache) { err = -ENOENT; } else if (usage == SWAP_HAS_CACHE) { if (has_cache) err = -EEXIST; } else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX) { err = -EINVAL; } if (err) goto unlock_out; } for (i = 0; i < nr; i++) { count = si->swap_map[offset + i]; has_cache = count & SWAP_HAS_CACHE; count &= ~SWAP_HAS_CACHE; if (usage == SWAP_HAS_CACHE) has_cache = SWAP_HAS_CACHE; else if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX) count += usage; else if (swap_count_continued(si, offset + i, count)) count = COUNT_CONTINUED; else { /* * Don't need to rollback changes, because if * usage == 1, there must be nr == 1. */ err = -ENOMEM; goto unlock_out; } WRITE_ONCE(si->swap_map[offset + i], count | has_cache); } unlock_out: unlock_cluster(ci); return err; } /* * Help swapoff by noting that swap entry belongs to shmem/tmpfs * (in which case its reference count is never incremented). */ void swap_shmem_alloc(swp_entry_t entry, int nr) { __swap_duplicate(entry, SWAP_MAP_SHMEM, nr); } /* * Increase reference count of swap entry by 1. * Returns 0 for success, or -ENOMEM if a swap_count_continuation is required * but could not be atomically allocated. Returns 0, just as if it succeeded, * if __swap_duplicate() fails for another reason (-EINVAL or -ENOENT), which * might occur if a page table entry has got corrupted. */ int swap_duplicate(swp_entry_t entry) { int err = 0; while (!err && __swap_duplicate(entry, 1, 1) == -ENOMEM) err = add_swap_count_continuation(entry, GFP_ATOMIC); return err; } /* * @entry: first swap entry from which we allocate nr swap cache. * * Called when allocating swap cache for existing swap entries, * This can return error codes. Returns 0 at success. * -EEXIST means there is a swap cache. * Note: return code is different from swap_duplicate(). */ int swapcache_prepare(swp_entry_t entry, int nr) { return __swap_duplicate(entry, SWAP_HAS_CACHE, nr); } void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr) { unsigned long offset = swp_offset(entry); cluster_swap_free_nr(si, offset, nr, SWAP_HAS_CACHE); } struct swap_info_struct *swp_swap_info(swp_entry_t entry) { return swap_type_to_swap_info(swp_type(entry)); } /* * out-of-line methods to avoid include hell. */ struct address_space *swapcache_mapping(struct folio *folio) { return swp_swap_info(folio->swap)->swap_file->f_mapping; } EXPORT_SYMBOL_GPL(swapcache_mapping); pgoff_t __folio_swap_cache_index(struct folio *folio) { return swap_cache_index(folio->swap); } EXPORT_SYMBOL_GPL(__folio_swap_cache_index); /* * add_swap_count_continuation - called when a swap count is duplicated * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's * page of the original vmalloc'ed swap_map, to hold the continuation count * (for that entry and for its neighbouring PAGE_SIZE swap entries). Called * again when count is duplicated beyond SWAP_MAP_MAX * SWAP_CONT_MAX, etc. * * These continuation pages are seldom referenced: the common paths all work * on the original swap_map, only referring to a continuation page when the * low "digit" of a count is incremented or decremented through SWAP_MAP_MAX. * * add_swap_count_continuation(, GFP_ATOMIC) can be called while holding * page table locks; if it fails, add_swap_count_continuation(, GFP_KERNEL) * can be called after dropping locks. */ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) { struct swap_info_struct *si; struct swap_cluster_info *ci; struct page *head; struct page *page; struct page *list_page; pgoff_t offset; unsigned char count; int ret = 0; /* * When debugging, it's easier to use __GFP_ZERO here; but it's better * for latency not to zero a page while GFP_ATOMIC and holding locks. */ page = alloc_page(gfp_mask | __GFP_HIGHMEM); si = get_swap_device(entry); if (!si) { /* * An acceptable race has occurred since the failing * __swap_duplicate(): the swap device may be swapoff */ goto outer; } offset = swp_offset(entry); ci = lock_cluster(si, offset); count = swap_count(si->swap_map[offset]); if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) { /* * The higher the swap count, the more likely it is that tasks * will race to add swap count continuation: we need to avoid * over-provisioning. */ goto out; } if (!page) { ret = -ENOMEM; goto out; } head = vmalloc_to_page(si->swap_map + offset); offset &= ~PAGE_MASK; spin_lock(&si->cont_lock); /* * Page allocation does not initialize the page's lru field, * but it does always reset its private field. */ if (!page_private(head)) { BUG_ON(count & COUNT_CONTINUED); INIT_LIST_HEAD(&head->lru); set_page_private(head, SWP_CONTINUED); si->flags |= SWP_CONTINUED; } list_for_each_entry(list_page, &head->lru, lru) { unsigned char *map; /* * If the previous map said no continuation, but we've found * a continuation page, free our allocation and use this one. */ if (!(count & COUNT_CONTINUED)) goto out_unlock_cont; map = kmap_local_page(list_page) + offset; count = *map; kunmap_local(map); /* * If this continuation count now has some space in it, * free our allocation and use this one. */ if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX) goto out_unlock_cont; } list_add_tail(&page->lru, &head->lru); page = NULL; /* now it's attached, don't free it */ out_unlock_cont: spin_unlock(&si->cont_lock); out: unlock_cluster(ci); put_swap_device(si); outer: if (page) __free_page(page); return ret; } /* * swap_count_continued - when the original swap_map count is incremented * from SWAP_MAP_MAX, check if there is already a continuation page to carry * into, carry if so, or else fail until a new continuation page is allocated; * when the original swap_map count is decremented from 0 with continuation, * borrow from the continuation and report whether it still holds more. * Called while __swap_duplicate() or swap_entry_free() holds swap or cluster * lock. */ static bool swap_count_continued(struct swap_info_struct *si, pgoff_t offset, unsigned char count) { struct page *head; struct page *page; unsigned char *map; bool ret; head = vmalloc_to_page(si->swap_map + offset); if (page_private(head) != SWP_CONTINUED) { BUG_ON(count & COUNT_CONTINUED); return false; /* need to add count continuation */ } spin_lock(&si->cont_lock); offset &= ~PAGE_MASK; page = list_next_entry(head, lru); map = kmap_local_page(page) + offset; if (count == SWAP_MAP_MAX) /* initial increment from swap_map */ goto init_map; /* jump over SWAP_CONT_MAX checks */ if (count == (SWAP_MAP_MAX | COUNT_CONTINUED)) { /* incrementing */ /* * Think of how you add 1 to 999 */ while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) { kunmap_local(map); page = list_next_entry(page, lru); BUG_ON(page == head); map = kmap_local_page(page) + offset; } if (*map == SWAP_CONT_MAX) { kunmap_local(map); page = list_next_entry(page, lru); if (page == head) { ret = false; /* add count continuation */ goto out; } map = kmap_local_page(page) + offset; init_map: *map = 0; /* we didn't zero the page */ } *map += 1; kunmap_local(map); while ((page = list_prev_entry(page, lru)) != head) { map = kmap_local_page(page) + offset; *map = COUNT_CONTINUED; kunmap_local(map); } ret = true; /* incremented */ } else { /* decrementing */ /* * Think of how you subtract 1 from 1000 */ BUG_ON(count != COUNT_CONTINUED); while (*map == COUNT_CONTINUED) { kunmap_local(map); page = list_next_entry(page, lru); BUG_ON(page == head); map = kmap_local_page(page) + offset; } BUG_ON(*map == 0); *map -= 1; if (*map == 0) count = 0; kunmap_local(map); while ((page = list_prev_entry(page, lru)) != head) { map = kmap_local_page(page) + offset; *map = SWAP_CONT_MAX | count; count = COUNT_CONTINUED; kunmap_local(map); } ret = count == COUNT_CONTINUED; } out: spin_unlock(&si->cont_lock); return ret; } /* * free_swap_count_continuations - swapoff free all the continuation pages * appended to the swap_map, after swap_map is quiesced, before vfree'ing it. */ static void free_swap_count_continuations(struct swap_info_struct *si) { pgoff_t offset; for (offset = 0; offset < si->max; offset += PAGE_SIZE) { struct page *head; head = vmalloc_to_page(si->swap_map + offset); if (page_private(head)) { struct page *page, *next; list_for_each_entry_safe(page, next, &head->lru, lru) { list_del(&page->lru); __free_page(page); } } } } #if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP) void __folio_throttle_swaprate(struct folio *folio, gfp_t gfp) { struct swap_info_struct *si, *next; int nid = folio_nid(folio); if (!(gfp & __GFP_IO)) return; if (!__has_usable_swap()) return; if (!blk_cgroup_congested()) return; /* * We've already scheduled a throttle, avoid taking the global swap * lock. */ if (current->throttle_disk) return; spin_lock(&swap_avail_lock); plist_for_each_entry_safe(si, next, &swap_avail_heads[nid], avail_lists[nid]) { if (si->bdev) { blkcg_schedule_throttle(si->bdev->bd_disk, true); break; } } spin_unlock(&swap_avail_lock); } #endif static int __init swapfile_init(void) { int nid; swap_avail_heads = kmalloc_array(nr_node_ids, sizeof(struct plist_head), GFP_KERNEL); if (!swap_avail_heads) { pr_emerg("Not enough memory for swap heads, swap is disabled\n"); return -ENOMEM; } for_each_node(nid) plist_head_init(&swap_avail_heads[nid]); swapfile_maximum_size = arch_max_swapfile_size(); #ifdef CONFIG_MIGRATION if (swapfile_maximum_size >= (1UL << SWP_MIG_TOTAL_BITS)) swap_migration_ad_supported = true; #endif /* CONFIG_MIGRATION */ return 0; } subsys_initcall(swapfile_init);
1494 1438 63 2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 // SPDX-License-Identifier: GPL-2.0-or-later /* * * Copyright (C) Alan Cox GW4PTS (alan@lxorguk.ukuu.org.uk) * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) * Copyright (C) Darryl Miles G7LED (dlm@g7led.demon.co.uk) * Copyright (C) Steven Whitehouse GW7RRM (stevew@acm.org) * Copyright (C) Joerg Reuter DL1BKE (jreuter@yaina.de) * Copyright (C) Hans-Joachim Hetscher DD8NE (dd8ne@bnv-bamberg.de) * Copyright (C) Hans Alblas PE1AYX (hans@esrac.ele.tue.nl) * Copyright (C) Frederic Rible F1OAT (frible@teaser.fr) */ #include <linux/capability.h> #include <linux/module.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/in.h> #include <linux/kernel.h> #include <linux/sched/signal.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/slab.h> #include <net/ax25.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/skbuff.h> #include <net/sock.h> #include <linux/uaccess.h> #include <linux/fcntl.h> #include <linux/termios.h> /* For TIOCINQ/OUTQ */ #include <linux/mm.h> #include <linux/interrupt.h> #include <linux/notifier.h> #include <linux/proc_fs.h> #include <linux/stat.h> #include <linux/sysctl.h> #include <linux/init.h> #include <linux/spinlock.h> #include <net/net_namespace.h> #include <net/tcp_states.h> #include <net/ip.h> #include <net/arp.h> HLIST_HEAD(ax25_list); DEFINE_SPINLOCK(ax25_list_lock); static const struct proto_ops ax25_proto_ops; static void ax25_free_sock(struct sock *sk) { ax25_cb_put(sk_to_ax25(sk)); } /* * Socket removal during an interrupt is now safe. */ static void ax25_cb_del(ax25_cb *ax25) { spin_lock_bh(&ax25_list_lock); if (!hlist_unhashed(&ax25->ax25_node)) { hlist_del_init(&ax25->ax25_node); ax25_cb_put(ax25); } spin_unlock_bh(&ax25_list_lock); } /* * Kill all bound sockets on a dropped device. */ static void ax25_kill_by_device(struct net_device *dev) { ax25_dev *ax25_dev; ax25_cb *s; struct sock *sk; if ((ax25_dev = ax25_dev_ax25dev(dev)) == NULL) return; ax25_dev->device_up = false; spin_lock_bh(&ax25_list_lock); again: ax25_for_each(s, &ax25_list) { if (s->ax25_dev == ax25_dev) { sk = s->sk; if (!sk) { spin_unlock_bh(&ax25_list_lock); ax25_disconnect(s, ENETUNREACH); s->ax25_dev = NULL; ax25_cb_del(s); spin_lock_bh(&ax25_list_lock); goto again; } sock_hold(sk); spin_unlock_bh(&ax25_list_lock); lock_sock(sk); ax25_disconnect(s, ENETUNREACH); s->ax25_dev = NULL; if (sk->sk_socket) { netdev_put(ax25_dev->dev, &s->dev_tracker); ax25_dev_put(ax25_dev); } ax25_cb_del(s); release_sock(sk); spin_lock_bh(&ax25_list_lock); sock_put(sk); /* The entry could have been deleted from the * list meanwhile and thus the next pointer is * no longer valid. Play it safe and restart * the scan. Forward progress is ensured * because we set s->ax25_dev to NULL and we * are never passed a NULL 'dev' argument. */ goto again; } } spin_unlock_bh(&ax25_list_lock); } /* * Handle device status changes. */ static int ax25_device_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); if (!net_eq(dev_net(dev), &init_net)) return NOTIFY_DONE; /* Reject non AX.25 devices */ if (dev->type != ARPHRD_AX25) return NOTIFY_DONE; switch (event) { case NETDEV_UP: ax25_dev_device_up(dev); break; case NETDEV_DOWN: ax25_kill_by_device(dev); ax25_rt_device_down(dev); ax25_dev_device_down(dev); break; default: break; } return NOTIFY_DONE; } /* * Add a socket to the bound sockets list. */ void ax25_cb_add(ax25_cb *ax25) { spin_lock_bh(&ax25_list_lock); ax25_cb_hold(ax25); hlist_add_head(&ax25->ax25_node, &ax25_list); spin_unlock_bh(&ax25_list_lock); } /* * Find a socket that wants to accept the SABM we have just * received. */ struct sock *ax25_find_listener(ax25_address *addr, int digi, struct net_device *dev, int type) { ax25_cb *s; spin_lock(&ax25_list_lock); ax25_for_each(s, &ax25_list) { if ((s->iamdigi && !digi) || (!s->iamdigi && digi)) continue; if (s->sk && !ax25cmp(&s->source_addr, addr) && s->sk->sk_type == type && s->sk->sk_state == TCP_LISTEN) { /* If device is null we match any device */ if (s->ax25_dev == NULL || s->ax25_dev->dev == dev) { sock_hold(s->sk); spin_unlock(&ax25_list_lock); return s->sk; } } } spin_unlock(&ax25_list_lock); return NULL; } /* * Find an AX.25 socket given both ends. */ struct sock *ax25_get_socket(ax25_address *my_addr, ax25_address *dest_addr, int type) { struct sock *sk = NULL; ax25_cb *s; spin_lock(&ax25_list_lock); ax25_for_each(s, &ax25_list) { if (s->sk && !ax25cmp(&s->source_addr, my_addr) && !ax25cmp(&s->dest_addr, dest_addr) && s->sk->sk_type == type) { sk = s->sk; sock_hold(sk); break; } } spin_unlock(&ax25_list_lock); return sk; } /* * Find an AX.25 control block given both ends. It will only pick up * floating AX.25 control blocks or non Raw socket bound control blocks. */ ax25_cb *ax25_find_cb(const ax25_address *src_addr, ax25_address *dest_addr, ax25_digi *digi, struct net_device *dev) { ax25_cb *s; spin_lock_bh(&ax25_list_lock); ax25_for_each(s, &ax25_list) { if (s->sk && s->sk->sk_type != SOCK_SEQPACKET) continue; if (s->ax25_dev == NULL) continue; if (ax25cmp(&s->source_addr, src_addr) == 0 && ax25cmp(&s->dest_addr, dest_addr) == 0 && s->ax25_dev->dev == dev) { if (digi != NULL && digi->ndigi != 0) { if (s->digipeat == NULL) continue; if (ax25digicmp(s->digipeat, digi) != 0) continue; } else { if (s->digipeat != NULL && s->digipeat->ndigi != 0) continue; } ax25_cb_hold(s); spin_unlock_bh(&ax25_list_lock); return s; } } spin_unlock_bh(&ax25_list_lock); return NULL; } EXPORT_SYMBOL(ax25_find_cb); void ax25_send_to_raw(ax25_address *addr, struct sk_buff *skb, int proto) { ax25_cb *s; struct sk_buff *copy; spin_lock(&ax25_list_lock); ax25_for_each(s, &ax25_list) { if (s->sk != NULL && ax25cmp(&s->source_addr, addr) == 0 && s->sk->sk_type == SOCK_RAW && s->sk->sk_protocol == proto && s->ax25_dev->dev == skb->dev && atomic_read(&s->sk->sk_rmem_alloc) <= s->sk->sk_rcvbuf) { if ((copy = skb_clone(skb, GFP_ATOMIC)) == NULL) continue; if (sock_queue_rcv_skb(s->sk, copy) != 0) kfree_skb(copy); } } spin_unlock(&ax25_list_lock); } /* * Deferred destroy. */ void ax25_destroy_socket(ax25_cb *); /* * Handler for deferred kills. */ static void ax25_destroy_timer(struct timer_list *t) { ax25_cb *ax25 = from_timer(ax25, t, dtimer); struct sock *sk; sk=ax25->sk; bh_lock_sock(sk); sock_hold(sk); ax25_destroy_socket(ax25); bh_unlock_sock(sk); sock_put(sk); } /* * This is called from user mode and the timers. Thus it protects itself * against interrupt users but doesn't worry about being called during * work. Once it is removed from the queue no interrupt or bottom half * will touch it and we are (fairly 8-) ) safe. */ void ax25_destroy_socket(ax25_cb *ax25) { struct sk_buff *skb; ax25_cb_del(ax25); ax25_stop_heartbeat(ax25); ax25_stop_t1timer(ax25); ax25_stop_t2timer(ax25); ax25_stop_t3timer(ax25); ax25_stop_idletimer(ax25); ax25_clear_queues(ax25); /* Flush the queues */ if (ax25->sk != NULL) { while ((skb = skb_dequeue(&ax25->sk->sk_receive_queue)) != NULL) { if (skb->sk != ax25->sk) { /* A pending connection */ ax25_cb *sax25 = sk_to_ax25(skb->sk); /* Queue the unaccepted socket for death */ sock_orphan(skb->sk); /* 9A4GL: hack to release unaccepted sockets */ skb->sk->sk_state = TCP_LISTEN; ax25_start_heartbeat(sax25); sax25->state = AX25_STATE_0; } kfree_skb(skb); } skb_queue_purge(&ax25->sk->sk_write_queue); } if (ax25->sk != NULL) { if (sk_has_allocations(ax25->sk)) { /* Defer: outstanding buffers */ timer_setup(&ax25->dtimer, ax25_destroy_timer, 0); ax25->dtimer.expires = jiffies + 2 * HZ; add_timer(&ax25->dtimer); } else { struct sock *sk=ax25->sk; ax25->sk=NULL; sock_put(sk); } } else { ax25_cb_put(ax25); } } /* * dl1bke 960311: set parameters for existing AX.25 connections, * includes a KILL command to abort any connection. * VERY useful for debugging ;-) */ static int ax25_ctl_ioctl(const unsigned int cmd, void __user *arg) { struct ax25_ctl_struct ax25_ctl; ax25_digi digi; ax25_dev *ax25_dev; ax25_cb *ax25; unsigned int k; int ret = 0; if (copy_from_user(&ax25_ctl, arg, sizeof(ax25_ctl))) return -EFAULT; if (ax25_ctl.digi_count > AX25_MAX_DIGIS) return -EINVAL; if (ax25_ctl.arg > ULONG_MAX / HZ && ax25_ctl.cmd != AX25_KILL) return -EINVAL; ax25_dev = ax25_addr_ax25dev(&ax25_ctl.port_addr); if (!ax25_dev) return -ENODEV; digi.ndigi = ax25_ctl.digi_count; for (k = 0; k < digi.ndigi; k++) digi.calls[k] = ax25_ctl.digi_addr[k]; ax25 = ax25_find_cb(&ax25_ctl.source_addr, &ax25_ctl.dest_addr, &digi, ax25_dev->dev); if (!ax25) { ax25_dev_put(ax25_dev); return -ENOTCONN; } switch (ax25_ctl.cmd) { case AX25_KILL: ax25_send_control(ax25, AX25_DISC, AX25_POLLON, AX25_COMMAND); #ifdef CONFIG_AX25_DAMA_SLAVE if (ax25_dev->dama.slave && ax25->ax25_dev->values[AX25_VALUES_PROTOCOL] == AX25_PROTO_DAMA_SLAVE) ax25_dama_off(ax25); #endif ax25_disconnect(ax25, ENETRESET); break; case AX25_WINDOW: if (ax25->modulus == AX25_MODULUS) { if (ax25_ctl.arg < 1 || ax25_ctl.arg > 7) goto einval_put; } else { if (ax25_ctl.arg < 1 || ax25_ctl.arg > 63) goto einval_put; } ax25->window = ax25_ctl.arg; break; case AX25_T1: if (ax25_ctl.arg < 1 || ax25_ctl.arg > ULONG_MAX / HZ) goto einval_put; ax25->rtt = (ax25_ctl.arg * HZ) / 2; ax25->t1 = ax25_ctl.arg * HZ; break; case AX25_T2: if (ax25_ctl.arg < 1 || ax25_ctl.arg > ULONG_MAX / HZ) goto einval_put; ax25->t2 = ax25_ctl.arg * HZ; break; case AX25_N2: if (ax25_ctl.arg < 1 || ax25_ctl.arg > 31) goto einval_put; ax25->n2count = 0; ax25->n2 = ax25_ctl.arg; break; case AX25_T3: if (ax25_ctl.arg > ULONG_MAX / HZ) goto einval_put; ax25->t3 = ax25_ctl.arg * HZ; break; case AX25_IDLE: if (ax25_ctl.arg > ULONG_MAX / (60 * HZ)) goto einval_put; ax25->idle = ax25_ctl.arg * 60 * HZ; break; case AX25_PACLEN: if (ax25_ctl.arg < 16 || ax25_ctl.arg > 65535) goto einval_put; ax25->paclen = ax25_ctl.arg; break; default: goto einval_put; } out_put: ax25_dev_put(ax25_dev); ax25_cb_put(ax25); return ret; einval_put: ret = -EINVAL; goto out_put; } static void ax25_fillin_cb_from_dev(ax25_cb *ax25, const ax25_dev *ax25_dev) { ax25->rtt = msecs_to_jiffies(ax25_dev->values[AX25_VALUES_T1]) / 2; ax25->t1 = msecs_to_jiffies(ax25_dev->values[AX25_VALUES_T1]); ax25->t2 = msecs_to_jiffies(ax25_dev->values[AX25_VALUES_T2]); ax25->t3 = msecs_to_jiffies(ax25_dev->values[AX25_VALUES_T3]); ax25->n2 = ax25_dev->values[AX25_VALUES_N2]; ax25->paclen = ax25_dev->values[AX25_VALUES_PACLEN]; ax25->idle = msecs_to_jiffies(ax25_dev->values[AX25_VALUES_IDLE]); ax25->backoff = ax25_dev->values[AX25_VALUES_BACKOFF]; if (ax25_dev->values[AX25_VALUES_AXDEFMODE]) { ax25->modulus = AX25_EMODULUS; ax25->window = ax25_dev->values[AX25_VALUES_EWINDOW]; } else { ax25->modulus = AX25_MODULUS; ax25->window = ax25_dev->values[AX25_VALUES_WINDOW]; } } /* * Fill in a created AX.25 created control block with the default * values for a particular device. */ void ax25_fillin_cb(ax25_cb *ax25, ax25_dev *ax25_dev) { ax25->ax25_dev = ax25_dev; if (ax25->ax25_dev != NULL) { ax25_fillin_cb_from_dev(ax25, ax25_dev); return; } /* * No device, use kernel / AX.25 spec default values */ ax25->rtt = msecs_to_jiffies(AX25_DEF_T1) / 2; ax25->t1 = msecs_to_jiffies(AX25_DEF_T1); ax25->t2 = msecs_to_jiffies(AX25_DEF_T2); ax25->t3 = msecs_to_jiffies(AX25_DEF_T3); ax25->n2 = AX25_DEF_N2; ax25->paclen = AX25_DEF_PACLEN; ax25->idle = msecs_to_jiffies(AX25_DEF_IDLE); ax25->backoff = AX25_DEF_BACKOFF; if (AX25_DEF_AXDEFMODE) { ax25->modulus = AX25_EMODULUS; ax25->window = AX25_DEF_EWINDOW; } else { ax25->modulus = AX25_MODULUS; ax25->window = AX25_DEF_WINDOW; } } /* * Create an empty AX.25 control block. */ ax25_cb *ax25_create_cb(void) { ax25_cb *ax25; if ((ax25 = kzalloc(sizeof(*ax25), GFP_ATOMIC)) == NULL) return NULL; refcount_set(&ax25->refcount, 1); skb_queue_head_init(&ax25->write_queue); skb_queue_head_init(&ax25->frag_queue); skb_queue_head_init(&ax25->ack_queue); skb_queue_head_init(&ax25->reseq_queue); ax25_setup_timers(ax25); ax25_fillin_cb(ax25, NULL); ax25->state = AX25_STATE_0; return ax25; } /* * Handling for system calls applied via the various interfaces to an * AX25 socket object */ static int ax25_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; ax25_cb *ax25; struct net_device *dev; char devname[IFNAMSIZ]; unsigned int opt; int res = 0; if (level != SOL_AX25) return -ENOPROTOOPT; if (optlen < sizeof(unsigned int)) return -EINVAL; if (copy_from_sockptr(&opt, optval, sizeof(unsigned int))) return -EFAULT; lock_sock(sk); ax25 = sk_to_ax25(sk); switch (optname) { case AX25_WINDOW: if (ax25->modulus == AX25_MODULUS) { if (opt < 1 || opt > 7) { res = -EINVAL; break; } } else { if (opt < 1 || opt > 63) { res = -EINVAL; break; } } ax25->window = opt; break; case AX25_T1: if (opt < 1 || opt > UINT_MAX / HZ) { res = -EINVAL; break; } ax25->rtt = (opt * HZ) >> 1; ax25->t1 = opt * HZ; break; case AX25_T2: if (opt < 1 || opt > UINT_MAX / HZ) { res = -EINVAL; break; } ax25->t2 = opt * HZ; break; case AX25_N2: if (opt < 1 || opt > 31) { res = -EINVAL; break; } ax25->n2 = opt; break; case AX25_T3: if (opt < 1 || opt > UINT_MAX / HZ) { res = -EINVAL; break; } ax25->t3 = opt * HZ; break; case AX25_IDLE: if (opt > UINT_MAX / (60 * HZ)) { res = -EINVAL; break; } ax25->idle = opt * 60 * HZ; break; case AX25_BACKOFF: if (opt > 2) { res = -EINVAL; break; } ax25->backoff = opt; break; case AX25_EXTSEQ: ax25->modulus = opt ? AX25_EMODULUS : AX25_MODULUS; break; case AX25_PIDINCL: ax25->pidincl = opt ? 1 : 0; break; case AX25_IAMDIGI: ax25->iamdigi = opt ? 1 : 0; break; case AX25_PACLEN: if (opt < 16 || opt > 65535) { res = -EINVAL; break; } ax25->paclen = opt; break; case SO_BINDTODEVICE: if (optlen > IFNAMSIZ - 1) optlen = IFNAMSIZ - 1; memset(devname, 0, sizeof(devname)); if (copy_from_sockptr(devname, optval, optlen)) { res = -EFAULT; break; } if (sk->sk_type == SOCK_SEQPACKET && (sock->state != SS_UNCONNECTED || sk->sk_state == TCP_LISTEN)) { res = -EADDRNOTAVAIL; break; } rcu_read_lock(); dev = dev_get_by_name_rcu(&init_net, devname); if (!dev) { rcu_read_unlock(); res = -ENODEV; break; } if (ax25->ax25_dev) { if (dev == ax25->ax25_dev->dev) { rcu_read_unlock(); break; } netdev_put(ax25->ax25_dev->dev, &ax25->dev_tracker); ax25_dev_put(ax25->ax25_dev); } ax25->ax25_dev = ax25_dev_ax25dev(dev); if (!ax25->ax25_dev) { rcu_read_unlock(); res = -ENODEV; break; } ax25_fillin_cb(ax25, ax25->ax25_dev); netdev_hold(dev, &ax25->dev_tracker, GFP_ATOMIC); ax25_dev_hold(ax25->ax25_dev); rcu_read_unlock(); break; default: res = -ENOPROTOOPT; } release_sock(sk); return res; } static int ax25_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; ax25_cb *ax25; struct ax25_dev *ax25_dev; char devname[IFNAMSIZ]; void *valptr; int val = 0; int maxlen, length; if (level != SOL_AX25) return -ENOPROTOOPT; if (get_user(maxlen, optlen)) return -EFAULT; if (maxlen < 1) return -EFAULT; valptr = &val; length = min_t(unsigned int, maxlen, sizeof(int)); lock_sock(sk); ax25 = sk_to_ax25(sk); switch (optname) { case AX25_WINDOW: val = ax25->window; break; case AX25_T1: val = ax25->t1 / HZ; break; case AX25_T2: val = ax25->t2 / HZ; break; case AX25_N2: val = ax25->n2; break; case AX25_T3: val = ax25->t3 / HZ; break; case AX25_IDLE: val = ax25->idle / (60 * HZ); break; case AX25_BACKOFF: val = ax25->backoff; break; case AX25_EXTSEQ: val = (ax25->modulus == AX25_EMODULUS); break; case AX25_PIDINCL: val = ax25->pidincl; break; case AX25_IAMDIGI: val = ax25->iamdigi; break; case AX25_PACLEN: val = ax25->paclen; break; case SO_BINDTODEVICE: ax25_dev = ax25->ax25_dev; if (ax25_dev != NULL && ax25_dev->dev != NULL) { strscpy(devname, ax25_dev->dev->name, sizeof(devname)); length = strlen(devname) + 1; } else { *devname = '\0'; length = 1; } valptr = devname; break; default: release_sock(sk); return -ENOPROTOOPT; } release_sock(sk); if (put_user(length, optlen)) return -EFAULT; return copy_to_user(optval, valptr, length) ? -EFAULT : 0; } static int ax25_listen(struct socket *sock, int backlog) { struct sock *sk = sock->sk; int res = 0; lock_sock(sk); if (sk->sk_type == SOCK_SEQPACKET && sk->sk_state != TCP_LISTEN) { sk->sk_max_ack_backlog = backlog; sk->sk_state = TCP_LISTEN; goto out; } res = -EOPNOTSUPP; out: release_sock(sk); return res; } /* * XXX: when creating ax25_sock we should update the .obj_size setting * below. */ static struct proto ax25_proto = { .name = "AX25", .owner = THIS_MODULE, .obj_size = sizeof(struct ax25_sock), }; static int ax25_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; ax25_cb *ax25; if (protocol < 0 || protocol > U8_MAX) return -EINVAL; if (!net_eq(net, &init_net)) return -EAFNOSUPPORT; switch (sock->type) { case SOCK_DGRAM: if (protocol == 0 || protocol == PF_AX25) protocol = AX25_P_TEXT; break; case SOCK_SEQPACKET: switch (protocol) { case 0: case PF_AX25: /* For CLX */ protocol = AX25_P_TEXT; break; case AX25_P_SEGMENT: #ifdef CONFIG_INET case AX25_P_ARP: case AX25_P_IP: #endif #ifdef CONFIG_NETROM case AX25_P_NETROM: #endif #ifdef CONFIG_ROSE case AX25_P_ROSE: #endif return -ESOCKTNOSUPPORT; #ifdef CONFIG_NETROM_MODULE case AX25_P_NETROM: if (ax25_protocol_is_registered(AX25_P_NETROM)) return -ESOCKTNOSUPPORT; break; #endif #ifdef CONFIG_ROSE_MODULE case AX25_P_ROSE: if (ax25_protocol_is_registered(AX25_P_ROSE)) return -ESOCKTNOSUPPORT; break; #endif default: break; } break; case SOCK_RAW: if (!capable(CAP_NET_RAW)) return -EPERM; break; default: return -ESOCKTNOSUPPORT; } sk = sk_alloc(net, PF_AX25, GFP_ATOMIC, &ax25_proto, kern); if (sk == NULL) return -ENOMEM; ax25 = ax25_sk(sk)->cb = ax25_create_cb(); if (!ax25) { sk_free(sk); return -ENOMEM; } sock_init_data(sock, sk); sk->sk_destruct = ax25_free_sock; sock->ops = &ax25_proto_ops; sk->sk_protocol = protocol; ax25->sk = sk; return 0; } struct sock *ax25_make_new(struct sock *osk, struct ax25_dev *ax25_dev) { struct sock *sk; ax25_cb *ax25, *oax25; sk = sk_alloc(sock_net(osk), PF_AX25, GFP_ATOMIC, osk->sk_prot, 0); if (sk == NULL) return NULL; if ((ax25 = ax25_create_cb()) == NULL) { sk_free(sk); return NULL; } switch (osk->sk_type) { case SOCK_DGRAM: break; case SOCK_SEQPACKET: break; default: sk_free(sk); ax25_cb_put(ax25); return NULL; } sock_init_data(NULL, sk); sk->sk_type = osk->sk_type; sk->sk_priority = READ_ONCE(osk->sk_priority); sk->sk_protocol = osk->sk_protocol; sk->sk_rcvbuf = osk->sk_rcvbuf; sk->sk_sndbuf = osk->sk_sndbuf; sk->sk_state = TCP_ESTABLISHED; sock_copy_flags(sk, osk); oax25 = sk_to_ax25(osk); ax25->modulus = oax25->modulus; ax25->backoff = oax25->backoff; ax25->pidincl = oax25->pidincl; ax25->iamdigi = oax25->iamdigi; ax25->rtt = oax25->rtt; ax25->t1 = oax25->t1; ax25->t2 = oax25->t2; ax25->t3 = oax25->t3; ax25->n2 = oax25->n2; ax25->idle = oax25->idle; ax25->paclen = oax25->paclen; ax25->window = oax25->window; ax25->ax25_dev = ax25_dev; ax25->source_addr = oax25->source_addr; if (oax25->digipeat != NULL) { ax25->digipeat = kmemdup(oax25->digipeat, sizeof(ax25_digi), GFP_ATOMIC); if (ax25->digipeat == NULL) { sk_free(sk); ax25_cb_put(ax25); return NULL; } } ax25_sk(sk)->cb = ax25; sk->sk_destruct = ax25_free_sock; ax25->sk = sk; return sk; } static int ax25_release(struct socket *sock) { struct sock *sk = sock->sk; ax25_cb *ax25; ax25_dev *ax25_dev; if (sk == NULL) return 0; sock_hold(sk); lock_sock(sk); sock_orphan(sk); ax25 = sk_to_ax25(sk); ax25_dev = ax25->ax25_dev; if (sk->sk_type == SOCK_SEQPACKET) { switch (ax25->state) { case AX25_STATE_0: if (!sock_flag(ax25->sk, SOCK_DEAD)) { release_sock(sk); ax25_disconnect(ax25, 0); lock_sock(sk); } ax25_destroy_socket(ax25); break; case AX25_STATE_1: case AX25_STATE_2: ax25_send_control(ax25, AX25_DISC, AX25_POLLON, AX25_COMMAND); release_sock(sk); ax25_disconnect(ax25, 0); lock_sock(sk); if (!sock_flag(ax25->sk, SOCK_DESTROY)) ax25_destroy_socket(ax25); break; case AX25_STATE_3: case AX25_STATE_4: ax25_clear_queues(ax25); ax25->n2count = 0; switch (ax25->ax25_dev->values[AX25_VALUES_PROTOCOL]) { case AX25_PROTO_STD_SIMPLEX: case AX25_PROTO_STD_DUPLEX: ax25_send_control(ax25, AX25_DISC, AX25_POLLON, AX25_COMMAND); ax25_stop_t2timer(ax25); ax25_stop_t3timer(ax25); ax25_stop_idletimer(ax25); break; #ifdef CONFIG_AX25_DAMA_SLAVE case AX25_PROTO_DAMA_SLAVE: ax25_stop_t3timer(ax25); ax25_stop_idletimer(ax25); break; #endif } ax25_calculate_t1(ax25); ax25_start_t1timer(ax25); ax25->state = AX25_STATE_2; sk->sk_state = TCP_CLOSE; sk->sk_shutdown |= SEND_SHUTDOWN; sk->sk_state_change(sk); sock_set_flag(sk, SOCK_DESTROY); break; default: break; } } else { sk->sk_state = TCP_CLOSE; sk->sk_shutdown |= SEND_SHUTDOWN; sk->sk_state_change(sk); ax25_destroy_socket(ax25); } if (ax25_dev) { if (!ax25_dev->device_up) { del_timer_sync(&ax25->timer); del_timer_sync(&ax25->t1timer); del_timer_sync(&ax25->t2timer); del_timer_sync(&ax25->t3timer); del_timer_sync(&ax25->idletimer); } netdev_put(ax25_dev->dev, &ax25->dev_tracker); ax25_dev_put(ax25_dev); } sock->sk = NULL; release_sock(sk); sock_put(sk); return 0; } /* * We support a funny extension here so you can (as root) give any callsign * digipeated via a local address as source. This hack is obsolete now * that we've implemented support for SO_BINDTODEVICE. It is however small * and trivially backward compatible. */ static int ax25_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct sock *sk = sock->sk; struct full_sockaddr_ax25 *addr = (struct full_sockaddr_ax25 *)uaddr; ax25_dev *ax25_dev = NULL; ax25_uid_assoc *user; ax25_address call; ax25_cb *ax25; int err = 0; if (addr_len != sizeof(struct sockaddr_ax25) && addr_len != sizeof(struct full_sockaddr_ax25)) /* support for old structure may go away some time * ax25_bind(): uses old (6 digipeater) socket structure. */ if ((addr_len < sizeof(struct sockaddr_ax25) + sizeof(ax25_address) * 6) || (addr_len > sizeof(struct full_sockaddr_ax25))) return -EINVAL; if (addr->fsa_ax25.sax25_family != AF_AX25) return -EINVAL; user = ax25_findbyuid(current_euid()); if (user) { call = user->call; ax25_uid_put(user); } else { if (ax25_uid_policy && !capable(CAP_NET_ADMIN)) return -EACCES; call = addr->fsa_ax25.sax25_call; } lock_sock(sk); ax25 = sk_to_ax25(sk); if (!sock_flag(sk, SOCK_ZAPPED)) { err = -EINVAL; goto out; } ax25->source_addr = call; /* * User already set interface with SO_BINDTODEVICE */ if (ax25->ax25_dev != NULL) goto done; if (addr_len > sizeof(struct sockaddr_ax25) && addr->fsa_ax25.sax25_ndigis == 1) { if (ax25cmp(&addr->fsa_digipeater[0], &null_ax25_address) != 0 && (ax25_dev = ax25_addr_ax25dev(&addr->fsa_digipeater[0])) == NULL) { err = -EADDRNOTAVAIL; goto out; } } else { if ((ax25_dev = ax25_addr_ax25dev(&addr->fsa_ax25.sax25_call)) == NULL) { err = -EADDRNOTAVAIL; goto out; } } if (ax25_dev) { ax25_fillin_cb(ax25, ax25_dev); netdev_hold(ax25_dev->dev, &ax25->dev_tracker, GFP_ATOMIC); } done: ax25_cb_add(ax25); sock_reset_flag(sk, SOCK_ZAPPED); out: release_sock(sk); return err; } /* * FIXME: nonblock behaviour looks like it may have a bug. */ static int __must_check ax25_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags) { struct sock *sk = sock->sk; ax25_cb *ax25 = sk_to_ax25(sk), *ax25t; struct full_sockaddr_ax25 *fsa = (struct full_sockaddr_ax25 *)uaddr; ax25_digi *digi = NULL; int ct = 0, err = 0; /* * some sanity checks. code further down depends on this */ if (addr_len == sizeof(struct sockaddr_ax25)) /* support for this will go away in early 2.5.x * ax25_connect(): uses obsolete socket structure */ ; else if (addr_len != sizeof(struct full_sockaddr_ax25)) /* support for old structure may go away some time * ax25_connect(): uses old (6 digipeater) socket structure. */ if ((addr_len < sizeof(struct sockaddr_ax25) + sizeof(ax25_address) * 6) || (addr_len > sizeof(struct full_sockaddr_ax25))) return -EINVAL; if (fsa->fsa_ax25.sax25_family != AF_AX25) return -EINVAL; lock_sock(sk); /* deal with restarts */ if (sock->state == SS_CONNECTING) { switch (sk->sk_state) { case TCP_SYN_SENT: /* still trying */ err = -EINPROGRESS; goto out_release; case TCP_ESTABLISHED: /* connection established */ sock->state = SS_CONNECTED; goto out_release; case TCP_CLOSE: /* connection refused */ sock->state = SS_UNCONNECTED; err = -ECONNREFUSED; goto out_release; } } if (sk->sk_state == TCP_ESTABLISHED && sk->sk_type == SOCK_SEQPACKET) { err = -EISCONN; /* No reconnect on a seqpacket socket */ goto out_release; } sk->sk_state = TCP_CLOSE; sock->state = SS_UNCONNECTED; kfree(ax25->digipeat); ax25->digipeat = NULL; /* * Handle digi-peaters to be used. */ if (addr_len > sizeof(struct sockaddr_ax25) && fsa->fsa_ax25.sax25_ndigis != 0) { /* Valid number of digipeaters ? */ if (fsa->fsa_ax25.sax25_ndigis < 1 || fsa->fsa_ax25.sax25_ndigis > AX25_MAX_DIGIS || addr_len < sizeof(struct sockaddr_ax25) + sizeof(ax25_address) * fsa->fsa_ax25.sax25_ndigis) { err = -EINVAL; goto out_release; } if ((digi = kmalloc(sizeof(ax25_digi), GFP_KERNEL)) == NULL) { err = -ENOBUFS; goto out_release; } digi->ndigi = fsa->fsa_ax25.sax25_ndigis; digi->lastrepeat = -1; while (ct < fsa->fsa_ax25.sax25_ndigis) { if ((fsa->fsa_digipeater[ct].ax25_call[6] & AX25_HBIT) && ax25->iamdigi) { digi->repeated[ct] = 1; digi->lastrepeat = ct; } else { digi->repeated[ct] = 0; } digi->calls[ct] = fsa->fsa_digipeater[ct]; ct++; } } /* * Must bind first - autobinding in this may or may not work. If * the socket is already bound, check to see if the device has * been filled in, error if it hasn't. */ if (sock_flag(sk, SOCK_ZAPPED)) { /* check if we can remove this feature. It is broken. */ printk(KERN_WARNING "ax25_connect(): %s uses autobind, please contact jreuter@yaina.de\n", current->comm); if ((err = ax25_rt_autobind(ax25, &fsa->fsa_ax25.sax25_call)) < 0) { kfree(digi); goto out_release; } ax25_fillin_cb(ax25, ax25->ax25_dev); ax25_cb_add(ax25); } else { if (ax25->ax25_dev == NULL) { kfree(digi); err = -EHOSTUNREACH; goto out_release; } } if (sk->sk_type == SOCK_SEQPACKET && (ax25t=ax25_find_cb(&ax25->source_addr, &fsa->fsa_ax25.sax25_call, digi, ax25->ax25_dev->dev))) { kfree(digi); err = -EADDRINUSE; /* Already such a connection */ ax25_cb_put(ax25t); goto out_release; } ax25->dest_addr = fsa->fsa_ax25.sax25_call; ax25->digipeat = digi; /* First the easy one */ if (sk->sk_type != SOCK_SEQPACKET) { sock->state = SS_CONNECTED; sk->sk_state = TCP_ESTABLISHED; goto out_release; } /* Move to connecting socket, ax.25 lapb WAIT_UA.. */ sock->state = SS_CONNECTING; sk->sk_state = TCP_SYN_SENT; switch (ax25->ax25_dev->values[AX25_VALUES_PROTOCOL]) { case AX25_PROTO_STD_SIMPLEX: case AX25_PROTO_STD_DUPLEX: ax25_std_establish_data_link(ax25); break; #ifdef CONFIG_AX25_DAMA_SLAVE case AX25_PROTO_DAMA_SLAVE: ax25->modulus = AX25_MODULUS; ax25->window = ax25->ax25_dev->values[AX25_VALUES_WINDOW]; if (ax25->ax25_dev->dama.slave) ax25_ds_establish_data_link(ax25); else ax25_std_establish_data_link(ax25); break; #endif } ax25->state = AX25_STATE_1; ax25_start_heartbeat(ax25); /* Now the loop */ if (sk->sk_state != TCP_ESTABLISHED && (flags & O_NONBLOCK)) { err = -EINPROGRESS; goto out_release; } if (sk->sk_state == TCP_SYN_SENT) { DEFINE_WAIT(wait); for (;;) { prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); if (sk->sk_state != TCP_SYN_SENT) break; if (!signal_pending(current)) { release_sock(sk); schedule(); lock_sock(sk); continue; } err = -ERESTARTSYS; break; } finish_wait(sk_sleep(sk), &wait); if (err) goto out_release; } if (sk->sk_state != TCP_ESTABLISHED) { /* Not in ABM, not in WAIT_UA -> failed */ sock->state = SS_UNCONNECTED; err = sock_error(sk); /* Always set at this point */ goto out_release; } sock->state = SS_CONNECTED; err = 0; out_release: release_sock(sk); return err; } static int ax25_accept(struct socket *sock, struct socket *newsock, struct proto_accept_arg *arg) { struct sk_buff *skb; struct sock *newsk; ax25_dev *ax25_dev; DEFINE_WAIT(wait); struct sock *sk; ax25_cb *ax25; int err = 0; if (sock->state != SS_UNCONNECTED) return -EINVAL; if ((sk = sock->sk) == NULL) return -EINVAL; lock_sock(sk); if (sk->sk_type != SOCK_SEQPACKET) { err = -EOPNOTSUPP; goto out; } if (sk->sk_state != TCP_LISTEN) { err = -EINVAL; goto out; } /* * The read queue this time is holding sockets ready to use * hooked into the SABM we saved */ for (;;) { prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); skb = skb_dequeue(&sk->sk_receive_queue); if (skb) break; if (arg->flags & O_NONBLOCK) { err = -EWOULDBLOCK; break; } if (!signal_pending(current)) { release_sock(sk); schedule(); lock_sock(sk); continue; } err = -ERESTARTSYS; break; } finish_wait(sk_sleep(sk), &wait); if (err) goto out; newsk = skb->sk; sock_graft(newsk, newsock); /* Now attach up the new socket */ kfree_skb(skb); sk_acceptq_removed(sk); newsock->state = SS_CONNECTED; ax25 = sk_to_ax25(newsk); ax25_dev = ax25->ax25_dev; netdev_hold(ax25_dev->dev, &ax25->dev_tracker, GFP_ATOMIC); ax25_dev_hold(ax25_dev); out: release_sock(sk); return err; } static int ax25_getname(struct socket *sock, struct sockaddr *uaddr, int peer) { struct full_sockaddr_ax25 *fsa = (struct full_sockaddr_ax25 *)uaddr; struct sock *sk = sock->sk; unsigned char ndigi, i; ax25_cb *ax25; int err = 0; memset(fsa, 0, sizeof(*fsa)); lock_sock(sk); ax25 = sk_to_ax25(sk); if (peer != 0) { if (sk->sk_state != TCP_ESTABLISHED) { err = -ENOTCONN; goto out; } fsa->fsa_ax25.sax25_family = AF_AX25; fsa->fsa_ax25.sax25_call = ax25->dest_addr; if (ax25->digipeat != NULL) { ndigi = ax25->digipeat->ndigi; fsa->fsa_ax25.sax25_ndigis = ndigi; for (i = 0; i < ndigi; i++) fsa->fsa_digipeater[i] = ax25->digipeat->calls[i]; } } else { fsa->fsa_ax25.sax25_family = AF_AX25; fsa->fsa_ax25.sax25_call = ax25->source_addr; fsa->fsa_ax25.sax25_ndigis = 1; if (ax25->ax25_dev != NULL) { memcpy(&fsa->fsa_digipeater[0], ax25->ax25_dev->dev->dev_addr, AX25_ADDR_LEN); } else { fsa->fsa_digipeater[0] = null_ax25_address; } } err = sizeof (struct full_sockaddr_ax25); out: release_sock(sk); return err; } static int ax25_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { DECLARE_SOCKADDR(struct sockaddr_ax25 *, usax, msg->msg_name); struct sock *sk = sock->sk; struct sockaddr_ax25 sax; struct sk_buff *skb; ax25_digi dtmp, *dp; ax25_cb *ax25; size_t size; int lv, err, addr_len = msg->msg_namelen; if (msg->msg_flags & ~(MSG_DONTWAIT|MSG_EOR|MSG_CMSG_COMPAT)) return -EINVAL; lock_sock(sk); ax25 = sk_to_ax25(sk); if (sock_flag(sk, SOCK_ZAPPED)) { err = -EADDRNOTAVAIL; goto out; } if (sk->sk_shutdown & SEND_SHUTDOWN) { send_sig(SIGPIPE, current, 0); err = -EPIPE; goto out; } if (ax25->ax25_dev == NULL) { err = -ENETUNREACH; goto out; } if (len > ax25->ax25_dev->dev->mtu) { err = -EMSGSIZE; goto out; } if (usax != NULL) { if (usax->sax25_family != AF_AX25) { err = -EINVAL; goto out; } if (addr_len == sizeof(struct sockaddr_ax25)) /* ax25_sendmsg(): uses obsolete socket structure */ ; else if (addr_len != sizeof(struct full_sockaddr_ax25)) /* support for old structure may go away some time * ax25_sendmsg(): uses old (6 digipeater) * socket structure. */ if ((addr_len < sizeof(struct sockaddr_ax25) + sizeof(ax25_address) * 6) || (addr_len > sizeof(struct full_sockaddr_ax25))) { err = -EINVAL; goto out; } if (addr_len > sizeof(struct sockaddr_ax25) && usax->sax25_ndigis != 0) { int ct = 0; struct full_sockaddr_ax25 *fsa = (struct full_sockaddr_ax25 *)usax; /* Valid number of digipeaters ? */ if (usax->sax25_ndigis < 1 || usax->sax25_ndigis > AX25_MAX_DIGIS || addr_len < sizeof(struct sockaddr_ax25) + sizeof(ax25_address) * usax->sax25_ndigis) { err = -EINVAL; goto out; } dtmp.ndigi = usax->sax25_ndigis; while (ct < usax->sax25_ndigis) { dtmp.repeated[ct] = 0; dtmp.calls[ct] = fsa->fsa_digipeater[ct]; ct++; } dtmp.lastrepeat = 0; } sax = *usax; if (sk->sk_type == SOCK_SEQPACKET && ax25cmp(&ax25->dest_addr, &sax.sax25_call)) { err = -EISCONN; goto out; } if (usax->sax25_ndigis == 0) dp = NULL; else dp = &dtmp; } else { /* * FIXME: 1003.1g - if the socket is like this because * it has become closed (not started closed) and is VC * we ought to SIGPIPE, EPIPE */ if (sk->sk_state != TCP_ESTABLISHED) { err = -ENOTCONN; goto out; } sax.sax25_family = AF_AX25; sax.sax25_call = ax25->dest_addr; dp = ax25->digipeat; } /* Build a packet */ /* Assume the worst case */ size = len + ax25->ax25_dev->dev->hard_header_len; skb = sock_alloc_send_skb(sk, size, msg->msg_flags&MSG_DONTWAIT, &err); if (skb == NULL) goto out; skb_reserve(skb, size - len); /* User data follows immediately after the AX.25 data */ if (memcpy_from_msg(skb_put(skb, len), msg, len)) { err = -EFAULT; kfree_skb(skb); goto out; } skb_reset_network_header(skb); /* Add the PID if one is not supplied by the user in the skb */ if (!ax25->pidincl) *(u8 *)skb_push(skb, 1) = sk->sk_protocol; if (sk->sk_type == SOCK_SEQPACKET) { /* Connected mode sockets go via the LAPB machine */ if (sk->sk_state != TCP_ESTABLISHED) { kfree_skb(skb); err = -ENOTCONN; goto out; } /* Shove it onto the queue and kick */ ax25_output(ax25, ax25->paclen, skb); err = len; goto out; } skb_push(skb, 1 + ax25_addr_size(dp)); /* Building AX.25 Header */ /* Build an AX.25 header */ lv = ax25_addr_build(skb->data, &ax25->source_addr, &sax.sax25_call, dp, AX25_COMMAND, AX25_MODULUS); skb_set_transport_header(skb, lv); *skb_transport_header(skb) = AX25_UI; /* Datagram frames go straight out of the door as UI */ ax25_queue_xmit(skb, ax25->ax25_dev->dev); err = len; out: release_sock(sk); return err; } static int ax25_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { struct sock *sk = sock->sk; struct sk_buff *skb, *last; struct sk_buff_head *sk_queue; int copied; int err = 0; int off = 0; long timeo; lock_sock(sk); /* * This works for seqpacket too. The receiver has ordered the * queue for us! We do one quick check first though */ if (sk->sk_type == SOCK_SEQPACKET && sk->sk_state != TCP_ESTABLISHED) { err = -ENOTCONN; goto out; } /* We need support for non-blocking reads. */ sk_queue = &sk->sk_receive_queue; skb = __skb_try_recv_datagram(sk, sk_queue, flags, &off, &err, &last); /* If no packet is available, release_sock(sk) and try again. */ if (!skb) { if (err != -EAGAIN) goto out; release_sock(sk); timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); while (timeo && !__skb_wait_for_more_packets(sk, sk_queue, &err, &timeo, last)) { skb = __skb_try_recv_datagram(sk, sk_queue, flags, &off, &err, &last); if (skb) break; if (err != -EAGAIN) goto done; } if (!skb) goto done; lock_sock(sk); } if (!sk_to_ax25(sk)->pidincl) skb_pull(skb, 1); /* Remove PID */ skb_reset_transport_header(skb); copied = skb->len; if (copied > size) { copied = size; msg->msg_flags |= MSG_TRUNC; } skb_copy_datagram_msg(skb, 0, msg, copied); if (msg->msg_name) { ax25_digi digi; ax25_address src; const unsigned char *mac = skb_mac_header(skb); DECLARE_SOCKADDR(struct sockaddr_ax25 *, sax, msg->msg_name); memset(sax, 0, sizeof(struct full_sockaddr_ax25)); ax25_addr_parse(mac + 1, skb->data - mac - 1, &src, NULL, &digi, NULL, NULL); sax->sax25_family = AF_AX25; /* We set this correctly, even though we may not let the application know the digi calls further down (because it did NOT ask to know them). This could get political... **/ sax->sax25_ndigis = digi.ndigi; sax->sax25_call = src; if (sax->sax25_ndigis != 0) { int ct; struct full_sockaddr_ax25 *fsa = (struct full_sockaddr_ax25 *)sax; for (ct = 0; ct < digi.ndigi; ct++) fsa->fsa_digipeater[ct] = digi.calls[ct]; } msg->msg_namelen = sizeof(struct full_sockaddr_ax25); } skb_free_datagram(sk, skb); err = copied; out: release_sock(sk); done: return err; } static int ax25_shutdown(struct socket *sk, int how) { /* FIXME - generate DM and RNR states */ return -EOPNOTSUPP; } static int ax25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { struct sock *sk = sock->sk; void __user *argp = (void __user *)arg; int res = 0; lock_sock(sk); switch (cmd) { case TIOCOUTQ: { long amount; amount = sk->sk_sndbuf - sk_wmem_alloc_get(sk); if (amount < 0) amount = 0; res = put_user(amount, (int __user *)argp); break; } case TIOCINQ: { struct sk_buff *skb; long amount = 0L; /* These two are safe on a single CPU system as only user tasks fiddle here */ if ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) amount = skb->len; res = put_user(amount, (int __user *) argp); break; } case SIOCAX25ADDUID: /* Add a uid to the uid/call map table */ case SIOCAX25DELUID: /* Delete a uid from the uid/call map table */ case SIOCAX25GETUID: { struct sockaddr_ax25 sax25; if (copy_from_user(&sax25, argp, sizeof(sax25))) { res = -EFAULT; break; } res = ax25_uid_ioctl(cmd, &sax25); break; } case SIOCAX25NOUID: { /* Set the default policy (default/bar) */ long amount; if (!capable(CAP_NET_ADMIN)) { res = -EPERM; break; } if (get_user(amount, (long __user *)argp)) { res = -EFAULT; break; } if (amount < 0 || amount > AX25_NOUID_BLOCK) { res = -EINVAL; break; } ax25_uid_policy = amount; res = 0; break; } case SIOCADDRT: case SIOCDELRT: case SIOCAX25OPTRT: if (!capable(CAP_NET_ADMIN)) { res = -EPERM; break; } res = ax25_rt_ioctl(cmd, argp); break; case SIOCAX25CTLCON: if (!capable(CAP_NET_ADMIN)) { res = -EPERM; break; } res = ax25_ctl_ioctl(cmd, argp); break; case SIOCAX25GETINFO: case SIOCAX25GETINFOOLD: { ax25_cb *ax25 = sk_to_ax25(sk); struct ax25_info_struct ax25_info; ax25_info.t1 = ax25->t1 / HZ; ax25_info.t2 = ax25->t2 / HZ; ax25_info.t3 = ax25->t3 / HZ; ax25_info.idle = ax25->idle / (60 * HZ); ax25_info.n2 = ax25->n2; ax25_info.t1timer = ax25_display_timer(&ax25->t1timer) / HZ; ax25_info.t2timer = ax25_display_timer(&ax25->t2timer) / HZ; ax25_info.t3timer = ax25_display_timer(&ax25->t3timer) / HZ; ax25_info.idletimer = ax25_display_timer(&ax25->idletimer) / (60 * HZ); ax25_info.n2count = ax25->n2count; ax25_info.state = ax25->state; ax25_info.rcv_q = sk_rmem_alloc_get(sk); ax25_info.snd_q = sk_wmem_alloc_get(sk); ax25_info.vs = ax25->vs; ax25_info.vr = ax25->vr; ax25_info.va = ax25->va; ax25_info.vs_max = ax25->vs; /* reserved */ ax25_info.paclen = ax25->paclen; ax25_info.window = ax25->window; /* old structure? */ if (cmd == SIOCAX25GETINFOOLD) { static int warned = 0; if (!warned) { printk(KERN_INFO "%s uses old SIOCAX25GETINFO\n", current->comm); warned=1; } if (copy_to_user(argp, &ax25_info, sizeof(struct ax25_info_struct_deprecated))) { res = -EFAULT; break; } } else { if (copy_to_user(argp, &ax25_info, sizeof(struct ax25_info_struct))) { res = -EINVAL; break; } } res = 0; break; } case SIOCAX25ADDFWD: case SIOCAX25DELFWD: { struct ax25_fwd_struct ax25_fwd; if (!capable(CAP_NET_ADMIN)) { res = -EPERM; break; } if (copy_from_user(&ax25_fwd, argp, sizeof(ax25_fwd))) { res = -EFAULT; break; } res = ax25_fwd_ioctl(cmd, &ax25_fwd); break; } case SIOCGIFADDR: case SIOCSIFADDR: case SIOCGIFDSTADDR: case SIOCSIFDSTADDR: case SIOCGIFBRDADDR: case SIOCSIFBRDADDR: case SIOCGIFNETMASK: case SIOCSIFNETMASK: case SIOCGIFMETRIC: case SIOCSIFMETRIC: res = -EINVAL; break; default: res = -ENOIOCTLCMD; break; } release_sock(sk); return res; } #ifdef CONFIG_PROC_FS static void *ax25_info_start(struct seq_file *seq, loff_t *pos) __acquires(ax25_list_lock) { spin_lock_bh(&ax25_list_lock); return seq_hlist_start(&ax25_list, *pos); } static void *ax25_info_next(struct seq_file *seq, void *v, loff_t *pos) { return seq_hlist_next(v, &ax25_list, pos); } static void ax25_info_stop(struct seq_file *seq, void *v) __releases(ax25_list_lock) { spin_unlock_bh(&ax25_list_lock); } static int ax25_info_show(struct seq_file *seq, void *v) { ax25_cb *ax25 = hlist_entry(v, struct ax25_cb, ax25_node); char buf[11]; int k; /* * New format: * magic dev src_addr dest_addr,digi1,digi2,.. st vs vr va t1 t1 t2 t2 t3 t3 idle idle n2 n2 rtt window paclen Snd-Q Rcv-Q inode */ seq_printf(seq, "%p %s %s%s ", ax25, ax25->ax25_dev == NULL? "???" : ax25->ax25_dev->dev->name, ax2asc(buf, &ax25->source_addr), ax25->iamdigi? "*":""); seq_printf(seq, "%s", ax2asc(buf, &ax25->dest_addr)); for (k=0; (ax25->digipeat != NULL) && (k < ax25->digipeat->ndigi); k++) { seq_printf(seq, ",%s%s", ax2asc(buf, &ax25->digipeat->calls[k]), ax25->digipeat->repeated[k]? "*":""); } seq_printf(seq, " %d %d %d %d %lu %lu %lu %lu %lu %lu %lu %lu %d %d %lu %d %d", ax25->state, ax25->vs, ax25->vr, ax25->va, ax25_display_timer(&ax25->t1timer) / HZ, ax25->t1 / HZ, ax25_display_timer(&ax25->t2timer) / HZ, ax25->t2 / HZ, ax25_display_timer(&ax25->t3timer) / HZ, ax25->t3 / HZ, ax25_display_timer(&ax25->idletimer) / (60 * HZ), ax25->idle / (60 * HZ), ax25->n2count, ax25->n2, ax25->rtt / HZ, ax25->window, ax25->paclen); if (ax25->sk != NULL) { seq_printf(seq, " %d %d %lu\n", sk_wmem_alloc_get(ax25->sk), sk_rmem_alloc_get(ax25->sk), sock_i_ino(ax25->sk)); } else { seq_puts(seq, " * * *\n"); } return 0; } static const struct seq_operations ax25_info_seqops = { .start = ax25_info_start, .next = ax25_info_next, .stop = ax25_info_stop, .show = ax25_info_show, }; #endif static const struct net_proto_family ax25_family_ops = { .family = PF_AX25, .create = ax25_create, .owner = THIS_MODULE, }; static const struct proto_ops ax25_proto_ops = { .family = PF_AX25, .owner = THIS_MODULE, .release = ax25_release, .bind = ax25_bind, .connect = ax25_connect, .socketpair = sock_no_socketpair, .accept = ax25_accept, .getname = ax25_getname, .poll = datagram_poll, .ioctl = ax25_ioctl, .gettstamp = sock_gettstamp, .listen = ax25_listen, .shutdown = ax25_shutdown, .setsockopt = ax25_setsockopt, .getsockopt = ax25_getsockopt, .sendmsg = ax25_sendmsg, .recvmsg = ax25_recvmsg, .mmap = sock_no_mmap, }; /* * Called by socket.c on kernel start up */ static struct packet_type ax25_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_AX25), .func = ax25_kiss_rcv, }; static struct notifier_block ax25_dev_notifier = { .notifier_call = ax25_device_event, }; static int __init ax25_init(void) { int rc = proto_register(&ax25_proto, 0); if (rc != 0) goto out; sock_register(&ax25_family_ops); dev_add_pack(&ax25_packet_type); register_netdevice_notifier(&ax25_dev_notifier); proc_create_seq("ax25_route", 0444, init_net.proc_net, &ax25_rt_seqops); proc_create_seq("ax25", 0444, init_net.proc_net, &ax25_info_seqops); proc_create_seq("ax25_calls", 0444, init_net.proc_net, &ax25_uid_seqops); out: return rc; } module_init(ax25_init); MODULE_AUTHOR("Jonathan Naylor G4KLX <g4klx@g4klx.demon.co.uk>"); MODULE_DESCRIPTION("The amateur radio AX.25 link layer protocol"); MODULE_LICENSE("GPL"); MODULE_ALIAS_NETPROTO(PF_AX25); static void __exit ax25_exit(void) { remove_proc_entry("ax25_route", init_net.proc_net); remove_proc_entry("ax25", init_net.proc_net); remove_proc_entry("ax25_calls", init_net.proc_net); unregister_netdevice_notifier(&ax25_dev_notifier); dev_remove_pack(&ax25_packet_type); sock_unregister(PF_AX25); proto_unregister(&ax25_proto); ax25_rt_free(); ax25_uid_free(); ax25_dev_free(); } module_exit(ax25_exit);
2 2 2 2 2 2 2 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 // SPDX-License-Identifier: GPL-2.0 /* * Alarmtimer interface * * This interface provides a timer which is similar to hrtimers, * but triggers a RTC alarm if the box is suspend. * * This interface is influenced by the Android RTC Alarm timer * interface. * * Copyright (C) 2010 IBM Corporation * * Author: John Stultz <john.stultz@linaro.org> */ #include <linux/time.h> #include <linux/hrtimer.h> #include <linux/timerqueue.h> #include <linux/rtc.h> #include <linux/sched/signal.h> #include <linux/sched/debug.h> #include <linux/alarmtimer.h> #include <linux/mutex.h> #include <linux/platform_device.h> #include <linux/posix-timers.h> #include <linux/workqueue.h> #include <linux/freezer.h> #include <linux/compat.h> #include <linux/module.h> #include <linux/time_namespace.h> #include "posix-timers.h" #define CREATE_TRACE_POINTS #include <trace/events/alarmtimer.h> /** * struct alarm_base - Alarm timer bases * @lock: Lock for syncrhonized access to the base * @timerqueue: Timerqueue head managing the list of events * @get_ktime: Function to read the time correlating to the base * @get_timespec: Function to read the namespace time correlating to the base * @base_clockid: clockid for the base */ static struct alarm_base { spinlock_t lock; struct timerqueue_head timerqueue; ktime_t (*get_ktime)(void); void (*get_timespec)(struct timespec64 *tp); clockid_t base_clockid; } alarm_bases[ALARM_NUMTYPE]; #if defined(CONFIG_POSIX_TIMERS) || defined(CONFIG_RTC_CLASS) /* freezer information to handle clock_nanosleep triggered wakeups */ static enum alarmtimer_type freezer_alarmtype; static ktime_t freezer_expires; static ktime_t freezer_delta; static DEFINE_SPINLOCK(freezer_delta_lock); #endif #ifdef CONFIG_RTC_CLASS /* rtc timer and device for setting alarm wakeups at suspend */ static struct rtc_timer rtctimer; static struct rtc_device *rtcdev; static DEFINE_SPINLOCK(rtcdev_lock); /** * alarmtimer_get_rtcdev - Return selected rtcdevice * * This function returns the rtc device to use for wakealarms. */ struct rtc_device *alarmtimer_get_rtcdev(void) { unsigned long flags; struct rtc_device *ret; spin_lock_irqsave(&rtcdev_lock, flags); ret = rtcdev; spin_unlock_irqrestore(&rtcdev_lock, flags); return ret; } EXPORT_SYMBOL_GPL(alarmtimer_get_rtcdev); static int alarmtimer_rtc_add_device(struct device *dev) { unsigned long flags; struct rtc_device *rtc = to_rtc_device(dev); struct platform_device *pdev; int ret = 0; if (rtcdev) return -EBUSY; if (!test_bit(RTC_FEATURE_ALARM, rtc->features)) return -1; if (!device_may_wakeup(rtc->dev.parent)) return -1; pdev = platform_device_register_data(dev, "alarmtimer", PLATFORM_DEVID_AUTO, NULL, 0); if (!IS_ERR(pdev)) device_init_wakeup(&pdev->dev, true); spin_lock_irqsave(&rtcdev_lock, flags); if (!IS_ERR(pdev) && !rtcdev) { if (!try_module_get(rtc->owner)) { ret = -1; goto unlock; } rtcdev = rtc; /* hold a reference so it doesn't go away */ get_device(dev); pdev = NULL; } else { ret = -1; } unlock: spin_unlock_irqrestore(&rtcdev_lock, flags); platform_device_unregister(pdev); return ret; } static inline void alarmtimer_rtc_timer_init(void) { rtc_timer_init(&rtctimer, NULL, NULL); } static struct class_interface alarmtimer_rtc_interface = { .add_dev = &alarmtimer_rtc_add_device, }; static int alarmtimer_rtc_interface_setup(void) { alarmtimer_rtc_interface.class = &rtc_class; return class_interface_register(&alarmtimer_rtc_interface); } static void alarmtimer_rtc_interface_remove(void) { class_interface_unregister(&alarmtimer_rtc_interface); } #else static inline int alarmtimer_rtc_interface_setup(void) { return 0; } static inline void alarmtimer_rtc_interface_remove(void) { } static inline void alarmtimer_rtc_timer_init(void) { } #endif /** * alarmtimer_enqueue - Adds an alarm timer to an alarm_base timerqueue * @base: pointer to the base where the timer is being run * @alarm: pointer to alarm being enqueued. * * Adds alarm to a alarm_base timerqueue * * Must hold base->lock when calling. */ static void alarmtimer_enqueue(struct alarm_base *base, struct alarm *alarm) { if (alarm->state & ALARMTIMER_STATE_ENQUEUED) timerqueue_del(&base->timerqueue, &alarm->node); timerqueue_add(&base->timerqueue, &alarm->node); alarm->state |= ALARMTIMER_STATE_ENQUEUED; } /** * alarmtimer_dequeue - Removes an alarm timer from an alarm_base timerqueue * @base: pointer to the base where the timer is running * @alarm: pointer to alarm being removed * * Removes alarm to a alarm_base timerqueue * * Must hold base->lock when calling. */ static void alarmtimer_dequeue(struct alarm_base *base, struct alarm *alarm) { if (!(alarm->state & ALARMTIMER_STATE_ENQUEUED)) return; timerqueue_del(&base->timerqueue, &alarm->node); alarm->state &= ~ALARMTIMER_STATE_ENQUEUED; } /** * alarmtimer_fired - Handles alarm hrtimer being fired. * @timer: pointer to hrtimer being run * * When a alarm timer fires, this runs through the timerqueue to * see which alarms expired, and runs those. If there are more alarm * timers queued for the future, we set the hrtimer to fire when * the next future alarm timer expires. */ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer) { struct alarm *alarm = container_of(timer, struct alarm, timer); struct alarm_base *base = &alarm_bases[alarm->type]; scoped_guard (spinlock_irqsave, &base->lock) alarmtimer_dequeue(base, alarm); if (alarm->function) alarm->function(alarm, base->get_ktime()); trace_alarmtimer_fired(alarm, base->get_ktime()); return HRTIMER_NORESTART; } ktime_t alarm_expires_remaining(const struct alarm *alarm) { struct alarm_base *base = &alarm_bases[alarm->type]; return ktime_sub(alarm->node.expires, base->get_ktime()); } EXPORT_SYMBOL_GPL(alarm_expires_remaining); #ifdef CONFIG_RTC_CLASS /** * alarmtimer_suspend - Suspend time callback * @dev: unused * * When we are going into suspend, we look through the bases * to see which is the soonest timer to expire. We then * set an rtc timer to fire that far into the future, which * will wake us from suspend. */ static int alarmtimer_suspend(struct device *dev) { ktime_t min, now, expires; int i, ret, type; struct rtc_device *rtc; unsigned long flags; struct rtc_time tm; spin_lock_irqsave(&freezer_delta_lock, flags); min = freezer_delta; expires = freezer_expires; type = freezer_alarmtype; freezer_delta = 0; spin_unlock_irqrestore(&freezer_delta_lock, flags); rtc = alarmtimer_get_rtcdev(); /* If we have no rtcdev, just return */ if (!rtc) return 0; /* Find the soonest timer to expire*/ for (i = 0; i < ALARM_NUMTYPE; i++) { struct alarm_base *base = &alarm_bases[i]; struct timerqueue_node *next; ktime_t delta; spin_lock_irqsave(&base->lock, flags); next = timerqueue_getnext(&base->timerqueue); spin_unlock_irqrestore(&base->lock, flags); if (!next) continue; delta = ktime_sub(next->expires, base->get_ktime()); if (!min || (delta < min)) { expires = next->expires; min = delta; type = i; } } if (min == 0) return 0; if (ktime_to_ns(min) < 2 * NSEC_PER_SEC) { pm_wakeup_event(dev, 2 * MSEC_PER_SEC); return -EBUSY; } trace_alarmtimer_suspend(expires, type); /* Setup an rtc timer to fire that far in the future */ rtc_timer_cancel(rtc, &rtctimer); rtc_read_time(rtc, &tm); now = rtc_tm_to_ktime(tm); /* * If the RTC alarm timer only supports a limited time offset, set the * alarm time to the maximum supported value. * The system may wake up earlier (possibly much earlier) than expected * when the alarmtimer runs. This is the best the kernel can do if * the alarmtimer exceeds the time that the rtc device can be programmed * for. */ min = rtc_bound_alarmtime(rtc, min); now = ktime_add(now, min); /* Set alarm, if in the past reject suspend briefly to handle */ ret = rtc_timer_start(rtc, &rtctimer, now, 0); if (ret < 0) pm_wakeup_event(dev, MSEC_PER_SEC); return ret; } static int alarmtimer_resume(struct device *dev) { struct rtc_device *rtc; rtc = alarmtimer_get_rtcdev(); if (rtc) rtc_timer_cancel(rtc, &rtctimer); return 0; } #else static int alarmtimer_suspend(struct device *dev) { return 0; } static int alarmtimer_resume(struct device *dev) { return 0; } #endif static void __alarm_init(struct alarm *alarm, enum alarmtimer_type type, void (*function)(struct alarm *, ktime_t)) { timerqueue_init(&alarm->node); alarm->function = function; alarm->type = type; alarm->state = ALARMTIMER_STATE_INACTIVE; } /** * alarm_init - Initialize an alarm structure * @alarm: ptr to alarm to be initialized * @type: the type of the alarm * @function: callback that is run when the alarm fires */ void alarm_init(struct alarm *alarm, enum alarmtimer_type type, void (*function)(struct alarm *, ktime_t)) { hrtimer_setup(&alarm->timer, alarmtimer_fired, alarm_bases[type].base_clockid, HRTIMER_MODE_ABS); __alarm_init(alarm, type, function); } EXPORT_SYMBOL_GPL(alarm_init); /** * alarm_start - Sets an absolute alarm to fire * @alarm: ptr to alarm to set * @start: time to run the alarm */ void alarm_start(struct alarm *alarm, ktime_t start) { struct alarm_base *base = &alarm_bases[alarm->type]; unsigned long flags; spin_lock_irqsave(&base->lock, flags); alarm->node.expires = start; alarmtimer_enqueue(base, alarm); hrtimer_start(&alarm->timer, alarm->node.expires, HRTIMER_MODE_ABS); spin_unlock_irqrestore(&base->lock, flags); trace_alarmtimer_start(alarm, base->get_ktime()); } EXPORT_SYMBOL_GPL(alarm_start); /** * alarm_start_relative - Sets a relative alarm to fire * @alarm: ptr to alarm to set * @start: time relative to now to run the alarm */ void alarm_start_relative(struct alarm *alarm, ktime_t start) { struct alarm_base *base = &alarm_bases[alarm->type]; start = ktime_add_safe(start, base->get_ktime()); alarm_start(alarm, start); } EXPORT_SYMBOL_GPL(alarm_start_relative); void alarm_restart(struct alarm *alarm) { struct alarm_base *base = &alarm_bases[alarm->type]; unsigned long flags; spin_lock_irqsave(&base->lock, flags); hrtimer_set_expires(&alarm->timer, alarm->node.expires); hrtimer_restart(&alarm->timer); alarmtimer_enqueue(base, alarm); spin_unlock_irqrestore(&base->lock, flags); } EXPORT_SYMBOL_GPL(alarm_restart); /** * alarm_try_to_cancel - Tries to cancel an alarm timer * @alarm: ptr to alarm to be canceled * * Returns 1 if the timer was canceled, 0 if it was not running, * and -1 if the callback was running */ int alarm_try_to_cancel(struct alarm *alarm) { struct alarm_base *base = &alarm_bases[alarm->type]; unsigned long flags; int ret; spin_lock_irqsave(&base->lock, flags); ret = hrtimer_try_to_cancel(&alarm->timer); if (ret >= 0) alarmtimer_dequeue(base, alarm); spin_unlock_irqrestore(&base->lock, flags); trace_alarmtimer_cancel(alarm, base->get_ktime()); return ret; } EXPORT_SYMBOL_GPL(alarm_try_to_cancel); /** * alarm_cancel - Spins trying to cancel an alarm timer until it is done * @alarm: ptr to alarm to be canceled * * Returns 1 if the timer was canceled, 0 if it was not active. */ int alarm_cancel(struct alarm *alarm) { for (;;) { int ret = alarm_try_to_cancel(alarm); if (ret >= 0) return ret; hrtimer_cancel_wait_running(&alarm->timer); } } EXPORT_SYMBOL_GPL(alarm_cancel); u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval) { u64 overrun = 1; ktime_t delta; delta = ktime_sub(now, alarm->node.expires); if (delta < 0) return 0; if (unlikely(delta >= interval)) { s64 incr = ktime_to_ns(interval); overrun = ktime_divns(delta, incr); alarm->node.expires = ktime_add_ns(alarm->node.expires, incr*overrun); if (alarm->node.expires > now) return overrun; /* * This (and the ktime_add() below) is the * correction for exact: */ overrun++; } alarm->node.expires = ktime_add_safe(alarm->node.expires, interval); return overrun; } EXPORT_SYMBOL_GPL(alarm_forward); u64 alarm_forward_now(struct alarm *alarm, ktime_t interval) { struct alarm_base *base = &alarm_bases[alarm->type]; return alarm_forward(alarm, base->get_ktime(), interval); } EXPORT_SYMBOL_GPL(alarm_forward_now); #ifdef CONFIG_POSIX_TIMERS static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type) { struct alarm_base *base; unsigned long flags; ktime_t delta; switch(type) { case ALARM_REALTIME: base = &alarm_bases[ALARM_REALTIME]; type = ALARM_REALTIME_FREEZER; break; case ALARM_BOOTTIME: base = &alarm_bases[ALARM_BOOTTIME]; type = ALARM_BOOTTIME_FREEZER; break; default: WARN_ONCE(1, "Invalid alarm type: %d\n", type); return; } delta = ktime_sub(absexp, base->get_ktime()); spin_lock_irqsave(&freezer_delta_lock, flags); if (!freezer_delta || (delta < freezer_delta)) { freezer_delta = delta; freezer_expires = absexp; freezer_alarmtype = type; } spin_unlock_irqrestore(&freezer_delta_lock, flags); } /** * clock2alarm - helper that converts from clockid to alarmtypes * @clockid: clockid. */ static enum alarmtimer_type clock2alarm(clockid_t clockid) { if (clockid == CLOCK_REALTIME_ALARM) return ALARM_REALTIME; if (clockid == CLOCK_BOOTTIME_ALARM) return ALARM_BOOTTIME; return -1; } /** * alarm_handle_timer - Callback for posix timers * @alarm: alarm that fired * @now: time at the timer expiration * * Posix timer callback for expired alarm timers. * * Return: whether the timer is to be restarted */ static void alarm_handle_timer(struct alarm *alarm, ktime_t now) { struct k_itimer *ptr = container_of(alarm, struct k_itimer, it.alarm.alarmtimer); guard(spinlock_irqsave)(&ptr->it_lock); posix_timer_queue_signal(ptr); } /** * alarm_timer_rearm - Posix timer callback for rearming timer * @timr: Pointer to the posixtimer data struct */ static void alarm_timer_rearm(struct k_itimer *timr) { struct alarm *alarm = &timr->it.alarm.alarmtimer; timr->it_overrun += alarm_forward_now(alarm, timr->it_interval); alarm_start(alarm, alarm->node.expires); } /** * alarm_timer_forward - Posix timer callback for forwarding timer * @timr: Pointer to the posixtimer data struct * @now: Current time to forward the timer against */ static s64 alarm_timer_forward(struct k_itimer *timr, ktime_t now) { struct alarm *alarm = &timr->it.alarm.alarmtimer; return alarm_forward(alarm, timr->it_interval, now); } /** * alarm_timer_remaining - Posix timer callback to retrieve remaining time * @timr: Pointer to the posixtimer data struct * @now: Current time to calculate against */ static ktime_t alarm_timer_remaining(struct k_itimer *timr, ktime_t now) { struct alarm *alarm = &timr->it.alarm.alarmtimer; return ktime_sub(alarm->node.expires, now); } /** * alarm_timer_try_to_cancel - Posix timer callback to cancel a timer * @timr: Pointer to the posixtimer data struct */ static int alarm_timer_try_to_cancel(struct k_itimer *timr) { return alarm_try_to_cancel(&timr->it.alarm.alarmtimer); } /** * alarm_timer_wait_running - Posix timer callback to wait for a timer * @timr: Pointer to the posixtimer data struct * * Called from the core code when timer cancel detected that the callback * is running. @timr is unlocked and rcu read lock is held to prevent it * from being freed. */ static void alarm_timer_wait_running(struct k_itimer *timr) { hrtimer_cancel_wait_running(&timr->it.alarm.alarmtimer.timer); } /** * alarm_timer_arm - Posix timer callback to arm a timer * @timr: Pointer to the posixtimer data struct * @expires: The new expiry time * @absolute: Expiry value is absolute time * @sigev_none: Posix timer does not deliver signals */ static void alarm_timer_arm(struct k_itimer *timr, ktime_t expires, bool absolute, bool sigev_none) { struct alarm *alarm = &timr->it.alarm.alarmtimer; struct alarm_base *base = &alarm_bases[alarm->type]; if (!absolute) expires = ktime_add_safe(expires, base->get_ktime()); if (sigev_none) alarm->node.expires = expires; else alarm_start(&timr->it.alarm.alarmtimer, expires); } /** * alarm_clock_getres - posix getres interface * @which_clock: clockid * @tp: timespec to fill * * Returns the granularity of underlying alarm base clock */ static int alarm_clock_getres(const clockid_t which_clock, struct timespec64 *tp) { if (!alarmtimer_get_rtcdev()) return -EINVAL; tp->tv_sec = 0; tp->tv_nsec = hrtimer_resolution; return 0; } /** * alarm_clock_get_timespec - posix clock_get_timespec interface * @which_clock: clockid * @tp: timespec to fill. * * Provides the underlying alarm base time in a tasks time namespace. */ static int alarm_clock_get_timespec(clockid_t which_clock, struct timespec64 *tp) { struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)]; if (!alarmtimer_get_rtcdev()) return -EINVAL; base->get_timespec(tp); return 0; } /** * alarm_clock_get_ktime - posix clock_get_ktime interface * @which_clock: clockid * * Provides the underlying alarm base time in the root namespace. */ static ktime_t alarm_clock_get_ktime(clockid_t which_clock) { struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)]; if (!alarmtimer_get_rtcdev()) return -EINVAL; return base->get_ktime(); } /** * alarm_timer_create - posix timer_create interface * @new_timer: k_itimer pointer to manage * * Initializes the k_itimer structure. */ static int alarm_timer_create(struct k_itimer *new_timer) { enum alarmtimer_type type; if (!alarmtimer_get_rtcdev()) return -EOPNOTSUPP; if (!capable(CAP_WAKE_ALARM)) return -EPERM; type = clock2alarm(new_timer->it_clock); alarm_init(&new_timer->it.alarm.alarmtimer, type, alarm_handle_timer); return 0; } /** * alarmtimer_nsleep_wakeup - Wakeup function for alarm_timer_nsleep * @alarm: ptr to alarm that fired * @now: time at the timer expiration * * Wakes up the task that set the alarmtimer */ static void alarmtimer_nsleep_wakeup(struct alarm *alarm, ktime_t now) { struct task_struct *task = alarm->data; alarm->data = NULL; if (task) wake_up_process(task); } /** * alarmtimer_do_nsleep - Internal alarmtimer nsleep implementation * @alarm: ptr to alarmtimer * @absexp: absolute expiration time * @type: alarm type (BOOTTIME/REALTIME). * * Sets the alarm timer and sleeps until it is fired or interrupted. */ static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp, enum alarmtimer_type type) { struct restart_block *restart; alarm->data = (void *)current; do { set_current_state(TASK_INTERRUPTIBLE); alarm_start(alarm, absexp); if (likely(alarm->data)) schedule(); alarm_cancel(alarm); } while (alarm->data && !signal_pending(current)); __set_current_state(TASK_RUNNING); destroy_hrtimer_on_stack(&alarm->timer); if (!alarm->data) return 0; if (freezing(current)) alarmtimer_freezerset(absexp, type); restart = &current->restart_block; if (restart->nanosleep.type != TT_NONE) { struct timespec64 rmt; ktime_t rem; rem = ktime_sub(absexp, alarm_bases[type].get_ktime()); if (rem <= 0) return 0; rmt = ktime_to_timespec64(rem); return nanosleep_copyout(restart, &rmt); } return -ERESTART_RESTARTBLOCK; } static void alarm_init_on_stack(struct alarm *alarm, enum alarmtimer_type type, void (*function)(struct alarm *, ktime_t)) { hrtimer_setup_on_stack(&alarm->timer, alarmtimer_fired, alarm_bases[type].base_clockid, HRTIMER_MODE_ABS); __alarm_init(alarm, type, function); } /** * alarm_timer_nsleep_restart - restartblock alarmtimer nsleep * @restart: ptr to restart block * * Handles restarted clock_nanosleep calls */ static long __sched alarm_timer_nsleep_restart(struct restart_block *restart) { enum alarmtimer_type type = restart->nanosleep.clockid; ktime_t exp = restart->nanosleep.expires; struct alarm alarm; alarm_init_on_stack(&alarm, type, alarmtimer_nsleep_wakeup); return alarmtimer_do_nsleep(&alarm, exp, type); } /** * alarm_timer_nsleep - alarmtimer nanosleep * @which_clock: clockid * @flags: determines abstime or relative * @tsreq: requested sleep time (abs or rel) * * Handles clock_nanosleep calls against _ALARM clockids */ static int alarm_timer_nsleep(const clockid_t which_clock, int flags, const struct timespec64 *tsreq) { enum alarmtimer_type type = clock2alarm(which_clock); struct restart_block *restart = &current->restart_block; struct alarm alarm; ktime_t exp; int ret; if (!alarmtimer_get_rtcdev()) return -EOPNOTSUPP; if (flags & ~TIMER_ABSTIME) return -EINVAL; if (!capable(CAP_WAKE_ALARM)) return -EPERM; alarm_init_on_stack(&alarm, type, alarmtimer_nsleep_wakeup); exp = timespec64_to_ktime(*tsreq); /* Convert (if necessary) to absolute time */ if (flags != TIMER_ABSTIME) { ktime_t now = alarm_bases[type].get_ktime(); exp = ktime_add_safe(now, exp); } else { exp = timens_ktime_to_host(which_clock, exp); } ret = alarmtimer_do_nsleep(&alarm, exp, type); if (ret != -ERESTART_RESTARTBLOCK) return ret; /* abs timers don't set remaining time or restart */ if (flags == TIMER_ABSTIME) return -ERESTARTNOHAND; restart->nanosleep.clockid = type; restart->nanosleep.expires = exp; set_restart_fn(restart, alarm_timer_nsleep_restart); return ret; } const struct k_clock alarm_clock = { .clock_getres = alarm_clock_getres, .clock_get_ktime = alarm_clock_get_ktime, .clock_get_timespec = alarm_clock_get_timespec, .timer_create = alarm_timer_create, .timer_set = common_timer_set, .timer_del = common_timer_del, .timer_get = common_timer_get, .timer_arm = alarm_timer_arm, .timer_rearm = alarm_timer_rearm, .timer_forward = alarm_timer_forward, .timer_remaining = alarm_timer_remaining, .timer_try_to_cancel = alarm_timer_try_to_cancel, .timer_wait_running = alarm_timer_wait_running, .nsleep = alarm_timer_nsleep, }; #endif /* CONFIG_POSIX_TIMERS */ /* Suspend hook structures */ static const struct dev_pm_ops alarmtimer_pm_ops = { .suspend = alarmtimer_suspend, .resume = alarmtimer_resume, }; static struct platform_driver alarmtimer_driver = { .driver = { .name = "alarmtimer", .pm = &alarmtimer_pm_ops, } }; static void get_boottime_timespec(struct timespec64 *tp) { ktime_get_boottime_ts64(tp); timens_add_boottime(tp); } /** * alarmtimer_init - Initialize alarm timer code * * This function initializes the alarm bases and registers * the posix clock ids. */ static int __init alarmtimer_init(void) { int error; int i; alarmtimer_rtc_timer_init(); /* Initialize alarm bases */ alarm_bases[ALARM_REALTIME].base_clockid = CLOCK_REALTIME; alarm_bases[ALARM_REALTIME].get_ktime = &ktime_get_real; alarm_bases[ALARM_REALTIME].get_timespec = ktime_get_real_ts64; alarm_bases[ALARM_BOOTTIME].base_clockid = CLOCK_BOOTTIME; alarm_bases[ALARM_BOOTTIME].get_ktime = &ktime_get_boottime; alarm_bases[ALARM_BOOTTIME].get_timespec = get_boottime_timespec; for (i = 0; i < ALARM_NUMTYPE; i++) { timerqueue_init_head(&alarm_bases[i].timerqueue); spin_lock_init(&alarm_bases[i].lock); } error = alarmtimer_rtc_interface_setup(); if (error) return error; error = platform_driver_register(&alarmtimer_driver); if (error) goto out_if; return 0; out_if: alarmtimer_rtc_interface_remove(); return error; } device_initcall(alarmtimer_init);
3 2 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/errno.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/mm.h> #include <linux/slab.h> #include <linux/syscalls.h> #include <linux/io_uring.h> #include <uapi/linux/io_uring.h> #include "../fs/internal.h" #include "io_uring.h" #include "truncate.h" struct io_ftrunc { struct file *file; loff_t len; }; int io_ftruncate_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_ftrunc *ft = io_kiocb_to_cmd(req, struct io_ftrunc); if (sqe->rw_flags || sqe->addr || sqe->len || sqe->buf_index || sqe->splice_fd_in || sqe->addr3) return -EINVAL; ft->len = READ_ONCE(sqe->off); req->flags |= REQ_F_FORCE_ASYNC; return 0; } int io_ftruncate(struct io_kiocb *req, unsigned int issue_flags) { struct io_ftrunc *ft = io_kiocb_to_cmd(req, struct io_ftrunc); int ret; WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); ret = do_ftruncate(req->file, ft->len, 1); io_req_set_res(req, ret, 0); return IOU_OK; }
246 245 245 303 303 303 29 29 29 29 9 21 302 302 303 3 300 300 303 301 91 268 303 91 268 299 3 303 300 2 431 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 // SPDX-License-Identifier: GPL-2.0-only /* * Kernel-based Virtual Machine driver for Linux * * Macros and functions to access KVM PTEs (also known as SPTEs) * * Copyright (C) 2006 Qumranet, Inc. * Copyright 2020 Red Hat, Inc. and/or its affiliates. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kvm_host.h> #include "mmu.h" #include "mmu_internal.h" #include "x86.h" #include "spte.h" #include <asm/e820/api.h> #include <asm/memtype.h> #include <asm/vmx.h> bool __read_mostly enable_mmio_caching = true; static bool __ro_after_init allow_mmio_caching; module_param_named(mmio_caching, enable_mmio_caching, bool, 0444); EXPORT_SYMBOL_GPL(enable_mmio_caching); bool __read_mostly kvm_ad_enabled; u64 __read_mostly shadow_host_writable_mask; u64 __read_mostly shadow_mmu_writable_mask; u64 __read_mostly shadow_nx_mask; u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ u64 __read_mostly shadow_user_mask; u64 __read_mostly shadow_accessed_mask; u64 __read_mostly shadow_dirty_mask; u64 __read_mostly shadow_mmio_value; u64 __read_mostly shadow_mmio_mask; u64 __read_mostly shadow_mmio_access_mask; u64 __read_mostly shadow_present_mask; u64 __read_mostly shadow_memtype_mask; u64 __read_mostly shadow_me_value; u64 __read_mostly shadow_me_mask; u64 __read_mostly shadow_acc_track_mask; u64 __read_mostly shadow_nonpresent_or_rsvd_mask; u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask; static u8 __init kvm_get_host_maxphyaddr(void) { /* * boot_cpu_data.x86_phys_bits is reduced when MKTME or SME are detected * in CPU detection code, but the processor treats those reduced bits as * 'keyID' thus they are not reserved bits. Therefore KVM needs to look at * the physical address bits reported by CPUID, i.e. the raw MAXPHYADDR, * when reasoning about CPU behavior with respect to MAXPHYADDR. */ if (likely(boot_cpu_data.extended_cpuid_level >= 0x80000008)) return cpuid_eax(0x80000008) & 0xff; /* * Quite weird to have VMX or SVM but not MAXPHYADDR; probably a VM with * custom CPUID. Proceed with whatever the kernel found since these features * aren't virtualizable (SME/SEV also require CPUIDs higher than 0x80000008). */ return boot_cpu_data.x86_phys_bits; } void __init kvm_mmu_spte_module_init(void) { /* * Snapshot userspace's desire to allow MMIO caching. Whether or not * KVM can actually enable MMIO caching depends on vendor-specific * hardware capabilities and other module params that can't be resolved * until the vendor module is loaded, i.e. enable_mmio_caching can and * will change when the vendor module is (re)loaded. */ allow_mmio_caching = enable_mmio_caching; kvm_host.maxphyaddr = kvm_get_host_maxphyaddr(); } static u64 generation_mmio_spte_mask(u64 gen) { u64 mask; WARN_ON_ONCE(gen & ~MMIO_SPTE_GEN_MASK); mask = (gen << MMIO_SPTE_GEN_LOW_SHIFT) & MMIO_SPTE_GEN_LOW_MASK; mask |= (gen << MMIO_SPTE_GEN_HIGH_SHIFT) & MMIO_SPTE_GEN_HIGH_MASK; return mask; } u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access) { u64 gen = kvm_vcpu_memslots(vcpu)->generation & MMIO_SPTE_GEN_MASK; u64 spte = generation_mmio_spte_mask(gen); u64 gpa = gfn << PAGE_SHIFT; WARN_ON_ONCE(!vcpu->kvm->arch.shadow_mmio_value); access &= shadow_mmio_access_mask; spte |= vcpu->kvm->arch.shadow_mmio_value | access; spte |= gpa | shadow_nonpresent_or_rsvd_mask; spte |= (gpa & shadow_nonpresent_or_rsvd_mask) << SHADOW_NONPRESENT_OR_RSVD_MASK_LEN; return spte; } static bool kvm_is_mmio_pfn(kvm_pfn_t pfn) { if (pfn_valid(pfn)) return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn)) && /* * Some reserved pages, such as those from NVDIMM * DAX devices, are not for MMIO, and can be mapped * with cached memory type for better performance. * However, the above check misconceives those pages * as MMIO, and results in KVM mapping them with UC * memory type, which would hurt the performance. * Therefore, we check the host memory type in addition * and only treat UC/UC-/WC pages as MMIO. */ (!pat_enabled() || pat_pfn_immune_to_uc_mtrr(pfn)); return !e820__mapped_raw_any(pfn_to_hpa(pfn), pfn_to_hpa(pfn + 1) - 1, E820_TYPE_RAM); } /* * Returns true if the SPTE has bits that may be set without holding mmu_lock. * The caller is responsible for checking if the SPTE is shadow-present, and * for determining whether or not the caller cares about non-leaf SPTEs. */ bool spte_has_volatile_bits(u64 spte) { if (!is_writable_pte(spte) && is_mmu_writable_spte(spte)) return true; if (is_access_track_spte(spte)) return true; if (spte_ad_enabled(spte)) { if (!(spte & shadow_accessed_mask) || (is_writable_pte(spte) && !(spte & shadow_dirty_mask))) return true; } return false; } bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, const struct kvm_memory_slot *slot, unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn, u64 old_spte, bool prefetch, bool synchronizing, bool host_writable, u64 *new_spte) { int level = sp->role.level; u64 spte = SPTE_MMU_PRESENT_MASK; bool wrprot = false; /* * For the EPT case, shadow_present_mask has no RWX bits set if * exec-only page table entries are supported. In that case, * ACC_USER_MASK and shadow_user_mask are used to represent * read access. See FNAME(gpte_access) in paging_tmpl.h. */ WARN_ON_ONCE((pte_access | shadow_present_mask) == SHADOW_NONPRESENT_VALUE); if (sp->role.ad_disabled) spte |= SPTE_TDP_AD_DISABLED; else if (kvm_mmu_page_ad_need_write_protect(sp)) spte |= SPTE_TDP_AD_WRPROT_ONLY; spte |= shadow_present_mask; if (!prefetch || synchronizing) spte |= shadow_accessed_mask; /* * For simplicity, enforce the NX huge page mitigation even if not * strictly necessary. KVM could ignore the mitigation if paging is * disabled in the guest, as the guest doesn't have any page tables to * abuse. But to safely ignore the mitigation, KVM would have to * ensure a new MMU is loaded (or all shadow pages zapped) when CR0.PG * is toggled on, and that's a net negative for performance when TDP is * enabled. When TDP is disabled, KVM will always switch to a new MMU * when CR0.PG is toggled, but leveraging that to ignore the mitigation * would tie make_spte() further to vCPU/MMU state, and add complexity * just to optimize a mode that is anything but performance critical. */ if (level > PG_LEVEL_4K && (pte_access & ACC_EXEC_MASK) && is_nx_huge_page_enabled(vcpu->kvm)) { pte_access &= ~ACC_EXEC_MASK; } if (pte_access & ACC_EXEC_MASK) spte |= shadow_x_mask; else spte |= shadow_nx_mask; if (pte_access & ACC_USER_MASK) spte |= shadow_user_mask; if (level > PG_LEVEL_4K) spte |= PT_PAGE_SIZE_MASK; if (shadow_memtype_mask) spte |= kvm_x86_call(get_mt_mask)(vcpu, gfn, kvm_is_mmio_pfn(pfn)); if (host_writable) spte |= shadow_host_writable_mask; else pte_access &= ~ACC_WRITE_MASK; if (shadow_me_value && !kvm_is_mmio_pfn(pfn)) spte |= shadow_me_value; spte |= (u64)pfn << PAGE_SHIFT; if (pte_access & ACC_WRITE_MASK) { /* * Unsync shadow pages that are reachable by the new, writable * SPTE. Write-protect the SPTE if the page can't be unsync'd, * e.g. it's write-tracked (upper-level SPs) or has one or more * shadow pages and unsync'ing pages is not allowed. * * When overwriting an existing leaf SPTE, and the old SPTE was * writable, skip trying to unsync shadow pages as any relevant * shadow pages must already be unsync, i.e. the hash lookup is * unnecessary (and expensive). Note, this relies on KVM not * changing PFNs without first zapping the old SPTE, which is * guaranteed by both the shadow MMU and the TDP MMU. */ if ((!is_last_spte(old_spte, level) || !is_writable_pte(old_spte)) && mmu_try_to_unsync_pages(vcpu->kvm, slot, gfn, synchronizing, prefetch)) wrprot = true; else spte |= PT_WRITABLE_MASK | shadow_mmu_writable_mask | shadow_dirty_mask; } if (prefetch && !synchronizing) spte = mark_spte_for_access_track(spte); WARN_ONCE(is_rsvd_spte(&vcpu->arch.mmu->shadow_zero_check, spte, level), "spte = 0x%llx, level = %d, rsvd bits = 0x%llx", spte, level, get_rsvd_bits(&vcpu->arch.mmu->shadow_zero_check, spte, level)); /* * Mark the memslot dirty *after* modifying it for access tracking. * Unlike folios, memslots can be safely marked dirty out of mmu_lock, * i.e. in the fast page fault handler. */ if ((spte & PT_WRITABLE_MASK) && kvm_slot_dirty_track_enabled(slot)) { /* Enforced by kvm_mmu_hugepage_adjust. */ WARN_ON_ONCE(level > PG_LEVEL_4K); mark_page_dirty_in_slot(vcpu->kvm, slot, gfn); } *new_spte = spte; return wrprot; } static u64 modify_spte_protections(u64 spte, u64 set, u64 clear) { bool is_access_track = is_access_track_spte(spte); if (is_access_track) spte = restore_acc_track_spte(spte); KVM_MMU_WARN_ON(set & clear); spte = (spte | set) & ~clear; if (is_access_track) spte = mark_spte_for_access_track(spte); return spte; } static u64 make_spte_executable(u64 spte) { return modify_spte_protections(spte, shadow_x_mask, shadow_nx_mask); } static u64 make_spte_nonexecutable(u64 spte) { return modify_spte_protections(spte, shadow_nx_mask, shadow_x_mask); } /* * Construct an SPTE that maps a sub-page of the given huge page SPTE where * `index` identifies which sub-page. * * This is used during huge page splitting to build the SPTEs that make up the * new page table. */ u64 make_small_spte(struct kvm *kvm, u64 huge_spte, union kvm_mmu_page_role role, int index) { u64 child_spte = huge_spte; KVM_BUG_ON(!is_shadow_present_pte(huge_spte) || !is_large_pte(huge_spte), kvm); /* * The child_spte already has the base address of the huge page being * split. So we just have to OR in the offset to the page at the next * lower level for the given index. */ child_spte |= (index * KVM_PAGES_PER_HPAGE(role.level)) << PAGE_SHIFT; if (role.level == PG_LEVEL_4K) { child_spte &= ~PT_PAGE_SIZE_MASK; /* * When splitting to a 4K page where execution is allowed, mark * the page executable as the NX hugepage mitigation no longer * applies. */ if ((role.access & ACC_EXEC_MASK) && is_nx_huge_page_enabled(kvm)) child_spte = make_spte_executable(child_spte); } return child_spte; } u64 make_huge_spte(struct kvm *kvm, u64 small_spte, int level) { u64 huge_spte; KVM_BUG_ON(!is_shadow_present_pte(small_spte) || level == PG_LEVEL_4K, kvm); huge_spte = small_spte | PT_PAGE_SIZE_MASK; /* * huge_spte already has the address of the sub-page being collapsed * from small_spte, so just clear the lower address bits to create the * huge page address. */ huge_spte &= KVM_HPAGE_MASK(level) | ~PAGE_MASK; if (is_nx_huge_page_enabled(kvm)) huge_spte = make_spte_nonexecutable(huge_spte); return huge_spte; } u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled) { u64 spte = SPTE_MMU_PRESENT_MASK; spte |= __pa(child_pt) | shadow_present_mask | PT_WRITABLE_MASK | shadow_user_mask | shadow_x_mask | shadow_me_value; if (ad_disabled) spte |= SPTE_TDP_AD_DISABLED; else spte |= shadow_accessed_mask; return spte; } u64 mark_spte_for_access_track(u64 spte) { if (spte_ad_enabled(spte)) return spte & ~shadow_accessed_mask; if (is_access_track_spte(spte)) return spte; check_spte_writable_invariants(spte); WARN_ONCE(spte & (SHADOW_ACC_TRACK_SAVED_BITS_MASK << SHADOW_ACC_TRACK_SAVED_BITS_SHIFT), "Access Tracking saved bit locations are not zero\n"); spte |= (spte & SHADOW_ACC_TRACK_SAVED_BITS_MASK) << SHADOW_ACC_TRACK_SAVED_BITS_SHIFT; spte &= ~(shadow_acc_track_mask | shadow_accessed_mask); return spte; } void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask) { BUG_ON((u64)(unsigned)access_mask != access_mask); WARN_ON(mmio_value & shadow_nonpresent_or_rsvd_lower_gfn_mask); /* * Reset to the original module param value to honor userspace's desire * to (dis)allow MMIO caching. Update the param itself so that * userspace can see whether or not KVM is actually using MMIO caching. */ enable_mmio_caching = allow_mmio_caching; if (!enable_mmio_caching) mmio_value = 0; /* * The mask must contain only bits that are carved out specifically for * the MMIO SPTE mask, e.g. to ensure there's no overlap with the MMIO * generation. */ if (WARN_ON(mmio_mask & ~SPTE_MMIO_ALLOWED_MASK)) mmio_value = 0; /* * Disable MMIO caching if the MMIO value collides with the bits that * are used to hold the relocated GFN when the L1TF mitigation is * enabled. This should never fire as there is no known hardware that * can trigger this condition, e.g. SME/SEV CPUs that require a custom * MMIO value are not susceptible to L1TF. */ if (WARN_ON(mmio_value & (shadow_nonpresent_or_rsvd_mask << SHADOW_NONPRESENT_OR_RSVD_MASK_LEN))) mmio_value = 0; /* * The masked MMIO value must obviously match itself and a frozen SPTE * must not get a false positive. Frozen SPTEs and MMIO SPTEs should * never collide as MMIO must set some RWX bits, and frozen SPTEs must * not set any RWX bits. */ if (WARN_ON((mmio_value & mmio_mask) != mmio_value) || WARN_ON(mmio_value && (FROZEN_SPTE & mmio_mask) == mmio_value)) mmio_value = 0; if (!mmio_value) enable_mmio_caching = false; shadow_mmio_value = mmio_value; shadow_mmio_mask = mmio_mask; shadow_mmio_access_mask = access_mask; } EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); void kvm_mmu_set_me_spte_mask(u64 me_value, u64 me_mask) { /* shadow_me_value must be a subset of shadow_me_mask */ if (WARN_ON(me_value & ~me_mask)) me_value = me_mask = 0; shadow_me_value = me_value; shadow_me_mask = me_mask; } EXPORT_SYMBOL_GPL(kvm_mmu_set_me_spte_mask); void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only) { kvm_ad_enabled = has_ad_bits; shadow_user_mask = VMX_EPT_READABLE_MASK; shadow_accessed_mask = VMX_EPT_ACCESS_BIT; shadow_dirty_mask = VMX_EPT_DIRTY_BIT; shadow_nx_mask = 0ull; shadow_x_mask = VMX_EPT_EXECUTABLE_MASK; /* VMX_EPT_SUPPRESS_VE_BIT is needed for W or X violation. */ shadow_present_mask = (has_exec_only ? 0ull : VMX_EPT_READABLE_MASK) | VMX_EPT_SUPPRESS_VE_BIT; /* * EPT overrides the host MTRRs, and so KVM must program the desired * memtype directly into the SPTEs. Note, this mask is just the mask * of all bits that factor into the memtype, the actual memtype must be * dynamically calculated, e.g. to ensure host MMIO is mapped UC. */ shadow_memtype_mask = VMX_EPT_MT_MASK | VMX_EPT_IPAT_BIT; shadow_acc_track_mask = VMX_EPT_RWX_MASK; shadow_host_writable_mask = EPT_SPTE_HOST_WRITABLE; shadow_mmu_writable_mask = EPT_SPTE_MMU_WRITABLE; /* * EPT Misconfigurations are generated if the value of bits 2:0 * of an EPT paging-structure entry is 110b (write/execute). */ kvm_mmu_set_mmio_spte_mask(VMX_EPT_MISCONFIG_WX_VALUE, VMX_EPT_RWX_MASK | VMX_EPT_SUPPRESS_VE_BIT, 0); } EXPORT_SYMBOL_GPL(kvm_mmu_set_ept_masks); void kvm_mmu_reset_all_pte_masks(void) { u8 low_phys_bits; u64 mask; kvm_ad_enabled = true; /* * If the CPU has 46 or less physical address bits, then set an * appropriate mask to guard against L1TF attacks. Otherwise, it is * assumed that the CPU is not vulnerable to L1TF. * * Some Intel CPUs address the L1 cache using more PA bits than are * reported by CPUID. Use the PA width of the L1 cache when possible * to achieve more effective mitigation, e.g. if system RAM overlaps * the most significant bits of legal physical address space. */ shadow_nonpresent_or_rsvd_mask = 0; low_phys_bits = boot_cpu_data.x86_phys_bits; if (boot_cpu_has_bug(X86_BUG_L1TF) && !WARN_ON_ONCE(boot_cpu_data.x86_cache_bits >= 52 - SHADOW_NONPRESENT_OR_RSVD_MASK_LEN)) { low_phys_bits = boot_cpu_data.x86_cache_bits - SHADOW_NONPRESENT_OR_RSVD_MASK_LEN; shadow_nonpresent_or_rsvd_mask = rsvd_bits(low_phys_bits, boot_cpu_data.x86_cache_bits - 1); } shadow_nonpresent_or_rsvd_lower_gfn_mask = GENMASK_ULL(low_phys_bits - 1, PAGE_SHIFT); shadow_user_mask = PT_USER_MASK; shadow_accessed_mask = PT_ACCESSED_MASK; shadow_dirty_mask = PT_DIRTY_MASK; shadow_nx_mask = PT64_NX_MASK; shadow_x_mask = 0; shadow_present_mask = PT_PRESENT_MASK; /* * For shadow paging and NPT, KVM uses PAT entry '0' to encode WB * memtype in the SPTEs, i.e. relies on host MTRRs to provide the * correct memtype (WB is the "weakest" memtype). */ shadow_memtype_mask = 0; shadow_acc_track_mask = 0; shadow_me_mask = 0; shadow_me_value = 0; shadow_host_writable_mask = DEFAULT_SPTE_HOST_WRITABLE; shadow_mmu_writable_mask = DEFAULT_SPTE_MMU_WRITABLE; /* * Set a reserved PA bit in MMIO SPTEs to generate page faults with * PFEC.RSVD=1 on MMIO accesses. 64-bit PTEs (PAE, x86-64, and EPT * paging) support a maximum of 52 bits of PA, i.e. if the CPU supports * 52-bit physical addresses then there are no reserved PA bits in the * PTEs and so the reserved PA approach must be disabled. */ if (kvm_host.maxphyaddr < 52) mask = BIT_ULL(51) | PT_PRESENT_MASK; else mask = 0; kvm_mmu_set_mmio_spte_mask(mask, mask, ACC_WRITE_MASK | ACC_USER_MASK); }
212 198 196 20 26 145 21 4 168 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 /* SPDX-License-Identifier: GPL-2.0-only */ /* * sha1_base.h - core logic for SHA-1 implementations * * Copyright (C) 2015 Linaro Ltd <ard.biesheuvel@linaro.org> */ #ifndef _CRYPTO_SHA1_BASE_H #define _CRYPTO_SHA1_BASE_H #include <crypto/internal/hash.h> #include <crypto/sha1.h> #include <linux/crypto.h> #include <linux/module.h> #include <linux/string.h> #include <linux/unaligned.h> typedef void (sha1_block_fn)(struct sha1_state *sst, u8 const *src, int blocks); static inline int sha1_base_init(struct shash_desc *desc) { struct sha1_state *sctx = shash_desc_ctx(desc); sctx->state[0] = SHA1_H0; sctx->state[1] = SHA1_H1; sctx->state[2] = SHA1_H2; sctx->state[3] = SHA1_H3; sctx->state[4] = SHA1_H4; sctx->count = 0; return 0; } static inline int sha1_base_do_update(struct shash_desc *desc, const u8 *data, unsigned int len, sha1_block_fn *block_fn) { struct sha1_state *sctx = shash_desc_ctx(desc); unsigned int partial = sctx->count % SHA1_BLOCK_SIZE; sctx->count += len; if (unlikely((partial + len) >= SHA1_BLOCK_SIZE)) { int blocks; if (partial) { int p = SHA1_BLOCK_SIZE - partial; memcpy(sctx->buffer + partial, data, p); data += p; len -= p; block_fn(sctx, sctx->buffer, 1); } blocks = len / SHA1_BLOCK_SIZE; len %= SHA1_BLOCK_SIZE; if (blocks) { block_fn(sctx, data, blocks); data += blocks * SHA1_BLOCK_SIZE; } partial = 0; } if (len) memcpy(sctx->buffer + partial, data, len); return 0; } static inline int sha1_base_do_finalize(struct shash_desc *desc, sha1_block_fn *block_fn) { const int bit_offset = SHA1_BLOCK_SIZE - sizeof(__be64); struct sha1_state *sctx = shash_desc_ctx(desc); __be64 *bits = (__be64 *)(sctx->buffer + bit_offset); unsigned int partial = sctx->count % SHA1_BLOCK_SIZE; sctx->buffer[partial++] = 0x80; if (partial > bit_offset) { memset(sctx->buffer + partial, 0x0, SHA1_BLOCK_SIZE - partial); partial = 0; block_fn(sctx, sctx->buffer, 1); } memset(sctx->buffer + partial, 0x0, bit_offset - partial); *bits = cpu_to_be64(sctx->count << 3); block_fn(sctx, sctx->buffer, 1); return 0; } static inline int sha1_base_finish(struct shash_desc *desc, u8 *out) { struct sha1_state *sctx = shash_desc_ctx(desc); __be32 *digest = (__be32 *)out; int i; for (i = 0; i < SHA1_DIGEST_SIZE / sizeof(__be32); i++) put_unaligned_be32(sctx->state[i], digest++); memzero_explicit(sctx, sizeof(*sctx)); return 0; } #endif /* _CRYPTO_SHA1_BASE_H */
4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _IP_SET_GETPORT_H #define _IP_SET_GETPORT_H #include <linux/skbuff.h> #include <linux/types.h> #include <uapi/linux/in.h> extern bool ip_set_get_ip4_port(const struct sk_buff *skb, bool src, __be16 *port, u8 *proto); #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) extern bool ip_set_get_ip6_port(const struct sk_buff *skb, bool src, __be16 *port, u8 *proto); #else static inline bool ip_set_get_ip6_port(const struct sk_buff *skb, bool src, __be16 *port, u8 *proto) { return false; } #endif static inline bool ip_set_proto_with_ports(u8 proto) { switch (proto) { case IPPROTO_TCP: case IPPROTO_SCTP: case IPPROTO_UDP: case IPPROTO_UDPLITE: return true; } return false; } #endif /*_IP_SET_GETPORT_H*/
1 1 1 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 // SPDX-License-Identifier: GPL-2.0-only /* * QNX6 file system, Linux implementation. * * Version : 1.0.0 * * History : * * 01-02-2012 by Kai Bankett (chaosman@ontika.net) : first release. * 16-02-2012 pagemap extension by Al Viro * */ #include <linux/module.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/highuid.h> #include <linux/pagemap.h> #include <linux/buffer_head.h> #include <linux/writeback.h> #include <linux/statfs.h> #include <linux/seq_file.h> #include <linux/crc32.h> #include <linux/mpage.h> #include <linux/fs_parser.h> #include <linux/fs_context.h> #include "qnx6.h" static const struct super_operations qnx6_sops; static void qnx6_put_super(struct super_block *sb); static struct inode *qnx6_alloc_inode(struct super_block *sb); static void qnx6_free_inode(struct inode *inode); static int qnx6_reconfigure(struct fs_context *fc); static int qnx6_statfs(struct dentry *dentry, struct kstatfs *buf); static int qnx6_show_options(struct seq_file *seq, struct dentry *root); static const struct super_operations qnx6_sops = { .alloc_inode = qnx6_alloc_inode, .free_inode = qnx6_free_inode, .put_super = qnx6_put_super, .statfs = qnx6_statfs, .show_options = qnx6_show_options, }; static int qnx6_show_options(struct seq_file *seq, struct dentry *root) { struct super_block *sb = root->d_sb; struct qnx6_sb_info *sbi = QNX6_SB(sb); if (sbi->s_mount_opt & QNX6_MOUNT_MMI_FS) seq_puts(seq, ",mmi_fs"); return 0; } static int qnx6_reconfigure(struct fs_context *fc) { struct super_block *sb = fc->root->d_sb; sync_filesystem(sb); fc->sb_flags |= SB_RDONLY; return 0; } static unsigned qnx6_get_devblock(struct super_block *sb, __fs32 block) { struct qnx6_sb_info *sbi = QNX6_SB(sb); return fs32_to_cpu(sbi, block) + sbi->s_blks_off; } static unsigned qnx6_block_map(struct inode *inode, unsigned iblock); static int qnx6_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh, int create) { unsigned phys; pr_debug("qnx6_get_block inode=[%ld] iblock=[%ld]\n", inode->i_ino, (unsigned long)iblock); phys = qnx6_block_map(inode, iblock); if (phys) { /* logical block is before EOF */ map_bh(bh, inode->i_sb, phys); } return 0; } static int qnx6_check_blockptr(__fs32 ptr) { if (ptr == ~(__fs32)0) { pr_err("hit unused blockpointer.\n"); return 0; } return 1; } static int qnx6_read_folio(struct file *file, struct folio *folio) { return mpage_read_folio(folio, qnx6_get_block); } static void qnx6_readahead(struct readahead_control *rac) { mpage_readahead(rac, qnx6_get_block); } /* * returns the block number for the no-th element in the tree * inodebits requred as there are multiple inodes in one inode block */ static unsigned qnx6_block_map(struct inode *inode, unsigned no) { struct super_block *s = inode->i_sb; struct qnx6_sb_info *sbi = QNX6_SB(s); struct qnx6_inode_info *ei = QNX6_I(inode); unsigned block = 0; struct buffer_head *bh; __fs32 ptr; int levelptr; int ptrbits = sbi->s_ptrbits; int bitdelta; u32 mask = (1 << ptrbits) - 1; int depth = ei->di_filelevels; int i; bitdelta = ptrbits * depth; levelptr = no >> bitdelta; if (levelptr > QNX6_NO_DIRECT_POINTERS - 1) { pr_err("Requested file block number (%u) too big.", no); return 0; } block = qnx6_get_devblock(s, ei->di_block_ptr[levelptr]); for (i = 0; i < depth; i++) { bh = sb_bread(s, block); if (!bh) { pr_err("Error reading block (%u)\n", block); return 0; } bitdelta -= ptrbits; levelptr = (no >> bitdelta) & mask; ptr = ((__fs32 *)bh->b_data)[levelptr]; if (!qnx6_check_blockptr(ptr)) return 0; block = qnx6_get_devblock(s, ptr); brelse(bh); } return block; } static int qnx6_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; struct qnx6_sb_info *sbi = QNX6_SB(sb); u64 id = huge_encode_dev(sb->s_bdev->bd_dev); buf->f_type = sb->s_magic; buf->f_bsize = sb->s_blocksize; buf->f_blocks = fs32_to_cpu(sbi, sbi->sb->sb_num_blocks); buf->f_bfree = fs32_to_cpu(sbi, sbi->sb->sb_free_blocks); buf->f_files = fs32_to_cpu(sbi, sbi->sb->sb_num_inodes); buf->f_ffree = fs32_to_cpu(sbi, sbi->sb->sb_free_inodes); buf->f_bavail = buf->f_bfree; buf->f_namelen = QNX6_LONG_NAME_MAX; buf->f_fsid = u64_to_fsid(id); return 0; } /* * Check the root directory of the filesystem to make sure * it really _is_ a qnx6 filesystem, and to check the size * of the directory entry. */ static const char *qnx6_checkroot(struct super_block *s) { int error = 0; struct qnx6_dir_entry *dir_entry; struct inode *root = d_inode(s->s_root); struct address_space *mapping = root->i_mapping; struct folio *folio = read_mapping_folio(mapping, 0, NULL); if (IS_ERR(folio)) return "error reading root directory"; dir_entry = kmap_local_folio(folio, 0); if (memcmp(dir_entry[0].de_fname, ".", 2) || memcmp(dir_entry[1].de_fname, "..", 3)) error = 1; folio_release_kmap(folio, dir_entry); if (error) return "error reading root directory."; return NULL; } #ifdef CONFIG_QNX6FS_DEBUG void qnx6_superblock_debug(struct qnx6_super_block *sb, struct super_block *s) { struct qnx6_sb_info *sbi = QNX6_SB(s); pr_debug("magic: %08x\n", fs32_to_cpu(sbi, sb->sb_magic)); pr_debug("checksum: %08x\n", fs32_to_cpu(sbi, sb->sb_checksum)); pr_debug("serial: %llx\n", fs64_to_cpu(sbi, sb->sb_serial)); pr_debug("flags: %08x\n", fs32_to_cpu(sbi, sb->sb_flags)); pr_debug("blocksize: %08x\n", fs32_to_cpu(sbi, sb->sb_blocksize)); pr_debug("num_inodes: %08x\n", fs32_to_cpu(sbi, sb->sb_num_inodes)); pr_debug("free_inodes: %08x\n", fs32_to_cpu(sbi, sb->sb_free_inodes)); pr_debug("num_blocks: %08x\n", fs32_to_cpu(sbi, sb->sb_num_blocks)); pr_debug("free_blocks: %08x\n", fs32_to_cpu(sbi, sb->sb_free_blocks)); pr_debug("inode_levels: %02x\n", sb->Inode.levels); } #endif enum { Opt_mmifs }; struct qnx6_context { unsigned long s_mount_opts; }; static const struct fs_parameter_spec qnx6_param_spec[] = { fsparam_flag ("mmi_fs", Opt_mmifs), {} }; static int qnx6_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct qnx6_context *ctx = fc->fs_private; struct fs_parse_result result; int opt; opt = fs_parse(fc, qnx6_param_spec, param, &result); if (opt < 0) return opt; switch (opt) { case Opt_mmifs: ctx->s_mount_opts |= QNX6_MOUNT_MMI_FS; break; default: return -EINVAL; } return 0; } static struct buffer_head *qnx6_check_first_superblock(struct super_block *s, int offset, int silent) { struct qnx6_sb_info *sbi = QNX6_SB(s); struct buffer_head *bh; struct qnx6_super_block *sb; /* Check the superblock signatures start with the first superblock */ bh = sb_bread(s, offset); if (!bh) { pr_err("unable to read the first superblock\n"); return NULL; } sb = (struct qnx6_super_block *)bh->b_data; if (fs32_to_cpu(sbi, sb->sb_magic) != QNX6_SUPER_MAGIC) { sbi->s_bytesex = BYTESEX_BE; if (fs32_to_cpu(sbi, sb->sb_magic) == QNX6_SUPER_MAGIC) { /* we got a big endian fs */ pr_debug("fs got different endianness.\n"); return bh; } else sbi->s_bytesex = BYTESEX_LE; if (!silent) { if (offset == 0) { pr_err("wrong signature (magic) in superblock #1.\n"); } else { pr_info("wrong signature (magic) at position (0x%lx) - will try alternative position (0x0000).\n", offset * s->s_blocksize); } } brelse(bh); return NULL; } return bh; } static struct inode *qnx6_private_inode(struct super_block *s, struct qnx6_root_node *p); static int qnx6_fill_super(struct super_block *s, struct fs_context *fc) { struct buffer_head *bh1 = NULL, *bh2 = NULL; struct qnx6_super_block *sb1 = NULL, *sb2 = NULL; struct qnx6_sb_info *sbi; struct qnx6_context *ctx = fc->fs_private; struct inode *root; const char *errmsg; struct qnx6_sb_info *qs; int ret = -EINVAL; u64 offset; int bootblock_offset = QNX6_BOOTBLOCK_SIZE; int silent = fc->sb_flags & SB_SILENT; qs = kzalloc(sizeof(struct qnx6_sb_info), GFP_KERNEL); if (!qs) return -ENOMEM; s->s_fs_info = qs; qs->s_mount_opt = ctx->s_mount_opts; /* Superblock always is 512 Byte long */ if (!sb_set_blocksize(s, QNX6_SUPERBLOCK_SIZE)) { pr_err("unable to set blocksize\n"); goto outnobh; } if (qs->s_mount_opt == QNX6_MOUNT_MMI_FS) { sb1 = qnx6_mmi_fill_super(s, silent); if (sb1) goto mmi_success; else goto outnobh; } sbi = QNX6_SB(s); sbi->s_bytesex = BYTESEX_LE; /* Check the superblock signatures start with the first superblock */ bh1 = qnx6_check_first_superblock(s, bootblock_offset / QNX6_SUPERBLOCK_SIZE, silent); if (!bh1) { /* try again without bootblock offset */ bh1 = qnx6_check_first_superblock(s, 0, silent); if (!bh1) { pr_err("unable to read the first superblock\n"); goto outnobh; } /* seems that no bootblock at partition start */ bootblock_offset = 0; } sb1 = (struct qnx6_super_block *)bh1->b_data; #ifdef CONFIG_QNX6FS_DEBUG qnx6_superblock_debug(sb1, s); #endif /* checksum check - start at byte 8 and end at byte 512 */ if (fs32_to_cpu(sbi, sb1->sb_checksum) != crc32_be(0, (char *)(bh1->b_data + 8), 504)) { pr_err("superblock #1 checksum error\n"); goto out; } /* set new blocksize */ if (!sb_set_blocksize(s, fs32_to_cpu(sbi, sb1->sb_blocksize))) { pr_err("unable to set blocksize\n"); goto out; } /* blocksize invalidates bh - pull it back in */ brelse(bh1); bh1 = sb_bread(s, bootblock_offset >> s->s_blocksize_bits); if (!bh1) goto outnobh; sb1 = (struct qnx6_super_block *)bh1->b_data; /* calculate second superblock blocknumber */ offset = fs32_to_cpu(sbi, sb1->sb_num_blocks) + (bootblock_offset >> s->s_blocksize_bits) + (QNX6_SUPERBLOCK_AREA >> s->s_blocksize_bits); /* set bootblock offset */ sbi->s_blks_off = (bootblock_offset >> s->s_blocksize_bits) + (QNX6_SUPERBLOCK_AREA >> s->s_blocksize_bits); /* next the second superblock */ bh2 = sb_bread(s, offset); if (!bh2) { pr_err("unable to read the second superblock\n"); goto out; } sb2 = (struct qnx6_super_block *)bh2->b_data; if (fs32_to_cpu(sbi, sb2->sb_magic) != QNX6_SUPER_MAGIC) { if (!silent) pr_err("wrong signature (magic) in superblock #2.\n"); goto out; } /* checksum check - start at byte 8 and end at byte 512 */ if (fs32_to_cpu(sbi, sb2->sb_checksum) != crc32_be(0, (char *)(bh2->b_data + 8), 504)) { pr_err("superblock #2 checksum error\n"); goto out; } if (fs64_to_cpu(sbi, sb1->sb_serial) >= fs64_to_cpu(sbi, sb2->sb_serial)) { /* superblock #1 active */ sbi->sb_buf = bh1; sbi->sb = (struct qnx6_super_block *)bh1->b_data; brelse(bh2); pr_info("superblock #1 active\n"); } else { /* superblock #2 active */ sbi->sb_buf = bh2; sbi->sb = (struct qnx6_super_block *)bh2->b_data; brelse(bh1); pr_info("superblock #2 active\n"); } mmi_success: /* sanity check - limit maximum indirect pointer levels */ if (sb1->Inode.levels > QNX6_PTR_MAX_LEVELS) { pr_err("too many inode levels (max %i, sb %i)\n", QNX6_PTR_MAX_LEVELS, sb1->Inode.levels); goto out; } if (sb1->Longfile.levels > QNX6_PTR_MAX_LEVELS) { pr_err("too many longfilename levels (max %i, sb %i)\n", QNX6_PTR_MAX_LEVELS, sb1->Longfile.levels); goto out; } s->s_op = &qnx6_sops; s->s_magic = QNX6_SUPER_MAGIC; s->s_flags |= SB_RDONLY; /* Yup, read-only yet */ s->s_time_min = 0; s->s_time_max = U32_MAX; /* ease the later tree level calculations */ sbi = QNX6_SB(s); sbi->s_ptrbits = ilog2(s->s_blocksize / 4); sbi->inodes = qnx6_private_inode(s, &sb1->Inode); if (!sbi->inodes) goto out; sbi->longfile = qnx6_private_inode(s, &sb1->Longfile); if (!sbi->longfile) goto out1; /* prefetch root inode */ root = qnx6_iget(s, QNX6_ROOT_INO); if (IS_ERR(root)) { pr_err("get inode failed\n"); ret = PTR_ERR(root); goto out2; } ret = -ENOMEM; s->s_root = d_make_root(root); if (!s->s_root) goto out2; ret = -EINVAL; errmsg = qnx6_checkroot(s); if (errmsg != NULL) { if (!silent) pr_err("%s\n", errmsg); goto out3; } return 0; out3: dput(s->s_root); s->s_root = NULL; out2: iput(sbi->longfile); out1: iput(sbi->inodes); out: brelse(bh1); brelse(bh2); outnobh: kfree(qs); s->s_fs_info = NULL; return ret; } static void qnx6_put_super(struct super_block *sb) { struct qnx6_sb_info *qs = QNX6_SB(sb); brelse(qs->sb_buf); iput(qs->longfile); iput(qs->inodes); kfree(qs); sb->s_fs_info = NULL; return; } static sector_t qnx6_bmap(struct address_space *mapping, sector_t block) { return generic_block_bmap(mapping, block, qnx6_get_block); } static const struct address_space_operations qnx6_aops = { .read_folio = qnx6_read_folio, .readahead = qnx6_readahead, .bmap = qnx6_bmap }; static struct inode *qnx6_private_inode(struct super_block *s, struct qnx6_root_node *p) { struct inode *inode = new_inode(s); if (inode) { struct qnx6_inode_info *ei = QNX6_I(inode); struct qnx6_sb_info *sbi = QNX6_SB(s); inode->i_size = fs64_to_cpu(sbi, p->size); memcpy(ei->di_block_ptr, p->ptr, sizeof(p->ptr)); ei->di_filelevels = p->levels; inode->i_mode = S_IFREG | S_IRUSR; /* probably wrong */ inode->i_mapping->a_ops = &qnx6_aops; } return inode; } struct inode *qnx6_iget(struct super_block *sb, unsigned ino) { struct qnx6_sb_info *sbi = QNX6_SB(sb); struct qnx6_inode_entry *raw_inode; struct inode *inode; struct qnx6_inode_info *ei; struct address_space *mapping; struct folio *folio; u32 n, offs; inode = iget_locked(sb, ino); if (!inode) return ERR_PTR(-ENOMEM); if (!(inode->i_state & I_NEW)) return inode; ei = QNX6_I(inode); inode->i_mode = 0; if (ino == 0) { pr_err("bad inode number on dev %s: %u is out of range\n", sb->s_id, ino); iget_failed(inode); return ERR_PTR(-EIO); } n = (ino - 1) >> (PAGE_SHIFT - QNX6_INODE_SIZE_BITS); mapping = sbi->inodes->i_mapping; folio = read_mapping_folio(mapping, n, NULL); if (IS_ERR(folio)) { pr_err("major problem: unable to read inode from dev %s\n", sb->s_id); iget_failed(inode); return ERR_CAST(folio); } offs = offset_in_folio(folio, (ino - 1) << QNX6_INODE_SIZE_BITS); raw_inode = kmap_local_folio(folio, offs); inode->i_mode = fs16_to_cpu(sbi, raw_inode->di_mode); i_uid_write(inode, (uid_t)fs32_to_cpu(sbi, raw_inode->di_uid)); i_gid_write(inode, (gid_t)fs32_to_cpu(sbi, raw_inode->di_gid)); inode->i_size = fs64_to_cpu(sbi, raw_inode->di_size); inode_set_mtime(inode, fs32_to_cpu(sbi, raw_inode->di_mtime), 0); inode_set_atime(inode, fs32_to_cpu(sbi, raw_inode->di_atime), 0); inode_set_ctime(inode, fs32_to_cpu(sbi, raw_inode->di_ctime), 0); /* calc blocks based on 512 byte blocksize */ inode->i_blocks = (inode->i_size + 511) >> 9; memcpy(&ei->di_block_ptr, &raw_inode->di_block_ptr, sizeof(raw_inode->di_block_ptr)); ei->di_filelevels = raw_inode->di_filelevels; if (S_ISREG(inode->i_mode)) { inode->i_fop = &generic_ro_fops; inode->i_mapping->a_ops = &qnx6_aops; } else if (S_ISDIR(inode->i_mode)) { inode->i_op = &qnx6_dir_inode_operations; inode->i_fop = &qnx6_dir_operations; inode->i_mapping->a_ops = &qnx6_aops; } else if (S_ISLNK(inode->i_mode)) { inode->i_op = &page_symlink_inode_operations; inode_nohighmem(inode); inode->i_mapping->a_ops = &qnx6_aops; } else init_special_inode(inode, inode->i_mode, 0); folio_release_kmap(folio, raw_inode); unlock_new_inode(inode); return inode; } static struct kmem_cache *qnx6_inode_cachep; static struct inode *qnx6_alloc_inode(struct super_block *sb) { struct qnx6_inode_info *ei; ei = alloc_inode_sb(sb, qnx6_inode_cachep, GFP_KERNEL); if (!ei) return NULL; return &ei->vfs_inode; } static void qnx6_free_inode(struct inode *inode) { kmem_cache_free(qnx6_inode_cachep, QNX6_I(inode)); } static void init_once(void *foo) { struct qnx6_inode_info *ei = (struct qnx6_inode_info *) foo; inode_init_once(&ei->vfs_inode); } static int init_inodecache(void) { qnx6_inode_cachep = kmem_cache_create("qnx6_inode_cache", sizeof(struct qnx6_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| SLAB_ACCOUNT), init_once); if (!qnx6_inode_cachep) return -ENOMEM; return 0; } static void destroy_inodecache(void) { /* * Make sure all delayed rcu free inodes are flushed before we * destroy cache. */ rcu_barrier(); kmem_cache_destroy(qnx6_inode_cachep); } static int qnx6_get_tree(struct fs_context *fc) { return get_tree_bdev(fc, qnx6_fill_super); } static void qnx6_free_fc(struct fs_context *fc) { kfree(fc->fs_private); } static const struct fs_context_operations qnx6_context_ops = { .parse_param = qnx6_parse_param, .get_tree = qnx6_get_tree, .reconfigure = qnx6_reconfigure, .free = qnx6_free_fc, }; static int qnx6_init_fs_context(struct fs_context *fc) { struct qnx6_context *ctx; ctx = kzalloc(sizeof(struct qnx6_context), GFP_KERNEL); if (!ctx) return -ENOMEM; fc->ops = &qnx6_context_ops; fc->fs_private = ctx; return 0; } static struct file_system_type qnx6_fs_type = { .owner = THIS_MODULE, .name = "qnx6", .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, .init_fs_context = qnx6_init_fs_context, .parameters = qnx6_param_spec, }; MODULE_ALIAS_FS("qnx6"); static int __init init_qnx6_fs(void) { int err; err = init_inodecache(); if (err) return err; err = register_filesystem(&qnx6_fs_type); if (err) { destroy_inodecache(); return err; } pr_info("QNX6 filesystem 1.0.0 registered.\n"); return 0; } static void __exit exit_qnx6_fs(void) { unregister_filesystem(&qnx6_fs_type); destroy_inodecache(); } module_init(init_qnx6_fs) module_exit(exit_qnx6_fs) MODULE_DESCRIPTION("QNX6 file system"); MODULE_LICENSE("GPL");
116 116 5 5 3 3 3 1 10 10 2 2 2 6 1 1 5 3 5 5 10 10 104 103 103 104 103 103 104 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 // SPDX-License-Identifier: GPL-2.0-or-later /* * drm gem framebuffer helper functions * * Copyright (C) 2017 Noralf Trønnes */ #include <linux/slab.h> #include <linux/module.h> #include <drm/drm_damage_helper.h> #include <drm/drm_drv.h> #include <drm/drm_fourcc.h> #include <drm/drm_framebuffer.h> #include <drm/drm_gem.h> #include <drm/drm_gem_framebuffer_helper.h> #include <drm/drm_modeset_helper.h> #include "drm_internal.h" MODULE_IMPORT_NS("DMA_BUF"); #define AFBC_HEADER_SIZE 16 #define AFBC_TH_LAYOUT_ALIGNMENT 8 #define AFBC_HDR_ALIGN 64 #define AFBC_SUPERBLOCK_PIXELS 256 #define AFBC_SUPERBLOCK_ALIGNMENT 128 #define AFBC_TH_BODY_START_ALIGNMENT 4096 /** * DOC: overview * * This library provides helpers for drivers that don't subclass * &drm_framebuffer and use &drm_gem_object for their backing storage. * * Drivers without additional needs to validate framebuffers can simply use * drm_gem_fb_create() and everything is wired up automatically. Other drivers * can use all parts independently. */ /** * drm_gem_fb_get_obj() - Get GEM object backing the framebuffer * @fb: Framebuffer * @plane: Plane index * * No additional reference is taken beyond the one that the &drm_frambuffer * already holds. * * Returns: * Pointer to &drm_gem_object for the given framebuffer and plane index or NULL * if it does not exist. */ struct drm_gem_object *drm_gem_fb_get_obj(struct drm_framebuffer *fb, unsigned int plane) { struct drm_device *dev = fb->dev; if (drm_WARN_ON_ONCE(dev, plane >= ARRAY_SIZE(fb->obj))) return NULL; else if (drm_WARN_ON_ONCE(dev, !fb->obj[plane])) return NULL; return fb->obj[plane]; } EXPORT_SYMBOL_GPL(drm_gem_fb_get_obj); static int drm_gem_fb_init(struct drm_device *dev, struct drm_framebuffer *fb, const struct drm_mode_fb_cmd2 *mode_cmd, struct drm_gem_object **obj, unsigned int num_planes, const struct drm_framebuffer_funcs *funcs) { unsigned int i; int ret; drm_helper_mode_fill_fb_struct(dev, fb, mode_cmd); for (i = 0; i < num_planes; i++) fb->obj[i] = obj[i]; ret = drm_framebuffer_init(dev, fb, funcs); if (ret) drm_err(dev, "Failed to init framebuffer: %d\n", ret); return ret; } /** * drm_gem_fb_destroy - Free GEM backed framebuffer * @fb: Framebuffer * * Frees a GEM backed framebuffer with its backing buffer(s) and the structure * itself. Drivers can use this as their &drm_framebuffer_funcs->destroy * callback. */ void drm_gem_fb_destroy(struct drm_framebuffer *fb) { unsigned int i; for (i = 0; i < fb->format->num_planes; i++) drm_gem_object_put(fb->obj[i]); drm_framebuffer_cleanup(fb); kfree(fb); } EXPORT_SYMBOL(drm_gem_fb_destroy); /** * drm_gem_fb_create_handle - Create handle for GEM backed framebuffer * @fb: Framebuffer * @file: DRM file to register the handle for * @handle: Pointer to return the created handle * * This function creates a handle for the GEM object backing the framebuffer. * Drivers can use this as their &drm_framebuffer_funcs->create_handle * callback. The GETFB IOCTL calls into this callback. * * Returns: * 0 on success or a negative error code on failure. */ int drm_gem_fb_create_handle(struct drm_framebuffer *fb, struct drm_file *file, unsigned int *handle) { return drm_gem_handle_create(file, fb->obj[0], handle); } EXPORT_SYMBOL(drm_gem_fb_create_handle); /** * drm_gem_fb_init_with_funcs() - Helper function for implementing * &drm_mode_config_funcs.fb_create * callback in cases when the driver * allocates a subclass of * struct drm_framebuffer * @dev: DRM device * @fb: framebuffer object * @file: DRM file that holds the GEM handle(s) backing the framebuffer * @mode_cmd: Metadata from the userspace framebuffer creation request * @funcs: vtable to be used for the new framebuffer object * * This function can be used to set &drm_framebuffer_funcs for drivers that need * custom framebuffer callbacks. Use drm_gem_fb_create() if you don't need to * change &drm_framebuffer_funcs. The function does buffer size validation. * The buffer size validation is for a general case, though, so users should * pay attention to the checks being appropriate for them or, at least, * non-conflicting. * * Returns: * Zero or a negative error code. */ int drm_gem_fb_init_with_funcs(struct drm_device *dev, struct drm_framebuffer *fb, struct drm_file *file, const struct drm_mode_fb_cmd2 *mode_cmd, const struct drm_framebuffer_funcs *funcs) { const struct drm_format_info *info; struct drm_gem_object *objs[DRM_FORMAT_MAX_PLANES]; unsigned int i; int ret; info = drm_get_format_info(dev, mode_cmd); if (!info) { drm_dbg_kms(dev, "Failed to get FB format info\n"); return -EINVAL; } if (drm_drv_uses_atomic_modeset(dev) && !drm_any_plane_has_format(dev, mode_cmd->pixel_format, mode_cmd->modifier[0])) { drm_dbg_kms(dev, "Unsupported pixel format %p4cc / modifier 0x%llx\n", &mode_cmd->pixel_format, mode_cmd->modifier[0]); return -EINVAL; } for (i = 0; i < info->num_planes; i++) { unsigned int width = mode_cmd->width / (i ? info->hsub : 1); unsigned int height = mode_cmd->height / (i ? info->vsub : 1); unsigned int min_size; objs[i] = drm_gem_object_lookup(file, mode_cmd->handles[i]); if (!objs[i]) { drm_dbg_kms(dev, "Failed to lookup GEM object\n"); ret = -ENOENT; goto err_gem_object_put; } min_size = (height - 1) * mode_cmd->pitches[i] + drm_format_info_min_pitch(info, i, width) + mode_cmd->offsets[i]; if (objs[i]->size < min_size) { drm_dbg_kms(dev, "GEM object size (%zu) smaller than minimum size (%u) for plane %d\n", objs[i]->size, min_size, i); drm_gem_object_put(objs[i]); ret = -EINVAL; goto err_gem_object_put; } } ret = drm_gem_fb_init(dev, fb, mode_cmd, objs, i, funcs); if (ret) goto err_gem_object_put; return 0; err_gem_object_put: while (i > 0) { --i; drm_gem_object_put(objs[i]); } return ret; } EXPORT_SYMBOL_GPL(drm_gem_fb_init_with_funcs); /** * drm_gem_fb_create_with_funcs() - Helper function for the * &drm_mode_config_funcs.fb_create * callback * @dev: DRM device * @file: DRM file that holds the GEM handle(s) backing the framebuffer * @mode_cmd: Metadata from the userspace framebuffer creation request * @funcs: vtable to be used for the new framebuffer object * * This function can be used to set &drm_framebuffer_funcs for drivers that need * custom framebuffer callbacks. Use drm_gem_fb_create() if you don't need to * change &drm_framebuffer_funcs. The function does buffer size validation. * * Returns: * Pointer to a &drm_framebuffer on success or an error pointer on failure. */ struct drm_framebuffer * drm_gem_fb_create_with_funcs(struct drm_device *dev, struct drm_file *file, const struct drm_mode_fb_cmd2 *mode_cmd, const struct drm_framebuffer_funcs *funcs) { struct drm_framebuffer *fb; int ret; fb = kzalloc(sizeof(*fb), GFP_KERNEL); if (!fb) return ERR_PTR(-ENOMEM); ret = drm_gem_fb_init_with_funcs(dev, fb, file, mode_cmd, funcs); if (ret) { kfree(fb); return ERR_PTR(ret); } return fb; } EXPORT_SYMBOL_GPL(drm_gem_fb_create_with_funcs); static const struct drm_framebuffer_funcs drm_gem_fb_funcs = { .destroy = drm_gem_fb_destroy, .create_handle = drm_gem_fb_create_handle, }; /** * drm_gem_fb_create() - Helper function for the * &drm_mode_config_funcs.fb_create callback * @dev: DRM device * @file: DRM file that holds the GEM handle(s) backing the framebuffer * @mode_cmd: Metadata from the userspace framebuffer creation request * * This function creates a new framebuffer object described by * &drm_mode_fb_cmd2. This description includes handles for the buffer(s) * backing the framebuffer. * * If your hardware has special alignment or pitch requirements these should be * checked before calling this function. The function does buffer size * validation. Use drm_gem_fb_create_with_dirty() if you need framebuffer * flushing. * * Drivers can use this as their &drm_mode_config_funcs.fb_create callback. * The ADDFB2 IOCTL calls into this callback. * * Returns: * Pointer to a &drm_framebuffer on success or an error pointer on failure. */ struct drm_framebuffer * drm_gem_fb_create(struct drm_device *dev, struct drm_file *file, const struct drm_mode_fb_cmd2 *mode_cmd) { return drm_gem_fb_create_with_funcs(dev, file, mode_cmd, &drm_gem_fb_funcs); } EXPORT_SYMBOL_GPL(drm_gem_fb_create); static const struct drm_framebuffer_funcs drm_gem_fb_funcs_dirtyfb = { .destroy = drm_gem_fb_destroy, .create_handle = drm_gem_fb_create_handle, .dirty = drm_atomic_helper_dirtyfb, }; /** * drm_gem_fb_create_with_dirty() - Helper function for the * &drm_mode_config_funcs.fb_create callback * @dev: DRM device * @file: DRM file that holds the GEM handle(s) backing the framebuffer * @mode_cmd: Metadata from the userspace framebuffer creation request * * This function creates a new framebuffer object described by * &drm_mode_fb_cmd2. This description includes handles for the buffer(s) * backing the framebuffer. drm_atomic_helper_dirtyfb() is used for the dirty * callback giving framebuffer flushing through the atomic machinery. Use * drm_gem_fb_create() if you don't need the dirty callback. * The function does buffer size validation. * * Drivers should also call drm_plane_enable_fb_damage_clips() on all planes * to enable userspace to use damage clips also with the ATOMIC IOCTL. * * Drivers can use this as their &drm_mode_config_funcs.fb_create callback. * The ADDFB2 IOCTL calls into this callback. * * Returns: * Pointer to a &drm_framebuffer on success or an error pointer on failure. */ struct drm_framebuffer * drm_gem_fb_create_with_dirty(struct drm_device *dev, struct drm_file *file, const struct drm_mode_fb_cmd2 *mode_cmd) { return drm_gem_fb_create_with_funcs(dev, file, mode_cmd, &drm_gem_fb_funcs_dirtyfb); } EXPORT_SYMBOL_GPL(drm_gem_fb_create_with_dirty); /** * drm_gem_fb_vmap - maps all framebuffer BOs into kernel address space * @fb: the framebuffer * @map: returns the mapping's address for each BO * @data: returns the data address for each BO, can be NULL * * This function maps all buffer objects of the given framebuffer into * kernel address space and stores them in struct iosys_map. If the * mapping operation fails for one of the BOs, the function unmaps the * already established mappings automatically. * * Callers that want to access a BO's stored data should pass @data. * The argument returns the addresses of the data stored in each BO. This * is different from @map if the framebuffer's offsets field is non-zero. * * Both, @map and @data, must each refer to arrays with at least * fb->format->num_planes elements. * * See drm_gem_fb_vunmap() for unmapping. * * Returns: * 0 on success, or a negative errno code otherwise. */ int drm_gem_fb_vmap(struct drm_framebuffer *fb, struct iosys_map *map, struct iosys_map *data) { struct drm_gem_object *obj; unsigned int i; int ret; for (i = 0; i < fb->format->num_planes; ++i) { obj = drm_gem_fb_get_obj(fb, i); if (!obj) { ret = -EINVAL; goto err_drm_gem_vunmap; } ret = drm_gem_vmap_unlocked(obj, &map[i]); if (ret) goto err_drm_gem_vunmap; } if (data) { for (i = 0; i < fb->format->num_planes; ++i) { memcpy(&data[i], &map[i], sizeof(data[i])); if (iosys_map_is_null(&data[i])) continue; iosys_map_incr(&data[i], fb->offsets[i]); } } return 0; err_drm_gem_vunmap: while (i) { --i; obj = drm_gem_fb_get_obj(fb, i); if (!obj) continue; drm_gem_vunmap_unlocked(obj, &map[i]); } return ret; } EXPORT_SYMBOL(drm_gem_fb_vmap); /** * drm_gem_fb_vunmap - unmaps framebuffer BOs from kernel address space * @fb: the framebuffer * @map: mapping addresses as returned by drm_gem_fb_vmap() * * This function unmaps all buffer objects of the given framebuffer. * * See drm_gem_fb_vmap() for more information. */ void drm_gem_fb_vunmap(struct drm_framebuffer *fb, struct iosys_map *map) { unsigned int i = fb->format->num_planes; struct drm_gem_object *obj; while (i) { --i; obj = drm_gem_fb_get_obj(fb, i); if (!obj) continue; if (iosys_map_is_null(&map[i])) continue; drm_gem_vunmap_unlocked(obj, &map[i]); } } EXPORT_SYMBOL(drm_gem_fb_vunmap); static void __drm_gem_fb_end_cpu_access(struct drm_framebuffer *fb, enum dma_data_direction dir, unsigned int num_planes) { struct dma_buf_attachment *import_attach; struct drm_gem_object *obj; int ret; while (num_planes) { --num_planes; obj = drm_gem_fb_get_obj(fb, num_planes); if (!obj) continue; import_attach = obj->import_attach; if (!import_attach) continue; ret = dma_buf_end_cpu_access(import_attach->dmabuf, dir); if (ret) drm_err(fb->dev, "dma_buf_end_cpu_access(%u, %d) failed: %d\n", ret, num_planes, dir); } } /** * drm_gem_fb_begin_cpu_access - prepares GEM buffer objects for CPU access * @fb: the framebuffer * @dir: access mode * * Prepares a framebuffer's GEM buffer objects for CPU access. This function * must be called before accessing the BO data within the kernel. For imported * BOs, the function calls dma_buf_begin_cpu_access(). * * See drm_gem_fb_end_cpu_access() for signalling the end of CPU access. * * Returns: * 0 on success, or a negative errno code otherwise. */ int drm_gem_fb_begin_cpu_access(struct drm_framebuffer *fb, enum dma_data_direction dir) { struct dma_buf_attachment *import_attach; struct drm_gem_object *obj; unsigned int i; int ret; for (i = 0; i < fb->format->num_planes; ++i) { obj = drm_gem_fb_get_obj(fb, i); if (!obj) { ret = -EINVAL; goto err___drm_gem_fb_end_cpu_access; } import_attach = obj->import_attach; if (!import_attach) continue; ret = dma_buf_begin_cpu_access(import_attach->dmabuf, dir); if (ret) goto err___drm_gem_fb_end_cpu_access; } return 0; err___drm_gem_fb_end_cpu_access: __drm_gem_fb_end_cpu_access(fb, dir, i); return ret; } EXPORT_SYMBOL(drm_gem_fb_begin_cpu_access); /** * drm_gem_fb_end_cpu_access - signals end of CPU access to GEM buffer objects * @fb: the framebuffer * @dir: access mode * * Signals the end of CPU access to the given framebuffer's GEM buffer objects. This * function must be paired with a corresponding call to drm_gem_fb_begin_cpu_access(). * For imported BOs, the function calls dma_buf_end_cpu_access(). * * See also drm_gem_fb_begin_cpu_access(). */ void drm_gem_fb_end_cpu_access(struct drm_framebuffer *fb, enum dma_data_direction dir) { __drm_gem_fb_end_cpu_access(fb, dir, fb->format->num_planes); } EXPORT_SYMBOL(drm_gem_fb_end_cpu_access); // TODO Drop this function and replace by drm_format_info_bpp() once all // DRM_FORMAT_* provide proper block info in drivers/gpu/drm/drm_fourcc.c static __u32 drm_gem_afbc_get_bpp(struct drm_device *dev, const struct drm_mode_fb_cmd2 *mode_cmd) { const struct drm_format_info *info; info = drm_get_format_info(dev, mode_cmd); switch (info->format) { case DRM_FORMAT_YUV420_8BIT: return 12; case DRM_FORMAT_YUV420_10BIT: return 15; case DRM_FORMAT_VUY101010: return 30; default: return drm_format_info_bpp(info, 0); } } static int drm_gem_afbc_min_size(struct drm_device *dev, const struct drm_mode_fb_cmd2 *mode_cmd, struct drm_afbc_framebuffer *afbc_fb) { __u32 n_blocks, w_alignment, h_alignment, hdr_alignment; /* remove bpp when all users properly encode cpp in drm_format_info */ __u32 bpp; switch (mode_cmd->modifier[0] & AFBC_FORMAT_MOD_BLOCK_SIZE_MASK) { case AFBC_FORMAT_MOD_BLOCK_SIZE_16x16: afbc_fb->block_width = 16; afbc_fb->block_height = 16; break; case AFBC_FORMAT_MOD_BLOCK_SIZE_32x8: afbc_fb->block_width = 32; afbc_fb->block_height = 8; break; /* no user exists yet - fall through */ case AFBC_FORMAT_MOD_BLOCK_SIZE_64x4: case AFBC_FORMAT_MOD_BLOCK_SIZE_32x8_64x4: default: drm_dbg_kms(dev, "Invalid AFBC_FORMAT_MOD_BLOCK_SIZE: %lld.\n", mode_cmd->modifier[0] & AFBC_FORMAT_MOD_BLOCK_SIZE_MASK); return -EINVAL; } /* tiled header afbc */ w_alignment = afbc_fb->block_width; h_alignment = afbc_fb->block_height; hdr_alignment = AFBC_HDR_ALIGN; if (mode_cmd->modifier[0] & AFBC_FORMAT_MOD_TILED) { w_alignment *= AFBC_TH_LAYOUT_ALIGNMENT; h_alignment *= AFBC_TH_LAYOUT_ALIGNMENT; hdr_alignment = AFBC_TH_BODY_START_ALIGNMENT; } afbc_fb->aligned_width = ALIGN(mode_cmd->width, w_alignment); afbc_fb->aligned_height = ALIGN(mode_cmd->height, h_alignment); afbc_fb->offset = mode_cmd->offsets[0]; bpp = drm_gem_afbc_get_bpp(dev, mode_cmd); if (!bpp) { drm_dbg_kms(dev, "Invalid AFBC bpp value: %d\n", bpp); return -EINVAL; } n_blocks = (afbc_fb->aligned_width * afbc_fb->aligned_height) / AFBC_SUPERBLOCK_PIXELS; afbc_fb->afbc_size = ALIGN(n_blocks * AFBC_HEADER_SIZE, hdr_alignment); afbc_fb->afbc_size += n_blocks * ALIGN(bpp * AFBC_SUPERBLOCK_PIXELS / 8, AFBC_SUPERBLOCK_ALIGNMENT); return 0; } /** * drm_gem_fb_afbc_init() - Helper function for drivers using afbc to * fill and validate all the afbc-specific * struct drm_afbc_framebuffer members * * @dev: DRM device * @afbc_fb: afbc-specific framebuffer * @mode_cmd: Metadata from the userspace framebuffer creation request * @afbc_fb: afbc framebuffer * * This function can be used by drivers which support afbc to complete * the preparation of struct drm_afbc_framebuffer. It must be called after * allocating the said struct and calling drm_gem_fb_init_with_funcs(). * It is caller's responsibility to put afbc_fb->base.obj objects in case * the call is unsuccessful. * * Returns: * Zero on success or a negative error value on failure. */ int drm_gem_fb_afbc_init(struct drm_device *dev, const struct drm_mode_fb_cmd2 *mode_cmd, struct drm_afbc_framebuffer *afbc_fb) { const struct drm_format_info *info; struct drm_gem_object **objs; int ret; objs = afbc_fb->base.obj; info = drm_get_format_info(dev, mode_cmd); if (!info) return -EINVAL; ret = drm_gem_afbc_min_size(dev, mode_cmd, afbc_fb); if (ret < 0) return ret; if (objs[0]->size < afbc_fb->afbc_size) return -EINVAL; return 0; } EXPORT_SYMBOL_GPL(drm_gem_fb_afbc_init);
4 32 1 3 31 36 36 30 2 29 30 2 11 3 9 1 10 13 13 9 9 4 2 2 2 2 3 3 87 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 // SPDX-License-Identifier: GPL-2.0-only /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> * (C) 2006-2012 Patrick McHardy <kaber@trash.net> */ #include <linux/types.h> #include <linux/timer.h> #include <linux/module.h> #include <linux/udp.h> #include <linux/seq_file.h> #include <linux/skbuff.h> #include <linux/ipv6.h> #include <net/ip6_checksum.h> #include <net/checksum.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter_ipv6.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_ecache.h> #include <net/netfilter/nf_conntrack_timeout.h> #include <net/netfilter/nf_log.h> #include <net/netfilter/ipv4/nf_conntrack_ipv4.h> #include <net/netfilter/ipv6/nf_conntrack_ipv6.h> static const unsigned int udp_timeouts[UDP_CT_MAX] = { [UDP_CT_UNREPLIED] = 30*HZ, [UDP_CT_REPLIED] = 120*HZ, }; static unsigned int *udp_get_timeouts(struct net *net) { return nf_udp_pernet(net)->timeouts; } static void udp_error_log(const struct sk_buff *skb, const struct nf_hook_state *state, const char *msg) { nf_l4proto_log_invalid(skb, state, IPPROTO_UDP, "%s", msg); } static bool udp_error(struct sk_buff *skb, unsigned int dataoff, const struct nf_hook_state *state) { unsigned int udplen = skb->len - dataoff; const struct udphdr *hdr; struct udphdr _hdr; /* Header is too small? */ hdr = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); if (!hdr) { udp_error_log(skb, state, "short packet"); return true; } /* Truncated/malformed packets */ if (ntohs(hdr->len) > udplen || ntohs(hdr->len) < sizeof(*hdr)) { udp_error_log(skb, state, "truncated/malformed packet"); return true; } /* Packet with no checksum */ if (!hdr->check) return false; /* Checksum invalid? Ignore. * We skip checking packets on the outgoing path * because the checksum is assumed to be correct. * FIXME: Source route IP option packets --RR */ if (state->hook == NF_INET_PRE_ROUTING && state->net->ct.sysctl_checksum && nf_checksum(skb, state->hook, dataoff, IPPROTO_UDP, state->pf)) { udp_error_log(skb, state, "bad checksum"); return true; } return false; } /* Returns verdict for packet, and may modify conntracktype */ int nf_conntrack_udp_packet(struct nf_conn *ct, struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, const struct nf_hook_state *state) { unsigned int *timeouts; unsigned long status; if (udp_error(skb, dataoff, state)) return -NF_ACCEPT; timeouts = nf_ct_timeout_lookup(ct); if (!timeouts) timeouts = udp_get_timeouts(nf_ct_net(ct)); status = READ_ONCE(ct->status); if ((status & IPS_CONFIRMED) == 0) ct->proto.udp.stream_ts = 2 * HZ + jiffies; /* If we've seen traffic both ways, this is some kind of UDP * stream. Set Assured. */ if (status & IPS_SEEN_REPLY) { unsigned long extra = timeouts[UDP_CT_UNREPLIED]; bool stream = false; /* Still active after two seconds? Extend timeout. */ if (time_after(jiffies, ct->proto.udp.stream_ts)) { extra = timeouts[UDP_CT_REPLIED]; stream = (status & IPS_ASSURED) == 0; } nf_ct_refresh_acct(ct, ctinfo, skb, extra); /* never set ASSURED for IPS_NAT_CLASH, they time out soon */ if (unlikely((status & IPS_NAT_CLASH))) return NF_ACCEPT; /* Also, more likely to be important, and not a probe */ if (stream && !test_and_set_bit(IPS_ASSURED_BIT, &ct->status)) nf_conntrack_event_cache(IPCT_ASSURED, ct); } else { nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[UDP_CT_UNREPLIED]); } return NF_ACCEPT; } #ifdef CONFIG_NF_CT_PROTO_UDPLITE static void udplite_error_log(const struct sk_buff *skb, const struct nf_hook_state *state, const char *msg) { nf_l4proto_log_invalid(skb, state, IPPROTO_UDPLITE, "%s", msg); } static bool udplite_error(struct sk_buff *skb, unsigned int dataoff, const struct nf_hook_state *state) { unsigned int udplen = skb->len - dataoff; const struct udphdr *hdr; struct udphdr _hdr; unsigned int cscov; /* Header is too small? */ hdr = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); if (!hdr) { udplite_error_log(skb, state, "short packet"); return true; } cscov = ntohs(hdr->len); if (cscov == 0) { cscov = udplen; } else if (cscov < sizeof(*hdr) || cscov > udplen) { udplite_error_log(skb, state, "invalid checksum coverage"); return true; } /* UDPLITE mandates checksums */ if (!hdr->check) { udplite_error_log(skb, state, "checksum missing"); return true; } /* Checksum invalid? Ignore. */ if (state->hook == NF_INET_PRE_ROUTING && state->net->ct.sysctl_checksum && nf_checksum_partial(skb, state->hook, dataoff, cscov, IPPROTO_UDP, state->pf)) { udplite_error_log(skb, state, "bad checksum"); return true; } return false; } /* Returns verdict for packet, and may modify conntracktype */ int nf_conntrack_udplite_packet(struct nf_conn *ct, struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, const struct nf_hook_state *state) { unsigned int *timeouts; if (udplite_error(skb, dataoff, state)) return -NF_ACCEPT; timeouts = nf_ct_timeout_lookup(ct); if (!timeouts) timeouts = udp_get_timeouts(nf_ct_net(ct)); /* If we've seen traffic both ways, this is some kind of UDP stream. Extend timeout. */ if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[UDP_CT_REPLIED]); if (unlikely((ct->status & IPS_NAT_CLASH))) return NF_ACCEPT; /* Also, more likely to be important, and not a probe */ if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status)) nf_conntrack_event_cache(IPCT_ASSURED, ct); } else { nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[UDP_CT_UNREPLIED]); } return NF_ACCEPT; } #endif #ifdef CONFIG_NF_CONNTRACK_TIMEOUT #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_cttimeout.h> static int udp_timeout_nlattr_to_obj(struct nlattr *tb[], struct net *net, void *data) { unsigned int *timeouts = data; struct nf_udp_net *un = nf_udp_pernet(net); if (!timeouts) timeouts = un->timeouts; /* set default timeouts for UDP. */ timeouts[UDP_CT_UNREPLIED] = un->timeouts[UDP_CT_UNREPLIED]; timeouts[UDP_CT_REPLIED] = un->timeouts[UDP_CT_REPLIED]; if (tb[CTA_TIMEOUT_UDP_UNREPLIED]) { timeouts[UDP_CT_UNREPLIED] = ntohl(nla_get_be32(tb[CTA_TIMEOUT_UDP_UNREPLIED])) * HZ; } if (tb[CTA_TIMEOUT_UDP_REPLIED]) { timeouts[UDP_CT_REPLIED] = ntohl(nla_get_be32(tb[CTA_TIMEOUT_UDP_REPLIED])) * HZ; } return 0; } static int udp_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data) { const unsigned int *timeouts = data; if (nla_put_be32(skb, CTA_TIMEOUT_UDP_UNREPLIED, htonl(timeouts[UDP_CT_UNREPLIED] / HZ)) || nla_put_be32(skb, CTA_TIMEOUT_UDP_REPLIED, htonl(timeouts[UDP_CT_REPLIED] / HZ))) goto nla_put_failure; return 0; nla_put_failure: return -ENOSPC; } static const struct nla_policy udp_timeout_nla_policy[CTA_TIMEOUT_UDP_MAX+1] = { [CTA_TIMEOUT_UDP_UNREPLIED] = { .type = NLA_U32 }, [CTA_TIMEOUT_UDP_REPLIED] = { .type = NLA_U32 }, }; #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ void nf_conntrack_udp_init_net(struct net *net) { struct nf_udp_net *un = nf_udp_pernet(net); int i; for (i = 0; i < UDP_CT_MAX; i++) un->timeouts[i] = udp_timeouts[i]; #if IS_ENABLED(CONFIG_NF_FLOW_TABLE) un->offload_timeout = 30 * HZ; #endif } const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp = { .l4proto = IPPROTO_UDP, .allow_clash = true, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size, .nla_policy = nf_ct_port_nla_policy, #endif #ifdef CONFIG_NF_CONNTRACK_TIMEOUT .ctnl_timeout = { .nlattr_to_obj = udp_timeout_nlattr_to_obj, .obj_to_nlattr = udp_timeout_obj_to_nlattr, .nlattr_max = CTA_TIMEOUT_UDP_MAX, .obj_size = sizeof(unsigned int) * CTA_TIMEOUT_UDP_MAX, .nla_policy = udp_timeout_nla_policy, }, #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ }; #ifdef CONFIG_NF_CT_PROTO_UDPLITE const struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite = { .l4proto = IPPROTO_UDPLITE, .allow_clash = true, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size, .nla_policy = nf_ct_port_nla_policy, #endif #ifdef CONFIG_NF_CONNTRACK_TIMEOUT .ctnl_timeout = { .nlattr_to_obj = udp_timeout_nlattr_to_obj, .obj_to_nlattr = udp_timeout_obj_to_nlattr, .nlattr_max = CTA_TIMEOUT_UDP_MAX, .obj_size = sizeof(unsigned int) * CTA_TIMEOUT_UDP_MAX, .nla_policy = udp_timeout_nla_policy, }, #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ }; #endif
19 19 19 19 19 19 13 13 13 13 13 32 32 156 157 157 114 82 2 158 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 /* * Mu-Law conversion Plug-In Interface * Copyright (c) 1999 by Jaroslav Kysela <perex@perex.cz> * Uros Bizjak <uros@kss-loka.si> * * Based on reference implementation by Sun Microsystems, Inc. * * This library is free software; you can redistribute it and/or modify * it under the terms of the GNU Library General Public License as * published by the Free Software Foundation; either version 2 of * the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Library General Public License for more details. * * You should have received a copy of the GNU Library General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * */ #include <linux/time.h> #include <sound/core.h> #include <sound/pcm.h> #include "pcm_plugin.h" #define SIGN_BIT (0x80) /* Sign bit for a u-law byte. */ #define QUANT_MASK (0xf) /* Quantization field mask. */ #define NSEGS (8) /* Number of u-law segments. */ #define SEG_SHIFT (4) /* Left shift for segment number. */ #define SEG_MASK (0x70) /* Segment field mask. */ static inline int val_seg(int val) { int r = 0; val >>= 7; if (val & 0xf0) { val >>= 4; r += 4; } if (val & 0x0c) { val >>= 2; r += 2; } if (val & 0x02) r += 1; return r; } #define BIAS (0x84) /* Bias for linear code. */ /* * linear2ulaw() - Convert a linear PCM value to u-law * * In order to simplify the encoding process, the original linear magnitude * is biased by adding 33 which shifts the encoding range from (0 - 8158) to * (33 - 8191). The result can be seen in the following encoding table: * * Biased Linear Input Code Compressed Code * ------------------------ --------------- * 00000001wxyza 000wxyz * 0000001wxyzab 001wxyz * 000001wxyzabc 010wxyz * 00001wxyzabcd 011wxyz * 0001wxyzabcde 100wxyz * 001wxyzabcdef 101wxyz * 01wxyzabcdefg 110wxyz * 1wxyzabcdefgh 111wxyz * * Each biased linear code has a leading 1 which identifies the segment * number. The value of the segment number is equal to 7 minus the number * of leading 0's. The quantization interval is directly available as the * four bits wxyz. * The trailing bits (a - h) are ignored. * * Ordinarily the complement of the resulting code word is used for * transmission, and so the code word is complemented before it is returned. * * For further information see John C. Bellamy's Digital Telephony, 1982, * John Wiley & Sons, pps 98-111 and 472-476. */ static unsigned char linear2ulaw(int pcm_val) /* 2's complement (16-bit range) */ { int mask; int seg; unsigned char uval; /* Get the sign and the magnitude of the value. */ if (pcm_val < 0) { pcm_val = BIAS - pcm_val; mask = 0x7F; } else { pcm_val += BIAS; mask = 0xFF; } if (pcm_val > 0x7FFF) pcm_val = 0x7FFF; /* Convert the scaled magnitude to segment number. */ seg = val_seg(pcm_val); /* * Combine the sign, segment, quantization bits; * and complement the code word. */ uval = (seg << 4) | ((pcm_val >> (seg + 3)) & 0xF); return uval ^ mask; } /* * ulaw2linear() - Convert a u-law value to 16-bit linear PCM * * First, a biased linear code is derived from the code word. An unbiased * output can then be obtained by subtracting 33 from the biased code. * * Note that this function expects to be passed the complement of the * original code word. This is in keeping with ISDN conventions. */ static int ulaw2linear(unsigned char u_val) { int t; /* Complement to obtain normal u-law value. */ u_val = ~u_val; /* * Extract and bias the quantization bits. Then * shift up by the segment number and subtract out the bias. */ t = ((u_val & QUANT_MASK) << 3) + BIAS; t <<= ((unsigned)u_val & SEG_MASK) >> SEG_SHIFT; return ((u_val & SIGN_BIT) ? (BIAS - t) : (t - BIAS)); } /* * Basic Mu-Law plugin */ typedef void (*mulaw_f)(struct snd_pcm_plugin *plugin, const struct snd_pcm_plugin_channel *src_channels, struct snd_pcm_plugin_channel *dst_channels, snd_pcm_uframes_t frames); struct mulaw_priv { mulaw_f func; int cvt_endian; /* need endian conversion? */ unsigned int native_ofs; /* byte offset in native format */ unsigned int copy_ofs; /* byte offset in s16 format */ unsigned int native_bytes; /* byte size of the native format */ unsigned int copy_bytes; /* bytes to copy per conversion */ u16 flip; /* MSB flip for signedness, done after endian conversion */ }; static inline void cvt_s16_to_native(struct mulaw_priv *data, unsigned char *dst, u16 sample) { sample ^= data->flip; if (data->cvt_endian) sample = swab16(sample); if (data->native_bytes > data->copy_bytes) memset(dst, 0, data->native_bytes); memcpy(dst + data->native_ofs, (char *)&sample + data->copy_ofs, data->copy_bytes); } static void mulaw_decode(struct snd_pcm_plugin *plugin, const struct snd_pcm_plugin_channel *src_channels, struct snd_pcm_plugin_channel *dst_channels, snd_pcm_uframes_t frames) { struct mulaw_priv *data = (struct mulaw_priv *)plugin->extra_data; int channel; int nchannels = plugin->src_format.channels; for (channel = 0; channel < nchannels; ++channel) { char *src; char *dst; int src_step, dst_step; snd_pcm_uframes_t frames1; if (!src_channels[channel].enabled) { if (dst_channels[channel].wanted) snd_pcm_area_silence(&dst_channels[channel].area, 0, frames, plugin->dst_format.format); dst_channels[channel].enabled = 0; continue; } dst_channels[channel].enabled = 1; src = src_channels[channel].area.addr + src_channels[channel].area.first / 8; dst = dst_channels[channel].area.addr + dst_channels[channel].area.first / 8; src_step = src_channels[channel].area.step / 8; dst_step = dst_channels[channel].area.step / 8; frames1 = frames; while (frames1-- > 0) { signed short sample = ulaw2linear(*src); cvt_s16_to_native(data, dst, sample); src += src_step; dst += dst_step; } } } static inline signed short cvt_native_to_s16(struct mulaw_priv *data, unsigned char *src) { u16 sample = 0; memcpy((char *)&sample + data->copy_ofs, src + data->native_ofs, data->copy_bytes); if (data->cvt_endian) sample = swab16(sample); sample ^= data->flip; return (signed short)sample; } static void mulaw_encode(struct snd_pcm_plugin *plugin, const struct snd_pcm_plugin_channel *src_channels, struct snd_pcm_plugin_channel *dst_channels, snd_pcm_uframes_t frames) { struct mulaw_priv *data = (struct mulaw_priv *)plugin->extra_data; int channel; int nchannels = plugin->src_format.channels; for (channel = 0; channel < nchannels; ++channel) { char *src; char *dst; int src_step, dst_step; snd_pcm_uframes_t frames1; if (!src_channels[channel].enabled) { if (dst_channels[channel].wanted) snd_pcm_area_silence(&dst_channels[channel].area, 0, frames, plugin->dst_format.format); dst_channels[channel].enabled = 0; continue; } dst_channels[channel].enabled = 1; src = src_channels[channel].area.addr + src_channels[channel].area.first / 8; dst = dst_channels[channel].area.addr + dst_channels[channel].area.first / 8; src_step = src_channels[channel].area.step / 8; dst_step = dst_channels[channel].area.step / 8; frames1 = frames; while (frames1-- > 0) { signed short sample = cvt_native_to_s16(data, src); *dst = linear2ulaw(sample); src += src_step; dst += dst_step; } } } static snd_pcm_sframes_t mulaw_transfer(struct snd_pcm_plugin *plugin, const struct snd_pcm_plugin_channel *src_channels, struct snd_pcm_plugin_channel *dst_channels, snd_pcm_uframes_t frames) { struct mulaw_priv *data; if (snd_BUG_ON(!plugin || !src_channels || !dst_channels)) return -ENXIO; if (frames == 0) return 0; #ifdef CONFIG_SND_DEBUG { unsigned int channel; for (channel = 0; channel < plugin->src_format.channels; channel++) { if (snd_BUG_ON(src_channels[channel].area.first % 8 || src_channels[channel].area.step % 8)) return -ENXIO; if (snd_BUG_ON(dst_channels[channel].area.first % 8 || dst_channels[channel].area.step % 8)) return -ENXIO; } } #endif if (frames > dst_channels[0].frames) frames = dst_channels[0].frames; data = (struct mulaw_priv *)plugin->extra_data; data->func(plugin, src_channels, dst_channels, frames); return frames; } static void init_data(struct mulaw_priv *data, snd_pcm_format_t format) { #ifdef SNDRV_LITTLE_ENDIAN data->cvt_endian = snd_pcm_format_big_endian(format) > 0; #else data->cvt_endian = snd_pcm_format_little_endian(format) > 0; #endif if (!snd_pcm_format_signed(format)) data->flip = 0x8000; data->native_bytes = snd_pcm_format_physical_width(format) / 8; data->copy_bytes = data->native_bytes < 2 ? 1 : 2; if (snd_pcm_format_little_endian(format)) { data->native_ofs = data->native_bytes - data->copy_bytes; data->copy_ofs = 2 - data->copy_bytes; } else { /* S24 in 4bytes need an 1 byte offset */ data->native_ofs = data->native_bytes - snd_pcm_format_width(format) / 8; } } int snd_pcm_plugin_build_mulaw(struct snd_pcm_substream *plug, struct snd_pcm_plugin_format *src_format, struct snd_pcm_plugin_format *dst_format, struct snd_pcm_plugin **r_plugin) { int err; struct mulaw_priv *data; struct snd_pcm_plugin *plugin; struct snd_pcm_plugin_format *format; mulaw_f func; if (snd_BUG_ON(!r_plugin)) return -ENXIO; *r_plugin = NULL; if (snd_BUG_ON(src_format->rate != dst_format->rate)) return -ENXIO; if (snd_BUG_ON(src_format->channels != dst_format->channels)) return -ENXIO; if (dst_format->format == SNDRV_PCM_FORMAT_MU_LAW) { format = src_format; func = mulaw_encode; } else if (src_format->format == SNDRV_PCM_FORMAT_MU_LAW) { format = dst_format; func = mulaw_decode; } else { snd_BUG(); return -EINVAL; } if (!snd_pcm_format_linear(format->format)) return -EINVAL; err = snd_pcm_plugin_build(plug, "Mu-Law<->linear conversion", src_format, dst_format, sizeof(struct mulaw_priv), &plugin); if (err < 0) return err; data = (struct mulaw_priv *)plugin->extra_data; data->func = func; init_data(data, format->format); plugin->transfer = mulaw_transfer; *r_plugin = plugin; return 0; }
95 72 97 3 95 97 3 3 1 97 97 97 97 97 1 1 1 1 1 2 1 1 1 3 20 20 19 2 5 15 5 4 7 8 39 1 38 1 1 37 1 15 21 1 1 1 19 20 2 2 2 8 6 14 4 8 1 2 10 1 10 2 1 15 1 1 1 1 1 1 2 1 2 1 2 1 3 3 2 1 1 1 1 1 1 1 3 3 8 5 1 1 1 39 40 6 7 2 6 2 2 2 4 2 1 1 2 2 2 3 2 1 2 2 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 // SPDX-License-Identifier: GPL-2.0-only /* * binfmt_misc.c * * Copyright (C) 1997 Richard Günther * * binfmt_misc detects binaries via a magic or filename extension and invokes * a specified wrapper. See Documentation/admin-guide/binfmt-misc.rst for more details. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kernel.h> #include <linux/module.h> #include <linux/init.h> #include <linux/sched/mm.h> #include <linux/magic.h> #include <linux/binfmts.h> #include <linux/slab.h> #include <linux/ctype.h> #include <linux/string_helpers.h> #include <linux/file.h> #include <linux/pagemap.h> #include <linux/namei.h> #include <linux/mount.h> #include <linux/fs_context.h> #include <linux/syscalls.h> #include <linux/fs.h> #include <linux/uaccess.h> #include "internal.h" #ifdef DEBUG # define USE_DEBUG 1 #else # define USE_DEBUG 0 #endif enum { VERBOSE_STATUS = 1 /* make it zero to save 400 bytes kernel memory */ }; enum {Enabled, Magic}; #define MISC_FMT_PRESERVE_ARGV0 (1UL << 31) #define MISC_FMT_OPEN_BINARY (1UL << 30) #define MISC_FMT_CREDENTIALS (1UL << 29) #define MISC_FMT_OPEN_FILE (1UL << 28) typedef struct { struct list_head list; unsigned long flags; /* type, status, etc. */ int offset; /* offset of magic */ int size; /* size of magic/mask */ char *magic; /* magic or filename extension */ char *mask; /* mask, NULL for exact match */ const char *interpreter; /* filename of interpreter */ char *name; struct dentry *dentry; struct file *interp_file; refcount_t users; /* sync removal with load_misc_binary() */ } Node; static struct file_system_type bm_fs_type; /* * Max length of the register string. Determined by: * - 7 delimiters * - name: ~50 bytes * - type: 1 byte * - offset: 3 bytes (has to be smaller than BINPRM_BUF_SIZE) * - magic: 128 bytes (512 in escaped form) * - mask: 128 bytes (512 in escaped form) * - interp: ~50 bytes * - flags: 5 bytes * Round that up a bit, and then back off to hold the internal data * (like struct Node). */ #define MAX_REGISTER_LENGTH 1920 /** * search_binfmt_handler - search for a binary handler for @bprm * @misc: handle to binfmt_misc instance * @bprm: binary for which we are looking for a handler * * Search for a binary type handler for @bprm in the list of registered binary * type handlers. * * Return: binary type list entry on success, NULL on failure */ static Node *search_binfmt_handler(struct binfmt_misc *misc, struct linux_binprm *bprm) { char *p = strrchr(bprm->interp, '.'); Node *e; /* Walk all the registered handlers. */ list_for_each_entry(e, &misc->entries, list) { char *s; int j; /* Make sure this one is currently enabled. */ if (!test_bit(Enabled, &e->flags)) continue; /* Do matching based on extension if applicable. */ if (!test_bit(Magic, &e->flags)) { if (p && !strcmp(e->magic, p + 1)) return e; continue; } /* Do matching based on magic & mask. */ s = bprm->buf + e->offset; if (e->mask) { for (j = 0; j < e->size; j++) if ((*s++ ^ e->magic[j]) & e->mask[j]) break; } else { for (j = 0; j < e->size; j++) if ((*s++ ^ e->magic[j])) break; } if (j == e->size) return e; } return NULL; } /** * get_binfmt_handler - try to find a binary type handler * @misc: handle to binfmt_misc instance * @bprm: binary for which we are looking for a handler * * Try to find a binfmt handler for the binary type. If one is found take a * reference to protect against removal via bm_{entry,status}_write(). * * Return: binary type list entry on success, NULL on failure */ static Node *get_binfmt_handler(struct binfmt_misc *misc, struct linux_binprm *bprm) { Node *e; read_lock(&misc->entries_lock); e = search_binfmt_handler(misc, bprm); if (e) refcount_inc(&e->users); read_unlock(&misc->entries_lock); return e; } /** * put_binfmt_handler - put binary handler node * @e: node to put * * Free node syncing with load_misc_binary() and defer final free to * load_misc_binary() in case it is using the binary type handler we were * requested to remove. */ static void put_binfmt_handler(Node *e) { if (refcount_dec_and_test(&e->users)) { if (e->flags & MISC_FMT_OPEN_FILE) filp_close(e->interp_file, NULL); kfree(e); } } /** * load_binfmt_misc - load the binfmt_misc of the caller's user namespace * * To be called in load_misc_binary() to load the relevant struct binfmt_misc. * If a user namespace doesn't have its own binfmt_misc mount it can make use * of its ancestor's binfmt_misc handlers. This mimicks the behavior of * pre-namespaced binfmt_misc where all registered binfmt_misc handlers where * available to all user and user namespaces on the system. * * Return: the binfmt_misc instance of the caller's user namespace */ static struct binfmt_misc *load_binfmt_misc(void) { const struct user_namespace *user_ns; struct binfmt_misc *misc; user_ns = current_user_ns(); while (user_ns) { /* Pairs with smp_store_release() in bm_fill_super(). */ misc = smp_load_acquire(&user_ns->binfmt_misc); if (misc) return misc; user_ns = user_ns->parent; } return &init_binfmt_misc; } /* * the loader itself */ static int load_misc_binary(struct linux_binprm *bprm) { Node *fmt; struct file *interp_file = NULL; int retval = -ENOEXEC; struct binfmt_misc *misc; misc = load_binfmt_misc(); if (!misc->enabled) return retval; fmt = get_binfmt_handler(misc, bprm); if (!fmt) return retval; /* Need to be able to load the file after exec */ retval = -ENOENT; if (bprm->interp_flags & BINPRM_FLAGS_PATH_INACCESSIBLE) goto ret; if (fmt->flags & MISC_FMT_PRESERVE_ARGV0) { bprm->interp_flags |= BINPRM_FLAGS_PRESERVE_ARGV0; } else { retval = remove_arg_zero(bprm); if (retval) goto ret; } if (fmt->flags & MISC_FMT_OPEN_BINARY) bprm->have_execfd = 1; /* make argv[1] be the path to the binary */ retval = copy_string_kernel(bprm->interp, bprm); if (retval < 0) goto ret; bprm->argc++; /* add the interp as argv[0] */ retval = copy_string_kernel(fmt->interpreter, bprm); if (retval < 0) goto ret; bprm->argc++; /* Update interp in case binfmt_script needs it. */ retval = bprm_change_interp(fmt->interpreter, bprm); if (retval < 0) goto ret; if (fmt->flags & MISC_FMT_OPEN_FILE) { interp_file = file_clone_open(fmt->interp_file); if (!IS_ERR(interp_file)) deny_write_access(interp_file); } else { interp_file = open_exec(fmt->interpreter); } retval = PTR_ERR(interp_file); if (IS_ERR(interp_file)) goto ret; bprm->interpreter = interp_file; if (fmt->flags & MISC_FMT_CREDENTIALS) bprm->execfd_creds = 1; retval = 0; ret: /* * If we actually put the node here all concurrent calls to * load_misc_binary() will have finished. We also know * that for the refcount to be zero someone must have concurently * removed the binary type handler from the list and it's our job to * free it. */ put_binfmt_handler(fmt); return retval; } /* Command parsers */ /* * parses and copies one argument enclosed in del from *sp to *dp, * recognising the \x special. * returns pointer to the copied argument or NULL in case of an * error (and sets err) or null argument length. */ static char *scanarg(char *s, char del) { char c; while ((c = *s++) != del) { if (c == '\\' && *s == 'x') { s++; if (!isxdigit(*s++)) return NULL; if (!isxdigit(*s++)) return NULL; } } s[-1] ='\0'; return s; } static char *check_special_flags(char *sfs, Node *e) { char *p = sfs; int cont = 1; /* special flags */ while (cont) { switch (*p) { case 'P': pr_debug("register: flag: P (preserve argv0)\n"); p++; e->flags |= MISC_FMT_PRESERVE_ARGV0; break; case 'O': pr_debug("register: flag: O (open binary)\n"); p++; e->flags |= MISC_FMT_OPEN_BINARY; break; case 'C': pr_debug("register: flag: C (preserve creds)\n"); p++; /* this flags also implies the open-binary flag */ e->flags |= (MISC_FMT_CREDENTIALS | MISC_FMT_OPEN_BINARY); break; case 'F': pr_debug("register: flag: F: open interpreter file now\n"); p++; e->flags |= MISC_FMT_OPEN_FILE; break; default: cont = 0; } } return p; } /* * This registers a new binary format, it recognises the syntax * ':name:type:offset:magic:mask:interpreter:flags' * where the ':' is the IFS, that can be chosen with the first char */ static Node *create_entry(const char __user *buffer, size_t count) { Node *e; int memsize, err; char *buf, *p; char del; pr_debug("register: received %zu bytes\n", count); /* some sanity checks */ err = -EINVAL; if ((count < 11) || (count > MAX_REGISTER_LENGTH)) goto out; err = -ENOMEM; memsize = sizeof(Node) + count + 8; e = kmalloc(memsize, GFP_KERNEL_ACCOUNT); if (!e) goto out; p = buf = (char *)e + sizeof(Node); memset(e, 0, sizeof(Node)); if (copy_from_user(buf, buffer, count)) goto efault; del = *p++; /* delimeter */ pr_debug("register: delim: %#x {%c}\n", del, del); /* Pad the buffer with the delim to simplify parsing below. */ memset(buf + count, del, 8); /* Parse the 'name' field. */ e->name = p; p = strchr(p, del); if (!p) goto einval; *p++ = '\0'; if (!e->name[0] || !strcmp(e->name, ".") || !strcmp(e->name, "..") || strchr(e->name, '/')) goto einval; pr_debug("register: name: {%s}\n", e->name); /* Parse the 'type' field. */ switch (*p++) { case 'E': pr_debug("register: type: E (extension)\n"); e->flags = 1 << Enabled; break; case 'M': pr_debug("register: type: M (magic)\n"); e->flags = (1 << Enabled) | (1 << Magic); break; default: goto einval; } if (*p++ != del) goto einval; if (test_bit(Magic, &e->flags)) { /* Handle the 'M' (magic) format. */ char *s; /* Parse the 'offset' field. */ s = strchr(p, del); if (!s) goto einval; *s = '\0'; if (p != s) { int r = kstrtoint(p, 10, &e->offset); if (r != 0 || e->offset < 0) goto einval; } p = s; if (*p++) goto einval; pr_debug("register: offset: %#x\n", e->offset); /* Parse the 'magic' field. */ e->magic = p; p = scanarg(p, del); if (!p) goto einval; if (!e->magic[0]) goto einval; if (USE_DEBUG) print_hex_dump_bytes( KBUILD_MODNAME ": register: magic[raw]: ", DUMP_PREFIX_NONE, e->magic, p - e->magic); /* Parse the 'mask' field. */ e->mask = p; p = scanarg(p, del); if (!p) goto einval; if (!e->mask[0]) { e->mask = NULL; pr_debug("register: mask[raw]: none\n"); } else if (USE_DEBUG) print_hex_dump_bytes( KBUILD_MODNAME ": register: mask[raw]: ", DUMP_PREFIX_NONE, e->mask, p - e->mask); /* * Decode the magic & mask fields. * Note: while we might have accepted embedded NUL bytes from * above, the unescape helpers here will stop at the first one * it encounters. */ e->size = string_unescape_inplace(e->magic, UNESCAPE_HEX); if (e->mask && string_unescape_inplace(e->mask, UNESCAPE_HEX) != e->size) goto einval; if (e->size > BINPRM_BUF_SIZE || BINPRM_BUF_SIZE - e->size < e->offset) goto einval; pr_debug("register: magic/mask length: %i\n", e->size); if (USE_DEBUG) { print_hex_dump_bytes( KBUILD_MODNAME ": register: magic[decoded]: ", DUMP_PREFIX_NONE, e->magic, e->size); if (e->mask) { int i; char *masked = kmalloc(e->size, GFP_KERNEL_ACCOUNT); print_hex_dump_bytes( KBUILD_MODNAME ": register: mask[decoded]: ", DUMP_PREFIX_NONE, e->mask, e->size); if (masked) { for (i = 0; i < e->size; ++i) masked[i] = e->magic[i] & e->mask[i]; print_hex_dump_bytes( KBUILD_MODNAME ": register: magic[masked]: ", DUMP_PREFIX_NONE, masked, e->size); kfree(masked); } } } } else { /* Handle the 'E' (extension) format. */ /* Skip the 'offset' field. */ p = strchr(p, del); if (!p) goto einval; *p++ = '\0'; /* Parse the 'magic' field. */ e->magic = p; p = strchr(p, del); if (!p) goto einval; *p++ = '\0'; if (!e->magic[0] || strchr(e->magic, '/')) goto einval; pr_debug("register: extension: {%s}\n", e->magic); /* Skip the 'mask' field. */ p = strchr(p, del); if (!p) goto einval; *p++ = '\0'; } /* Parse the 'interpreter' field. */ e->interpreter = p; p = strchr(p, del); if (!p) goto einval; *p++ = '\0'; if (!e->interpreter[0]) goto einval; pr_debug("register: interpreter: {%s}\n", e->interpreter); /* Parse the 'flags' field. */ p = check_special_flags(p, e); if (*p == '\n') p++; if (p != buf + count) goto einval; return e; out: return ERR_PTR(err); efault: kfree(e); return ERR_PTR(-EFAULT); einval: kfree(e); return ERR_PTR(-EINVAL); } /* * Set status of entry/binfmt_misc: * '1' enables, '0' disables and '-1' clears entry/binfmt_misc */ static int parse_command(const char __user *buffer, size_t count) { char s[4]; if (count > 3) return -EINVAL; if (copy_from_user(s, buffer, count)) return -EFAULT; if (!count) return 0; if (s[count - 1] == '\n') count--; if (count == 1 && s[0] == '0') return 1; if (count == 1 && s[0] == '1') return 2; if (count == 2 && s[0] == '-' && s[1] == '1') return 3; return -EINVAL; } /* generic stuff */ static void entry_status(Node *e, char *page) { char *dp = page; const char *status = "disabled"; if (test_bit(Enabled, &e->flags)) status = "enabled"; if (!VERBOSE_STATUS) { sprintf(page, "%s\n", status); return; } dp += sprintf(dp, "%s\ninterpreter %s\n", status, e->interpreter); /* print the special flags */ dp += sprintf(dp, "flags: "); if (e->flags & MISC_FMT_PRESERVE_ARGV0) *dp++ = 'P'; if (e->flags & MISC_FMT_OPEN_BINARY) *dp++ = 'O'; if (e->flags & MISC_FMT_CREDENTIALS) *dp++ = 'C'; if (e->flags & MISC_FMT_OPEN_FILE) *dp++ = 'F'; *dp++ = '\n'; if (!test_bit(Magic, &e->flags)) { sprintf(dp, "extension .%s\n", e->magic); } else { dp += sprintf(dp, "offset %i\nmagic ", e->offset); dp = bin2hex(dp, e->magic, e->size); if (e->mask) { dp += sprintf(dp, "\nmask "); dp = bin2hex(dp, e->mask, e->size); } *dp++ = '\n'; *dp = '\0'; } } static struct inode *bm_get_inode(struct super_block *sb, int mode) { struct inode *inode = new_inode(sb); if (inode) { inode->i_ino = get_next_ino(); inode->i_mode = mode; simple_inode_init_ts(inode); } return inode; } /** * i_binfmt_misc - retrieve struct binfmt_misc from a binfmt_misc inode * @inode: inode of the relevant binfmt_misc instance * * This helper retrieves struct binfmt_misc from a binfmt_misc inode. This can * be done without any memory barriers because we are guaranteed that * user_ns->binfmt_misc is fully initialized. It was fully initialized when the * binfmt_misc mount was first created. * * Return: struct binfmt_misc of the relevant binfmt_misc instance */ static struct binfmt_misc *i_binfmt_misc(struct inode *inode) { return inode->i_sb->s_user_ns->binfmt_misc; } /** * bm_evict_inode - cleanup data associated with @inode * @inode: inode to which the data is attached * * Cleanup the binary type handler data associated with @inode if a binary type * entry is removed or the filesystem is unmounted and the super block is * shutdown. * * If the ->evict call was not caused by a super block shutdown but by a write * to remove the entry or all entries via bm_{entry,status}_write() the entry * will have already been removed from the list. We keep the list_empty() check * to make that explicit. */ static void bm_evict_inode(struct inode *inode) { Node *e = inode->i_private; clear_inode(inode); if (e) { struct binfmt_misc *misc; misc = i_binfmt_misc(inode); write_lock(&misc->entries_lock); if (!list_empty(&e->list)) list_del_init(&e->list); write_unlock(&misc->entries_lock); put_binfmt_handler(e); } } /** * unlink_binfmt_dentry - remove the dentry for the binary type handler * @dentry: dentry associated with the binary type handler * * Do the actual filesystem work to remove a dentry for a registered binary * type handler. Since binfmt_misc only allows simple files to be created * directly under the root dentry of the filesystem we ensure that we are * indeed passed a dentry directly beneath the root dentry, that the inode * associated with the root dentry is locked, and that it is a regular file we * are asked to remove. */ static void unlink_binfmt_dentry(struct dentry *dentry) { struct dentry *parent = dentry->d_parent; struct inode *inode, *parent_inode; /* All entries are immediate descendants of the root dentry. */ if (WARN_ON_ONCE(dentry->d_sb->s_root != parent)) return; /* We only expect to be called on regular files. */ inode = d_inode(dentry); if (WARN_ON_ONCE(!S_ISREG(inode->i_mode))) return; /* The parent inode must be locked. */ parent_inode = d_inode(parent); if (WARN_ON_ONCE(!inode_is_locked(parent_inode))) return; if (simple_positive(dentry)) { dget(dentry); simple_unlink(parent_inode, dentry); d_delete(dentry); dput(dentry); } } /** * remove_binfmt_handler - remove a binary type handler * @misc: handle to binfmt_misc instance * @e: binary type handler to remove * * Remove a binary type handler from the list of binary type handlers and * remove its associated dentry. This is called from * binfmt_{entry,status}_write(). In the future, we might want to think about * adding a proper ->unlink() method to binfmt_misc instead of forcing caller's * to use writes to files in order to delete binary type handlers. But it has * worked for so long that it's not a pressing issue. */ static void remove_binfmt_handler(struct binfmt_misc *misc, Node *e) { write_lock(&misc->entries_lock); list_del_init(&e->list); write_unlock(&misc->entries_lock); unlink_binfmt_dentry(e->dentry); } /* /<entry> */ static ssize_t bm_entry_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos) { Node *e = file_inode(file)->i_private; ssize_t res; char *page; page = (char *) __get_free_page(GFP_KERNEL); if (!page) return -ENOMEM; entry_status(e, page); res = simple_read_from_buffer(buf, nbytes, ppos, page, strlen(page)); free_page((unsigned long) page); return res; } static ssize_t bm_entry_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { struct inode *inode = file_inode(file); Node *e = inode->i_private; int res = parse_command(buffer, count); switch (res) { case 1: /* Disable this handler. */ clear_bit(Enabled, &e->flags); break; case 2: /* Enable this handler. */ set_bit(Enabled, &e->flags); break; case 3: /* Delete this handler. */ inode = d_inode(inode->i_sb->s_root); inode_lock(inode); /* * In order to add new element or remove elements from the list * via bm_{entry,register,status}_write() inode_lock() on the * root inode must be held. * The lock is exclusive ensuring that the list can't be * modified. Only load_misc_binary() can access but does so * read-only. So we only need to take the write lock when we * actually remove the entry from the list. */ if (!list_empty(&e->list)) remove_binfmt_handler(i_binfmt_misc(inode), e); inode_unlock(inode); break; default: return res; } return count; } static const struct file_operations bm_entry_operations = { .read = bm_entry_read, .write = bm_entry_write, .llseek = default_llseek, }; /* /register */ static ssize_t bm_register_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { Node *e; struct inode *inode; struct super_block *sb = file_inode(file)->i_sb; struct dentry *root = sb->s_root, *dentry; struct binfmt_misc *misc; int err = 0; struct file *f = NULL; e = create_entry(buffer, count); if (IS_ERR(e)) return PTR_ERR(e); if (e->flags & MISC_FMT_OPEN_FILE) { const struct cred *old_cred; /* * Now that we support unprivileged binfmt_misc mounts make * sure we use the credentials that the register @file was * opened with to also open the interpreter. Before that this * didn't matter much as only a privileged process could open * the register file. */ old_cred = override_creds(file->f_cred); f = open_exec(e->interpreter); revert_creds(old_cred); if (IS_ERR(f)) { pr_notice("register: failed to install interpreter file %s\n", e->interpreter); kfree(e); return PTR_ERR(f); } e->interp_file = f; } inode_lock(d_inode(root)); dentry = lookup_one_len(e->name, root, strlen(e->name)); err = PTR_ERR(dentry); if (IS_ERR(dentry)) goto out; err = -EEXIST; if (d_really_is_positive(dentry)) goto out2; inode = bm_get_inode(sb, S_IFREG | 0644); err = -ENOMEM; if (!inode) goto out2; refcount_set(&e->users, 1); e->dentry = dget(dentry); inode->i_private = e; inode->i_fop = &bm_entry_operations; d_instantiate(dentry, inode); misc = i_binfmt_misc(inode); write_lock(&misc->entries_lock); list_add(&e->list, &misc->entries); write_unlock(&misc->entries_lock); err = 0; out2: dput(dentry); out: inode_unlock(d_inode(root)); if (err) { if (f) filp_close(f, NULL); kfree(e); return err; } return count; } static const struct file_operations bm_register_operations = { .write = bm_register_write, .llseek = noop_llseek, }; /* /status */ static ssize_t bm_status_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos) { struct binfmt_misc *misc; char *s; misc = i_binfmt_misc(file_inode(file)); s = misc->enabled ? "enabled\n" : "disabled\n"; return simple_read_from_buffer(buf, nbytes, ppos, s, strlen(s)); } static ssize_t bm_status_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { struct binfmt_misc *misc; int res = parse_command(buffer, count); Node *e, *next; struct inode *inode; misc = i_binfmt_misc(file_inode(file)); switch (res) { case 1: /* Disable all handlers. */ misc->enabled = false; break; case 2: /* Enable all handlers. */ misc->enabled = true; break; case 3: /* Delete all handlers. */ inode = d_inode(file_inode(file)->i_sb->s_root); inode_lock(inode); /* * In order to add new element or remove elements from the list * via bm_{entry,register,status}_write() inode_lock() on the * root inode must be held. * The lock is exclusive ensuring that the list can't be * modified. Only load_misc_binary() can access but does so * read-only. So we only need to take the write lock when we * actually remove the entry from the list. */ list_for_each_entry_safe(e, next, &misc->entries, list) remove_binfmt_handler(misc, e); inode_unlock(inode); break; default: return res; } return count; } static const struct file_operations bm_status_operations = { .read = bm_status_read, .write = bm_status_write, .llseek = default_llseek, }; /* Superblock handling */ static void bm_put_super(struct super_block *sb) { struct user_namespace *user_ns = sb->s_fs_info; sb->s_fs_info = NULL; put_user_ns(user_ns); } static const struct super_operations s_ops = { .statfs = simple_statfs, .evict_inode = bm_evict_inode, .put_super = bm_put_super, }; static int bm_fill_super(struct super_block *sb, struct fs_context *fc) { int err; struct user_namespace *user_ns = sb->s_user_ns; struct binfmt_misc *misc; static const struct tree_descr bm_files[] = { [2] = {"status", &bm_status_operations, S_IWUSR|S_IRUGO}, [3] = {"register", &bm_register_operations, S_IWUSR}, /* last one */ {""} }; if (WARN_ON(user_ns != current_user_ns())) return -EINVAL; /* * Lazily allocate a new binfmt_misc instance for this namespace, i.e. * do it here during the first mount of binfmt_misc. We don't need to * waste memory for every user namespace allocation. It's likely much * more common to not mount a separate binfmt_misc instance than it is * to mount one. * * While multiple superblocks can exist they are keyed by userns in * s_fs_info for binfmt_misc. Hence, the vfs guarantees that * bm_fill_super() is called exactly once whenever a binfmt_misc * superblock for a userns is created. This in turn lets us conclude * that when a binfmt_misc superblock is created for the first time for * a userns there's no one racing us. Therefore we don't need any * barriers when we dereference binfmt_misc. */ misc = user_ns->binfmt_misc; if (!misc) { /* * If it turns out that most user namespaces actually want to * register their own binary type handler and therefore all * create their own separate binfmt_misc mounts we should * consider turning this into a kmem cache. */ misc = kzalloc(sizeof(struct binfmt_misc), GFP_KERNEL); if (!misc) return -ENOMEM; INIT_LIST_HEAD(&misc->entries); rwlock_init(&misc->entries_lock); /* Pairs with smp_load_acquire() in load_binfmt_misc(). */ smp_store_release(&user_ns->binfmt_misc, misc); } /* * When the binfmt_misc superblock for this userns is shutdown * ->enabled might have been set to false and we don't reinitialize * ->enabled again in put_super() as someone might already be mounting * binfmt_misc again. It also would be pointless since by the time * ->put_super() is called we know that the binary type list for this * bintfmt_misc mount is empty making load_misc_binary() return * -ENOEXEC independent of whether ->enabled is true. Instead, if * someone mounts binfmt_misc for the first time or again we simply * reset ->enabled to true. */ misc->enabled = true; err = simple_fill_super(sb, BINFMTFS_MAGIC, bm_files); if (!err) sb->s_op = &s_ops; return err; } static void bm_free(struct fs_context *fc) { if (fc->s_fs_info) put_user_ns(fc->s_fs_info); } static int bm_get_tree(struct fs_context *fc) { return get_tree_keyed(fc, bm_fill_super, get_user_ns(fc->user_ns)); } static const struct fs_context_operations bm_context_ops = { .free = bm_free, .get_tree = bm_get_tree, }; static int bm_init_fs_context(struct fs_context *fc) { fc->ops = &bm_context_ops; return 0; } static struct linux_binfmt misc_format = { .module = THIS_MODULE, .load_binary = load_misc_binary, }; static struct file_system_type bm_fs_type = { .owner = THIS_MODULE, .name = "binfmt_misc", .init_fs_context = bm_init_fs_context, .fs_flags = FS_USERNS_MOUNT, .kill_sb = kill_litter_super, }; MODULE_ALIAS_FS("binfmt_misc"); static int __init init_misc_binfmt(void) { int err = register_filesystem(&bm_fs_type); if (!err) insert_binfmt(&misc_format); return err; } static void __exit exit_misc_binfmt(void) { unregister_binfmt(&misc_format); unregister_filesystem(&bm_fs_type); } core_initcall(init_misc_binfmt); module_exit(exit_misc_binfmt); MODULE_DESCRIPTION("Kernel support for miscellaneous binaries"); MODULE_LICENSE("GPL");
87 87 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) ST-Ericsson AB 2010 * Author: Sjur Brendeland */ #define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ #include <linux/stddef.h> #include <linux/spinlock.h> #include <linux/slab.h> #include <linux/pkt_sched.h> #include <net/caif/caif_layer.h> #include <net/caif/cfpkt.h> #include <net/caif/cfctrl.h> #define container_obj(layr) container_of(layr, struct cfctrl, serv.layer) #define UTILITY_NAME_LENGTH 16 #define CFPKT_CTRL_PKT_LEN 20 #ifdef CAIF_NO_LOOP static int handle_loop(struct cfctrl *ctrl, int cmd, struct cfpkt *pkt){ return -1; } #else static int handle_loop(struct cfctrl *ctrl, int cmd, struct cfpkt *pkt); #endif static int cfctrl_recv(struct cflayer *layr, struct cfpkt *pkt); static void cfctrl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl, int phyid); struct cflayer *cfctrl_create(void) { struct dev_info dev_info; struct cfctrl *this = kzalloc(sizeof(struct cfctrl), GFP_ATOMIC); if (!this) return NULL; caif_assert(offsetof(struct cfctrl, serv.layer) == 0); memset(&dev_info, 0, sizeof(dev_info)); dev_info.id = 0xff; cfsrvl_init(&this->serv, 0, &dev_info, false); atomic_set(&this->req_seq_no, 1); atomic_set(&this->rsp_seq_no, 1); this->serv.layer.receive = cfctrl_recv; sprintf(this->serv.layer.name, "ctrl"); this->serv.layer.ctrlcmd = cfctrl_ctrlcmd; #ifndef CAIF_NO_LOOP spin_lock_init(&this->loop_linkid_lock); this->loop_linkid = 1; #endif spin_lock_init(&this->info_list_lock); INIT_LIST_HEAD(&this->list); return &this->serv.layer; } void cfctrl_remove(struct cflayer *layer) { struct cfctrl_request_info *p, *tmp; struct cfctrl *ctrl = container_obj(layer); spin_lock_bh(&ctrl->info_list_lock); list_for_each_entry_safe(p, tmp, &ctrl->list, list) { list_del(&p->list); kfree(p); } spin_unlock_bh(&ctrl->info_list_lock); kfree(layer); } static bool param_eq(const struct cfctrl_link_param *p1, const struct cfctrl_link_param *p2) { bool eq = p1->linktype == p2->linktype && p1->priority == p2->priority && p1->phyid == p2->phyid && p1->endpoint == p2->endpoint && p1->chtype == p2->chtype; if (!eq) return false; switch (p1->linktype) { case CFCTRL_SRV_VEI: return true; case CFCTRL_SRV_DATAGRAM: return p1->u.datagram.connid == p2->u.datagram.connid; case CFCTRL_SRV_RFM: return p1->u.rfm.connid == p2->u.rfm.connid && strcmp(p1->u.rfm.volume, p2->u.rfm.volume) == 0; case CFCTRL_SRV_UTIL: return p1->u.utility.fifosize_kb == p2->u.utility.fifosize_kb && p1->u.utility.fifosize_bufs == p2->u.utility.fifosize_bufs && strcmp(p1->u.utility.name, p2->u.utility.name) == 0 && p1->u.utility.paramlen == p2->u.utility.paramlen && memcmp(p1->u.utility.params, p2->u.utility.params, p1->u.utility.paramlen) == 0; case CFCTRL_SRV_VIDEO: return p1->u.video.connid == p2->u.video.connid; case CFCTRL_SRV_DBG: return true; case CFCTRL_SRV_DECM: return false; default: return false; } return false; } static bool cfctrl_req_eq(const struct cfctrl_request_info *r1, const struct cfctrl_request_info *r2) { if (r1->cmd != r2->cmd) return false; if (r1->cmd == CFCTRL_CMD_LINK_SETUP) return param_eq(&r1->param, &r2->param); else return r1->channel_id == r2->channel_id; } /* Insert request at the end */ static void cfctrl_insert_req(struct cfctrl *ctrl, struct cfctrl_request_info *req) { spin_lock_bh(&ctrl->info_list_lock); atomic_inc(&ctrl->req_seq_no); req->sequence_no = atomic_read(&ctrl->req_seq_no); list_add_tail(&req->list, &ctrl->list); spin_unlock_bh(&ctrl->info_list_lock); } /* Compare and remove request */ static struct cfctrl_request_info *cfctrl_remove_req(struct cfctrl *ctrl, struct cfctrl_request_info *req) { struct cfctrl_request_info *p, *tmp, *first; first = list_first_entry(&ctrl->list, struct cfctrl_request_info, list); list_for_each_entry_safe(p, tmp, &ctrl->list, list) { if (cfctrl_req_eq(req, p)) { if (p != first) pr_warn("Requests are not received in order\n"); atomic_set(&ctrl->rsp_seq_no, p->sequence_no); list_del(&p->list); goto out; } } p = NULL; out: return p; } struct cfctrl_rsp *cfctrl_get_respfuncs(struct cflayer *layer) { struct cfctrl *this = container_obj(layer); return &this->res; } static void init_info(struct caif_payload_info *info, struct cfctrl *cfctrl) { info->hdr_len = 0; info->channel_id = cfctrl->serv.layer.id; info->dev_info = &cfctrl->serv.dev_info; } void cfctrl_enum_req(struct cflayer *layer, u8 physlinkid) { struct cfpkt *pkt; struct cfctrl *cfctrl = container_obj(layer); struct cflayer *dn = cfctrl->serv.layer.dn; if (!dn) { pr_debug("not able to send enum request\n"); return; } pkt = cfpkt_create(CFPKT_CTRL_PKT_LEN); if (!pkt) return; caif_assert(offsetof(struct cfctrl, serv.layer) == 0); init_info(cfpkt_info(pkt), cfctrl); cfpkt_info(pkt)->dev_info->id = physlinkid; cfctrl->serv.dev_info.id = physlinkid; cfpkt_addbdy(pkt, CFCTRL_CMD_ENUM); cfpkt_addbdy(pkt, physlinkid); cfpkt_set_prio(pkt, TC_PRIO_CONTROL); dn->transmit(dn, pkt); } int cfctrl_linkup_request(struct cflayer *layer, struct cfctrl_link_param *param, struct cflayer *user_layer) { struct cfctrl *cfctrl = container_obj(layer); struct cflayer *dn = cfctrl->serv.layer.dn; char utility_name[UTILITY_NAME_LENGTH]; struct cfctrl_request_info *req; struct cfpkt *pkt; u32 tmp32; u16 tmp16; u8 tmp8; int ret; if (!dn) { pr_debug("not able to send linkup request\n"); return -ENODEV; } if (cfctrl_cancel_req(layer, user_layer) > 0) { /* Slight Paranoia, check if already connecting */ pr_err("Duplicate connect request for same client\n"); WARN_ON(1); return -EALREADY; } pkt = cfpkt_create(CFPKT_CTRL_PKT_LEN); if (!pkt) return -ENOMEM; cfpkt_addbdy(pkt, CFCTRL_CMD_LINK_SETUP); cfpkt_addbdy(pkt, (param->chtype << 4) | param->linktype); cfpkt_addbdy(pkt, (param->priority << 3) | param->phyid); cfpkt_addbdy(pkt, param->endpoint & 0x03); switch (param->linktype) { case CFCTRL_SRV_VEI: break; case CFCTRL_SRV_VIDEO: cfpkt_addbdy(pkt, (u8) param->u.video.connid); break; case CFCTRL_SRV_DBG: break; case CFCTRL_SRV_DATAGRAM: tmp32 = cpu_to_le32(param->u.datagram.connid); cfpkt_add_body(pkt, &tmp32, 4); break; case CFCTRL_SRV_RFM: /* Construct a frame, convert DatagramConnectionID to network * format long and copy it out... */ tmp32 = cpu_to_le32(param->u.rfm.connid); cfpkt_add_body(pkt, &tmp32, 4); /* Add volume name, including zero termination... */ cfpkt_add_body(pkt, param->u.rfm.volume, strlen(param->u.rfm.volume) + 1); break; case CFCTRL_SRV_UTIL: tmp16 = cpu_to_le16(param->u.utility.fifosize_kb); cfpkt_add_body(pkt, &tmp16, 2); tmp16 = cpu_to_le16(param->u.utility.fifosize_bufs); cfpkt_add_body(pkt, &tmp16, 2); memset(utility_name, 0, sizeof(utility_name)); strscpy(utility_name, param->u.utility.name, UTILITY_NAME_LENGTH); cfpkt_add_body(pkt, utility_name, UTILITY_NAME_LENGTH); tmp8 = param->u.utility.paramlen; cfpkt_add_body(pkt, &tmp8, 1); cfpkt_add_body(pkt, param->u.utility.params, param->u.utility.paramlen); break; default: pr_warn("Request setup of bad link type = %d\n", param->linktype); cfpkt_destroy(pkt); return -EINVAL; } req = kzalloc(sizeof(*req), GFP_KERNEL); if (!req) { cfpkt_destroy(pkt); return -ENOMEM; } req->client_layer = user_layer; req->cmd = CFCTRL_CMD_LINK_SETUP; req->param = *param; cfctrl_insert_req(cfctrl, req); init_info(cfpkt_info(pkt), cfctrl); /* * NOTE:Always send linkup and linkdown request on the same * device as the payload. Otherwise old queued up payload * might arrive with the newly allocated channel ID. */ cfpkt_info(pkt)->dev_info->id = param->phyid; cfpkt_set_prio(pkt, TC_PRIO_CONTROL); ret = dn->transmit(dn, pkt); if (ret < 0) { int count; count = cfctrl_cancel_req(&cfctrl->serv.layer, user_layer); if (count != 1) { pr_err("Could not remove request (%d)", count); return -ENODEV; } } return 0; } int cfctrl_linkdown_req(struct cflayer *layer, u8 channelid, struct cflayer *client) { int ret; struct cfpkt *pkt; struct cfctrl *cfctrl = container_obj(layer); struct cflayer *dn = cfctrl->serv.layer.dn; if (!dn) { pr_debug("not able to send link-down request\n"); return -ENODEV; } pkt = cfpkt_create(CFPKT_CTRL_PKT_LEN); if (!pkt) return -ENOMEM; cfpkt_addbdy(pkt, CFCTRL_CMD_LINK_DESTROY); cfpkt_addbdy(pkt, channelid); init_info(cfpkt_info(pkt), cfctrl); cfpkt_set_prio(pkt, TC_PRIO_CONTROL); ret = dn->transmit(dn, pkt); #ifndef CAIF_NO_LOOP cfctrl->loop_linkused[channelid] = 0; #endif return ret; } int cfctrl_cancel_req(struct cflayer *layr, struct cflayer *adap_layer) { struct cfctrl_request_info *p, *tmp; struct cfctrl *ctrl = container_obj(layr); int found = 0; spin_lock_bh(&ctrl->info_list_lock); list_for_each_entry_safe(p, tmp, &ctrl->list, list) { if (p->client_layer == adap_layer) { list_del(&p->list); kfree(p); found++; } } spin_unlock_bh(&ctrl->info_list_lock); return found; } static int cfctrl_recv(struct cflayer *layer, struct cfpkt *pkt) { u8 cmdrsp; u8 cmd; int ret = -1; u8 len; u8 param[255]; u8 linkid = 0; struct cfctrl *cfctrl = container_obj(layer); struct cfctrl_request_info rsp, *req; cmdrsp = cfpkt_extr_head_u8(pkt); cmd = cmdrsp & CFCTRL_CMD_MASK; if (cmd != CFCTRL_CMD_LINK_ERR && CFCTRL_RSP_BIT != (CFCTRL_RSP_BIT & cmdrsp) && CFCTRL_ERR_BIT != (CFCTRL_ERR_BIT & cmdrsp)) { if (handle_loop(cfctrl, cmd, pkt) != 0) cmdrsp |= CFCTRL_ERR_BIT; } switch (cmd) { case CFCTRL_CMD_LINK_SETUP: { enum cfctrl_srv serv; enum cfctrl_srv servtype; u8 endpoint; u8 physlinkid; u8 prio; u8 tmp; u8 *cp; int i; struct cfctrl_link_param linkparam; memset(&linkparam, 0, sizeof(linkparam)); tmp = cfpkt_extr_head_u8(pkt); serv = tmp & CFCTRL_SRV_MASK; linkparam.linktype = serv; servtype = tmp >> 4; linkparam.chtype = servtype; tmp = cfpkt_extr_head_u8(pkt); physlinkid = tmp & 0x07; prio = tmp >> 3; linkparam.priority = prio; linkparam.phyid = physlinkid; endpoint = cfpkt_extr_head_u8(pkt); linkparam.endpoint = endpoint & 0x03; switch (serv) { case CFCTRL_SRV_VEI: case CFCTRL_SRV_DBG: if (CFCTRL_ERR_BIT & cmdrsp) break; /* Link ID */ linkid = cfpkt_extr_head_u8(pkt); break; case CFCTRL_SRV_VIDEO: tmp = cfpkt_extr_head_u8(pkt); linkparam.u.video.connid = tmp; if (CFCTRL_ERR_BIT & cmdrsp) break; /* Link ID */ linkid = cfpkt_extr_head_u8(pkt); break; case CFCTRL_SRV_DATAGRAM: linkparam.u.datagram.connid = cfpkt_extr_head_u32(pkt); if (CFCTRL_ERR_BIT & cmdrsp) break; /* Link ID */ linkid = cfpkt_extr_head_u8(pkt); break; case CFCTRL_SRV_RFM: /* Construct a frame, convert * DatagramConnectionID * to network format long and copy it out... */ linkparam.u.rfm.connid = cfpkt_extr_head_u32(pkt); cp = (u8 *) linkparam.u.rfm.volume; for (tmp = cfpkt_extr_head_u8(pkt); cfpkt_more(pkt) && tmp != '\0'; tmp = cfpkt_extr_head_u8(pkt)) *cp++ = tmp; *cp = '\0'; if (CFCTRL_ERR_BIT & cmdrsp) break; /* Link ID */ linkid = cfpkt_extr_head_u8(pkt); break; case CFCTRL_SRV_UTIL: /* Construct a frame, convert * DatagramConnectionID * to network format long and copy it out... */ /* Fifosize KB */ linkparam.u.utility.fifosize_kb = cfpkt_extr_head_u16(pkt); /* Fifosize bufs */ linkparam.u.utility.fifosize_bufs = cfpkt_extr_head_u16(pkt); /* name */ cp = (u8 *) linkparam.u.utility.name; caif_assert(sizeof(linkparam.u.utility.name) >= UTILITY_NAME_LENGTH); for (i = 0; i < UTILITY_NAME_LENGTH && cfpkt_more(pkt); i++) { tmp = cfpkt_extr_head_u8(pkt); *cp++ = tmp; } /* Length */ len = cfpkt_extr_head_u8(pkt); linkparam.u.utility.paramlen = len; /* Param Data */ cp = linkparam.u.utility.params; while (cfpkt_more(pkt) && len--) { tmp = cfpkt_extr_head_u8(pkt); *cp++ = tmp; } if (CFCTRL_ERR_BIT & cmdrsp) break; /* Link ID */ linkid = cfpkt_extr_head_u8(pkt); /* Length */ len = cfpkt_extr_head_u8(pkt); /* Param Data */ cfpkt_extr_head(pkt, &param, len); break; default: pr_warn("Request setup, invalid type (%d)\n", serv); goto error; } rsp.cmd = cmd; rsp.param = linkparam; spin_lock_bh(&cfctrl->info_list_lock); req = cfctrl_remove_req(cfctrl, &rsp); if (CFCTRL_ERR_BIT == (CFCTRL_ERR_BIT & cmdrsp) || cfpkt_erroneous(pkt)) { pr_err("Invalid O/E bit or parse error " "on CAIF control channel\n"); cfctrl->res.reject_rsp(cfctrl->serv.layer.up, 0, req ? req->client_layer : NULL); } else { cfctrl->res.linksetup_rsp(cfctrl->serv. layer.up, linkid, serv, physlinkid, req ? req-> client_layer : NULL); } kfree(req); spin_unlock_bh(&cfctrl->info_list_lock); } break; case CFCTRL_CMD_LINK_DESTROY: linkid = cfpkt_extr_head_u8(pkt); cfctrl->res.linkdestroy_rsp(cfctrl->serv.layer.up, linkid); break; case CFCTRL_CMD_LINK_ERR: pr_err("Frame Error Indication received\n"); cfctrl->res.linkerror_ind(); break; case CFCTRL_CMD_ENUM: cfctrl->res.enum_rsp(); break; case CFCTRL_CMD_SLEEP: cfctrl->res.sleep_rsp(); break; case CFCTRL_CMD_WAKE: cfctrl->res.wake_rsp(); break; case CFCTRL_CMD_LINK_RECONF: cfctrl->res.restart_rsp(); break; case CFCTRL_CMD_RADIO_SET: cfctrl->res.radioset_rsp(); break; default: pr_err("Unrecognized Control Frame\n"); goto error; } ret = 0; error: cfpkt_destroy(pkt); return ret; } static void cfctrl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl, int phyid) { struct cfctrl *this = container_obj(layr); switch (ctrl) { case _CAIF_CTRLCMD_PHYIF_FLOW_OFF_IND: case CAIF_CTRLCMD_FLOW_OFF_IND: spin_lock_bh(&this->info_list_lock); if (!list_empty(&this->list)) pr_debug("Received flow off in control layer\n"); spin_unlock_bh(&this->info_list_lock); break; case _CAIF_CTRLCMD_PHYIF_DOWN_IND: { struct cfctrl_request_info *p, *tmp; /* Find all connect request and report failure */ spin_lock_bh(&this->info_list_lock); list_for_each_entry_safe(p, tmp, &this->list, list) { if (p->param.phyid == phyid) { list_del(&p->list); p->client_layer->ctrlcmd(p->client_layer, CAIF_CTRLCMD_INIT_FAIL_RSP, phyid); kfree(p); } } spin_unlock_bh(&this->info_list_lock); break; } default: break; } } #ifndef CAIF_NO_LOOP static int handle_loop(struct cfctrl *ctrl, int cmd, struct cfpkt *pkt) { static int last_linkid; static int dec; u8 linkid, linktype, tmp; switch (cmd) { case CFCTRL_CMD_LINK_SETUP: spin_lock_bh(&ctrl->loop_linkid_lock); if (!dec) { for (linkid = last_linkid + 1; linkid < 254; linkid++) if (!ctrl->loop_linkused[linkid]) goto found; } dec = 1; for (linkid = last_linkid - 1; linkid > 1; linkid--) if (!ctrl->loop_linkused[linkid]) goto found; spin_unlock_bh(&ctrl->loop_linkid_lock); return -1; found: if (linkid < 10) dec = 0; if (!ctrl->loop_linkused[linkid]) ctrl->loop_linkused[linkid] = 1; last_linkid = linkid; cfpkt_add_trail(pkt, &linkid, 1); spin_unlock_bh(&ctrl->loop_linkid_lock); cfpkt_peek_head(pkt, &linktype, 1); if (linktype == CFCTRL_SRV_UTIL) { tmp = 0x01; cfpkt_add_trail(pkt, &tmp, 1); cfpkt_add_trail(pkt, &tmp, 1); } break; case CFCTRL_CMD_LINK_DESTROY: spin_lock_bh(&ctrl->loop_linkid_lock); cfpkt_peek_head(pkt, &linkid, 1); ctrl->loop_linkused[linkid] = 0; spin_unlock_bh(&ctrl->loop_linkid_lock); break; default: break; } return 0; } #endif
2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 // SPDX-License-Identifier: GPL-2.0 /* * * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved. * * This code builds two trees of free clusters extents. * Trees are sorted by start of extent and by length of extent. * NTFS_MAX_WND_EXTENTS defines the maximum number of elements in trees. * In extreme case code reads on-disk bitmap to find free clusters. * */ #include <linux/buffer_head.h> #include <linux/fs.h> #include <linux/kernel.h> #include "ntfs.h" #include "ntfs_fs.h" /* * Maximum number of extents in tree. */ #define NTFS_MAX_WND_EXTENTS (32u * 1024u) struct rb_node_key { struct rb_node node; size_t key; }; struct e_node { struct rb_node_key start; /* Tree sorted by start. */ struct rb_node_key count; /* Tree sorted by len. */ }; static int wnd_rescan(struct wnd_bitmap *wnd); static struct buffer_head *wnd_map(struct wnd_bitmap *wnd, size_t iw); static bool wnd_is_free_hlp(struct wnd_bitmap *wnd, size_t bit, size_t bits); static struct kmem_cache *ntfs_enode_cachep; int __init ntfs3_init_bitmap(void) { ntfs_enode_cachep = kmem_cache_create("ntfs3_enode_cache", sizeof(struct e_node), 0, SLAB_RECLAIM_ACCOUNT, NULL); return ntfs_enode_cachep ? 0 : -ENOMEM; } void ntfs3_exit_bitmap(void) { kmem_cache_destroy(ntfs_enode_cachep); } /* * wnd_scan * * b_pos + b_len - biggest fragment. * Scan range [wpos wbits) window @buf. * * Return: -1 if not found. */ static size_t wnd_scan(const void *buf, size_t wbit, u32 wpos, u32 wend, size_t to_alloc, size_t *prev_tail, size_t *b_pos, size_t *b_len) { while (wpos < wend) { size_t free_len; u32 free_bits, end; u32 used = find_next_zero_bit_le(buf, wend, wpos); if (used >= wend) { if (*b_len < *prev_tail) { *b_pos = wbit - *prev_tail; *b_len = *prev_tail; } *prev_tail = 0; return -1; } if (used > wpos) { wpos = used; if (*b_len < *prev_tail) { *b_pos = wbit - *prev_tail; *b_len = *prev_tail; } *prev_tail = 0; } /* * Now we have a fragment [wpos, wend) staring with 0. */ end = wpos + to_alloc - *prev_tail; free_bits = find_next_bit_le(buf, min(end, wend), wpos); free_len = *prev_tail + free_bits - wpos; if (*b_len < free_len) { *b_pos = wbit + wpos - *prev_tail; *b_len = free_len; } if (free_len >= to_alloc) return wbit + wpos - *prev_tail; if (free_bits >= wend) { *prev_tail += free_bits - wpos; return -1; } wpos = free_bits + 1; *prev_tail = 0; } return -1; } /* * wnd_close - Frees all resources. */ void wnd_close(struct wnd_bitmap *wnd) { struct rb_node *node, *next; kvfree(wnd->free_bits); wnd->free_bits = NULL; run_close(&wnd->run); node = rb_first(&wnd->start_tree); while (node) { next = rb_next(node); rb_erase(node, &wnd->start_tree); kmem_cache_free(ntfs_enode_cachep, rb_entry(node, struct e_node, start.node)); node = next; } } static struct rb_node *rb_lookup(struct rb_root *root, size_t v) { struct rb_node **p = &root->rb_node; struct rb_node *r = NULL; while (*p) { struct rb_node_key *k; k = rb_entry(*p, struct rb_node_key, node); if (v < k->key) { p = &(*p)->rb_left; } else if (v > k->key) { r = &k->node; p = &(*p)->rb_right; } else { return &k->node; } } return r; } /* * rb_insert_count - Helper function to insert special kind of 'count' tree. */ static inline bool rb_insert_count(struct rb_root *root, struct e_node *e) { struct rb_node **p = &root->rb_node; struct rb_node *parent = NULL; size_t e_ckey = e->count.key; size_t e_skey = e->start.key; while (*p) { struct e_node *k = rb_entry(parent = *p, struct e_node, count.node); if (e_ckey > k->count.key) { p = &(*p)->rb_left; } else if (e_ckey < k->count.key) { p = &(*p)->rb_right; } else if (e_skey < k->start.key) { p = &(*p)->rb_left; } else if (e_skey > k->start.key) { p = &(*p)->rb_right; } else { WARN_ON(1); return false; } } rb_link_node(&e->count.node, parent, p); rb_insert_color(&e->count.node, root); return true; } /* * rb_insert_start - Helper function to insert special kind of 'count' tree. */ static inline bool rb_insert_start(struct rb_root *root, struct e_node *e) { struct rb_node **p = &root->rb_node; struct rb_node *parent = NULL; size_t e_skey = e->start.key; while (*p) { struct e_node *k; parent = *p; k = rb_entry(parent, struct e_node, start.node); if (e_skey < k->start.key) { p = &(*p)->rb_left; } else if (e_skey > k->start.key) { p = &(*p)->rb_right; } else { WARN_ON(1); return false; } } rb_link_node(&e->start.node, parent, p); rb_insert_color(&e->start.node, root); return true; } /* * wnd_add_free_ext - Adds a new extent of free space. * @build: 1 when building tree. */ static void wnd_add_free_ext(struct wnd_bitmap *wnd, size_t bit, size_t len, bool build) { struct e_node *e, *e0 = NULL; size_t ib, end_in = bit + len; struct rb_node *n; if (build) { /* Use extent_min to filter too short extents. */ if (wnd->count >= NTFS_MAX_WND_EXTENTS && len <= wnd->extent_min) { wnd->uptodated = -1; return; } } else { /* Try to find extent before 'bit'. */ n = rb_lookup(&wnd->start_tree, bit); if (!n) { n = rb_first(&wnd->start_tree); } else { e = rb_entry(n, struct e_node, start.node); n = rb_next(n); if (e->start.key + e->count.key == bit) { /* Remove left. */ bit = e->start.key; len += e->count.key; rb_erase(&e->start.node, &wnd->start_tree); rb_erase(&e->count.node, &wnd->count_tree); wnd->count -= 1; e0 = e; } } while (n) { size_t next_end; e = rb_entry(n, struct e_node, start.node); next_end = e->start.key + e->count.key; if (e->start.key > end_in) break; /* Remove right. */ n = rb_next(n); len += next_end - end_in; end_in = next_end; rb_erase(&e->start.node, &wnd->start_tree); rb_erase(&e->count.node, &wnd->count_tree); wnd->count -= 1; if (!e0) e0 = e; else kmem_cache_free(ntfs_enode_cachep, e); } if (wnd->uptodated != 1) { /* Check bits before 'bit'. */ ib = wnd->zone_bit == wnd->zone_end || bit < wnd->zone_end ? 0 : wnd->zone_end; while (bit > ib && wnd_is_free_hlp(wnd, bit - 1, 1)) { bit -= 1; len += 1; } /* Check bits after 'end_in'. */ ib = wnd->zone_bit == wnd->zone_end || end_in > wnd->zone_bit ? wnd->nbits : wnd->zone_bit; while (end_in < ib && wnd_is_free_hlp(wnd, end_in, 1)) { end_in += 1; len += 1; } } } /* Insert new fragment. */ if (wnd->count >= NTFS_MAX_WND_EXTENTS) { if (e0) kmem_cache_free(ntfs_enode_cachep, e0); wnd->uptodated = -1; /* Compare with smallest fragment. */ n = rb_last(&wnd->count_tree); e = rb_entry(n, struct e_node, count.node); if (len <= e->count.key) goto out; /* Do not insert small fragments. */ if (build) { struct e_node *e2; n = rb_prev(n); e2 = rb_entry(n, struct e_node, count.node); /* Smallest fragment will be 'e2->count.key'. */ wnd->extent_min = e2->count.key; } /* Replace smallest fragment by new one. */ rb_erase(&e->start.node, &wnd->start_tree); rb_erase(&e->count.node, &wnd->count_tree); wnd->count -= 1; } else { e = e0 ? e0 : kmem_cache_alloc(ntfs_enode_cachep, GFP_ATOMIC); if (!e) { wnd->uptodated = -1; goto out; } if (build && len <= wnd->extent_min) wnd->extent_min = len; } e->start.key = bit; e->count.key = len; if (len > wnd->extent_max) wnd->extent_max = len; rb_insert_start(&wnd->start_tree, e); rb_insert_count(&wnd->count_tree, e); wnd->count += 1; out:; } /* * wnd_remove_free_ext - Remove a run from the cached free space. */ static void wnd_remove_free_ext(struct wnd_bitmap *wnd, size_t bit, size_t len) { struct rb_node *n, *n3; struct e_node *e, *e3; size_t end_in = bit + len; size_t end3, end, new_key, new_len, max_new_len; /* Try to find extent before 'bit'. */ n = rb_lookup(&wnd->start_tree, bit); if (!n) return; e = rb_entry(n, struct e_node, start.node); end = e->start.key + e->count.key; new_key = new_len = 0; len = e->count.key; /* Range [bit,end_in) must be inside 'e' or outside 'e' and 'n'. */ if (e->start.key > bit) ; else if (end_in <= end) { /* Range [bit,end_in) inside 'e'. */ new_key = end_in; new_len = end - end_in; len = bit - e->start.key; } else if (bit > end) { bool bmax = false; n3 = rb_next(n); while (n3) { e3 = rb_entry(n3, struct e_node, start.node); if (e3->start.key >= end_in) break; if (e3->count.key == wnd->extent_max) bmax = true; end3 = e3->start.key + e3->count.key; if (end3 > end_in) { e3->start.key = end_in; rb_erase(&e3->count.node, &wnd->count_tree); e3->count.key = end3 - end_in; rb_insert_count(&wnd->count_tree, e3); break; } n3 = rb_next(n3); rb_erase(&e3->start.node, &wnd->start_tree); rb_erase(&e3->count.node, &wnd->count_tree); wnd->count -= 1; kmem_cache_free(ntfs_enode_cachep, e3); } if (!bmax) return; n3 = rb_first(&wnd->count_tree); wnd->extent_max = n3 ? rb_entry(n3, struct e_node, count.node)->count.key : 0; return; } if (e->count.key != wnd->extent_max) { ; } else if (rb_prev(&e->count.node)) { ; } else { n3 = rb_next(&e->count.node); max_new_len = max(len, new_len); if (!n3) { wnd->extent_max = max_new_len; } else { e3 = rb_entry(n3, struct e_node, count.node); wnd->extent_max = max(e3->count.key, max_new_len); } } if (!len) { if (new_len) { e->start.key = new_key; rb_erase(&e->count.node, &wnd->count_tree); e->count.key = new_len; rb_insert_count(&wnd->count_tree, e); } else { rb_erase(&e->start.node, &wnd->start_tree); rb_erase(&e->count.node, &wnd->count_tree); wnd->count -= 1; kmem_cache_free(ntfs_enode_cachep, e); } goto out; } rb_erase(&e->count.node, &wnd->count_tree); e->count.key = len; rb_insert_count(&wnd->count_tree, e); if (!new_len) goto out; if (wnd->count >= NTFS_MAX_WND_EXTENTS) { wnd->uptodated = -1; /* Get minimal extent. */ e = rb_entry(rb_last(&wnd->count_tree), struct e_node, count.node); if (e->count.key > new_len) goto out; /* Replace minimum. */ rb_erase(&e->start.node, &wnd->start_tree); rb_erase(&e->count.node, &wnd->count_tree); wnd->count -= 1; } else { e = kmem_cache_alloc(ntfs_enode_cachep, GFP_ATOMIC); if (!e) wnd->uptodated = -1; } if (e) { e->start.key = new_key; e->count.key = new_len; rb_insert_start(&wnd->start_tree, e); rb_insert_count(&wnd->count_tree, e); wnd->count += 1; } out: if (!wnd->count && 1 != wnd->uptodated) wnd_rescan(wnd); } /* * wnd_rescan - Scan all bitmap. Used while initialization. */ static int wnd_rescan(struct wnd_bitmap *wnd) { int err = 0; size_t prev_tail = 0; struct super_block *sb = wnd->sb; struct ntfs_sb_info *sbi = sb->s_fs_info; u64 lbo, len = 0; u32 blocksize = sb->s_blocksize; u8 cluster_bits = sbi->cluster_bits; u32 wbits = 8 * sb->s_blocksize; u32 used, frb; size_t wpos, wbit, iw, vbo; struct buffer_head *bh = NULL; CLST lcn, clen; wnd->uptodated = 0; wnd->extent_max = 0; wnd->extent_min = MINUS_ONE_T; wnd->total_zeroes = 0; vbo = 0; for (iw = 0; iw < wnd->nwnd; iw++) { if (iw + 1 == wnd->nwnd) wbits = wnd->bits_last; if (wnd->inited) { if (!wnd->free_bits[iw]) { /* All ones. */ if (prev_tail) { wnd_add_free_ext(wnd, vbo * 8 - prev_tail, prev_tail, true); prev_tail = 0; } goto next_wnd; } if (wbits == wnd->free_bits[iw]) { /* All zeroes. */ prev_tail += wbits; wnd->total_zeroes += wbits; goto next_wnd; } } if (!len) { u32 off = vbo & sbi->cluster_mask; if (!run_lookup_entry(&wnd->run, vbo >> cluster_bits, &lcn, &clen, NULL)) { err = -ENOENT; goto out; } lbo = ((u64)lcn << cluster_bits) + off; len = ((u64)clen << cluster_bits) - off; } bh = ntfs_bread(sb, lbo >> sb->s_blocksize_bits); if (!bh) { err = -EIO; goto out; } used = ntfs_bitmap_weight_le(bh->b_data, wbits); if (used < wbits) { frb = wbits - used; wnd->free_bits[iw] = frb; wnd->total_zeroes += frb; } wpos = 0; wbit = vbo * 8; if (wbit + wbits > wnd->nbits) wbits = wnd->nbits - wbit; do { used = find_next_zero_bit_le(bh->b_data, wbits, wpos); if (used > wpos && prev_tail) { wnd_add_free_ext(wnd, wbit + wpos - prev_tail, prev_tail, true); prev_tail = 0; } wpos = used; if (wpos >= wbits) { /* No free blocks. */ prev_tail = 0; break; } frb = find_next_bit_le(bh->b_data, wbits, wpos); if (frb >= wbits) { /* Keep last free block. */ prev_tail += frb - wpos; break; } wnd_add_free_ext(wnd, wbit + wpos - prev_tail, frb + prev_tail - wpos, true); /* Skip free block and first '1'. */ wpos = frb + 1; /* Reset previous tail. */ prev_tail = 0; } while (wpos < wbits); next_wnd: if (bh) put_bh(bh); bh = NULL; vbo += blocksize; if (len) { len -= blocksize; lbo += blocksize; } } /* Add last block. */ if (prev_tail) wnd_add_free_ext(wnd, wnd->nbits - prev_tail, prev_tail, true); /* * Before init cycle wnd->uptodated was 0. * If any errors or limits occurs while initialization then * wnd->uptodated will be -1. * If 'uptodated' is still 0 then Tree is really updated. */ if (!wnd->uptodated) wnd->uptodated = 1; if (wnd->zone_bit != wnd->zone_end) { size_t zlen = wnd->zone_end - wnd->zone_bit; wnd->zone_end = wnd->zone_bit; wnd_zone_set(wnd, wnd->zone_bit, zlen); } out: return err; } int wnd_init(struct wnd_bitmap *wnd, struct super_block *sb, size_t nbits) { int err; u32 blocksize = sb->s_blocksize; u32 wbits = blocksize * 8; init_rwsem(&wnd->rw_lock); wnd->sb = sb; wnd->nbits = nbits; wnd->total_zeroes = nbits; wnd->extent_max = MINUS_ONE_T; wnd->zone_bit = wnd->zone_end = 0; wnd->nwnd = bytes_to_block(sb, ntfs3_bitmap_size(nbits)); wnd->bits_last = nbits & (wbits - 1); if (!wnd->bits_last) wnd->bits_last = wbits; wnd->free_bits = kvmalloc_array(wnd->nwnd, sizeof(u16), GFP_KERNEL | __GFP_ZERO); if (!wnd->free_bits) return -ENOMEM; err = wnd_rescan(wnd); if (err) return err; wnd->inited = true; return 0; } /* * wnd_map - Call sb_bread for requested window. */ static struct buffer_head *wnd_map(struct wnd_bitmap *wnd, size_t iw) { size_t vbo; CLST lcn, clen; struct super_block *sb = wnd->sb; struct ntfs_sb_info *sbi; struct buffer_head *bh; u64 lbo; sbi = sb->s_fs_info; vbo = (u64)iw << sb->s_blocksize_bits; if (!run_lookup_entry(&wnd->run, vbo >> sbi->cluster_bits, &lcn, &clen, NULL)) { return ERR_PTR(-ENOENT); } lbo = ((u64)lcn << sbi->cluster_bits) + (vbo & sbi->cluster_mask); bh = ntfs_bread(wnd->sb, lbo >> sb->s_blocksize_bits); if (!bh) return ERR_PTR(-EIO); return bh; } /* * wnd_set_free - Mark the bits range from bit to bit + bits as free. */ int wnd_set_free(struct wnd_bitmap *wnd, size_t bit, size_t bits) { int err = 0; struct super_block *sb = wnd->sb; u32 wbits = 8 * sb->s_blocksize; size_t iw = bit >> (sb->s_blocksize_bits + 3); u32 wbit = bit & (wbits - 1); struct buffer_head *bh; u32 op; for (; iw < wnd->nwnd && bits; iw++, bit += op, bits -= op, wbit = 0) { if (iw + 1 == wnd->nwnd) wbits = wnd->bits_last; op = min_t(u32, wbits - wbit, bits); bh = wnd_map(wnd, iw); if (IS_ERR(bh)) { err = PTR_ERR(bh); break; } lock_buffer(bh); ntfs_bitmap_clear_le(bh->b_data, wbit, op); wnd->free_bits[iw] += op; wnd->total_zeroes += op; set_buffer_uptodate(bh); mark_buffer_dirty(bh); unlock_buffer(bh); put_bh(bh); wnd_add_free_ext(wnd, bit, op, false); } return err; } /* * wnd_set_used - Mark the bits range from bit to bit + bits as used. */ int wnd_set_used(struct wnd_bitmap *wnd, size_t bit, size_t bits) { int err = 0; struct super_block *sb = wnd->sb; size_t iw = bit >> (sb->s_blocksize_bits + 3); u32 wbits = 8 * sb->s_blocksize; u32 wbit = bit & (wbits - 1); struct buffer_head *bh; u32 op; for (; iw < wnd->nwnd && bits; iw++, bit += op, bits -= op, wbit = 0) { if (unlikely(iw + 1 == wnd->nwnd)) wbits = wnd->bits_last; op = min_t(u32, wbits - wbit, bits); bh = wnd_map(wnd, iw); if (IS_ERR(bh)) { err = PTR_ERR(bh); break; } lock_buffer(bh); ntfs_bitmap_set_le(bh->b_data, wbit, op); wnd->free_bits[iw] -= op; wnd->total_zeroes -= op; set_buffer_uptodate(bh); mark_buffer_dirty(bh); unlock_buffer(bh); put_bh(bh); if (!RB_EMPTY_ROOT(&wnd->start_tree)) wnd_remove_free_ext(wnd, bit, op); } return err; } /* * wnd_set_used_safe - Mark the bits range from bit to bit + bits as used. * * Unlikely wnd_set_used/wnd_set_free this function is not full trusted. * It scans every bit in bitmap and marks free bit as used. * @done - how many bits were marked as used. * * NOTE: normally *done should be 0. */ int wnd_set_used_safe(struct wnd_bitmap *wnd, size_t bit, size_t bits, size_t *done) { size_t i, from = 0, len = 0; int err = 0; *done = 0; for (i = 0; i < bits; i++) { if (wnd_is_free(wnd, bit + i, 1)) { if (!len) from = bit + i; len += 1; } else if (len) { err = wnd_set_used(wnd, from, len); *done += len; len = 0; if (err) break; } } if (len) { /* last fragment. */ err = wnd_set_used(wnd, from, len); *done += len; } return err; } /* * wnd_is_free_hlp * * Return: True if all clusters [bit, bit+bits) are free (bitmap only). */ static bool wnd_is_free_hlp(struct wnd_bitmap *wnd, size_t bit, size_t bits) { struct super_block *sb = wnd->sb; size_t iw = bit >> (sb->s_blocksize_bits + 3); u32 wbits = 8 * sb->s_blocksize; u32 wbit = bit & (wbits - 1); u32 op; for (; iw < wnd->nwnd && bits; iw++, bits -= op, wbit = 0) { if (unlikely(iw + 1 == wnd->nwnd)) wbits = wnd->bits_last; op = min_t(u32, wbits - wbit, bits); if (wbits != wnd->free_bits[iw]) { bool ret; struct buffer_head *bh = wnd_map(wnd, iw); if (IS_ERR(bh)) return false; ret = are_bits_clear(bh->b_data, wbit, op); put_bh(bh); if (!ret) return false; } } return true; } /* * wnd_is_free * * Return: True if all clusters [bit, bit+bits) are free. */ bool wnd_is_free(struct wnd_bitmap *wnd, size_t bit, size_t bits) { bool ret; struct rb_node *n; size_t end; struct e_node *e; if (RB_EMPTY_ROOT(&wnd->start_tree)) goto use_wnd; n = rb_lookup(&wnd->start_tree, bit); if (!n) goto use_wnd; e = rb_entry(n, struct e_node, start.node); end = e->start.key + e->count.key; if (bit < end && bit + bits <= end) return true; use_wnd: ret = wnd_is_free_hlp(wnd, bit, bits); return ret; } /* * wnd_is_used * * Return: True if all clusters [bit, bit+bits) are used. */ bool wnd_is_used(struct wnd_bitmap *wnd, size_t bit, size_t bits) { bool ret = false; struct super_block *sb = wnd->sb; size_t iw = bit >> (sb->s_blocksize_bits + 3); u32 wbits = 8 * sb->s_blocksize; u32 wbit = bit & (wbits - 1); u32 op; size_t end; struct rb_node *n; struct e_node *e; if (RB_EMPTY_ROOT(&wnd->start_tree)) goto use_wnd; end = bit + bits; n = rb_lookup(&wnd->start_tree, end - 1); if (!n) goto use_wnd; e = rb_entry(n, struct e_node, start.node); if (e->start.key + e->count.key > bit) return false; use_wnd: for (; iw < wnd->nwnd && bits; iw++, bits -= op, wbit = 0) { if (unlikely(iw + 1 == wnd->nwnd)) wbits = wnd->bits_last; op = min_t(u32, wbits - wbit, bits); if (wnd->free_bits[iw]) { bool ret; struct buffer_head *bh = wnd_map(wnd, iw); if (IS_ERR(bh)) goto out; ret = are_bits_set(bh->b_data, wbit, op); put_bh(bh); if (!ret) goto out; } } ret = true; out: return ret; } /* * wnd_find - Look for free space. * * - flags - BITMAP_FIND_XXX flags * * Return: 0 if not found. */ size_t wnd_find(struct wnd_bitmap *wnd, size_t to_alloc, size_t hint, size_t flags, size_t *allocated) { struct super_block *sb; u32 wbits, wpos, wzbit, wzend; size_t fnd, max_alloc, b_len, b_pos; size_t iw, prev_tail, nwnd, wbit, ebit, zbit, zend; size_t to_alloc0 = to_alloc; const struct e_node *e; const struct rb_node *pr, *cr; u8 log2_bits; bool fbits_valid; struct buffer_head *bh; /* Fast checking for available free space. */ if (flags & BITMAP_FIND_FULL) { size_t zeroes = wnd_zeroes(wnd); zeroes -= wnd->zone_end - wnd->zone_bit; if (zeroes < to_alloc0) goto no_space; if (to_alloc0 > wnd->extent_max) goto no_space; } else { if (to_alloc > wnd->extent_max) to_alloc = wnd->extent_max; } if (wnd->zone_bit <= hint && hint < wnd->zone_end) hint = wnd->zone_end; max_alloc = wnd->nbits; b_len = b_pos = 0; if (hint >= max_alloc) hint = 0; if (RB_EMPTY_ROOT(&wnd->start_tree)) { if (wnd->uptodated == 1) { /* Extents tree is updated -> No free space. */ goto no_space; } goto scan_bitmap; } e = NULL; if (!hint) goto allocate_biggest; /* Use hint: Enumerate extents by start >= hint. */ pr = NULL; cr = wnd->start_tree.rb_node; for (;;) { e = rb_entry(cr, struct e_node, start.node); if (e->start.key == hint) break; if (e->start.key < hint) { pr = cr; cr = cr->rb_right; if (!cr) break; continue; } cr = cr->rb_left; if (!cr) { e = pr ? rb_entry(pr, struct e_node, start.node) : NULL; break; } } if (!e) goto allocate_biggest; if (e->start.key + e->count.key > hint) { /* We have found extension with 'hint' inside. */ size_t len = e->start.key + e->count.key - hint; if (len >= to_alloc && hint + to_alloc <= max_alloc) { fnd = hint; goto found; } if (!(flags & BITMAP_FIND_FULL)) { if (len > to_alloc) len = to_alloc; if (hint + len <= max_alloc) { fnd = hint; to_alloc = len; goto found; } } } allocate_biggest: /* Allocate from biggest free extent. */ e = rb_entry(rb_first(&wnd->count_tree), struct e_node, count.node); if (e->count.key != wnd->extent_max) wnd->extent_max = e->count.key; if (e->count.key < max_alloc) { if (e->count.key >= to_alloc) { ; } else if (flags & BITMAP_FIND_FULL) { if (e->count.key < to_alloc0) { /* Biggest free block is less then requested. */ goto no_space; } to_alloc = e->count.key; } else if (-1 != wnd->uptodated) { to_alloc = e->count.key; } else { /* Check if we can use more bits. */ size_t op, max_check; struct rb_root start_tree; memcpy(&start_tree, &wnd->start_tree, sizeof(struct rb_root)); memset(&wnd->start_tree, 0, sizeof(struct rb_root)); max_check = e->start.key + to_alloc; if (max_check > max_alloc) max_check = max_alloc; for (op = e->start.key + e->count.key; op < max_check; op++) { if (!wnd_is_free(wnd, op, 1)) break; } memcpy(&wnd->start_tree, &start_tree, sizeof(struct rb_root)); to_alloc = op - e->start.key; } /* Prepare to return. */ fnd = e->start.key; if (e->start.key + to_alloc > max_alloc) to_alloc = max_alloc - e->start.key; goto found; } if (wnd->uptodated == 1) { /* Extents tree is updated -> no free space. */ goto no_space; } b_len = e->count.key; b_pos = e->start.key; scan_bitmap: sb = wnd->sb; log2_bits = sb->s_blocksize_bits + 3; /* At most two ranges [hint, max_alloc) + [0, hint). */ Again: /* TODO: Optimize request for case nbits > wbits. */ iw = hint >> log2_bits; wbits = sb->s_blocksize * 8; wpos = hint & (wbits - 1); prev_tail = 0; fbits_valid = true; if (max_alloc == wnd->nbits) { nwnd = wnd->nwnd; } else { size_t t = max_alloc + wbits - 1; nwnd = likely(t > max_alloc) ? (t >> log2_bits) : wnd->nwnd; } /* Enumerate all windows. */ for (; iw < nwnd; iw++) { wbit = iw << log2_bits; if (!wnd->free_bits[iw]) { if (prev_tail > b_len) { b_pos = wbit - prev_tail; b_len = prev_tail; } /* Skip full used window. */ prev_tail = 0; wpos = 0; continue; } if (unlikely(iw + 1 == nwnd)) { if (max_alloc == wnd->nbits) { wbits = wnd->bits_last; } else { size_t t = max_alloc & (wbits - 1); if (t) { wbits = t; fbits_valid = false; } } } if (wnd->zone_end > wnd->zone_bit) { ebit = wbit + wbits; zbit = max(wnd->zone_bit, wbit); zend = min(wnd->zone_end, ebit); /* Here we have a window [wbit, ebit) and zone [zbit, zend). */ if (zend <= zbit) { /* Zone does not overlap window. */ } else { wzbit = zbit - wbit; wzend = zend - wbit; /* Zone overlaps window. */ if (wnd->free_bits[iw] == wzend - wzbit) { prev_tail = 0; wpos = 0; continue; } /* Scan two ranges window: [wbit, zbit) and [zend, ebit). */ bh = wnd_map(wnd, iw); if (IS_ERR(bh)) { /* TODO: Error */ prev_tail = 0; wpos = 0; continue; } /* Scan range [wbit, zbit). */ if (wpos < wzbit) { /* Scan range [wpos, zbit). */ fnd = wnd_scan(bh->b_data, wbit, wpos, wzbit, to_alloc, &prev_tail, &b_pos, &b_len); if (fnd != MINUS_ONE_T) { put_bh(bh); goto found; } } prev_tail = 0; /* Scan range [zend, ebit). */ if (wzend < wbits) { fnd = wnd_scan(bh->b_data, wbit, max(wzend, wpos), wbits, to_alloc, &prev_tail, &b_pos, &b_len); if (fnd != MINUS_ONE_T) { put_bh(bh); goto found; } } wpos = 0; put_bh(bh); continue; } } /* Current window does not overlap zone. */ if (!wpos && fbits_valid && wnd->free_bits[iw] == wbits) { /* Window is empty. */ if (prev_tail + wbits >= to_alloc) { fnd = wbit + wpos - prev_tail; goto found; } /* Increase 'prev_tail' and process next window. */ prev_tail += wbits; wpos = 0; continue; } /* Read window. */ bh = wnd_map(wnd, iw); if (IS_ERR(bh)) { // TODO: Error. prev_tail = 0; wpos = 0; continue; } /* Scan range [wpos, eBits). */ fnd = wnd_scan(bh->b_data, wbit, wpos, wbits, to_alloc, &prev_tail, &b_pos, &b_len); put_bh(bh); if (fnd != MINUS_ONE_T) goto found; } if (b_len < prev_tail) { /* The last fragment. */ b_len = prev_tail; b_pos = max_alloc - prev_tail; } if (hint) { /* * We have scanned range [hint max_alloc). * Prepare to scan range [0 hint + to_alloc). */ size_t nextmax = hint + to_alloc; if (likely(nextmax >= hint) && nextmax < max_alloc) max_alloc = nextmax; hint = 0; goto Again; } if (!b_len) goto no_space; wnd->extent_max = b_len; if (flags & BITMAP_FIND_FULL) goto no_space; fnd = b_pos; to_alloc = b_len; found: if (flags & BITMAP_FIND_MARK_AS_USED) { /* TODO: Optimize remove extent (pass 'e'?). */ if (wnd_set_used(wnd, fnd, to_alloc)) goto no_space; } else if (wnd->extent_max != MINUS_ONE_T && to_alloc > wnd->extent_max) { wnd->extent_max = to_alloc; } *allocated = fnd; return to_alloc; no_space: return 0; } /* * wnd_extend - Extend bitmap ($MFT bitmap). */ int wnd_extend(struct wnd_bitmap *wnd, size_t new_bits) { int err; struct super_block *sb = wnd->sb; struct ntfs_sb_info *sbi = sb->s_fs_info; u32 blocksize = sb->s_blocksize; u32 wbits = blocksize * 8; u32 b0, new_last; size_t bits, iw, new_wnd; size_t old_bits = wnd->nbits; u16 *new_free; if (new_bits <= old_bits) return -EINVAL; /* Align to 8 byte boundary. */ new_wnd = bytes_to_block(sb, ntfs3_bitmap_size(new_bits)); new_last = new_bits & (wbits - 1); if (!new_last) new_last = wbits; if (new_wnd != wnd->nwnd) { new_free = kmalloc_array(new_wnd, sizeof(u16), GFP_NOFS); if (!new_free) return -ENOMEM; memcpy(new_free, wnd->free_bits, wnd->nwnd * sizeof(short)); memset(new_free + wnd->nwnd, 0, (new_wnd - wnd->nwnd) * sizeof(short)); kvfree(wnd->free_bits); wnd->free_bits = new_free; } /* Zero bits [old_bits,new_bits). */ bits = new_bits - old_bits; b0 = old_bits & (wbits - 1); for (iw = old_bits >> (sb->s_blocksize_bits + 3); bits; iw += 1) { u32 op; size_t frb; u64 vbo, lbo, bytes; struct buffer_head *bh; if (iw + 1 == new_wnd) wbits = new_last; op = b0 + bits > wbits ? wbits - b0 : bits; vbo = (u64)iw * blocksize; err = ntfs_vbo_to_lbo(sbi, &wnd->run, vbo, &lbo, &bytes); if (err) return err; bh = ntfs_bread(sb, lbo >> sb->s_blocksize_bits); if (!bh) return -EIO; lock_buffer(bh); ntfs_bitmap_clear_le(bh->b_data, b0, blocksize * 8 - b0); frb = wbits - ntfs_bitmap_weight_le(bh->b_data, wbits); wnd->total_zeroes += frb - wnd->free_bits[iw]; wnd->free_bits[iw] = frb; set_buffer_uptodate(bh); mark_buffer_dirty(bh); unlock_buffer(bh); /* err = sync_dirty_buffer(bh); */ b0 = 0; bits -= op; } wnd->nbits = new_bits; wnd->nwnd = new_wnd; wnd->bits_last = new_last; wnd_add_free_ext(wnd, old_bits, new_bits - old_bits, false); return 0; } void wnd_zone_set(struct wnd_bitmap *wnd, size_t lcn, size_t len) { size_t zlen = wnd->zone_end - wnd->zone_bit; if (zlen) wnd_add_free_ext(wnd, wnd->zone_bit, zlen, false); if (!RB_EMPTY_ROOT(&wnd->start_tree) && len) wnd_remove_free_ext(wnd, lcn, len); wnd->zone_bit = lcn; wnd->zone_end = lcn + len; } int ntfs_trim_fs(struct ntfs_sb_info *sbi, struct fstrim_range *range) { int err = 0; struct super_block *sb = sbi->sb; struct wnd_bitmap *wnd = &sbi->used.bitmap; u32 wbits = 8 * sb->s_blocksize; CLST len = 0, lcn = 0, done = 0; CLST minlen = bytes_to_cluster(sbi, range->minlen); CLST lcn_from = bytes_to_cluster(sbi, range->start); size_t iw = lcn_from >> (sb->s_blocksize_bits + 3); u32 wbit = lcn_from & (wbits - 1); CLST lcn_to; if (!minlen) minlen = 1; if (range->len == (u64)-1) lcn_to = wnd->nbits; else lcn_to = bytes_to_cluster(sbi, range->start + range->len); down_read_nested(&wnd->rw_lock, BITMAP_MUTEX_CLUSTERS); for (; iw < wnd->nwnd; iw++, wbit = 0) { CLST lcn_wnd = iw * wbits; struct buffer_head *bh; if (lcn_wnd > lcn_to) break; if (!wnd->free_bits[iw]) continue; if (iw + 1 == wnd->nwnd) wbits = wnd->bits_last; if (lcn_wnd + wbits > lcn_to) wbits = lcn_to - lcn_wnd; bh = wnd_map(wnd, iw); if (IS_ERR(bh)) { err = PTR_ERR(bh); break; } for (; wbit < wbits; wbit++) { if (!test_bit_le(wbit, bh->b_data)) { if (!len) lcn = lcn_wnd + wbit; len += 1; continue; } if (len >= minlen) { err = ntfs_discard(sbi, lcn, len); if (err) goto out; done += len; } len = 0; } put_bh(bh); } /* Process the last fragment. */ if (len >= minlen) { err = ntfs_discard(sbi, lcn, len); if (err) goto out; done += len; } out: range->len = (u64)done << sbi->cluster_bits; up_read(&wnd->rw_lock); return err; } #if BITS_PER_LONG == 64 typedef __le64 bitmap_ulong; #define cpu_to_ul(x) cpu_to_le64(x) #define ul_to_cpu(x) le64_to_cpu(x) #else typedef __le32 bitmap_ulong; #define cpu_to_ul(x) cpu_to_le32(x) #define ul_to_cpu(x) le32_to_cpu(x) #endif void ntfs_bitmap_set_le(void *map, unsigned int start, int len) { bitmap_ulong *p = (bitmap_ulong *)map + BIT_WORD(start); const unsigned int size = start + len; int bits_to_set = BITS_PER_LONG - (start % BITS_PER_LONG); bitmap_ulong mask_to_set = cpu_to_ul(BITMAP_FIRST_WORD_MASK(start)); while (len - bits_to_set >= 0) { *p |= mask_to_set; len -= bits_to_set; bits_to_set = BITS_PER_LONG; mask_to_set = cpu_to_ul(~0UL); p++; } if (len) { mask_to_set &= cpu_to_ul(BITMAP_LAST_WORD_MASK(size)); *p |= mask_to_set; } } void ntfs_bitmap_clear_le(void *map, unsigned int start, int len) { bitmap_ulong *p = (bitmap_ulong *)map + BIT_WORD(start); const unsigned int size = start + len; int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG); bitmap_ulong mask_to_clear = cpu_to_ul(BITMAP_FIRST_WORD_MASK(start)); while (len - bits_to_clear >= 0) { *p &= ~mask_to_clear; len -= bits_to_clear; bits_to_clear = BITS_PER_LONG; mask_to_clear = cpu_to_ul(~0UL); p++; } if (len) { mask_to_clear &= cpu_to_ul(BITMAP_LAST_WORD_MASK(size)); *p &= ~mask_to_clear; } } unsigned int ntfs_bitmap_weight_le(const void *bitmap, int bits) { const ulong *bmp = bitmap; unsigned int k, lim = bits / BITS_PER_LONG; unsigned int w = 0; for (k = 0; k < lim; k++) w += hweight_long(bmp[k]); if (bits % BITS_PER_LONG) { w += hweight_long(ul_to_cpu(((bitmap_ulong *)bitmap)[k]) & BITMAP_LAST_WORD_MASK(bits)); } return w; }
10 13 13 2 2 1 1 3 3 1 1 1 14 14 1 13 4 13 5 1 1 1 1 1 4 4 1 3 3 1 1 1 3 3 2 1 3 3 12 10 1 1 25 23 2 22 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 // SPDX-License-Identifier: GPL-2.0-only /* * * Author Karsten Keil <kkeil@novell.com> * * Copyright 2008 by Karsten Keil <kkeil@novell.com> */ #include <linux/mISDNif.h> #include <linux/slab.h> #include <linux/export.h> #include "core.h" static u_int *debug; static struct proto mISDN_proto = { .name = "misdn", .owner = THIS_MODULE, .obj_size = sizeof(struct mISDN_sock) }; #define _pms(sk) ((struct mISDN_sock *)sk) static struct mISDN_sock_list data_sockets = { .lock = __RW_LOCK_UNLOCKED(data_sockets.lock) }; static struct mISDN_sock_list base_sockets = { .lock = __RW_LOCK_UNLOCKED(base_sockets.lock) }; #define L2_HEADER_LEN 4 static inline struct sk_buff * _l2_alloc_skb(unsigned int len, gfp_t gfp_mask) { struct sk_buff *skb; skb = alloc_skb(len + L2_HEADER_LEN, gfp_mask); if (likely(skb)) skb_reserve(skb, L2_HEADER_LEN); return skb; } static void mISDN_sock_link(struct mISDN_sock_list *l, struct sock *sk) { write_lock_bh(&l->lock); sk_add_node(sk, &l->head); write_unlock_bh(&l->lock); } static void mISDN_sock_unlink(struct mISDN_sock_list *l, struct sock *sk) { write_lock_bh(&l->lock); sk_del_node_init(sk); write_unlock_bh(&l->lock); } static int mISDN_send(struct mISDNchannel *ch, struct sk_buff *skb) { struct mISDN_sock *msk; int err; msk = container_of(ch, struct mISDN_sock, ch); if (*debug & DEBUG_SOCKET) printk(KERN_DEBUG "%s len %d %p\n", __func__, skb->len, skb); if (msk->sk.sk_state == MISDN_CLOSED) return -EUNATCH; __net_timestamp(skb); err = sock_queue_rcv_skb(&msk->sk, skb); if (err) printk(KERN_WARNING "%s: error %d\n", __func__, err); return err; } static int mISDN_ctrl(struct mISDNchannel *ch, u_int cmd, void *arg) { struct mISDN_sock *msk; msk = container_of(ch, struct mISDN_sock, ch); if (*debug & DEBUG_SOCKET) printk(KERN_DEBUG "%s(%p, %x, %p)\n", __func__, ch, cmd, arg); switch (cmd) { case CLOSE_CHANNEL: msk->sk.sk_state = MISDN_CLOSED; break; } return 0; } static inline void mISDN_sock_cmsg(struct sock *sk, struct msghdr *msg, struct sk_buff *skb) { struct __kernel_old_timeval tv; if (_pms(sk)->cmask & MISDN_TIME_STAMP) { skb_get_timestamp(skb, &tv); put_cmsg(msg, SOL_MISDN, MISDN_TIME_STAMP, sizeof(tv), &tv); } } static int mISDN_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { struct sk_buff *skb; struct sock *sk = sock->sk; int copied, err; if (*debug & DEBUG_SOCKET) printk(KERN_DEBUG "%s: len %d, flags %x ch.nr %d, proto %x\n", __func__, (int)len, flags, _pms(sk)->ch.nr, sk->sk_protocol); if (flags & (MSG_OOB)) return -EOPNOTSUPP; if (sk->sk_state == MISDN_CLOSED) return 0; skb = skb_recv_datagram(sk, flags, &err); if (!skb) return err; if (msg->msg_name) { DECLARE_SOCKADDR(struct sockaddr_mISDN *, maddr, msg->msg_name); maddr->family = AF_ISDN; maddr->dev = _pms(sk)->dev->id; if ((sk->sk_protocol == ISDN_P_LAPD_TE) || (sk->sk_protocol == ISDN_P_LAPD_NT)) { maddr->channel = (mISDN_HEAD_ID(skb) >> 16) & 0xff; maddr->tei = (mISDN_HEAD_ID(skb) >> 8) & 0xff; maddr->sapi = mISDN_HEAD_ID(skb) & 0xff; } else { maddr->channel = _pms(sk)->ch.nr; maddr->sapi = _pms(sk)->ch.addr & 0xFF; maddr->tei = (_pms(sk)->ch.addr >> 8) & 0xFF; } msg->msg_namelen = sizeof(*maddr); } copied = skb->len + MISDN_HEADER_LEN; if (len < copied) { if (flags & MSG_PEEK) refcount_dec(&skb->users); else skb_queue_head(&sk->sk_receive_queue, skb); return -ENOSPC; } memcpy(skb_push(skb, MISDN_HEADER_LEN), mISDN_HEAD_P(skb), MISDN_HEADER_LEN); err = skb_copy_datagram_msg(skb, 0, msg, copied); mISDN_sock_cmsg(sk, msg, skb); skb_free_datagram(sk, skb); return err ? : copied; } static int mISDN_sock_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct sk_buff *skb; int err = -ENOMEM; if (*debug & DEBUG_SOCKET) printk(KERN_DEBUG "%s: len %d flags %x ch %d proto %x\n", __func__, (int)len, msg->msg_flags, _pms(sk)->ch.nr, sk->sk_protocol); if (msg->msg_flags & MSG_OOB) return -EOPNOTSUPP; if (msg->msg_flags & ~(MSG_DONTWAIT | MSG_NOSIGNAL | MSG_ERRQUEUE)) return -EINVAL; if (len < MISDN_HEADER_LEN) return -EINVAL; if (sk->sk_state != MISDN_BOUND) return -EBADFD; lock_sock(sk); skb = _l2_alloc_skb(len, GFP_KERNEL); if (!skb) goto done; if (memcpy_from_msg(skb_put(skb, len), msg, len)) { err = -EFAULT; goto done; } memcpy(mISDN_HEAD_P(skb), skb->data, MISDN_HEADER_LEN); skb_pull(skb, MISDN_HEADER_LEN); if (msg->msg_namelen >= sizeof(struct sockaddr_mISDN)) { /* if we have a address, we use it */ DECLARE_SOCKADDR(struct sockaddr_mISDN *, maddr, msg->msg_name); mISDN_HEAD_ID(skb) = maddr->channel; } else { /* use default for L2 messages */ if ((sk->sk_protocol == ISDN_P_LAPD_TE) || (sk->sk_protocol == ISDN_P_LAPD_NT)) mISDN_HEAD_ID(skb) = _pms(sk)->ch.nr; } if (*debug & DEBUG_SOCKET) printk(KERN_DEBUG "%s: ID:%x\n", __func__, mISDN_HEAD_ID(skb)); err = -ENODEV; if (!_pms(sk)->ch.peer) goto done; err = _pms(sk)->ch.recv(_pms(sk)->ch.peer, skb); if (err) goto done; else { skb = NULL; err = len; } done: kfree_skb(skb); release_sock(sk); return err; } static int data_sock_release(struct socket *sock) { struct sock *sk = sock->sk; if (*debug & DEBUG_SOCKET) printk(KERN_DEBUG "%s(%p) sk=%p\n", __func__, sock, sk); if (!sk) return 0; switch (sk->sk_protocol) { case ISDN_P_TE_S0: case ISDN_P_NT_S0: case ISDN_P_TE_E1: case ISDN_P_NT_E1: if (sk->sk_state == MISDN_BOUND) delete_channel(&_pms(sk)->ch); else mISDN_sock_unlink(&data_sockets, sk); break; case ISDN_P_LAPD_TE: case ISDN_P_LAPD_NT: case ISDN_P_B_RAW: case ISDN_P_B_HDLC: case ISDN_P_B_X75SLP: case ISDN_P_B_L2DTMF: case ISDN_P_B_L2DSP: case ISDN_P_B_L2DSPHDLC: delete_channel(&_pms(sk)->ch); mISDN_sock_unlink(&data_sockets, sk); break; } lock_sock(sk); sock_orphan(sk); skb_queue_purge(&sk->sk_receive_queue); release_sock(sk); sock_put(sk); return 0; } static int data_sock_ioctl_bound(struct sock *sk, unsigned int cmd, void __user *p) { struct mISDN_ctrl_req cq; int err = -EINVAL, val[2]; struct mISDNchannel *bchan, *next; lock_sock(sk); if (!_pms(sk)->dev) { err = -ENODEV; goto done; } switch (cmd) { case IMCTRLREQ: if (copy_from_user(&cq, p, sizeof(cq))) { err = -EFAULT; break; } if ((sk->sk_protocol & ~ISDN_P_B_MASK) == ISDN_P_B_START) { list_for_each_entry_safe(bchan, next, &_pms(sk)->dev->bchannels, list) { if (bchan->nr == cq.channel) { err = bchan->ctrl(bchan, CONTROL_CHANNEL, &cq); break; } } } else err = _pms(sk)->dev->D.ctrl(&_pms(sk)->dev->D, CONTROL_CHANNEL, &cq); if (err) break; if (copy_to_user(p, &cq, sizeof(cq))) err = -EFAULT; break; case IMCLEAR_L2: if (sk->sk_protocol != ISDN_P_LAPD_NT) { err = -EINVAL; break; } val[0] = cmd; if (get_user(val[1], (int __user *)p)) { err = -EFAULT; break; } err = _pms(sk)->dev->teimgr->ctrl(_pms(sk)->dev->teimgr, CONTROL_CHANNEL, val); break; case IMHOLD_L1: if (sk->sk_protocol != ISDN_P_LAPD_NT && sk->sk_protocol != ISDN_P_LAPD_TE) { err = -EINVAL; break; } val[0] = cmd; if (get_user(val[1], (int __user *)p)) { err = -EFAULT; break; } err = _pms(sk)->dev->teimgr->ctrl(_pms(sk)->dev->teimgr, CONTROL_CHANNEL, val); break; default: err = -EINVAL; break; } done: release_sock(sk); return err; } static int data_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { int err = 0, id; struct sock *sk = sock->sk; struct mISDNdevice *dev; struct mISDNversion ver; switch (cmd) { case IMGETVERSION: ver.major = MISDN_MAJOR_VERSION; ver.minor = MISDN_MINOR_VERSION; ver.release = MISDN_RELEASE; if (copy_to_user((void __user *)arg, &ver, sizeof(ver))) err = -EFAULT; break; case IMGETCOUNT: id = get_mdevice_count(); if (put_user(id, (int __user *)arg)) err = -EFAULT; break; case IMGETDEVINFO: if (get_user(id, (int __user *)arg)) { err = -EFAULT; break; } dev = get_mdevice(id); if (dev) { struct mISDN_devinfo di; memset(&di, 0, sizeof(di)); di.id = dev->id; di.Dprotocols = dev->Dprotocols; di.Bprotocols = dev->Bprotocols | get_all_Bprotocols(); di.protocol = dev->D.protocol; memcpy(di.channelmap, dev->channelmap, sizeof(di.channelmap)); di.nrbchan = dev->nrbchan; strscpy(di.name, dev_name(&dev->dev), sizeof(di.name)); if (copy_to_user((void __user *)arg, &di, sizeof(di))) err = -EFAULT; } else err = -ENODEV; break; default: if (sk->sk_state == MISDN_BOUND) err = data_sock_ioctl_bound(sk, cmd, (void __user *)arg); else err = -ENOTCONN; } return err; } static int data_sock_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; int err = 0, opt = 0; if (*debug & DEBUG_SOCKET) printk(KERN_DEBUG "%s(%p, %d, %x, optval, %d)\n", __func__, sock, level, optname, optlen); lock_sock(sk); switch (optname) { case MISDN_TIME_STAMP: err = copy_safe_from_sockptr(&opt, sizeof(opt), optval, optlen); if (err) break; if (opt) _pms(sk)->cmask |= MISDN_TIME_STAMP; else _pms(sk)->cmask &= ~MISDN_TIME_STAMP; break; default: err = -ENOPROTOOPT; break; } release_sock(sk); return err; } static int data_sock_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; int len, opt; if (get_user(len, optlen)) return -EFAULT; if (len != sizeof(char)) return -EINVAL; switch (optname) { case MISDN_TIME_STAMP: if (_pms(sk)->cmask & MISDN_TIME_STAMP) opt = 1; else opt = 0; if (put_user(opt, optval)) return -EFAULT; break; default: return -ENOPROTOOPT; } return 0; } static int data_sock_bind(struct socket *sock, struct sockaddr *addr, int addr_len) { struct sockaddr_mISDN *maddr = (struct sockaddr_mISDN *) addr; struct sock *sk = sock->sk; struct sock *csk; int err = 0; if (*debug & DEBUG_SOCKET) printk(KERN_DEBUG "%s(%p) sk=%p\n", __func__, sock, sk); if (addr_len != sizeof(struct sockaddr_mISDN)) return -EINVAL; if (!maddr || maddr->family != AF_ISDN) return -EINVAL; lock_sock(sk); if (_pms(sk)->dev) { err = -EALREADY; goto done; } _pms(sk)->dev = get_mdevice(maddr->dev); if (!_pms(sk)->dev) { err = -ENODEV; goto done; } if (sk->sk_protocol < ISDN_P_B_START) { read_lock_bh(&data_sockets.lock); sk_for_each(csk, &data_sockets.head) { if (sk == csk) continue; if (_pms(csk)->dev != _pms(sk)->dev) continue; if (csk->sk_protocol >= ISDN_P_B_START) continue; if (IS_ISDN_P_TE(csk->sk_protocol) == IS_ISDN_P_TE(sk->sk_protocol)) continue; read_unlock_bh(&data_sockets.lock); err = -EBUSY; goto done; } read_unlock_bh(&data_sockets.lock); } _pms(sk)->ch.send = mISDN_send; _pms(sk)->ch.ctrl = mISDN_ctrl; switch (sk->sk_protocol) { case ISDN_P_TE_S0: case ISDN_P_NT_S0: case ISDN_P_TE_E1: case ISDN_P_NT_E1: mISDN_sock_unlink(&data_sockets, sk); err = connect_layer1(_pms(sk)->dev, &_pms(sk)->ch, sk->sk_protocol, maddr); if (err) mISDN_sock_link(&data_sockets, sk); break; case ISDN_P_LAPD_TE: case ISDN_P_LAPD_NT: err = create_l2entity(_pms(sk)->dev, &_pms(sk)->ch, sk->sk_protocol, maddr); break; case ISDN_P_B_RAW: case ISDN_P_B_HDLC: case ISDN_P_B_X75SLP: case ISDN_P_B_L2DTMF: case ISDN_P_B_L2DSP: case ISDN_P_B_L2DSPHDLC: err = connect_Bstack(_pms(sk)->dev, &_pms(sk)->ch, sk->sk_protocol, maddr); break; default: err = -EPROTONOSUPPORT; } if (err) goto done; sk->sk_state = MISDN_BOUND; _pms(sk)->ch.protocol = sk->sk_protocol; done: release_sock(sk); return err; } static int data_sock_getname(struct socket *sock, struct sockaddr *addr, int peer) { struct sockaddr_mISDN *maddr = (struct sockaddr_mISDN *) addr; struct sock *sk = sock->sk; if (!_pms(sk)->dev) return -EBADFD; lock_sock(sk); maddr->family = AF_ISDN; maddr->dev = _pms(sk)->dev->id; maddr->channel = _pms(sk)->ch.nr; maddr->sapi = _pms(sk)->ch.addr & 0xff; maddr->tei = (_pms(sk)->ch.addr >> 8) & 0xff; release_sock(sk); return sizeof(*maddr); } static const struct proto_ops data_sock_ops = { .family = PF_ISDN, .owner = THIS_MODULE, .release = data_sock_release, .ioctl = data_sock_ioctl, .bind = data_sock_bind, .getname = data_sock_getname, .sendmsg = mISDN_sock_sendmsg, .recvmsg = mISDN_sock_recvmsg, .poll = datagram_poll, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = data_sock_setsockopt, .getsockopt = data_sock_getsockopt, .connect = sock_no_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .mmap = sock_no_mmap }; static int data_sock_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; if (sock->type != SOCK_DGRAM) return -ESOCKTNOSUPPORT; sk = sk_alloc(net, PF_ISDN, GFP_KERNEL, &mISDN_proto, kern); if (!sk) return -ENOMEM; sock_init_data(sock, sk); sock->ops = &data_sock_ops; sock->state = SS_UNCONNECTED; sock_reset_flag(sk, SOCK_ZAPPED); sk->sk_protocol = protocol; sk->sk_state = MISDN_OPEN; mISDN_sock_link(&data_sockets, sk); return 0; } static int base_sock_release(struct socket *sock) { struct sock *sk = sock->sk; printk(KERN_DEBUG "%s(%p) sk=%p\n", __func__, sock, sk); if (!sk) return 0; mISDN_sock_unlink(&base_sockets, sk); sock_orphan(sk); sock_put(sk); return 0; } static int base_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { int err = 0, id; struct mISDNdevice *dev; struct mISDNversion ver; switch (cmd) { case IMGETVERSION: ver.major = MISDN_MAJOR_VERSION; ver.minor = MISDN_MINOR_VERSION; ver.release = MISDN_RELEASE; if (copy_to_user((void __user *)arg, &ver, sizeof(ver))) err = -EFAULT; break; case IMGETCOUNT: id = get_mdevice_count(); if (put_user(id, (int __user *)arg)) err = -EFAULT; break; case IMGETDEVINFO: if (get_user(id, (int __user *)arg)) { err = -EFAULT; break; } dev = get_mdevice(id); if (dev) { struct mISDN_devinfo di; memset(&di, 0, sizeof(di)); di.id = dev->id; di.Dprotocols = dev->Dprotocols; di.Bprotocols = dev->Bprotocols | get_all_Bprotocols(); di.protocol = dev->D.protocol; memcpy(di.channelmap, dev->channelmap, sizeof(di.channelmap)); di.nrbchan = dev->nrbchan; strscpy(di.name, dev_name(&dev->dev), sizeof(di.name)); if (copy_to_user((void __user *)arg, &di, sizeof(di))) err = -EFAULT; } else err = -ENODEV; break; case IMSETDEVNAME: { struct mISDN_devrename dn; if (copy_from_user(&dn, (void __user *)arg, sizeof(dn))) { err = -EFAULT; break; } dn.name[sizeof(dn.name) - 1] = '\0'; dev = get_mdevice(dn.id); if (dev) err = device_rename(&dev->dev, dn.name); else err = -ENODEV; } break; default: err = -EINVAL; } return err; } static int base_sock_bind(struct socket *sock, struct sockaddr *addr, int addr_len) { struct sockaddr_mISDN *maddr = (struct sockaddr_mISDN *) addr; struct sock *sk = sock->sk; int err = 0; if (addr_len < sizeof(struct sockaddr_mISDN)) return -EINVAL; if (!maddr || maddr->family != AF_ISDN) return -EINVAL; lock_sock(sk); if (_pms(sk)->dev) { err = -EALREADY; goto done; } _pms(sk)->dev = get_mdevice(maddr->dev); if (!_pms(sk)->dev) { err = -ENODEV; goto done; } sk->sk_state = MISDN_BOUND; done: release_sock(sk); return err; } static const struct proto_ops base_sock_ops = { .family = PF_ISDN, .owner = THIS_MODULE, .release = base_sock_release, .ioctl = base_sock_ioctl, .bind = base_sock_bind, .getname = sock_no_getname, .sendmsg = sock_no_sendmsg, .recvmsg = sock_no_recvmsg, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .connect = sock_no_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .mmap = sock_no_mmap }; static int base_sock_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; if (sock->type != SOCK_RAW) return -ESOCKTNOSUPPORT; if (!capable(CAP_NET_RAW)) return -EPERM; sk = sk_alloc(net, PF_ISDN, GFP_KERNEL, &mISDN_proto, kern); if (!sk) return -ENOMEM; sock_init_data(sock, sk); sock->ops = &base_sock_ops; sock->state = SS_UNCONNECTED; sock_reset_flag(sk, SOCK_ZAPPED); sk->sk_protocol = protocol; sk->sk_state = MISDN_OPEN; mISDN_sock_link(&base_sockets, sk); return 0; } static int mISDN_sock_create(struct net *net, struct socket *sock, int proto, int kern) { int err = -EPROTONOSUPPORT; switch (proto) { case ISDN_P_BASE: err = base_sock_create(net, sock, proto, kern); break; case ISDN_P_TE_S0: case ISDN_P_NT_S0: case ISDN_P_TE_E1: case ISDN_P_NT_E1: case ISDN_P_LAPD_TE: case ISDN_P_LAPD_NT: case ISDN_P_B_RAW: case ISDN_P_B_HDLC: case ISDN_P_B_X75SLP: case ISDN_P_B_L2DTMF: case ISDN_P_B_L2DSP: case ISDN_P_B_L2DSPHDLC: err = data_sock_create(net, sock, proto, kern); break; default: return err; } return err; } static const struct net_proto_family mISDN_sock_family_ops = { .owner = THIS_MODULE, .family = PF_ISDN, .create = mISDN_sock_create, }; int misdn_sock_init(u_int *deb) { int err; debug = deb; err = sock_register(&mISDN_sock_family_ops); if (err) printk(KERN_ERR "%s: error(%d)\n", __func__, err); return err; } void misdn_sock_cleanup(void) { sock_unregister(PF_ISDN); }
13 13 87 87 4 3 2 1 4 3 1 1 2 1 1494 1496 1492 1492 2 2 649 613 1497 1481 25 652 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 /* * Copyright (c) 2004 Topspin Communications. All rights reserved. * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include <linux/module.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/netdevice.h> #include <net/net_namespace.h> #include <linux/security.h> #include <linux/notifier.h> #include <linux/hashtable.h> #include <rdma/rdma_netlink.h> #include <rdma/ib_addr.h> #include <rdma/ib_cache.h> #include <rdma/rdma_counter.h> #include "core_priv.h" #include "restrack.h" MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("core kernel InfiniBand API"); MODULE_LICENSE("Dual BSD/GPL"); struct workqueue_struct *ib_comp_wq; struct workqueue_struct *ib_comp_unbound_wq; struct workqueue_struct *ib_wq; EXPORT_SYMBOL_GPL(ib_wq); static struct workqueue_struct *ib_unreg_wq; /* * Each of the three rwsem locks (devices, clients, client_data) protects the * xarray of the same name. Specifically it allows the caller to assert that * the MARK will/will not be changing under the lock, and for devices and * clients, that the value in the xarray is still a valid pointer. Change of * the MARK is linked to the object state, so holding the lock and testing the * MARK also asserts that the contained object is in a certain state. * * This is used to build a two stage register/unregister flow where objects * can continue to be in the xarray even though they are still in progress to * register/unregister. * * The xarray itself provides additional locking, and restartable iteration, * which is also relied on. * * Locks should not be nested, with the exception of client_data, which is * allowed to nest under the read side of the other two locks. * * The devices_rwsem also protects the device name list, any change or * assignment of device name must also hold the write side to guarantee unique * names. */ /* * devices contains devices that have had their names assigned. The * devices may not be registered. Users that care about the registration * status need to call ib_device_try_get() on the device to ensure it is * registered, and keep it registered, for the required duration. * */ static DEFINE_XARRAY_FLAGS(devices, XA_FLAGS_ALLOC); static DECLARE_RWSEM(devices_rwsem); #define DEVICE_REGISTERED XA_MARK_1 static u32 highest_client_id; #define CLIENT_REGISTERED XA_MARK_1 static DEFINE_XARRAY_FLAGS(clients, XA_FLAGS_ALLOC); static DECLARE_RWSEM(clients_rwsem); static void ib_client_put(struct ib_client *client) { if (refcount_dec_and_test(&client->uses)) complete(&client->uses_zero); } /* * If client_data is registered then the corresponding client must also still * be registered. */ #define CLIENT_DATA_REGISTERED XA_MARK_1 unsigned int rdma_dev_net_id; /* * A list of net namespaces is maintained in an xarray. This is necessary * because we can't get the locking right using the existing net ns list. We * would require a init_net callback after the list is updated. */ static DEFINE_XARRAY_FLAGS(rdma_nets, XA_FLAGS_ALLOC); /* * rwsem to protect accessing the rdma_nets xarray entries. */ static DECLARE_RWSEM(rdma_nets_rwsem); bool ib_devices_shared_netns = true; module_param_named(netns_mode, ib_devices_shared_netns, bool, 0444); MODULE_PARM_DESC(netns_mode, "Share device among net namespaces; default=1 (shared)"); /** * rdma_dev_access_netns() - Return whether an rdma device can be accessed * from a specified net namespace or not. * @dev: Pointer to rdma device which needs to be checked * @net: Pointer to net namesapce for which access to be checked * * When the rdma device is in shared mode, it ignores the net namespace. * When the rdma device is exclusive to a net namespace, rdma device net * namespace is checked against the specified one. */ bool rdma_dev_access_netns(const struct ib_device *dev, const struct net *net) { return (ib_devices_shared_netns || net_eq(read_pnet(&dev->coredev.rdma_net), net)); } EXPORT_SYMBOL(rdma_dev_access_netns); /* * xarray has this behavior where it won't iterate over NULL values stored in * allocated arrays. So we need our own iterator to see all values stored in * the array. This does the same thing as xa_for_each except that it also * returns NULL valued entries if the array is allocating. Simplified to only * work on simple xarrays. */ static void *xan_find_marked(struct xarray *xa, unsigned long *indexp, xa_mark_t filter) { XA_STATE(xas, xa, *indexp); void *entry; rcu_read_lock(); do { entry = xas_find_marked(&xas, ULONG_MAX, filter); if (xa_is_zero(entry)) break; } while (xas_retry(&xas, entry)); rcu_read_unlock(); if (entry) { *indexp = xas.xa_index; if (xa_is_zero(entry)) return NULL; return entry; } return XA_ERROR(-ENOENT); } #define xan_for_each_marked(xa, index, entry, filter) \ for (index = 0, entry = xan_find_marked(xa, &(index), filter); \ !xa_is_err(entry); \ (index)++, entry = xan_find_marked(xa, &(index), filter)) /* RCU hash table mapping netdevice pointers to struct ib_port_data */ static DEFINE_SPINLOCK(ndev_hash_lock); static DECLARE_HASHTABLE(ndev_hash, 5); static void free_netdevs(struct ib_device *ib_dev); static void ib_unregister_work(struct work_struct *work); static void __ib_unregister_device(struct ib_device *device); static int ib_security_change(struct notifier_block *nb, unsigned long event, void *lsm_data); static void ib_policy_change_task(struct work_struct *work); static DECLARE_WORK(ib_policy_change_work, ib_policy_change_task); static void __ibdev_printk(const char *level, const struct ib_device *ibdev, struct va_format *vaf) { if (ibdev && ibdev->dev.parent) dev_printk_emit(level[1] - '0', ibdev->dev.parent, "%s %s %s: %pV", dev_driver_string(ibdev->dev.parent), dev_name(ibdev->dev.parent), dev_name(&ibdev->dev), vaf); else if (ibdev) printk("%s%s: %pV", level, dev_name(&ibdev->dev), vaf); else printk("%s(NULL ib_device): %pV", level, vaf); } #define define_ibdev_printk_level(func, level) \ void func(const struct ib_device *ibdev, const char *fmt, ...) \ { \ struct va_format vaf; \ va_list args; \ \ va_start(args, fmt); \ \ vaf.fmt = fmt; \ vaf.va = &args; \ \ __ibdev_printk(level, ibdev, &vaf); \ \ va_end(args); \ } \ EXPORT_SYMBOL(func); define_ibdev_printk_level(ibdev_emerg, KERN_EMERG); define_ibdev_printk_level(ibdev_alert, KERN_ALERT); define_ibdev_printk_level(ibdev_crit, KERN_CRIT); define_ibdev_printk_level(ibdev_err, KERN_ERR); define_ibdev_printk_level(ibdev_warn, KERN_WARNING); define_ibdev_printk_level(ibdev_notice, KERN_NOTICE); define_ibdev_printk_level(ibdev_info, KERN_INFO); static struct notifier_block ibdev_lsm_nb = { .notifier_call = ib_security_change, }; static int rdma_dev_change_netns(struct ib_device *device, struct net *cur_net, struct net *net); /* Pointer to the RCU head at the start of the ib_port_data array */ struct ib_port_data_rcu { struct rcu_head rcu_head; struct ib_port_data pdata[]; }; static void ib_device_check_mandatory(struct ib_device *device) { #define IB_MANDATORY_FUNC(x) { offsetof(struct ib_device_ops, x), #x } static const struct { size_t offset; char *name; } mandatory_table[] = { IB_MANDATORY_FUNC(query_device), IB_MANDATORY_FUNC(query_port), IB_MANDATORY_FUNC(alloc_pd), IB_MANDATORY_FUNC(dealloc_pd), IB_MANDATORY_FUNC(create_qp), IB_MANDATORY_FUNC(modify_qp), IB_MANDATORY_FUNC(destroy_qp), IB_MANDATORY_FUNC(post_send), IB_MANDATORY_FUNC(post_recv), IB_MANDATORY_FUNC(create_cq), IB_MANDATORY_FUNC(destroy_cq), IB_MANDATORY_FUNC(poll_cq), IB_MANDATORY_FUNC(req_notify_cq), IB_MANDATORY_FUNC(get_dma_mr), IB_MANDATORY_FUNC(reg_user_mr), IB_MANDATORY_FUNC(dereg_mr), IB_MANDATORY_FUNC(get_port_immutable) }; int i; device->kverbs_provider = true; for (i = 0; i < ARRAY_SIZE(mandatory_table); ++i) { if (!*(void **) ((void *) &device->ops + mandatory_table[i].offset)) { device->kverbs_provider = false; break; } } } /* * Caller must perform ib_device_put() to return the device reference count * when ib_device_get_by_index() returns valid device pointer. */ struct ib_device *ib_device_get_by_index(const struct net *net, u32 index) { struct ib_device *device; down_read(&devices_rwsem); device = xa_load(&devices, index); if (device) { if (!rdma_dev_access_netns(device, net)) { device = NULL; goto out; } if (!ib_device_try_get(device)) device = NULL; } out: up_read(&devices_rwsem); return device; } /** * ib_device_put - Release IB device reference * @device: device whose reference to be released * * ib_device_put() releases reference to the IB device to allow it to be * unregistered and eventually free. */ void ib_device_put(struct ib_device *device) { if (refcount_dec_and_test(&device->refcount)) complete(&device->unreg_completion); } EXPORT_SYMBOL(ib_device_put); static struct ib_device *__ib_device_get_by_name(const char *name) { struct ib_device *device; unsigned long index; xa_for_each (&devices, index, device) if (!strcmp(name, dev_name(&device->dev))) return device; return NULL; } /** * ib_device_get_by_name - Find an IB device by name * @name: The name to look for * @driver_id: The driver ID that must match (RDMA_DRIVER_UNKNOWN matches all) * * Find and hold an ib_device by its name. The caller must call * ib_device_put() on the returned pointer. */ struct ib_device *ib_device_get_by_name(const char *name, enum rdma_driver_id driver_id) { struct ib_device *device; down_read(&devices_rwsem); device = __ib_device_get_by_name(name); if (device && driver_id != RDMA_DRIVER_UNKNOWN && device->ops.driver_id != driver_id) device = NULL; if (device) { if (!ib_device_try_get(device)) device = NULL; } up_read(&devices_rwsem); return device; } EXPORT_SYMBOL(ib_device_get_by_name); static int rename_compat_devs(struct ib_device *device) { struct ib_core_device *cdev; unsigned long index; int ret = 0; mutex_lock(&device->compat_devs_mutex); xa_for_each (&device->compat_devs, index, cdev) { ret = device_rename(&cdev->dev, dev_name(&device->dev)); if (ret) { dev_warn(&cdev->dev, "Fail to rename compatdev to new name %s\n", dev_name(&device->dev)); break; } } mutex_unlock(&device->compat_devs_mutex); return ret; } int ib_device_rename(struct ib_device *ibdev, const char *name) { unsigned long index; void *client_data; int ret; down_write(&devices_rwsem); if (!strcmp(name, dev_name(&ibdev->dev))) { up_write(&devices_rwsem); return 0; } if (__ib_device_get_by_name(name)) { up_write(&devices_rwsem); return -EEXIST; } ret = device_rename(&ibdev->dev, name); if (ret) { up_write(&devices_rwsem); return ret; } strscpy(ibdev->name, name, IB_DEVICE_NAME_MAX); ret = rename_compat_devs(ibdev); downgrade_write(&devices_rwsem); down_read(&ibdev->client_data_rwsem); xan_for_each_marked(&ibdev->client_data, index, client_data, CLIENT_DATA_REGISTERED) { struct ib_client *client = xa_load(&clients, index); if (!client || !client->rename) continue; client->rename(ibdev, client_data); } up_read(&ibdev->client_data_rwsem); rdma_nl_notify_event(ibdev, 0, RDMA_RENAME_EVENT); up_read(&devices_rwsem); return 0; } int ib_device_set_dim(struct ib_device *ibdev, u8 use_dim) { if (use_dim > 1) return -EINVAL; ibdev->use_cq_dim = use_dim; return 0; } static int alloc_name(struct ib_device *ibdev, const char *name) { struct ib_device *device; unsigned long index; struct ida inuse; int rc; int i; lockdep_assert_held_write(&devices_rwsem); ida_init(&inuse); xa_for_each (&devices, index, device) { char buf[IB_DEVICE_NAME_MAX]; if (sscanf(dev_name(&device->dev), name, &i) != 1) continue; if (i < 0 || i >= INT_MAX) continue; snprintf(buf, sizeof buf, name, i); if (strcmp(buf, dev_name(&device->dev)) != 0) continue; rc = ida_alloc_range(&inuse, i, i, GFP_KERNEL); if (rc < 0) goto out; } rc = ida_alloc(&inuse, GFP_KERNEL); if (rc < 0) goto out; rc = dev_set_name(&ibdev->dev, name, rc); out: ida_destroy(&inuse); return rc; } static void ib_device_release(struct device *device) { struct ib_device *dev = container_of(device, struct ib_device, dev); free_netdevs(dev); WARN_ON(refcount_read(&dev->refcount)); if (dev->hw_stats_data) ib_device_release_hw_stats(dev->hw_stats_data); if (dev->port_data) { ib_cache_release_one(dev); ib_security_release_port_pkey_list(dev); rdma_counter_release(dev); kfree_rcu(container_of(dev->port_data, struct ib_port_data_rcu, pdata[0]), rcu_head); } mutex_destroy(&dev->subdev_lock); mutex_destroy(&dev->unregistration_lock); mutex_destroy(&dev->compat_devs_mutex); xa_destroy(&dev->compat_devs); xa_destroy(&dev->client_data); kfree_rcu(dev, rcu_head); } static int ib_device_uevent(const struct device *device, struct kobj_uevent_env *env) { if (add_uevent_var(env, "NAME=%s", dev_name(device))) return -ENOMEM; /* * It would be nice to pass the node GUID with the event... */ return 0; } static const void *net_namespace(const struct device *d) { const struct ib_core_device *coredev = container_of(d, struct ib_core_device, dev); return read_pnet(&coredev->rdma_net); } static struct class ib_class = { .name = "infiniband", .dev_release = ib_device_release, .dev_uevent = ib_device_uevent, .ns_type = &net_ns_type_operations, .namespace = net_namespace, }; static void rdma_init_coredev(struct ib_core_device *coredev, struct ib_device *dev, struct net *net) { /* This BUILD_BUG_ON is intended to catch layout change * of union of ib_core_device and device. * dev must be the first element as ib_core and providers * driver uses it. Adding anything in ib_core_device before * device will break this assumption. */ BUILD_BUG_ON(offsetof(struct ib_device, coredev.dev) != offsetof(struct ib_device, dev)); coredev->dev.class = &ib_class; coredev->dev.groups = dev->groups; device_initialize(&coredev->dev); coredev->owner = dev; INIT_LIST_HEAD(&coredev->port_list); write_pnet(&coredev->rdma_net, net); } /** * _ib_alloc_device - allocate an IB device struct * @size:size of structure to allocate * * Low-level drivers should use ib_alloc_device() to allocate &struct * ib_device. @size is the size of the structure to be allocated, * including any private data used by the low-level driver. * ib_dealloc_device() must be used to free structures allocated with * ib_alloc_device(). */ struct ib_device *_ib_alloc_device(size_t size) { struct ib_device *device; unsigned int i; if (WARN_ON(size < sizeof(struct ib_device))) return NULL; device = kzalloc(size, GFP_KERNEL); if (!device) return NULL; if (rdma_restrack_init(device)) { kfree(device); return NULL; } rdma_init_coredev(&device->coredev, device, &init_net); INIT_LIST_HEAD(&device->event_handler_list); spin_lock_init(&device->qp_open_list_lock); init_rwsem(&device->event_handler_rwsem); mutex_init(&device->unregistration_lock); /* * client_data needs to be alloc because we don't want our mark to be * destroyed if the user stores NULL in the client data. */ xa_init_flags(&device->client_data, XA_FLAGS_ALLOC); init_rwsem(&device->client_data_rwsem); xa_init_flags(&device->compat_devs, XA_FLAGS_ALLOC); mutex_init(&device->compat_devs_mutex); init_completion(&device->unreg_completion); INIT_WORK(&device->unregistration_work, ib_unregister_work); spin_lock_init(&device->cq_pools_lock); for (i = 0; i < ARRAY_SIZE(device->cq_pools); i++) INIT_LIST_HEAD(&device->cq_pools[i]); rwlock_init(&device->cache_lock); device->uverbs_cmd_mask = BIT_ULL(IB_USER_VERBS_CMD_ALLOC_MW) | BIT_ULL(IB_USER_VERBS_CMD_ALLOC_PD) | BIT_ULL(IB_USER_VERBS_CMD_ATTACH_MCAST) | BIT_ULL(IB_USER_VERBS_CMD_CLOSE_XRCD) | BIT_ULL(IB_USER_VERBS_CMD_CREATE_AH) | BIT_ULL(IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | BIT_ULL(IB_USER_VERBS_CMD_CREATE_CQ) | BIT_ULL(IB_USER_VERBS_CMD_CREATE_QP) | BIT_ULL(IB_USER_VERBS_CMD_CREATE_SRQ) | BIT_ULL(IB_USER_VERBS_CMD_CREATE_XSRQ) | BIT_ULL(IB_USER_VERBS_CMD_DEALLOC_MW) | BIT_ULL(IB_USER_VERBS_CMD_DEALLOC_PD) | BIT_ULL(IB_USER_VERBS_CMD_DEREG_MR) | BIT_ULL(IB_USER_VERBS_CMD_DESTROY_AH) | BIT_ULL(IB_USER_VERBS_CMD_DESTROY_CQ) | BIT_ULL(IB_USER_VERBS_CMD_DESTROY_QP) | BIT_ULL(IB_USER_VERBS_CMD_DESTROY_SRQ) | BIT_ULL(IB_USER_VERBS_CMD_DETACH_MCAST) | BIT_ULL(IB_USER_VERBS_CMD_GET_CONTEXT) | BIT_ULL(IB_USER_VERBS_CMD_MODIFY_QP) | BIT_ULL(IB_USER_VERBS_CMD_MODIFY_SRQ) | BIT_ULL(IB_USER_VERBS_CMD_OPEN_QP) | BIT_ULL(IB_USER_VERBS_CMD_OPEN_XRCD) | BIT_ULL(IB_USER_VERBS_CMD_QUERY_DEVICE) | BIT_ULL(IB_USER_VERBS_CMD_QUERY_PORT) | BIT_ULL(IB_USER_VERBS_CMD_QUERY_QP) | BIT_ULL(IB_USER_VERBS_CMD_QUERY_SRQ) | BIT_ULL(IB_USER_VERBS_CMD_REG_MR) | BIT_ULL(IB_USER_VERBS_CMD_REREG_MR) | BIT_ULL(IB_USER_VERBS_CMD_RESIZE_CQ); mutex_init(&device->subdev_lock); INIT_LIST_HEAD(&device->subdev_list_head); INIT_LIST_HEAD(&device->subdev_list); return device; } EXPORT_SYMBOL(_ib_alloc_device); /** * ib_dealloc_device - free an IB device struct * @device:structure to free * * Free a structure allocated with ib_alloc_device(). */ void ib_dealloc_device(struct ib_device *device) { if (device->ops.dealloc_driver) device->ops.dealloc_driver(device); /* * ib_unregister_driver() requires all devices to remain in the xarray * while their ops are callable. The last op we call is dealloc_driver * above. This is needed to create a fence on op callbacks prior to * allowing the driver module to unload. */ down_write(&devices_rwsem); if (xa_load(&devices, device->index) == device) xa_erase(&devices, device->index); up_write(&devices_rwsem); /* Expedite releasing netdev references */ free_netdevs(device); WARN_ON(!xa_empty(&device->compat_devs)); WARN_ON(!xa_empty(&device->client_data)); WARN_ON(refcount_read(&device->refcount)); rdma_restrack_clean(device); /* Balances with device_initialize */ put_device(&device->dev); } EXPORT_SYMBOL(ib_dealloc_device); /* * add_client_context() and remove_client_context() must be safe against * parallel calls on the same device - registration/unregistration of both the * device and client can be occurring in parallel. * * The routines need to be a fence, any caller must not return until the add * or remove is fully completed. */ static int add_client_context(struct ib_device *device, struct ib_client *client) { int ret = 0; if (!device->kverbs_provider && !client->no_kverbs_req) return 0; down_write(&device->client_data_rwsem); /* * So long as the client is registered hold both the client and device * unregistration locks. */ if (!refcount_inc_not_zero(&client->uses)) goto out_unlock; refcount_inc(&device->refcount); /* * Another caller to add_client_context got here first and has already * completely initialized context. */ if (xa_get_mark(&device->client_data, client->client_id, CLIENT_DATA_REGISTERED)) goto out; ret = xa_err(xa_store(&device->client_data, client->client_id, NULL, GFP_KERNEL)); if (ret) goto out; downgrade_write(&device->client_data_rwsem); if (client->add) { if (client->add(device)) { /* * If a client fails to add then the error code is * ignored, but we won't call any more ops on this * client. */ xa_erase(&device->client_data, client->client_id); up_read(&device->client_data_rwsem); ib_device_put(device); ib_client_put(client); return 0; } } /* Readers shall not see a client until add has been completed */ xa_set_mark(&device->client_data, client->client_id, CLIENT_DATA_REGISTERED); up_read(&device->client_data_rwsem); return 0; out: ib_device_put(device); ib_client_put(client); out_unlock: up_write(&device->client_data_rwsem); return ret; } static void remove_client_context(struct ib_device *device, unsigned int client_id) { struct ib_client *client; void *client_data; down_write(&device->client_data_rwsem); if (!xa_get_mark(&device->client_data, client_id, CLIENT_DATA_REGISTERED)) { up_write(&device->client_data_rwsem); return; } client_data = xa_load(&device->client_data, client_id); xa_clear_mark(&device->client_data, client_id, CLIENT_DATA_REGISTERED); client = xa_load(&clients, client_id); up_write(&device->client_data_rwsem); /* * Notice we cannot be holding any exclusive locks when calling the * remove callback as the remove callback can recurse back into any * public functions in this module and thus try for any locks those * functions take. * * For this reason clients and drivers should not call the * unregistration functions will holdling any locks. */ if (client->remove) client->remove(device, client_data); xa_erase(&device->client_data, client_id); ib_device_put(device); ib_client_put(client); } static int alloc_port_data(struct ib_device *device) { struct ib_port_data_rcu *pdata_rcu; u32 port; if (device->port_data) return 0; /* This can only be called once the physical port range is defined */ if (WARN_ON(!device->phys_port_cnt)) return -EINVAL; /* Reserve U32_MAX so the logic to go over all the ports is sane */ if (WARN_ON(device->phys_port_cnt == U32_MAX)) return -EINVAL; /* * device->port_data is indexed directly by the port number to make * access to this data as efficient as possible. * * Therefore port_data is declared as a 1 based array with potential * empty slots at the beginning. */ pdata_rcu = kzalloc(struct_size(pdata_rcu, pdata, size_add(rdma_end_port(device), 1)), GFP_KERNEL); if (!pdata_rcu) return -ENOMEM; /* * The rcu_head is put in front of the port data array and the stored * pointer is adjusted since we never need to see that member until * kfree_rcu. */ device->port_data = pdata_rcu->pdata; rdma_for_each_port (device, port) { struct ib_port_data *pdata = &device->port_data[port]; pdata->ib_dev = device; spin_lock_init(&pdata->pkey_list_lock); INIT_LIST_HEAD(&pdata->pkey_list); spin_lock_init(&pdata->netdev_lock); INIT_HLIST_NODE(&pdata->ndev_hash_link); } return 0; } static int verify_immutable(const struct ib_device *dev, u32 port) { return WARN_ON(!rdma_cap_ib_mad(dev, port) && rdma_max_mad_size(dev, port) != 0); } static int setup_port_data(struct ib_device *device) { u32 port; int ret; ret = alloc_port_data(device); if (ret) return ret; rdma_for_each_port (device, port) { struct ib_port_data *pdata = &device->port_data[port]; ret = device->ops.get_port_immutable(device, port, &pdata->immutable); if (ret) return ret; if (verify_immutable(device, port)) return -EINVAL; } return 0; } /** * ib_port_immutable_read() - Read rdma port's immutable data * @dev: IB device * @port: port number whose immutable data to read. It starts with index 1 and * valid upto including rdma_end_port(). */ const struct ib_port_immutable* ib_port_immutable_read(struct ib_device *dev, unsigned int port) { WARN_ON(!rdma_is_port_valid(dev, port)); return &dev->port_data[port].immutable; } EXPORT_SYMBOL(ib_port_immutable_read); void ib_get_device_fw_str(struct ib_device *dev, char *str) { if (dev->ops.get_dev_fw_str) dev->ops.get_dev_fw_str(dev, str); else str[0] = '\0'; } EXPORT_SYMBOL(ib_get_device_fw_str); static void ib_policy_change_task(struct work_struct *work) { struct ib_device *dev; unsigned long index; down_read(&devices_rwsem); xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { unsigned int i; rdma_for_each_port (dev, i) { u64 sp; ib_get_cached_subnet_prefix(dev, i, &sp); ib_security_cache_change(dev, i, sp); } } up_read(&devices_rwsem); } static int ib_security_change(struct notifier_block *nb, unsigned long event, void *lsm_data) { if (event != LSM_POLICY_CHANGE) return NOTIFY_DONE; schedule_work(&ib_policy_change_work); ib_mad_agent_security_change(); return NOTIFY_OK; } static void compatdev_release(struct device *dev) { struct ib_core_device *cdev = container_of(dev, struct ib_core_device, dev); kfree(cdev); } static int add_one_compat_dev(struct ib_device *device, struct rdma_dev_net *rnet) { struct ib_core_device *cdev; int ret; lockdep_assert_held(&rdma_nets_rwsem); if (!ib_devices_shared_netns) return 0; /* * Create and add compat device in all namespaces other than where it * is currently bound to. */ if (net_eq(read_pnet(&rnet->net), read_pnet(&device->coredev.rdma_net))) return 0; /* * The first of init_net() or ib_register_device() to take the * compat_devs_mutex wins and gets to add the device. Others will wait * for completion here. */ mutex_lock(&device->compat_devs_mutex); cdev = xa_load(&device->compat_devs, rnet->id); if (cdev) { ret = 0; goto done; } ret = xa_reserve(&device->compat_devs, rnet->id, GFP_KERNEL); if (ret) goto done; cdev = kzalloc(sizeof(*cdev), GFP_KERNEL); if (!cdev) { ret = -ENOMEM; goto cdev_err; } cdev->dev.parent = device->dev.parent; rdma_init_coredev(cdev, device, read_pnet(&rnet->net)); cdev->dev.release = compatdev_release; ret = dev_set_name(&cdev->dev, "%s", dev_name(&device->dev)); if (ret) goto add_err; ret = device_add(&cdev->dev); if (ret) goto add_err; ret = ib_setup_port_attrs(cdev); if (ret) goto port_err; ret = xa_err(xa_store(&device->compat_devs, rnet->id, cdev, GFP_KERNEL)); if (ret) goto insert_err; mutex_unlock(&device->compat_devs_mutex); return 0; insert_err: ib_free_port_attrs(cdev); port_err: device_del(&cdev->dev); add_err: put_device(&cdev->dev); cdev_err: xa_release(&device->compat_devs, rnet->id); done: mutex_unlock(&device->compat_devs_mutex); return ret; } static void remove_one_compat_dev(struct ib_device *device, u32 id) { struct ib_core_device *cdev; mutex_lock(&device->compat_devs_mutex); cdev = xa_erase(&device->compat_devs, id); mutex_unlock(&device->compat_devs_mutex); if (cdev) { ib_free_port_attrs(cdev); device_del(&cdev->dev); put_device(&cdev->dev); } } static void remove_compat_devs(struct ib_device *device) { struct ib_core_device *cdev; unsigned long index; xa_for_each (&device->compat_devs, index, cdev) remove_one_compat_dev(device, index); } static int add_compat_devs(struct ib_device *device) { struct rdma_dev_net *rnet; unsigned long index; int ret = 0; lockdep_assert_held(&devices_rwsem); down_read(&rdma_nets_rwsem); xa_for_each (&rdma_nets, index, rnet) { ret = add_one_compat_dev(device, rnet); if (ret) break; } up_read(&rdma_nets_rwsem); return ret; } static void remove_all_compat_devs(void) { struct ib_compat_device *cdev; struct ib_device *dev; unsigned long index; down_read(&devices_rwsem); xa_for_each (&devices, index, dev) { unsigned long c_index = 0; /* Hold nets_rwsem so that any other thread modifying this * system param can sync with this thread. */ down_read(&rdma_nets_rwsem); xa_for_each (&dev->compat_devs, c_index, cdev) remove_one_compat_dev(dev, c_index); up_read(&rdma_nets_rwsem); } up_read(&devices_rwsem); } static int add_all_compat_devs(void) { struct rdma_dev_net *rnet; struct ib_device *dev; unsigned long index; int ret = 0; down_read(&devices_rwsem); xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { unsigned long net_index = 0; /* Hold nets_rwsem so that any other thread modifying this * system param can sync with this thread. */ down_read(&rdma_nets_rwsem); xa_for_each (&rdma_nets, net_index, rnet) { ret = add_one_compat_dev(dev, rnet); if (ret) break; } up_read(&rdma_nets_rwsem); } up_read(&devices_rwsem); if (ret) remove_all_compat_devs(); return ret; } int rdma_compatdev_set(u8 enable) { struct rdma_dev_net *rnet; unsigned long index; int ret = 0; down_write(&rdma_nets_rwsem); if (ib_devices_shared_netns == enable) { up_write(&rdma_nets_rwsem); return 0; } /* enable/disable of compat devices is not supported * when more than default init_net exists. */ xa_for_each (&rdma_nets, index, rnet) { ret++; break; } if (!ret) ib_devices_shared_netns = enable; up_write(&rdma_nets_rwsem); if (ret) return -EBUSY; if (enable) ret = add_all_compat_devs(); else remove_all_compat_devs(); return ret; } static void rdma_dev_exit_net(struct net *net) { struct rdma_dev_net *rnet = rdma_net_to_dev_net(net); struct ib_device *dev; unsigned long index; int ret; down_write(&rdma_nets_rwsem); /* * Prevent the ID from being re-used and hide the id from xa_for_each. */ ret = xa_err(xa_store(&rdma_nets, rnet->id, NULL, GFP_KERNEL)); WARN_ON(ret); up_write(&rdma_nets_rwsem); down_read(&devices_rwsem); xa_for_each (&devices, index, dev) { get_device(&dev->dev); /* * Release the devices_rwsem so that pontentially blocking * device_del, doesn't hold the devices_rwsem for too long. */ up_read(&devices_rwsem); remove_one_compat_dev(dev, rnet->id); /* * If the real device is in the NS then move it back to init. */ rdma_dev_change_netns(dev, net, &init_net); put_device(&dev->dev); down_read(&devices_rwsem); } up_read(&devices_rwsem); rdma_nl_net_exit(rnet); xa_erase(&rdma_nets, rnet->id); } static __net_init int rdma_dev_init_net(struct net *net) { struct rdma_dev_net *rnet = rdma_net_to_dev_net(net); unsigned long index; struct ib_device *dev; int ret; write_pnet(&rnet->net, net); ret = rdma_nl_net_init(rnet); if (ret) return ret; /* No need to create any compat devices in default init_net. */ if (net_eq(net, &init_net)) return 0; ret = xa_alloc(&rdma_nets, &rnet->id, rnet, xa_limit_32b, GFP_KERNEL); if (ret) { rdma_nl_net_exit(rnet); return ret; } down_read(&devices_rwsem); xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { /* Hold nets_rwsem so that netlink command cannot change * system configuration for device sharing mode. */ down_read(&rdma_nets_rwsem); ret = add_one_compat_dev(dev, rnet); up_read(&rdma_nets_rwsem); if (ret) break; } up_read(&devices_rwsem); if (ret) rdma_dev_exit_net(net); return ret; } /* * Assign the unique string device name and the unique device index. This is * undone by ib_dealloc_device. */ static int assign_name(struct ib_device *device, const char *name) { static u32 last_id; int ret; down_write(&devices_rwsem); /* Assign a unique name to the device */ if (strchr(name, '%')) ret = alloc_name(device, name); else ret = dev_set_name(&device->dev, name); if (ret) goto out; if (__ib_device_get_by_name(dev_name(&device->dev))) { ret = -ENFILE; goto out; } strscpy(device->name, dev_name(&device->dev), IB_DEVICE_NAME_MAX); ret = xa_alloc_cyclic(&devices, &device->index, device, xa_limit_31b, &last_id, GFP_KERNEL); if (ret > 0) ret = 0; out: up_write(&devices_rwsem); return ret; } /* * setup_device() allocates memory and sets up data that requires calling the * device ops, this is the only reason these actions are not done during * ib_alloc_device. It is undone by ib_dealloc_device(). */ static int setup_device(struct ib_device *device) { struct ib_udata uhw = {.outlen = 0, .inlen = 0}; int ret; ib_device_check_mandatory(device); ret = setup_port_data(device); if (ret) { dev_warn(&device->dev, "Couldn't create per-port data\n"); return ret; } memset(&device->attrs, 0, sizeof(device->attrs)); ret = device->ops.query_device(device, &device->attrs, &uhw); if (ret) { dev_warn(&device->dev, "Couldn't query the device attributes\n"); return ret; } return 0; } static void disable_device(struct ib_device *device) { u32 cid; WARN_ON(!refcount_read(&device->refcount)); down_write(&devices_rwsem); xa_clear_mark(&devices, device->index, DEVICE_REGISTERED); up_write(&devices_rwsem); /* * Remove clients in LIFO order, see assign_client_id. This could be * more efficient if xarray learns to reverse iterate. Since no new * clients can be added to this ib_device past this point we only need * the maximum possible client_id value here. */ down_read(&clients_rwsem); cid = highest_client_id; up_read(&clients_rwsem); while (cid) { cid--; remove_client_context(device, cid); } ib_cq_pool_cleanup(device); /* Pairs with refcount_set in enable_device */ ib_device_put(device); wait_for_completion(&device->unreg_completion); /* * compat devices must be removed after device refcount drops to zero. * Otherwise init_net() may add more compatdevs after removing compat * devices and before device is disabled. */ remove_compat_devs(device); } /* * An enabled device is visible to all clients and to all the public facing * APIs that return a device pointer. This always returns with a new get, even * if it fails. */ static int enable_device_and_get(struct ib_device *device) { struct ib_client *client; unsigned long index; int ret = 0; /* * One ref belongs to the xa and the other belongs to this * thread. This is needed to guard against parallel unregistration. */ refcount_set(&device->refcount, 2); down_write(&devices_rwsem); xa_set_mark(&devices, device->index, DEVICE_REGISTERED); /* * By using downgrade_write() we ensure that no other thread can clear * DEVICE_REGISTERED while we are completing the client setup. */ downgrade_write(&devices_rwsem); if (device->ops.enable_driver) { ret = device->ops.enable_driver(device); if (ret) goto out; } down_read(&clients_rwsem); xa_for_each_marked (&clients, index, client, CLIENT_REGISTERED) { ret = add_client_context(device, client); if (ret) break; } up_read(&clients_rwsem); if (!ret) ret = add_compat_devs(device); out: up_read(&devices_rwsem); return ret; } static void prevent_dealloc_device(struct ib_device *ib_dev) { } static void ib_device_notify_register(struct ib_device *device) { struct net_device *netdev; u32 port; int ret; ret = rdma_nl_notify_event(device, 0, RDMA_REGISTER_EVENT); if (ret) return; rdma_for_each_port(device, port) { netdev = ib_device_get_netdev(device, port); if (!netdev) continue; ret = rdma_nl_notify_event(device, port, RDMA_NETDEV_ATTACH_EVENT); dev_put(netdev); if (ret) return; } } /** * ib_register_device - Register an IB device with IB core * @device: Device to register * @name: unique string device name. This may include a '%' which will * cause a unique index to be added to the passed device name. * @dma_device: pointer to a DMA-capable device. If %NULL, then the IB * device will be used. In this case the caller should fully * setup the ibdev for DMA. This usually means using dma_virt_ops. * * Low-level drivers use ib_register_device() to register their * devices with the IB core. All registered clients will receive a * callback for each device that is added. @device must be allocated * with ib_alloc_device(). * * If the driver uses ops.dealloc_driver and calls any ib_unregister_device() * asynchronously then the device pointer may become freed as soon as this * function returns. */ int ib_register_device(struct ib_device *device, const char *name, struct device *dma_device) { int ret; ret = assign_name(device, name); if (ret) return ret; /* * If the caller does not provide a DMA capable device then the IB core * will set up ib_sge and scatterlist structures that stash the kernel * virtual address into the address field. */ WARN_ON(dma_device && !dma_device->dma_parms); device->dma_device = dma_device; ret = setup_device(device); if (ret) return ret; ret = ib_cache_setup_one(device); if (ret) { dev_warn(&device->dev, "Couldn't set up InfiniBand P_Key/GID cache\n"); return ret; } device->groups[0] = &ib_dev_attr_group; device->groups[1] = device->ops.device_group; ret = ib_setup_device_attrs(device); if (ret) goto cache_cleanup; ib_device_register_rdmacg(device); rdma_counter_init(device); /* * Ensure that ADD uevent is not fired because it * is too early amd device is not initialized yet. */ dev_set_uevent_suppress(&device->dev, true); ret = device_add(&device->dev); if (ret) goto cg_cleanup; ret = ib_setup_port_attrs(&device->coredev); if (ret) { dev_warn(&device->dev, "Couldn't register device with driver model\n"); goto dev_cleanup; } ret = enable_device_and_get(device); if (ret) { void (*dealloc_fn)(struct ib_device *); /* * If we hit this error flow then we don't want to * automatically dealloc the device since the caller is * expected to call ib_dealloc_device() after * ib_register_device() fails. This is tricky due to the * possibility for a parallel unregistration along with this * error flow. Since we have a refcount here we know any * parallel flow is stopped in disable_device and will see the * special dealloc_driver pointer, causing the responsibility to * ib_dealloc_device() to revert back to this thread. */ dealloc_fn = device->ops.dealloc_driver; device->ops.dealloc_driver = prevent_dealloc_device; ib_device_put(device); __ib_unregister_device(device); device->ops.dealloc_driver = dealloc_fn; dev_set_uevent_suppress(&device->dev, false); return ret; } dev_set_uevent_suppress(&device->dev, false); /* Mark for userspace that device is ready */ kobject_uevent(&device->dev.kobj, KOBJ_ADD); ib_device_notify_register(device); ib_device_put(device); return 0; dev_cleanup: device_del(&device->dev); cg_cleanup: dev_set_uevent_suppress(&device->dev, false); ib_device_unregister_rdmacg(device); cache_cleanup: ib_cache_cleanup_one(device); return ret; } EXPORT_SYMBOL(ib_register_device); /* Callers must hold a get on the device. */ static void __ib_unregister_device(struct ib_device *ib_dev) { struct ib_device *sub, *tmp; mutex_lock(&ib_dev->subdev_lock); list_for_each_entry_safe_reverse(sub, tmp, &ib_dev->subdev_list_head, subdev_list) { list_del(&sub->subdev_list); ib_dev->ops.del_sub_dev(sub); ib_device_put(ib_dev); } mutex_unlock(&ib_dev->subdev_lock); /* * We have a registration lock so that all the calls to unregister are * fully fenced, once any unregister returns the device is truely * unregistered even if multiple callers are unregistering it at the * same time. This also interacts with the registration flow and * provides sane semantics if register and unregister are racing. */ mutex_lock(&ib_dev->unregistration_lock); if (!refcount_read(&ib_dev->refcount)) goto out; disable_device(ib_dev); rdma_nl_notify_event(ib_dev, 0, RDMA_UNREGISTER_EVENT); /* Expedite removing unregistered pointers from the hash table */ free_netdevs(ib_dev); ib_free_port_attrs(&ib_dev->coredev); device_del(&ib_dev->dev); ib_device_unregister_rdmacg(ib_dev); ib_cache_cleanup_one(ib_dev); /* * Drivers using the new flow may not call ib_dealloc_device except * in error unwind prior to registration success. */ if (ib_dev->ops.dealloc_driver && ib_dev->ops.dealloc_driver != prevent_dealloc_device) { WARN_ON(kref_read(&ib_dev->dev.kobj.kref) <= 1); ib_dealloc_device(ib_dev); } out: mutex_unlock(&ib_dev->unregistration_lock); } /** * ib_unregister_device - Unregister an IB device * @ib_dev: The device to unregister * * Unregister an IB device. All clients will receive a remove callback. * * Callers should call this routine only once, and protect against races with * registration. Typically it should only be called as part of a remove * callback in an implementation of driver core's struct device_driver and * related. * * If ops.dealloc_driver is used then ib_dev will be freed upon return from * this function. */ void ib_unregister_device(struct ib_device *ib_dev) { get_device(&ib_dev->dev); __ib_unregister_device(ib_dev); put_device(&ib_dev->dev); } EXPORT_SYMBOL(ib_unregister_device); /** * ib_unregister_device_and_put - Unregister a device while holding a 'get' * @ib_dev: The device to unregister * * This is the same as ib_unregister_device(), except it includes an internal * ib_device_put() that should match a 'get' obtained by the caller. * * It is safe to call this routine concurrently from multiple threads while * holding the 'get'. When the function returns the device is fully * unregistered. * * Drivers using this flow MUST use the driver_unregister callback to clean up * their resources associated with the device and dealloc it. */ void ib_unregister_device_and_put(struct ib_device *ib_dev) { WARN_ON(!ib_dev->ops.dealloc_driver); get_device(&ib_dev->dev); ib_device_put(ib_dev); __ib_unregister_device(ib_dev); put_device(&ib_dev->dev); } EXPORT_SYMBOL(ib_unregister_device_and_put); /** * ib_unregister_driver - Unregister all IB devices for a driver * @driver_id: The driver to unregister * * This implements a fence for device unregistration. It only returns once all * devices associated with the driver_id have fully completed their * unregistration and returned from ib_unregister_device*(). * * If device's are not yet unregistered it goes ahead and starts unregistering * them. * * This does not block creation of new devices with the given driver_id, that * is the responsibility of the caller. */ void ib_unregister_driver(enum rdma_driver_id driver_id) { struct ib_device *ib_dev; unsigned long index; down_read(&devices_rwsem); xa_for_each (&devices, index, ib_dev) { if (ib_dev->ops.driver_id != driver_id) continue; get_device(&ib_dev->dev); up_read(&devices_rwsem); WARN_ON(!ib_dev->ops.dealloc_driver); __ib_unregister_device(ib_dev); put_device(&ib_dev->dev); down_read(&devices_rwsem); } up_read(&devices_rwsem); } EXPORT_SYMBOL(ib_unregister_driver); static void ib_unregister_work(struct work_struct *work) { struct ib_device *ib_dev = container_of(work, struct ib_device, unregistration_work); __ib_unregister_device(ib_dev); put_device(&ib_dev->dev); } /** * ib_unregister_device_queued - Unregister a device using a work queue * @ib_dev: The device to unregister * * This schedules an asynchronous unregistration using a WQ for the device. A * driver should use this to avoid holding locks while doing unregistration, * such as holding the RTNL lock. * * Drivers using this API must use ib_unregister_driver before module unload * to ensure that all scheduled unregistrations have completed. */ void ib_unregister_device_queued(struct ib_device *ib_dev) { WARN_ON(!refcount_read(&ib_dev->refcount)); WARN_ON(!ib_dev->ops.dealloc_driver); get_device(&ib_dev->dev); if (!queue_work(ib_unreg_wq, &ib_dev->unregistration_work)) put_device(&ib_dev->dev); } EXPORT_SYMBOL(ib_unregister_device_queued); /* * The caller must pass in a device that has the kref held and the refcount * released. If the device is in cur_net and still registered then it is moved * into net. */ static int rdma_dev_change_netns(struct ib_device *device, struct net *cur_net, struct net *net) { int ret2 = -EINVAL; int ret; mutex_lock(&device->unregistration_lock); /* * If a device not under ib_device_get() or if the unregistration_lock * is not held, the namespace can be changed, or it can be unregistered. * Check again under the lock. */ if (refcount_read(&device->refcount) == 0 || !net_eq(cur_net, read_pnet(&device->coredev.rdma_net))) { ret = -ENODEV; goto out; } kobject_uevent(&device->dev.kobj, KOBJ_REMOVE); disable_device(device); /* * At this point no one can be using the device, so it is safe to * change the namespace. */ write_pnet(&device->coredev.rdma_net, net); down_read(&devices_rwsem); /* * Currently rdma devices are system wide unique. So the device name * is guaranteed free in the new namespace. Publish the new namespace * at the sysfs level. */ ret = device_rename(&device->dev, dev_name(&device->dev)); up_read(&devices_rwsem); if (ret) { dev_warn(&device->dev, "%s: Couldn't rename device after namespace change\n", __func__); /* Try and put things back and re-enable the device */ write_pnet(&device->coredev.rdma_net, cur_net); } ret2 = enable_device_and_get(device); if (ret2) { /* * This shouldn't really happen, but if it does, let the user * retry at later point. So don't disable the device. */ dev_warn(&device->dev, "%s: Couldn't re-enable device after namespace change\n", __func__); } kobject_uevent(&device->dev.kobj, KOBJ_ADD); ib_device_put(device); out: mutex_unlock(&device->unregistration_lock); if (ret) return ret; return ret2; } int ib_device_set_netns_put(struct sk_buff *skb, struct ib_device *dev, u32 ns_fd) { struct net *net; int ret; net = get_net_ns_by_fd(ns_fd); if (IS_ERR(net)) { ret = PTR_ERR(net); goto net_err; } if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) { ret = -EPERM; goto ns_err; } /* * All the ib_clients, including uverbs, are reset when the namespace is * changed and this cannot be blocked waiting for userspace to do * something, so disassociation is mandatory. */ if (!dev->ops.disassociate_ucontext || ib_devices_shared_netns) { ret = -EOPNOTSUPP; goto ns_err; } get_device(&dev->dev); ib_device_put(dev); ret = rdma_dev_change_netns(dev, current->nsproxy->net_ns, net); put_device(&dev->dev); put_net(net); return ret; ns_err: put_net(net); net_err: ib_device_put(dev); return ret; } static struct pernet_operations rdma_dev_net_ops = { .init = rdma_dev_init_net, .exit = rdma_dev_exit_net, .id = &rdma_dev_net_id, .size = sizeof(struct rdma_dev_net), }; static int assign_client_id(struct ib_client *client) { int ret; lockdep_assert_held(&clients_rwsem); /* * The add/remove callbacks must be called in FIFO/LIFO order. To * achieve this we assign client_ids so they are sorted in * registration order. */ client->client_id = highest_client_id; ret = xa_insert(&clients, client->client_id, client, GFP_KERNEL); if (ret) return ret; highest_client_id++; xa_set_mark(&clients, client->client_id, CLIENT_REGISTERED); return 0; } static void remove_client_id(struct ib_client *client) { down_write(&clients_rwsem); xa_erase(&clients, client->client_id); for (; highest_client_id; highest_client_id--) if (xa_load(&clients, highest_client_id - 1)) break; up_write(&clients_rwsem); } /** * ib_register_client - Register an IB client * @client:Client to register * * Upper level users of the IB drivers can use ib_register_client() to * register callbacks for IB device addition and removal. When an IB * device is added, each registered client's add method will be called * (in the order the clients were registered), and when a device is * removed, each client's remove method will be called (in the reverse * order that clients were registered). In addition, when * ib_register_client() is called, the client will receive an add * callback for all devices already registered. */ int ib_register_client(struct ib_client *client) { struct ib_device *device; unsigned long index; bool need_unreg = false; int ret; refcount_set(&client->uses, 1); init_completion(&client->uses_zero); /* * The devices_rwsem is held in write mode to ensure that a racing * ib_register_device() sees a consisent view of clients and devices. */ down_write(&devices_rwsem); down_write(&clients_rwsem); ret = assign_client_id(client); if (ret) goto out; need_unreg = true; xa_for_each_marked (&devices, index, device, DEVICE_REGISTERED) { ret = add_client_context(device, client); if (ret) goto out; } ret = 0; out: up_write(&clients_rwsem); up_write(&devices_rwsem); if (need_unreg && ret) ib_unregister_client(client); return ret; } EXPORT_SYMBOL(ib_register_client); /** * ib_unregister_client - Unregister an IB client * @client:Client to unregister * * Upper level users use ib_unregister_client() to remove their client * registration. When ib_unregister_client() is called, the client * will receive a remove callback for each IB device still registered. * * This is a full fence, once it returns no client callbacks will be called, * or are running in another thread. */ void ib_unregister_client(struct ib_client *client) { struct ib_device *device; unsigned long index; down_write(&clients_rwsem); ib_client_put(client); xa_clear_mark(&clients, client->client_id, CLIENT_REGISTERED); up_write(&clients_rwsem); /* We do not want to have locks while calling client->remove() */ rcu_read_lock(); xa_for_each (&devices, index, device) { if (!ib_device_try_get(device)) continue; rcu_read_unlock(); remove_client_context(device, client->client_id); ib_device_put(device); rcu_read_lock(); } rcu_read_unlock(); /* * remove_client_context() is not a fence, it can return even though a * removal is ongoing. Wait until all removals are completed. */ wait_for_completion(&client->uses_zero); remove_client_id(client); } EXPORT_SYMBOL(ib_unregister_client); static int __ib_get_global_client_nl_info(const char *client_name, struct ib_client_nl_info *res) { struct ib_client *client; unsigned long index; int ret = -ENOENT; down_read(&clients_rwsem); xa_for_each_marked (&clients, index, client, CLIENT_REGISTERED) { if (strcmp(client->name, client_name) != 0) continue; if (!client->get_global_nl_info) { ret = -EOPNOTSUPP; break; } ret = client->get_global_nl_info(res); if (WARN_ON(ret == -ENOENT)) ret = -EINVAL; if (!ret && res->cdev) get_device(res->cdev); break; } up_read(&clients_rwsem); return ret; } static int __ib_get_client_nl_info(struct ib_device *ibdev, const char *client_name, struct ib_client_nl_info *res) { unsigned long index; void *client_data; int ret = -ENOENT; down_read(&ibdev->client_data_rwsem); xan_for_each_marked (&ibdev->client_data, index, client_data, CLIENT_DATA_REGISTERED) { struct ib_client *client = xa_load(&clients, index); if (!client || strcmp(client->name, client_name) != 0) continue; if (!client->get_nl_info) { ret = -EOPNOTSUPP; break; } ret = client->get_nl_info(ibdev, client_data, res); if (WARN_ON(ret == -ENOENT)) ret = -EINVAL; /* * The cdev is guaranteed valid as long as we are inside the * client_data_rwsem as remove_one can't be called. Keep it * valid for the caller. */ if (!ret && res->cdev) get_device(res->cdev); break; } up_read(&ibdev->client_data_rwsem); return ret; } /** * ib_get_client_nl_info - Fetch the nl_info from a client * @ibdev: IB device * @client_name: Name of the client * @res: Result of the query */ int ib_get_client_nl_info(struct ib_device *ibdev, const char *client_name, struct ib_client_nl_info *res) { int ret; if (ibdev) ret = __ib_get_client_nl_info(ibdev, client_name, res); else ret = __ib_get_global_client_nl_info(client_name, res); #ifdef CONFIG_MODULES if (ret == -ENOENT) { request_module("rdma-client-%s", client_name); if (ibdev) ret = __ib_get_client_nl_info(ibdev, client_name, res); else ret = __ib_get_global_client_nl_info(client_name, res); } #endif if (ret) { if (ret == -ENOENT) return -EOPNOTSUPP; return ret; } if (WARN_ON(!res->cdev)) return -EINVAL; return 0; } /** * ib_set_client_data - Set IB client context * @device:Device to set context for * @client:Client to set context for * @data:Context to set * * ib_set_client_data() sets client context data that can be retrieved with * ib_get_client_data(). This can only be called while the client is * registered to the device, once the ib_client remove() callback returns this * cannot be called. */ void ib_set_client_data(struct ib_device *device, struct ib_client *client, void *data) { void *rc; if (WARN_ON(IS_ERR(data))) data = NULL; rc = xa_store(&device->client_data, client->client_id, data, GFP_KERNEL); WARN_ON(xa_is_err(rc)); } EXPORT_SYMBOL(ib_set_client_data); /** * ib_register_event_handler - Register an IB event handler * @event_handler:Handler to register * * ib_register_event_handler() registers an event handler that will be * called back when asynchronous IB events occur (as defined in * chapter 11 of the InfiniBand Architecture Specification). This * callback occurs in workqueue context. */ void ib_register_event_handler(struct ib_event_handler *event_handler) { down_write(&event_handler->device->event_handler_rwsem); list_add_tail(&event_handler->list, &event_handler->device->event_handler_list); up_write(&event_handler->device->event_handler_rwsem); } EXPORT_SYMBOL(ib_register_event_handler); /** * ib_unregister_event_handler - Unregister an event handler * @event_handler:Handler to unregister * * Unregister an event handler registered with * ib_register_event_handler(). */ void ib_unregister_event_handler(struct ib_event_handler *event_handler) { down_write(&event_handler->device->event_handler_rwsem); list_del(&event_handler->list); up_write(&event_handler->device->event_handler_rwsem); } EXPORT_SYMBOL(ib_unregister_event_handler); void ib_dispatch_event_clients(struct ib_event *event) { struct ib_event_handler *handler; down_read(&event->device->event_handler_rwsem); list_for_each_entry(handler, &event->device->event_handler_list, list) handler->handler(handler, event); up_read(&event->device->event_handler_rwsem); } static int iw_query_port(struct ib_device *device, u32 port_num, struct ib_port_attr *port_attr) { struct in_device *inetdev; struct net_device *netdev; memset(port_attr, 0, sizeof(*port_attr)); netdev = ib_device_get_netdev(device, port_num); if (!netdev) return -ENODEV; port_attr->max_mtu = IB_MTU_4096; port_attr->active_mtu = ib_mtu_int_to_enum(netdev->mtu); if (!netif_carrier_ok(netdev)) { port_attr->state = IB_PORT_DOWN; port_attr->phys_state = IB_PORT_PHYS_STATE_DISABLED; } else { rcu_read_lock(); inetdev = __in_dev_get_rcu(netdev); if (inetdev && inetdev->ifa_list) { port_attr->state = IB_PORT_ACTIVE; port_attr->phys_state = IB_PORT_PHYS_STATE_LINK_UP; } else { port_attr->state = IB_PORT_INIT; port_attr->phys_state = IB_PORT_PHYS_STATE_PORT_CONFIGURATION_TRAINING; } rcu_read_unlock(); } dev_put(netdev); return device->ops.query_port(device, port_num, port_attr); } static int __ib_query_port(struct ib_device *device, u32 port_num, struct ib_port_attr *port_attr) { int err; memset(port_attr, 0, sizeof(*port_attr)); err = device->ops.query_port(device, port_num, port_attr); if (err || port_attr->subnet_prefix) return err; if (rdma_port_get_link_layer(device, port_num) != IB_LINK_LAYER_INFINIBAND) return 0; ib_get_cached_subnet_prefix(device, port_num, &port_attr->subnet_prefix); return 0; } /** * ib_query_port - Query IB port attributes * @device:Device to query * @port_num:Port number to query * @port_attr:Port attributes * * ib_query_port() returns the attributes of a port through the * @port_attr pointer. */ int ib_query_port(struct ib_device *device, u32 port_num, struct ib_port_attr *port_attr) { if (!rdma_is_port_valid(device, port_num)) return -EINVAL; if (rdma_protocol_iwarp(device, port_num)) return iw_query_port(device, port_num, port_attr); else return __ib_query_port(device, port_num, port_attr); } EXPORT_SYMBOL(ib_query_port); static void add_ndev_hash(struct ib_port_data *pdata) { unsigned long flags; might_sleep(); spin_lock_irqsave(&ndev_hash_lock, flags); if (hash_hashed(&pdata->ndev_hash_link)) { hash_del_rcu(&pdata->ndev_hash_link); spin_unlock_irqrestore(&ndev_hash_lock, flags); /* * We cannot do hash_add_rcu after a hash_del_rcu until the * grace period */ synchronize_rcu(); spin_lock_irqsave(&ndev_hash_lock, flags); } if (pdata->netdev) hash_add_rcu(ndev_hash, &pdata->ndev_hash_link, (uintptr_t)pdata->netdev); spin_unlock_irqrestore(&ndev_hash_lock, flags); } /** * ib_device_set_netdev - Associate the ib_dev with an underlying net_device * @ib_dev: Device to modify * @ndev: net_device to affiliate, may be NULL * @port: IB port the net_device is connected to * * Drivers should use this to link the ib_device to a netdev so the netdev * shows up in interfaces like ib_enum_roce_netdev. Only one netdev may be * affiliated with any port. * * The caller must ensure that the given ndev is not unregistered or * unregistering, and that either the ib_device is unregistered or * ib_device_set_netdev() is called with NULL when the ndev sends a * NETDEV_UNREGISTER event. */ int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev, u32 port) { enum rdma_nl_notify_event_type etype; struct net_device *old_ndev; struct ib_port_data *pdata; unsigned long flags; int ret; if (!rdma_is_port_valid(ib_dev, port)) return -EINVAL; /* * Drivers wish to call this before ib_register_driver, so we have to * setup the port data early. */ ret = alloc_port_data(ib_dev); if (ret) return ret; pdata = &ib_dev->port_data[port]; spin_lock_irqsave(&pdata->netdev_lock, flags); old_ndev = rcu_dereference_protected( pdata->netdev, lockdep_is_held(&pdata->netdev_lock)); if (old_ndev == ndev) { spin_unlock_irqrestore(&pdata->netdev_lock, flags); return 0; } rcu_assign_pointer(pdata->netdev, ndev); netdev_put(old_ndev, &pdata->netdev_tracker); netdev_hold(ndev, &pdata->netdev_tracker, GFP_ATOMIC); spin_unlock_irqrestore(&pdata->netdev_lock, flags); add_ndev_hash(pdata); /* Make sure that the device is registered before we send events */ if (xa_load(&devices, ib_dev->index) != ib_dev) return 0; etype = ndev ? RDMA_NETDEV_ATTACH_EVENT : RDMA_NETDEV_DETACH_EVENT; rdma_nl_notify_event(ib_dev, port, etype); return 0; } EXPORT_SYMBOL(ib_device_set_netdev); static void free_netdevs(struct ib_device *ib_dev) { unsigned long flags; u32 port; if (!ib_dev->port_data) return; rdma_for_each_port (ib_dev, port) { struct ib_port_data *pdata = &ib_dev->port_data[port]; struct net_device *ndev; spin_lock_irqsave(&pdata->netdev_lock, flags); ndev = rcu_dereference_protected( pdata->netdev, lockdep_is_held(&pdata->netdev_lock)); if (ndev) { spin_lock(&ndev_hash_lock); hash_del_rcu(&pdata->ndev_hash_link); spin_unlock(&ndev_hash_lock); /* * If this is the last dev_put there is still a * synchronize_rcu before the netdev is kfreed, so we * can continue to rely on unlocked pointer * comparisons after the put */ rcu_assign_pointer(pdata->netdev, NULL); netdev_put(ndev, &pdata->netdev_tracker); } spin_unlock_irqrestore(&pdata->netdev_lock, flags); } } struct net_device *ib_device_get_netdev(struct ib_device *ib_dev, u32 port) { struct ib_port_data *pdata; struct net_device *res; if (!rdma_is_port_valid(ib_dev, port)) return NULL; if (!ib_dev->port_data) return NULL; pdata = &ib_dev->port_data[port]; /* * New drivers should use ib_device_set_netdev() not the legacy * get_netdev(). */ if (ib_dev->ops.get_netdev) res = ib_dev->ops.get_netdev(ib_dev, port); else { spin_lock(&pdata->netdev_lock); res = rcu_dereference_protected( pdata->netdev, lockdep_is_held(&pdata->netdev_lock)); dev_hold(res); spin_unlock(&pdata->netdev_lock); } return res; } EXPORT_SYMBOL(ib_device_get_netdev); /** * ib_query_netdev_port - Query the port number of a net_device * associated with an ibdev * @ibdev: IB device * @ndev: Network device * @port: IB port the net_device is connected to */ int ib_query_netdev_port(struct ib_device *ibdev, struct net_device *ndev, u32 *port) { struct net_device *ib_ndev; u32 port_num; rdma_for_each_port(ibdev, port_num) { ib_ndev = ib_device_get_netdev(ibdev, port_num); if (ndev == ib_ndev) { *port = port_num; dev_put(ib_ndev); return 0; } dev_put(ib_ndev); } return -ENOENT; } EXPORT_SYMBOL(ib_query_netdev_port); /** * ib_device_get_by_netdev - Find an IB device associated with a netdev * @ndev: netdev to locate * @driver_id: The driver ID that must match (RDMA_DRIVER_UNKNOWN matches all) * * Find and hold an ib_device that is associated with a netdev via * ib_device_set_netdev(). The caller must call ib_device_put() on the * returned pointer. */ struct ib_device *ib_device_get_by_netdev(struct net_device *ndev, enum rdma_driver_id driver_id) { struct ib_device *res = NULL; struct ib_port_data *cur; rcu_read_lock(); hash_for_each_possible_rcu (ndev_hash, cur, ndev_hash_link, (uintptr_t)ndev) { if (rcu_access_pointer(cur->netdev) == ndev && (driver_id == RDMA_DRIVER_UNKNOWN || cur->ib_dev->ops.driver_id == driver_id) && ib_device_try_get(cur->ib_dev)) { res = cur->ib_dev; break; } } rcu_read_unlock(); return res; } EXPORT_SYMBOL(ib_device_get_by_netdev); /** * ib_enum_roce_netdev - enumerate all RoCE ports * @ib_dev : IB device we want to query * @filter: Should we call the callback? * @filter_cookie: Cookie passed to filter * @cb: Callback to call for each found RoCE ports * @cookie: Cookie passed back to the callback * * Enumerates all of the physical RoCE ports of ib_dev * which are related to netdevice and calls callback() on each * device for which filter() function returns non zero. */ void ib_enum_roce_netdev(struct ib_device *ib_dev, roce_netdev_filter filter, void *filter_cookie, roce_netdev_callback cb, void *cookie) { u32 port; rdma_for_each_port (ib_dev, port) if (rdma_protocol_roce(ib_dev, port)) { struct net_device *idev = ib_device_get_netdev(ib_dev, port); if (filter(ib_dev, port, idev, filter_cookie)) cb(ib_dev, port, idev, cookie); dev_put(idev); } } /** * ib_enum_all_roce_netdevs - enumerate all RoCE devices * @filter: Should we call the callback? * @filter_cookie: Cookie passed to filter * @cb: Callback to call for each found RoCE ports * @cookie: Cookie passed back to the callback * * Enumerates all RoCE devices' physical ports which are related * to netdevices and calls callback() on each device for which * filter() function returns non zero. */ void ib_enum_all_roce_netdevs(roce_netdev_filter filter, void *filter_cookie, roce_netdev_callback cb, void *cookie) { struct ib_device *dev; unsigned long index; down_read(&devices_rwsem); xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) ib_enum_roce_netdev(dev, filter, filter_cookie, cb, cookie); up_read(&devices_rwsem); } /* * ib_enum_all_devs - enumerate all ib_devices * @cb: Callback to call for each found ib_device * * Enumerates all ib_devices and calls callback() on each device. */ int ib_enum_all_devs(nldev_callback nldev_cb, struct sk_buff *skb, struct netlink_callback *cb) { unsigned long index; struct ib_device *dev; unsigned int idx = 0; int ret = 0; down_read(&devices_rwsem); xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { if (!rdma_dev_access_netns(dev, sock_net(skb->sk))) continue; ret = nldev_cb(dev, skb, cb, idx); if (ret) break; idx++; } up_read(&devices_rwsem); return ret; } /** * ib_query_pkey - Get P_Key table entry * @device:Device to query * @port_num:Port number to query * @index:P_Key table index to query * @pkey:Returned P_Key * * ib_query_pkey() fetches the specified P_Key table entry. */ int ib_query_pkey(struct ib_device *device, u32 port_num, u16 index, u16 *pkey) { if (!rdma_is_port_valid(device, port_num)) return -EINVAL; if (!device->ops.query_pkey) return -EOPNOTSUPP; return device->ops.query_pkey(device, port_num, index, pkey); } EXPORT_SYMBOL(ib_query_pkey); /** * ib_modify_device - Change IB device attributes * @device:Device to modify * @device_modify_mask:Mask of attributes to change * @device_modify:New attribute values * * ib_modify_device() changes a device's attributes as specified by * the @device_modify_mask and @device_modify structure. */ int ib_modify_device(struct ib_device *device, int device_modify_mask, struct ib_device_modify *device_modify) { if (!device->ops.modify_device) return -EOPNOTSUPP; return device->ops.modify_device(device, device_modify_mask, device_modify); } EXPORT_SYMBOL(ib_modify_device); /** * ib_modify_port - Modifies the attributes for the specified port. * @device: The device to modify. * @port_num: The number of the port to modify. * @port_modify_mask: Mask used to specify which attributes of the port * to change. * @port_modify: New attribute values for the port. * * ib_modify_port() changes a port's attributes as specified by the * @port_modify_mask and @port_modify structure. */ int ib_modify_port(struct ib_device *device, u32 port_num, int port_modify_mask, struct ib_port_modify *port_modify) { int rc; if (!rdma_is_port_valid(device, port_num)) return -EINVAL; if (device->ops.modify_port) rc = device->ops.modify_port(device, port_num, port_modify_mask, port_modify); else if (rdma_protocol_roce(device, port_num) && ((port_modify->set_port_cap_mask & ~IB_PORT_CM_SUP) == 0 || (port_modify->clr_port_cap_mask & ~IB_PORT_CM_SUP) == 0)) rc = 0; else rc = -EOPNOTSUPP; return rc; } EXPORT_SYMBOL(ib_modify_port); /** * ib_find_gid - Returns the port number and GID table index where * a specified GID value occurs. Its searches only for IB link layer. * @device: The device to query. * @gid: The GID value to search for. * @port_num: The port number of the device where the GID value was found. * @index: The index into the GID table where the GID was found. This * parameter may be NULL. */ int ib_find_gid(struct ib_device *device, union ib_gid *gid, u32 *port_num, u16 *index) { union ib_gid tmp_gid; u32 port; int ret, i; rdma_for_each_port (device, port) { if (!rdma_protocol_ib(device, port)) continue; for (i = 0; i < device->port_data[port].immutable.gid_tbl_len; ++i) { ret = rdma_query_gid(device, port, i, &tmp_gid); if (ret) continue; if (!memcmp(&tmp_gid, gid, sizeof *gid)) { *port_num = port; if (index) *index = i; return 0; } } } return -ENOENT; } EXPORT_SYMBOL(ib_find_gid); /** * ib_find_pkey - Returns the PKey table index where a specified * PKey value occurs. * @device: The device to query. * @port_num: The port number of the device to search for the PKey. * @pkey: The PKey value to search for. * @index: The index into the PKey table where the PKey was found. */ int ib_find_pkey(struct ib_device *device, u32 port_num, u16 pkey, u16 *index) { int ret, i; u16 tmp_pkey; int partial_ix = -1; for (i = 0; i < device->port_data[port_num].immutable.pkey_tbl_len; ++i) { ret = ib_query_pkey(device, port_num, i, &tmp_pkey); if (ret) return ret; if ((pkey & 0x7fff) == (tmp_pkey & 0x7fff)) { /* if there is full-member pkey take it.*/ if (tmp_pkey & 0x8000) { *index = i; return 0; } if (partial_ix < 0) partial_ix = i; } } /*no full-member, if exists take the limited*/ if (partial_ix >= 0) { *index = partial_ix; return 0; } return -ENOENT; } EXPORT_SYMBOL(ib_find_pkey); /** * ib_get_net_dev_by_params() - Return the appropriate net_dev * for a received CM request * @dev: An RDMA device on which the request has been received. * @port: Port number on the RDMA device. * @pkey: The Pkey the request came on. * @gid: A GID that the net_dev uses to communicate. * @addr: Contains the IP address that the request specified as its * destination. * */ struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, u32 port, u16 pkey, const union ib_gid *gid, const struct sockaddr *addr) { struct net_device *net_dev = NULL; unsigned long index; void *client_data; if (!rdma_protocol_ib(dev, port)) return NULL; /* * Holding the read side guarantees that the client will not become * unregistered while we are calling get_net_dev_by_params() */ down_read(&dev->client_data_rwsem); xan_for_each_marked (&dev->client_data, index, client_data, CLIENT_DATA_REGISTERED) { struct ib_client *client = xa_load(&clients, index); if (!client || !client->get_net_dev_by_params) continue; net_dev = client->get_net_dev_by_params(dev, port, pkey, gid, addr, client_data); if (net_dev) break; } up_read(&dev->client_data_rwsem); return net_dev; } EXPORT_SYMBOL(ib_get_net_dev_by_params); void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) { struct ib_device_ops *dev_ops = &dev->ops; #define SET_DEVICE_OP(ptr, name) \ do { \ if (ops->name) \ if (!((ptr)->name)) \ (ptr)->name = ops->name; \ } while (0) #define SET_OBJ_SIZE(ptr, name) SET_DEVICE_OP(ptr, size_##name) if (ops->driver_id != RDMA_DRIVER_UNKNOWN) { WARN_ON(dev_ops->driver_id != RDMA_DRIVER_UNKNOWN && dev_ops->driver_id != ops->driver_id); dev_ops->driver_id = ops->driver_id; } if (ops->owner) { WARN_ON(dev_ops->owner && dev_ops->owner != ops->owner); dev_ops->owner = ops->owner; } if (ops->uverbs_abi_ver) dev_ops->uverbs_abi_ver = ops->uverbs_abi_ver; dev_ops->uverbs_no_driver_id_binding |= ops->uverbs_no_driver_id_binding; SET_DEVICE_OP(dev_ops, add_gid); SET_DEVICE_OP(dev_ops, add_sub_dev); SET_DEVICE_OP(dev_ops, advise_mr); SET_DEVICE_OP(dev_ops, alloc_dm); SET_DEVICE_OP(dev_ops, alloc_hw_device_stats); SET_DEVICE_OP(dev_ops, alloc_hw_port_stats); SET_DEVICE_OP(dev_ops, alloc_mr); SET_DEVICE_OP(dev_ops, alloc_mr_integrity); SET_DEVICE_OP(dev_ops, alloc_mw); SET_DEVICE_OP(dev_ops, alloc_pd); SET_DEVICE_OP(dev_ops, alloc_rdma_netdev); SET_DEVICE_OP(dev_ops, alloc_ucontext); SET_DEVICE_OP(dev_ops, alloc_xrcd); SET_DEVICE_OP(dev_ops, attach_mcast); SET_DEVICE_OP(dev_ops, check_mr_status); SET_DEVICE_OP(dev_ops, counter_alloc_stats); SET_DEVICE_OP(dev_ops, counter_bind_qp); SET_DEVICE_OP(dev_ops, counter_dealloc); SET_DEVICE_OP(dev_ops, counter_unbind_qp); SET_DEVICE_OP(dev_ops, counter_update_stats); SET_DEVICE_OP(dev_ops, create_ah); SET_DEVICE_OP(dev_ops, create_counters); SET_DEVICE_OP(dev_ops, create_cq); SET_DEVICE_OP(dev_ops, create_flow); SET_DEVICE_OP(dev_ops, create_qp); SET_DEVICE_OP(dev_ops, create_rwq_ind_table); SET_DEVICE_OP(dev_ops, create_srq); SET_DEVICE_OP(dev_ops, create_user_ah); SET_DEVICE_OP(dev_ops, create_wq); SET_DEVICE_OP(dev_ops, dealloc_dm); SET_DEVICE_OP(dev_ops, dealloc_driver); SET_DEVICE_OP(dev_ops, dealloc_mw); SET_DEVICE_OP(dev_ops, dealloc_pd); SET_DEVICE_OP(dev_ops, dealloc_ucontext); SET_DEVICE_OP(dev_ops, dealloc_xrcd); SET_DEVICE_OP(dev_ops, del_gid); SET_DEVICE_OP(dev_ops, del_sub_dev); SET_DEVICE_OP(dev_ops, dereg_mr); SET_DEVICE_OP(dev_ops, destroy_ah); SET_DEVICE_OP(dev_ops, destroy_counters); SET_DEVICE_OP(dev_ops, destroy_cq); SET_DEVICE_OP(dev_ops, destroy_flow); SET_DEVICE_OP(dev_ops, destroy_flow_action); SET_DEVICE_OP(dev_ops, destroy_qp); SET_DEVICE_OP(dev_ops, destroy_rwq_ind_table); SET_DEVICE_OP(dev_ops, destroy_srq); SET_DEVICE_OP(dev_ops, destroy_wq); SET_DEVICE_OP(dev_ops, device_group); SET_DEVICE_OP(dev_ops, detach_mcast); SET_DEVICE_OP(dev_ops, disassociate_ucontext); SET_DEVICE_OP(dev_ops, drain_rq); SET_DEVICE_OP(dev_ops, drain_sq); SET_DEVICE_OP(dev_ops, enable_driver); SET_DEVICE_OP(dev_ops, fill_res_cm_id_entry); SET_DEVICE_OP(dev_ops, fill_res_cq_entry); SET_DEVICE_OP(dev_ops, fill_res_cq_entry_raw); SET_DEVICE_OP(dev_ops, fill_res_mr_entry); SET_DEVICE_OP(dev_ops, fill_res_mr_entry_raw); SET_DEVICE_OP(dev_ops, fill_res_qp_entry); SET_DEVICE_OP(dev_ops, fill_res_qp_entry_raw); SET_DEVICE_OP(dev_ops, fill_res_srq_entry); SET_DEVICE_OP(dev_ops, fill_res_srq_entry_raw); SET_DEVICE_OP(dev_ops, fill_stat_mr_entry); SET_DEVICE_OP(dev_ops, get_dev_fw_str); SET_DEVICE_OP(dev_ops, get_dma_mr); SET_DEVICE_OP(dev_ops, get_hw_stats); SET_DEVICE_OP(dev_ops, get_link_layer); SET_DEVICE_OP(dev_ops, get_netdev); SET_DEVICE_OP(dev_ops, get_numa_node); SET_DEVICE_OP(dev_ops, get_port_immutable); SET_DEVICE_OP(dev_ops, get_vector_affinity); SET_DEVICE_OP(dev_ops, get_vf_config); SET_DEVICE_OP(dev_ops, get_vf_guid); SET_DEVICE_OP(dev_ops, get_vf_stats); SET_DEVICE_OP(dev_ops, iw_accept); SET_DEVICE_OP(dev_ops, iw_add_ref); SET_DEVICE_OP(dev_ops, iw_connect); SET_DEVICE_OP(dev_ops, iw_create_listen); SET_DEVICE_OP(dev_ops, iw_destroy_listen); SET_DEVICE_OP(dev_ops, iw_get_qp); SET_DEVICE_OP(dev_ops, iw_reject); SET_DEVICE_OP(dev_ops, iw_rem_ref); SET_DEVICE_OP(dev_ops, map_mr_sg); SET_DEVICE_OP(dev_ops, map_mr_sg_pi); SET_DEVICE_OP(dev_ops, mmap); SET_DEVICE_OP(dev_ops, mmap_free); SET_DEVICE_OP(dev_ops, modify_ah); SET_DEVICE_OP(dev_ops, modify_cq); SET_DEVICE_OP(dev_ops, modify_device); SET_DEVICE_OP(dev_ops, modify_hw_stat); SET_DEVICE_OP(dev_ops, modify_port); SET_DEVICE_OP(dev_ops, modify_qp); SET_DEVICE_OP(dev_ops, modify_srq); SET_DEVICE_OP(dev_ops, modify_wq); SET_DEVICE_OP(dev_ops, peek_cq); SET_DEVICE_OP(dev_ops, poll_cq); SET_DEVICE_OP(dev_ops, port_groups); SET_DEVICE_OP(dev_ops, post_recv); SET_DEVICE_OP(dev_ops, post_send); SET_DEVICE_OP(dev_ops, post_srq_recv); SET_DEVICE_OP(dev_ops, process_mad); SET_DEVICE_OP(dev_ops, query_ah); SET_DEVICE_OP(dev_ops, query_device); SET_DEVICE_OP(dev_ops, query_gid); SET_DEVICE_OP(dev_ops, query_pkey); SET_DEVICE_OP(dev_ops, query_port); SET_DEVICE_OP(dev_ops, query_qp); SET_DEVICE_OP(dev_ops, query_srq); SET_DEVICE_OP(dev_ops, query_ucontext); SET_DEVICE_OP(dev_ops, rdma_netdev_get_params); SET_DEVICE_OP(dev_ops, read_counters); SET_DEVICE_OP(dev_ops, reg_dm_mr); SET_DEVICE_OP(dev_ops, reg_user_mr); SET_DEVICE_OP(dev_ops, reg_user_mr_dmabuf); SET_DEVICE_OP(dev_ops, req_notify_cq); SET_DEVICE_OP(dev_ops, rereg_user_mr); SET_DEVICE_OP(dev_ops, resize_cq); SET_DEVICE_OP(dev_ops, set_vf_guid); SET_DEVICE_OP(dev_ops, set_vf_link_state); SET_DEVICE_OP(dev_ops, ufile_hw_cleanup); SET_DEVICE_OP(dev_ops, report_port_event); SET_OBJ_SIZE(dev_ops, ib_ah); SET_OBJ_SIZE(dev_ops, ib_counters); SET_OBJ_SIZE(dev_ops, ib_cq); SET_OBJ_SIZE(dev_ops, ib_mw); SET_OBJ_SIZE(dev_ops, ib_pd); SET_OBJ_SIZE(dev_ops, ib_qp); SET_OBJ_SIZE(dev_ops, ib_rwq_ind_table); SET_OBJ_SIZE(dev_ops, ib_srq); SET_OBJ_SIZE(dev_ops, ib_ucontext); SET_OBJ_SIZE(dev_ops, ib_xrcd); } EXPORT_SYMBOL(ib_set_device_ops); int ib_add_sub_device(struct ib_device *parent, enum rdma_nl_dev_type type, const char *name) { struct ib_device *sub; int ret = 0; if (!parent->ops.add_sub_dev || !parent->ops.del_sub_dev) return -EOPNOTSUPP; if (!ib_device_try_get(parent)) return -EINVAL; sub = parent->ops.add_sub_dev(parent, type, name); if (IS_ERR(sub)) { ib_device_put(parent); return PTR_ERR(sub); } sub->type = type; sub->parent = parent; mutex_lock(&parent->subdev_lock); list_add_tail(&parent->subdev_list_head, &sub->subdev_list); mutex_unlock(&parent->subdev_lock); return ret; } EXPORT_SYMBOL(ib_add_sub_device); int ib_del_sub_device_and_put(struct ib_device *sub) { struct ib_device *parent = sub->parent; if (!parent) return -EOPNOTSUPP; mutex_lock(&parent->subdev_lock); list_del(&sub->subdev_list); mutex_unlock(&parent->subdev_lock); ib_device_put(sub); parent->ops.del_sub_dev(sub); ib_device_put(parent); return 0; } EXPORT_SYMBOL(ib_del_sub_device_and_put); #ifdef CONFIG_INFINIBAND_VIRT_DMA int ib_dma_virt_map_sg(struct ib_device *dev, struct scatterlist *sg, int nents) { struct scatterlist *s; int i; for_each_sg(sg, s, nents, i) { sg_dma_address(s) = (uintptr_t)sg_virt(s); sg_dma_len(s) = s->length; } return nents; } EXPORT_SYMBOL(ib_dma_virt_map_sg); #endif /* CONFIG_INFINIBAND_VIRT_DMA */ static const struct rdma_nl_cbs ibnl_ls_cb_table[RDMA_NL_LS_NUM_OPS] = { [RDMA_NL_LS_OP_RESOLVE] = { .doit = ib_nl_handle_resolve_resp, .flags = RDMA_NL_ADMIN_PERM, }, [RDMA_NL_LS_OP_SET_TIMEOUT] = { .doit = ib_nl_handle_set_timeout, .flags = RDMA_NL_ADMIN_PERM, }, [RDMA_NL_LS_OP_IP_RESOLVE] = { .doit = ib_nl_handle_ip_res_resp, .flags = RDMA_NL_ADMIN_PERM, }, }; void ib_dispatch_port_state_event(struct ib_device *ibdev, struct net_device *ndev) { enum ib_port_state curr_state; struct ib_event ibevent = {}; u32 port; if (ib_query_netdev_port(ibdev, ndev, &port)) return; curr_state = ib_get_curr_port_state(ndev); write_lock_irq(&ibdev->cache_lock); if (ibdev->port_data[port].cache.last_port_state == curr_state) { write_unlock_irq(&ibdev->cache_lock); return; } ibdev->port_data[port].cache.last_port_state = curr_state; write_unlock_irq(&ibdev->cache_lock); ibevent.event = (curr_state == IB_PORT_DOWN) ? IB_EVENT_PORT_ERR : IB_EVENT_PORT_ACTIVE; ibevent.device = ibdev; ibevent.element.port_num = port; ib_dispatch_event(&ibevent); } EXPORT_SYMBOL(ib_dispatch_port_state_event); static void handle_port_event(struct net_device *ndev, unsigned long event) { struct ib_device *ibdev; /* Currently, link events in bonding scenarios are still * reported by drivers that support bonding. */ if (netif_is_lag_master(ndev) || netif_is_lag_port(ndev)) return; ibdev = ib_device_get_by_netdev(ndev, RDMA_DRIVER_UNKNOWN); if (!ibdev) return; if (ibdev->ops.report_port_event) { ibdev->ops.report_port_event(ibdev, ndev, event); goto put_ibdev; } ib_dispatch_port_state_event(ibdev, ndev); put_ibdev: ib_device_put(ibdev); }; static int ib_netdevice_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *ndev = netdev_notifier_info_to_dev(ptr); struct ib_device *ibdev; u32 port; switch (event) { case NETDEV_CHANGENAME: ibdev = ib_device_get_by_netdev(ndev, RDMA_DRIVER_UNKNOWN); if (!ibdev) return NOTIFY_DONE; if (ib_query_netdev_port(ibdev, ndev, &port)) { ib_device_put(ibdev); break; } rdma_nl_notify_event(ibdev, port, RDMA_NETDEV_RENAME_EVENT); ib_device_put(ibdev); break; case NETDEV_UP: case NETDEV_CHANGE: case NETDEV_DOWN: handle_port_event(ndev, event); break; default: break; } return NOTIFY_DONE; } static struct notifier_block nb_netdevice = { .notifier_call = ib_netdevice_event, }; static int __init ib_core_init(void) { int ret = -ENOMEM; ib_wq = alloc_workqueue("infiniband", 0, 0); if (!ib_wq) return -ENOMEM; ib_unreg_wq = alloc_workqueue("ib-unreg-wq", WQ_UNBOUND, WQ_UNBOUND_MAX_ACTIVE); if (!ib_unreg_wq) goto err; ib_comp_wq = alloc_workqueue("ib-comp-wq", WQ_HIGHPRI | WQ_MEM_RECLAIM | WQ_SYSFS, 0); if (!ib_comp_wq) goto err_unbound; ib_comp_unbound_wq = alloc_workqueue("ib-comp-unb-wq", WQ_UNBOUND | WQ_HIGHPRI | WQ_MEM_RECLAIM | WQ_SYSFS, WQ_UNBOUND_MAX_ACTIVE); if (!ib_comp_unbound_wq) goto err_comp; ret = class_register(&ib_class); if (ret) { pr_warn("Couldn't create InfiniBand device class\n"); goto err_comp_unbound; } rdma_nl_init(); ret = addr_init(); if (ret) { pr_warn("Couldn't init IB address resolution\n"); goto err_ibnl; } ret = ib_mad_init(); if (ret) { pr_warn("Couldn't init IB MAD\n"); goto err_addr; } ret = ib_sa_init(); if (ret) { pr_warn("Couldn't init SA\n"); goto err_mad; } ret = register_blocking_lsm_notifier(&ibdev_lsm_nb); if (ret) { pr_warn("Couldn't register LSM notifier. ret %d\n", ret); goto err_sa; } ret = register_pernet_device(&rdma_dev_net_ops); if (ret) { pr_warn("Couldn't init compat dev. ret %d\n", ret); goto err_compat; } nldev_init(); rdma_nl_register(RDMA_NL_LS, ibnl_ls_cb_table); ret = roce_gid_mgmt_init(); if (ret) { pr_warn("Couldn't init RoCE GID management\n"); goto err_parent; } register_netdevice_notifier(&nb_netdevice); return 0; err_parent: rdma_nl_unregister(RDMA_NL_LS); nldev_exit(); unregister_pernet_device(&rdma_dev_net_ops); err_compat: unregister_blocking_lsm_notifier(&ibdev_lsm_nb); err_sa: ib_sa_cleanup(); err_mad: ib_mad_cleanup(); err_addr: addr_cleanup(); err_ibnl: class_unregister(&ib_class); err_comp_unbound: destroy_workqueue(ib_comp_unbound_wq); err_comp: destroy_workqueue(ib_comp_wq); err_unbound: destroy_workqueue(ib_unreg_wq); err: destroy_workqueue(ib_wq); return ret; } static void __exit ib_core_cleanup(void) { unregister_netdevice_notifier(&nb_netdevice); roce_gid_mgmt_cleanup(); rdma_nl_unregister(RDMA_NL_LS); nldev_exit(); unregister_pernet_device(&rdma_dev_net_ops); unregister_blocking_lsm_notifier(&ibdev_lsm_nb); ib_sa_cleanup(); ib_mad_cleanup(); addr_cleanup(); rdma_nl_exit(); class_unregister(&ib_class); destroy_workqueue(ib_comp_unbound_wq); destroy_workqueue(ib_comp_wq); /* Make sure that any pending umem accounting work is done. */ destroy_workqueue(ib_wq); destroy_workqueue(ib_unreg_wq); WARN_ON(!xa_empty(&clients)); WARN_ON(!xa_empty(&devices)); } MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_LS, 4); /* ib core relies on netdev stack to first register net_ns_type_operations * ns kobject type before ib_core initialization. */ fs_initcall(ib_core_init); module_exit(ib_core_cleanup);
1247 1250 1205 1209 534 473 474 473 473 3 534 532 532 534 535 1 3 30 1 3 1 2 2 407 464 447 407 406 407 407 464 59 59 10 10 450 451 450 53 53 133 100 4 46 25 27 46 133 46 1 46 46 46 46 46 46 132 133 133 46 46 45 46 46 133 133 133 8 133 50 50 2 7 1 8 8 50 50 50 7 1 50 2 204 204 2 3 206 206 3 5 5 5 50 50 50 50 49 1 50 50 50 50 216 222 52 21 50 2 3 50 114 107 2 12 108 36 36 109 7 12 114 114 114 106 12 12 12 2 114 114 114 107 12 5 109 114 114 114 114 114 9 100 13 2 1 12 4 110 112 8 2 107 102 110 2 5 3 114 107 2 5 3 116 110 116 2 2 107 2 5 3 108 7 112 113 113 7 124 124 1 1 1 2 119 33 1 1 2 109 109 10 7 4 113 114 109 4 44 44 82 1 81 1 1 7 2 1 1 1 2 1 1 3 9 9 1 1 1 9 5 1 2 2 1 2 12 12 9 5 5 1 4 3 2 2 4 2 2 4 2 2 2 2 2 2 8 1 7 1 1 4 113 113 471 470 442 374 471 282 316 212 36 13 70 16 63 11 12 82 166 82 265 268 51 396 219 401 268 267 402 47 366 408 378 272 290 40 242 1 282 4 3 1 3 2 407 408 290 16 3 4 1 10 16 34 371 38 400 400 18 18 402 166 320 401 113 107 7 49 46 6 170 47 127 1 126 2 38 36 2 132 89 46 80 1 80 1 70 4 75 75 47 47 46 3 26 21 21 47 118 118 117 2 32 86 118 158 158 158 129 29 6 500 499 1 450 449 1 3 3 33 15 3 25 33 33 33 28 22 12 12 12 12 3 12 3 3 6 6 3 3 6 3 6 4 4 99 4 44 58 4 4 99 5 4 1 1 1 3 1 5 5 5 19 214 212 212 218 1 1 1 214 1 3 212 2 1 1 2 2 9 1 1 1 3 3 6 6 1215 1217 1 1215 1 102 2 64 1 91 1 11 1 1 8 1 1 1 1 2 1 9 705 1 2 10 14 10 1 2 1 1 1 6 1 2 3 3 3 1 1 1 1 2 118 15 1 1 1 3 2 1 2 92 2 1 2 3 1 1 1 56 6 1 1 2 5 46 6 1 1 675 101 217 1 56 6 1 83 8 1 12 1 4 1 13 1 27 1 41 1 5 1 4 2 14 1 1 1 1 2 13 1 8 1 3 3 3 1 1 3 179 1 209 5 206 438 1 210 110 1 120 207 207 53 5 15 50 10 74 6 81 75 75 42 111 122 94 31 13 70 70 121 122 122 61 11 3 3 68 68 68 15 43 43 42 34 43 15 15 5 3 11 11 11 11 247 50 206 206 50 2 246 247 587 158 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 // SPDX-License-Identifier: GPL-2.0-only /* * Kernel-based Virtual Machine (KVM) Hypervisor * * Copyright (C) 2006 Qumranet, Inc. * Copyright 2010 Red Hat, Inc. and/or its affiliates. * * Authors: * Avi Kivity <avi@qumranet.com> * Yaniv Kamay <yaniv@qumranet.com> */ #include <kvm/iodev.h> #include <linux/kvm_host.h> #include <linux/kvm.h> #include <linux/module.h> #include <linux/errno.h> #include <linux/percpu.h> #include <linux/mm.h> #include <linux/miscdevice.h> #include <linux/vmalloc.h> #include <linux/reboot.h> #include <linux/debugfs.h> #include <linux/highmem.h> #include <linux/file.h> #include <linux/syscore_ops.h> #include <linux/cpu.h> #include <linux/sched/signal.h> #include <linux/sched/mm.h> #include <linux/sched/stat.h> #include <linux/cpumask.h> #include <linux/smp.h> #include <linux/anon_inodes.h> #include <linux/profile.h> #include <linux/kvm_para.h> #include <linux/pagemap.h> #include <linux/mman.h> #include <linux/swap.h> #include <linux/bitops.h> #include <linux/spinlock.h> #include <linux/compat.h> #include <linux/srcu.h> #include <linux/hugetlb.h> #include <linux/slab.h> #include <linux/sort.h> #include <linux/bsearch.h> #include <linux/io.h> #include <linux/lockdep.h> #include <linux/kthread.h> #include <linux/suspend.h> #include <asm/processor.h> #include <asm/ioctl.h> #include <linux/uaccess.h> #include "coalesced_mmio.h" #include "async_pf.h" #include "kvm_mm.h" #include "vfio.h" #include <trace/events/ipi.h> #define CREATE_TRACE_POINTS #include <trace/events/kvm.h> #include <linux/kvm_dirty_ring.h> /* Worst case buffer size needed for holding an integer. */ #define ITOA_MAX_LEN 12 MODULE_AUTHOR("Qumranet"); MODULE_DESCRIPTION("Kernel-based Virtual Machine (KVM) Hypervisor"); MODULE_LICENSE("GPL"); /* Architectures should define their poll value according to the halt latency */ unsigned int halt_poll_ns = KVM_HALT_POLL_NS_DEFAULT; module_param(halt_poll_ns, uint, 0644); EXPORT_SYMBOL_GPL(halt_poll_ns); /* Default doubles per-vcpu halt_poll_ns. */ unsigned int halt_poll_ns_grow = 2; module_param(halt_poll_ns_grow, uint, 0644); EXPORT_SYMBOL_GPL(halt_poll_ns_grow); /* The start value to grow halt_poll_ns from */ unsigned int halt_poll_ns_grow_start = 10000; /* 10us */ module_param(halt_poll_ns_grow_start, uint, 0644); EXPORT_SYMBOL_GPL(halt_poll_ns_grow_start); /* Default halves per-vcpu halt_poll_ns. */ unsigned int halt_poll_ns_shrink = 2; module_param(halt_poll_ns_shrink, uint, 0644); EXPORT_SYMBOL_GPL(halt_poll_ns_shrink); /* * Allow direct access (from KVM or the CPU) without MMU notifier protection * to unpinned pages. */ static bool allow_unsafe_mappings; module_param(allow_unsafe_mappings, bool, 0444); /* * Ordering of locks: * * kvm->lock --> kvm->slots_lock --> kvm->irq_lock */ DEFINE_MUTEX(kvm_lock); LIST_HEAD(vm_list); static struct kmem_cache *kvm_vcpu_cache; static __read_mostly struct preempt_ops kvm_preempt_ops; static DEFINE_PER_CPU(struct kvm_vcpu *, kvm_running_vcpu); static struct dentry *kvm_debugfs_dir; static const struct file_operations stat_fops_per_vm; static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl, unsigned long arg); #ifdef CONFIG_KVM_COMPAT static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl, unsigned long arg); #define KVM_COMPAT(c) .compat_ioctl = (c) #else /* * For architectures that don't implement a compat infrastructure, * adopt a double line of defense: * - Prevent a compat task from opening /dev/kvm * - If the open has been done by a 64bit task, and the KVM fd * passed to a compat task, let the ioctls fail. */ static long kvm_no_compat_ioctl(struct file *file, unsigned int ioctl, unsigned long arg) { return -EINVAL; } static int kvm_no_compat_open(struct inode *inode, struct file *file) { return is_compat_task() ? -ENODEV : 0; } #define KVM_COMPAT(c) .compat_ioctl = kvm_no_compat_ioctl, \ .open = kvm_no_compat_open #endif static int kvm_enable_virtualization(void); static void kvm_disable_virtualization(void); static void kvm_io_bus_destroy(struct kvm_io_bus *bus); #define KVM_EVENT_CREATE_VM 0 #define KVM_EVENT_DESTROY_VM 1 static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm); static unsigned long long kvm_createvm_count; static unsigned long long kvm_active_vms; static DEFINE_PER_CPU(cpumask_var_t, cpu_kick_mask); __weak void kvm_arch_guest_memory_reclaimed(struct kvm *kvm) { } /* * Switches to specified vcpu, until a matching vcpu_put() */ void vcpu_load(struct kvm_vcpu *vcpu) { int cpu = get_cpu(); __this_cpu_write(kvm_running_vcpu, vcpu); preempt_notifier_register(&vcpu->preempt_notifier); kvm_arch_vcpu_load(vcpu, cpu); put_cpu(); } EXPORT_SYMBOL_GPL(vcpu_load); void vcpu_put(struct kvm_vcpu *vcpu) { preempt_disable(); kvm_arch_vcpu_put(vcpu); preempt_notifier_unregister(&vcpu->preempt_notifier); __this_cpu_write(kvm_running_vcpu, NULL); preempt_enable(); } EXPORT_SYMBOL_GPL(vcpu_put); /* TODO: merge with kvm_arch_vcpu_should_kick */ static bool kvm_request_needs_ipi(struct kvm_vcpu *vcpu, unsigned req) { int mode = kvm_vcpu_exiting_guest_mode(vcpu); /* * We need to wait for the VCPU to reenable interrupts and get out of * READING_SHADOW_PAGE_TABLES mode. */ if (req & KVM_REQUEST_WAIT) return mode != OUTSIDE_GUEST_MODE; /* * Need to kick a running VCPU, but otherwise there is nothing to do. */ return mode == IN_GUEST_MODE; } static void ack_kick(void *_completed) { } static inline bool kvm_kick_many_cpus(struct cpumask *cpus, bool wait) { if (cpumask_empty(cpus)) return false; smp_call_function_many(cpus, ack_kick, NULL, wait); return true; } static void kvm_make_vcpu_request(struct kvm_vcpu *vcpu, unsigned int req, struct cpumask *tmp, int current_cpu) { int cpu; if (likely(!(req & KVM_REQUEST_NO_ACTION))) __kvm_make_request(req, vcpu); if (!(req & KVM_REQUEST_NO_WAKEUP) && kvm_vcpu_wake_up(vcpu)) return; /* * Note, the vCPU could get migrated to a different pCPU at any point * after kvm_request_needs_ipi(), which could result in sending an IPI * to the previous pCPU. But, that's OK because the purpose of the IPI * is to ensure the vCPU returns to OUTSIDE_GUEST_MODE, which is * satisfied if the vCPU migrates. Entering READING_SHADOW_PAGE_TABLES * after this point is also OK, as the requirement is only that KVM wait * for vCPUs that were reading SPTEs _before_ any changes were * finalized. See kvm_vcpu_kick() for more details on handling requests. */ if (kvm_request_needs_ipi(vcpu, req)) { cpu = READ_ONCE(vcpu->cpu); if (cpu != -1 && cpu != current_cpu) __cpumask_set_cpu(cpu, tmp); } } bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req, unsigned long *vcpu_bitmap) { struct kvm_vcpu *vcpu; struct cpumask *cpus; int i, me; bool called; me = get_cpu(); cpus = this_cpu_cpumask_var_ptr(cpu_kick_mask); cpumask_clear(cpus); for_each_set_bit(i, vcpu_bitmap, KVM_MAX_VCPUS) { vcpu = kvm_get_vcpu(kvm, i); if (!vcpu) continue; kvm_make_vcpu_request(vcpu, req, cpus, me); } called = kvm_kick_many_cpus(cpus, !!(req & KVM_REQUEST_WAIT)); put_cpu(); return called; } bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req) { struct kvm_vcpu *vcpu; struct cpumask *cpus; unsigned long i; bool called; int me; me = get_cpu(); cpus = this_cpu_cpumask_var_ptr(cpu_kick_mask); cpumask_clear(cpus); kvm_for_each_vcpu(i, vcpu, kvm) kvm_make_vcpu_request(vcpu, req, cpus, me); called = kvm_kick_many_cpus(cpus, !!(req & KVM_REQUEST_WAIT)); put_cpu(); return called; } EXPORT_SYMBOL_GPL(kvm_make_all_cpus_request); void kvm_flush_remote_tlbs(struct kvm *kvm) { ++kvm->stat.generic.remote_tlb_flush_requests; /* * We want to publish modifications to the page tables before reading * mode. Pairs with a memory barrier in arch-specific code. * - x86: smp_mb__after_srcu_read_unlock in vcpu_enter_guest * and smp_mb in walk_shadow_page_lockless_begin/end. * - powerpc: smp_mb in kvmppc_prepare_to_enter. * * There is already an smp_mb__after_atomic() before * kvm_make_all_cpus_request() reads vcpu->mode. We reuse that * barrier here. */ if (!kvm_arch_flush_remote_tlbs(kvm) || kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH)) ++kvm->stat.generic.remote_tlb_flush; } EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs); void kvm_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, u64 nr_pages) { if (!kvm_arch_flush_remote_tlbs_range(kvm, gfn, nr_pages)) return; /* * Fall back to a flushing entire TLBs if the architecture range-based * TLB invalidation is unsupported or can't be performed for whatever * reason. */ kvm_flush_remote_tlbs(kvm); } void kvm_flush_remote_tlbs_memslot(struct kvm *kvm, const struct kvm_memory_slot *memslot) { /* * All current use cases for flushing the TLBs for a specific memslot * are related to dirty logging, and many do the TLB flush out of * mmu_lock. The interaction between the various operations on memslot * must be serialized by slots_locks to ensure the TLB flush from one * operation is observed by any other operation on the same memslot. */ lockdep_assert_held(&kvm->slots_lock); kvm_flush_remote_tlbs_range(kvm, memslot->base_gfn, memslot->npages); } static void kvm_flush_shadow_all(struct kvm *kvm) { kvm_arch_flush_shadow_all(kvm); kvm_arch_guest_memory_reclaimed(kvm); } #ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE static inline void *mmu_memory_cache_alloc_obj(struct kvm_mmu_memory_cache *mc, gfp_t gfp_flags) { void *page; gfp_flags |= mc->gfp_zero; if (mc->kmem_cache) return kmem_cache_alloc(mc->kmem_cache, gfp_flags); page = (void *)__get_free_page(gfp_flags); if (page && mc->init_value) memset64(page, mc->init_value, PAGE_SIZE / sizeof(u64)); return page; } int __kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int capacity, int min) { gfp_t gfp = mc->gfp_custom ? mc->gfp_custom : GFP_KERNEL_ACCOUNT; void *obj; if (mc->nobjs >= min) return 0; if (unlikely(!mc->objects)) { if (WARN_ON_ONCE(!capacity)) return -EIO; /* * Custom init values can be used only for page allocations, * and obviously conflict with __GFP_ZERO. */ if (WARN_ON_ONCE(mc->init_value && (mc->kmem_cache || mc->gfp_zero))) return -EIO; mc->objects = kvmalloc_array(capacity, sizeof(void *), gfp); if (!mc->objects) return -ENOMEM; mc->capacity = capacity; } /* It is illegal to request a different capacity across topups. */ if (WARN_ON_ONCE(mc->capacity != capacity)) return -EIO; while (mc->nobjs < mc->capacity) { obj = mmu_memory_cache_alloc_obj(mc, gfp); if (!obj) return mc->nobjs >= min ? 0 : -ENOMEM; mc->objects[mc->nobjs++] = obj; } return 0; } int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min) { return __kvm_mmu_topup_memory_cache(mc, KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE, min); } int kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache *mc) { return mc->nobjs; } void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc) { while (mc->nobjs) { if (mc->kmem_cache) kmem_cache_free(mc->kmem_cache, mc->objects[--mc->nobjs]); else free_page((unsigned long)mc->objects[--mc->nobjs]); } kvfree(mc->objects); mc->objects = NULL; mc->capacity = 0; } void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc) { void *p; if (WARN_ON(!mc->nobjs)) p = mmu_memory_cache_alloc_obj(mc, GFP_ATOMIC | __GFP_ACCOUNT); else p = mc->objects[--mc->nobjs]; BUG_ON(!p); return p; } #endif static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) { mutex_init(&vcpu->mutex); vcpu->cpu = -1; vcpu->kvm = kvm; vcpu->vcpu_id = id; vcpu->pid = NULL; rwlock_init(&vcpu->pid_lock); #ifndef __KVM_HAVE_ARCH_WQP rcuwait_init(&vcpu->wait); #endif kvm_async_pf_vcpu_init(vcpu); kvm_vcpu_set_in_spin_loop(vcpu, false); kvm_vcpu_set_dy_eligible(vcpu, false); vcpu->preempted = false; vcpu->ready = false; preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops); vcpu->last_used_slot = NULL; /* Fill the stats id string for the vcpu */ snprintf(vcpu->stats_id, sizeof(vcpu->stats_id), "kvm-%d/vcpu-%d", task_pid_nr(current), id); } static void kvm_vcpu_destroy(struct kvm_vcpu *vcpu) { kvm_arch_vcpu_destroy(vcpu); kvm_dirty_ring_free(&vcpu->dirty_ring); /* * No need for rcu_read_lock as VCPU_RUN is the only place that changes * the vcpu->pid pointer, and at destruction time all file descriptors * are already gone. */ put_pid(vcpu->pid); free_page((unsigned long)vcpu->run); kmem_cache_free(kvm_vcpu_cache, vcpu); } void kvm_destroy_vcpus(struct kvm *kvm) { unsigned long i; struct kvm_vcpu *vcpu; kvm_for_each_vcpu(i, vcpu, kvm) { kvm_vcpu_destroy(vcpu); xa_erase(&kvm->vcpu_array, i); } atomic_set(&kvm->online_vcpus, 0); } EXPORT_SYMBOL_GPL(kvm_destroy_vcpus); #ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn) { return container_of(mn, struct kvm, mmu_notifier); } typedef bool (*gfn_handler_t)(struct kvm *kvm, struct kvm_gfn_range *range); typedef void (*on_lock_fn_t)(struct kvm *kvm); struct kvm_mmu_notifier_range { /* * 64-bit addresses, as KVM notifiers can operate on host virtual * addresses (unsigned long) and guest physical addresses (64-bit). */ u64 start; u64 end; union kvm_mmu_notifier_arg arg; gfn_handler_t handler; on_lock_fn_t on_lock; bool flush_on_ret; bool may_block; }; /* * The inner-most helper returns a tuple containing the return value from the * arch- and action-specific handler, plus a flag indicating whether or not at * least one memslot was found, i.e. if the handler found guest memory. * * Note, most notifiers are averse to booleans, so even though KVM tracks the * return from arch code as a bool, outer helpers will cast it to an int. :-( */ typedef struct kvm_mmu_notifier_return { bool ret; bool found_memslot; } kvm_mn_ret_t; /* * Use a dedicated stub instead of NULL to indicate that there is no callback * function/handler. The compiler technically can't guarantee that a real * function will have a non-zero address, and so it will generate code to * check for !NULL, whereas comparing against a stub will be elided at compile * time (unless the compiler is getting long in the tooth, e.g. gcc 4.9). */ static void kvm_null_fn(void) { } #define IS_KVM_NULL_FN(fn) ((fn) == (void *)kvm_null_fn) /* Iterate over each memslot intersecting [start, last] (inclusive) range */ #define kvm_for_each_memslot_in_hva_range(node, slots, start, last) \ for (node = interval_tree_iter_first(&slots->hva_tree, start, last); \ node; \ node = interval_tree_iter_next(node, start, last)) \ static __always_inline kvm_mn_ret_t __kvm_handle_hva_range(struct kvm *kvm, const struct kvm_mmu_notifier_range *range) { struct kvm_mmu_notifier_return r = { .ret = false, .found_memslot = false, }; struct kvm_gfn_range gfn_range; struct kvm_memory_slot *slot; struct kvm_memslots *slots; int i, idx; if (WARN_ON_ONCE(range->end <= range->start)) return r; /* A null handler is allowed if and only if on_lock() is provided. */ if (WARN_ON_ONCE(IS_KVM_NULL_FN(range->on_lock) && IS_KVM_NULL_FN(range->handler))) return r; idx = srcu_read_lock(&kvm->srcu); for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) { struct interval_tree_node *node; slots = __kvm_memslots(kvm, i); kvm_for_each_memslot_in_hva_range(node, slots, range->start, range->end - 1) { unsigned long hva_start, hva_end; slot = container_of(node, struct kvm_memory_slot, hva_node[slots->node_idx]); hva_start = max_t(unsigned long, range->start, slot->userspace_addr); hva_end = min_t(unsigned long, range->end, slot->userspace_addr + (slot->npages << PAGE_SHIFT)); /* * To optimize for the likely case where the address * range is covered by zero or one memslots, don't * bother making these conditional (to avoid writes on * the second or later invocation of the handler). */ gfn_range.arg = range->arg; gfn_range.may_block = range->may_block; /* * HVA-based notifications aren't relevant to private * mappings as they don't have a userspace mapping. */ gfn_range.attr_filter = KVM_FILTER_SHARED; /* * {gfn(page) | page intersects with [hva_start, hva_end)} = * {gfn_start, gfn_start+1, ..., gfn_end-1}. */ gfn_range.start = hva_to_gfn_memslot(hva_start, slot); gfn_range.end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, slot); gfn_range.slot = slot; if (!r.found_memslot) { r.found_memslot = true; KVM_MMU_LOCK(kvm); if (!IS_KVM_NULL_FN(range->on_lock)) range->on_lock(kvm); if (IS_KVM_NULL_FN(range->handler)) goto mmu_unlock; } r.ret |= range->handler(kvm, &gfn_range); } } if (range->flush_on_ret && r.ret) kvm_flush_remote_tlbs(kvm); mmu_unlock: if (r.found_memslot) KVM_MMU_UNLOCK(kvm); srcu_read_unlock(&kvm->srcu, idx); return r; } static __always_inline int kvm_handle_hva_range(struct mmu_notifier *mn, unsigned long start, unsigned long end, gfn_handler_t handler, bool flush_on_ret) { struct kvm *kvm = mmu_notifier_to_kvm(mn); const struct kvm_mmu_notifier_range range = { .start = start, .end = end, .handler = handler, .on_lock = (void *)kvm_null_fn, .flush_on_ret = flush_on_ret, .may_block = false, }; return __kvm_handle_hva_range(kvm, &range).ret; } static __always_inline int kvm_handle_hva_range_no_flush(struct mmu_notifier *mn, unsigned long start, unsigned long end, gfn_handler_t handler) { return kvm_handle_hva_range(mn, start, end, handler, false); } void kvm_mmu_invalidate_begin(struct kvm *kvm) { lockdep_assert_held_write(&kvm->mmu_lock); /* * The count increase must become visible at unlock time as no * spte can be established without taking the mmu_lock and * count is also read inside the mmu_lock critical section. */ kvm->mmu_invalidate_in_progress++; if (likely(kvm->mmu_invalidate_in_progress == 1)) { kvm->mmu_invalidate_range_start = INVALID_GPA; kvm->mmu_invalidate_range_end = INVALID_GPA; } } void kvm_mmu_invalidate_range_add(struct kvm *kvm, gfn_t start, gfn_t end) { lockdep_assert_held_write(&kvm->mmu_lock); WARN_ON_ONCE(!kvm->mmu_invalidate_in_progress); if (likely(kvm->mmu_invalidate_range_start == INVALID_GPA)) { kvm->mmu_invalidate_range_start = start; kvm->mmu_invalidate_range_end = end; } else { /* * Fully tracking multiple concurrent ranges has diminishing * returns. Keep things simple and just find the minimal range * which includes the current and new ranges. As there won't be * enough information to subtract a range after its invalidate * completes, any ranges invalidated concurrently will * accumulate and persist until all outstanding invalidates * complete. */ kvm->mmu_invalidate_range_start = min(kvm->mmu_invalidate_range_start, start); kvm->mmu_invalidate_range_end = max(kvm->mmu_invalidate_range_end, end); } } bool kvm_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) { kvm_mmu_invalidate_range_add(kvm, range->start, range->end); return kvm_unmap_gfn_range(kvm, range); } static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, const struct mmu_notifier_range *range) { struct kvm *kvm = mmu_notifier_to_kvm(mn); const struct kvm_mmu_notifier_range hva_range = { .start = range->start, .end = range->end, .handler = kvm_mmu_unmap_gfn_range, .on_lock = kvm_mmu_invalidate_begin, .flush_on_ret = true, .may_block = mmu_notifier_range_blockable(range), }; trace_kvm_unmap_hva_range(range->start, range->end); /* * Prevent memslot modification between range_start() and range_end() * so that conditionally locking provides the same result in both * functions. Without that guarantee, the mmu_invalidate_in_progress * adjustments will be imbalanced. * * Pairs with the decrement in range_end(). */ spin_lock(&kvm->mn_invalidate_lock); kvm->mn_active_invalidate_count++; spin_unlock(&kvm->mn_invalidate_lock); /* * Invalidate pfn caches _before_ invalidating the secondary MMUs, i.e. * before acquiring mmu_lock, to avoid holding mmu_lock while acquiring * each cache's lock. There are relatively few caches in existence at * any given time, and the caches themselves can check for hva overlap, * i.e. don't need to rely on memslot overlap checks for performance. * Because this runs without holding mmu_lock, the pfn caches must use * mn_active_invalidate_count (see above) instead of * mmu_invalidate_in_progress. */ gfn_to_pfn_cache_invalidate_start(kvm, range->start, range->end); /* * If one or more memslots were found and thus zapped, notify arch code * that guest memory has been reclaimed. This needs to be done *after* * dropping mmu_lock, as x86's reclaim path is slooooow. */ if (__kvm_handle_hva_range(kvm, &hva_range).found_memslot) kvm_arch_guest_memory_reclaimed(kvm); return 0; } void kvm_mmu_invalidate_end(struct kvm *kvm) { lockdep_assert_held_write(&kvm->mmu_lock); /* * This sequence increase will notify the kvm page fault that * the page that is going to be mapped in the spte could have * been freed. */ kvm->mmu_invalidate_seq++; smp_wmb(); /* * The above sequence increase must be visible before the * below count decrease, which is ensured by the smp_wmb above * in conjunction with the smp_rmb in mmu_invalidate_retry(). */ kvm->mmu_invalidate_in_progress--; KVM_BUG_ON(kvm->mmu_invalidate_in_progress < 0, kvm); /* * Assert that at least one range was added between start() and end(). * Not adding a range isn't fatal, but it is a KVM bug. */ WARN_ON_ONCE(kvm->mmu_invalidate_range_start == INVALID_GPA); } static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, const struct mmu_notifier_range *range) { struct kvm *kvm = mmu_notifier_to_kvm(mn); const struct kvm_mmu_notifier_range hva_range = { .start = range->start, .end = range->end, .handler = (void *)kvm_null_fn, .on_lock = kvm_mmu_invalidate_end, .flush_on_ret = false, .may_block = mmu_notifier_range_blockable(range), }; bool wake; __kvm_handle_hva_range(kvm, &hva_range); /* Pairs with the increment in range_start(). */ spin_lock(&kvm->mn_invalidate_lock); if (!WARN_ON_ONCE(!kvm->mn_active_invalidate_count)) --kvm->mn_active_invalidate_count; wake = !kvm->mn_active_invalidate_count; spin_unlock(&kvm->mn_invalidate_lock); /* * There can only be one waiter, since the wait happens under * slots_lock. */ if (wake) rcuwait_wake_up(&kvm->mn_memslots_update_rcuwait); } static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long start, unsigned long end) { trace_kvm_age_hva(start, end); return kvm_handle_hva_range(mn, start, end, kvm_age_gfn, !IS_ENABLED(CONFIG_KVM_ELIDE_TLB_FLUSH_IF_YOUNG)); } static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long start, unsigned long end) { trace_kvm_age_hva(start, end); /* * Even though we do not flush TLB, this will still adversely * affect performance on pre-Haswell Intel EPT, where there is * no EPT Access Bit to clear so that we have to tear down EPT * tables instead. If we find this unacceptable, we can always * add a parameter to kvm_age_hva so that it effectively doesn't * do anything on clear_young. * * Also note that currently we never issue secondary TLB flushes * from clear_young, leaving this job up to the regular system * cadence. If we find this inaccurate, we might come up with a * more sophisticated heuristic later. */ return kvm_handle_hva_range_no_flush(mn, start, end, kvm_age_gfn); } static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long address) { trace_kvm_test_age_hva(address); return kvm_handle_hva_range_no_flush(mn, address, address + 1, kvm_test_age_gfn); } static void kvm_mmu_notifier_release(struct mmu_notifier *mn, struct mm_struct *mm) { struct kvm *kvm = mmu_notifier_to_kvm(mn); int idx; idx = srcu_read_lock(&kvm->srcu); kvm_flush_shadow_all(kvm); srcu_read_unlock(&kvm->srcu, idx); } static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start, .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end, .clear_flush_young = kvm_mmu_notifier_clear_flush_young, .clear_young = kvm_mmu_notifier_clear_young, .test_young = kvm_mmu_notifier_test_young, .release = kvm_mmu_notifier_release, }; static int kvm_init_mmu_notifier(struct kvm *kvm) { kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops; return mmu_notifier_register(&kvm->mmu_notifier, current->mm); } #else /* !CONFIG_KVM_GENERIC_MMU_NOTIFIER */ static int kvm_init_mmu_notifier(struct kvm *kvm) { return 0; } #endif /* CONFIG_KVM_GENERIC_MMU_NOTIFIER */ #ifdef CONFIG_HAVE_KVM_PM_NOTIFIER static int kvm_pm_notifier_call(struct notifier_block *bl, unsigned long state, void *unused) { struct kvm *kvm = container_of(bl, struct kvm, pm_notifier); return kvm_arch_pm_notifier(kvm, state); } static void kvm_init_pm_notifier(struct kvm *kvm) { kvm->pm_notifier.notifier_call = kvm_pm_notifier_call; /* Suspend KVM before we suspend ftrace, RCU, etc. */ kvm->pm_notifier.priority = INT_MAX; register_pm_notifier(&kvm->pm_notifier); } static void kvm_destroy_pm_notifier(struct kvm *kvm) { unregister_pm_notifier(&kvm->pm_notifier); } #else /* !CONFIG_HAVE_KVM_PM_NOTIFIER */ static void kvm_init_pm_notifier(struct kvm *kvm) { } static void kvm_destroy_pm_notifier(struct kvm *kvm) { } #endif /* CONFIG_HAVE_KVM_PM_NOTIFIER */ static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot) { if (!memslot->dirty_bitmap) return; vfree(memslot->dirty_bitmap); memslot->dirty_bitmap = NULL; } /* This does not remove the slot from struct kvm_memslots data structures */ static void kvm_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) { if (slot->flags & KVM_MEM_GUEST_MEMFD) kvm_gmem_unbind(slot); kvm_destroy_dirty_bitmap(slot); kvm_arch_free_memslot(kvm, slot); kfree(slot); } static void kvm_free_memslots(struct kvm *kvm, struct kvm_memslots *slots) { struct hlist_node *idnode; struct kvm_memory_slot *memslot; int bkt; /* * The same memslot objects live in both active and inactive sets, * arbitrarily free using index '1' so the second invocation of this * function isn't operating over a structure with dangling pointers * (even though this function isn't actually touching them). */ if (!slots->node_idx) return; hash_for_each_safe(slots->id_hash, bkt, idnode, memslot, id_node[1]) kvm_free_memslot(kvm, memslot); } static umode_t kvm_stats_debugfs_mode(const struct _kvm_stats_desc *pdesc) { switch (pdesc->desc.flags & KVM_STATS_TYPE_MASK) { case KVM_STATS_TYPE_INSTANT: return 0444; case KVM_STATS_TYPE_CUMULATIVE: case KVM_STATS_TYPE_PEAK: default: return 0644; } } static void kvm_destroy_vm_debugfs(struct kvm *kvm) { int i; int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc + kvm_vcpu_stats_header.num_desc; if (IS_ERR(kvm->debugfs_dentry)) return; debugfs_remove_recursive(kvm->debugfs_dentry); if (kvm->debugfs_stat_data) { for (i = 0; i < kvm_debugfs_num_entries; i++) kfree(kvm->debugfs_stat_data[i]); kfree(kvm->debugfs_stat_data); } } static int kvm_create_vm_debugfs(struct kvm *kvm, const char *fdname) { static DEFINE_MUTEX(kvm_debugfs_lock); struct dentry *dent; char dir_name[ITOA_MAX_LEN * 2]; struct kvm_stat_data *stat_data; const struct _kvm_stats_desc *pdesc; int i, ret = -ENOMEM; int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc + kvm_vcpu_stats_header.num_desc; if (!debugfs_initialized()) return 0; snprintf(dir_name, sizeof(dir_name), "%d-%s", task_pid_nr(current), fdname); mutex_lock(&kvm_debugfs_lock); dent = debugfs_lookup(dir_name, kvm_debugfs_dir); if (dent) { pr_warn_ratelimited("KVM: debugfs: duplicate directory %s\n", dir_name); dput(dent); mutex_unlock(&kvm_debugfs_lock); return 0; } dent = debugfs_create_dir(dir_name, kvm_debugfs_dir); mutex_unlock(&kvm_debugfs_lock); if (IS_ERR(dent)) return 0; kvm->debugfs_dentry = dent; kvm->debugfs_stat_data = kcalloc(kvm_debugfs_num_entries, sizeof(*kvm->debugfs_stat_data), GFP_KERNEL_ACCOUNT); if (!kvm->debugfs_stat_data) goto out_err; for (i = 0; i < kvm_vm_stats_header.num_desc; ++i) { pdesc = &kvm_vm_stats_desc[i]; stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT); if (!stat_data) goto out_err; stat_data->kvm = kvm; stat_data->desc = pdesc; stat_data->kind = KVM_STAT_VM; kvm->debugfs_stat_data[i] = stat_data; debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc), kvm->debugfs_dentry, stat_data, &stat_fops_per_vm); } for (i = 0; i < kvm_vcpu_stats_header.num_desc; ++i) { pdesc = &kvm_vcpu_stats_desc[i]; stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT); if (!stat_data) goto out_err; stat_data->kvm = kvm; stat_data->desc = pdesc; stat_data->kind = KVM_STAT_VCPU; kvm->debugfs_stat_data[i + kvm_vm_stats_header.num_desc] = stat_data; debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc), kvm->debugfs_dentry, stat_data, &stat_fops_per_vm); } kvm_arch_create_vm_debugfs(kvm); return 0; out_err: kvm_destroy_vm_debugfs(kvm); return ret; } /* * Called just after removing the VM from the vm_list, but before doing any * other destruction. */ void __weak kvm_arch_pre_destroy_vm(struct kvm *kvm) { } /* * Called after per-vm debugfs created. When called kvm->debugfs_dentry should * be setup already, so we can create arch-specific debugfs entries under it. * Cleanup should be automatic done in kvm_destroy_vm_debugfs() recursively, so * a per-arch destroy interface is not needed. */ void __weak kvm_arch_create_vm_debugfs(struct kvm *kvm) { } static struct kvm *kvm_create_vm(unsigned long type, const char *fdname) { struct kvm *kvm = kvm_arch_alloc_vm(); struct kvm_memslots *slots; int r, i, j; if (!kvm) return ERR_PTR(-ENOMEM); KVM_MMU_LOCK_INIT(kvm); mmgrab(current->mm); kvm->mm = current->mm; kvm_eventfd_init(kvm); mutex_init(&kvm->lock); mutex_init(&kvm->irq_lock); mutex_init(&kvm->slots_lock); mutex_init(&kvm->slots_arch_lock); spin_lock_init(&kvm->mn_invalidate_lock); rcuwait_init(&kvm->mn_memslots_update_rcuwait); xa_init(&kvm->vcpu_array); #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES xa_init(&kvm->mem_attr_array); #endif INIT_LIST_HEAD(&kvm->gpc_list); spin_lock_init(&kvm->gpc_lock); INIT_LIST_HEAD(&kvm->devices); kvm->max_vcpus = KVM_MAX_VCPUS; BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX); /* * Force subsequent debugfs file creations to fail if the VM directory * is not created (by kvm_create_vm_debugfs()). */ kvm->debugfs_dentry = ERR_PTR(-ENOENT); snprintf(kvm->stats_id, sizeof(kvm->stats_id), "kvm-%d", task_pid_nr(current)); r = -ENOMEM; if (init_srcu_struct(&kvm->srcu)) goto out_err_no_srcu; if (init_srcu_struct(&kvm->irq_srcu)) goto out_err_no_irq_srcu; r = kvm_init_irq_routing(kvm); if (r) goto out_err_no_irq_routing; refcount_set(&kvm->users_count, 1); for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) { for (j = 0; j < 2; j++) { slots = &kvm->__memslots[i][j]; atomic_long_set(&slots->last_used_slot, (unsigned long)NULL); slots->hva_tree = RB_ROOT_CACHED; slots->gfn_tree = RB_ROOT; hash_init(slots->id_hash); slots->node_idx = j; /* Generations must be different for each address space. */ slots->generation = i; } rcu_assign_pointer(kvm->memslots[i], &kvm->__memslots[i][0]); } r = -ENOMEM; for (i = 0; i < KVM_NR_BUSES; i++) { rcu_assign_pointer(kvm->buses[i], kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL_ACCOUNT)); if (!kvm->buses[i]) goto out_err_no_arch_destroy_vm; } r = kvm_arch_init_vm(kvm, type); if (r) goto out_err_no_arch_destroy_vm; r = kvm_enable_virtualization(); if (r) goto out_err_no_disable; #ifdef CONFIG_HAVE_KVM_IRQCHIP INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list); #endif r = kvm_init_mmu_notifier(kvm); if (r) goto out_err_no_mmu_notifier; r = kvm_coalesced_mmio_init(kvm); if (r < 0) goto out_no_coalesced_mmio; r = kvm_create_vm_debugfs(kvm, fdname); if (r) goto out_err_no_debugfs; mutex_lock(&kvm_lock); list_add(&kvm->vm_list, &vm_list); mutex_unlock(&kvm_lock); preempt_notifier_inc(); kvm_init_pm_notifier(kvm); return kvm; out_err_no_debugfs: kvm_coalesced_mmio_free(kvm); out_no_coalesced_mmio: #ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER if (kvm->mmu_notifier.ops) mmu_notifier_unregister(&kvm->mmu_notifier, current->mm); #endif out_err_no_mmu_notifier: kvm_disable_virtualization(); out_err_no_disable: kvm_arch_destroy_vm(kvm); out_err_no_arch_destroy_vm: WARN_ON_ONCE(!refcount_dec_and_test(&kvm->users_count)); for (i = 0; i < KVM_NR_BUSES; i++) kfree(kvm_get_bus(kvm, i)); kvm_free_irq_routing(kvm); out_err_no_irq_routing: cleanup_srcu_struct(&kvm->irq_srcu); out_err_no_irq_srcu: cleanup_srcu_struct(&kvm->srcu); out_err_no_srcu: kvm_arch_free_vm(kvm); mmdrop(current->mm); return ERR_PTR(r); } static void kvm_destroy_devices(struct kvm *kvm) { struct kvm_device *dev, *tmp; /* * We do not need to take the kvm->lock here, because nobody else * has a reference to the struct kvm at this point and therefore * cannot access the devices list anyhow. * * The device list is generally managed as an rculist, but list_del() * is used intentionally here. If a bug in KVM introduced a reader that * was not backed by a reference on the kvm struct, the hope is that * it'd consume the poisoned forward pointer instead of suffering a * use-after-free, even though this cannot be guaranteed. */ list_for_each_entry_safe(dev, tmp, &kvm->devices, vm_node) { list_del(&dev->vm_node); dev->ops->destroy(dev); } } static void kvm_destroy_vm(struct kvm *kvm) { int i; struct mm_struct *mm = kvm->mm; kvm_destroy_pm_notifier(kvm); kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm); kvm_destroy_vm_debugfs(kvm); kvm_arch_sync_events(kvm); mutex_lock(&kvm_lock); list_del(&kvm->vm_list); mutex_unlock(&kvm_lock); kvm_arch_pre_destroy_vm(kvm); kvm_free_irq_routing(kvm); for (i = 0; i < KVM_NR_BUSES; i++) { struct kvm_io_bus *bus = kvm_get_bus(kvm, i); if (bus) kvm_io_bus_destroy(bus); kvm->buses[i] = NULL; } kvm_coalesced_mmio_free(kvm); #ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm); /* * At this point, pending calls to invalidate_range_start() * have completed but no more MMU notifiers will run, so * mn_active_invalidate_count may remain unbalanced. * No threads can be waiting in kvm_swap_active_memslots() as the * last reference on KVM has been dropped, but freeing * memslots would deadlock without this manual intervention. * * If the count isn't unbalanced, i.e. KVM did NOT unregister its MMU * notifier between a start() and end(), then there shouldn't be any * in-progress invalidations. */ WARN_ON(rcuwait_active(&kvm->mn_memslots_update_rcuwait)); if (kvm->mn_active_invalidate_count) kvm->mn_active_invalidate_count = 0; else WARN_ON(kvm->mmu_invalidate_in_progress); #else kvm_flush_shadow_all(kvm); #endif kvm_arch_destroy_vm(kvm); kvm_destroy_devices(kvm); for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) { kvm_free_memslots(kvm, &kvm->__memslots[i][0]); kvm_free_memslots(kvm, &kvm->__memslots[i][1]); } cleanup_srcu_struct(&kvm->irq_srcu); cleanup_srcu_struct(&kvm->srcu); #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES xa_destroy(&kvm->mem_attr_array); #endif kvm_arch_free_vm(kvm); preempt_notifier_dec(); kvm_disable_virtualization(); mmdrop(mm); } void kvm_get_kvm(struct kvm *kvm) { refcount_inc(&kvm->users_count); } EXPORT_SYMBOL_GPL(kvm_get_kvm); /* * Make sure the vm is not during destruction, which is a safe version of * kvm_get_kvm(). Return true if kvm referenced successfully, false otherwise. */ bool kvm_get_kvm_safe(struct kvm *kvm) { return refcount_inc_not_zero(&kvm->users_count); } EXPORT_SYMBOL_GPL(kvm_get_kvm_safe); void kvm_put_kvm(struct kvm *kvm) { if (refcount_dec_and_test(&kvm->users_count)) kvm_destroy_vm(kvm); } EXPORT_SYMBOL_GPL(kvm_put_kvm); /* * Used to put a reference that was taken on behalf of an object associated * with a user-visible file descriptor, e.g. a vcpu or device, if installation * of the new file descriptor fails and the reference cannot be transferred to * its final owner. In such cases, the caller is still actively using @kvm and * will fail miserably if the refcount unexpectedly hits zero. */ void kvm_put_kvm_no_destroy(struct kvm *kvm) { WARN_ON(refcount_dec_and_test(&kvm->users_count)); } EXPORT_SYMBOL_GPL(kvm_put_kvm_no_destroy); static int kvm_vm_release(struct inode *inode, struct file *filp) { struct kvm *kvm = filp->private_data; kvm_irqfd_release(kvm); kvm_put_kvm(kvm); return 0; } /* * Allocation size is twice as large as the actual dirty bitmap size. * See kvm_vm_ioctl_get_dirty_log() why this is needed. */ static int kvm_alloc_dirty_bitmap(struct kvm_memory_slot *memslot) { unsigned long dirty_bytes = kvm_dirty_bitmap_bytes(memslot); memslot->dirty_bitmap = __vcalloc(2, dirty_bytes, GFP_KERNEL_ACCOUNT); if (!memslot->dirty_bitmap) return -ENOMEM; return 0; } static struct kvm_memslots *kvm_get_inactive_memslots(struct kvm *kvm, int as_id) { struct kvm_memslots *active = __kvm_memslots(kvm, as_id); int node_idx_inactive = active->node_idx ^ 1; return &kvm->__memslots[as_id][node_idx_inactive]; } /* * Helper to get the address space ID when one of memslot pointers may be NULL. * This also serves as a sanity that at least one of the pointers is non-NULL, * and that their address space IDs don't diverge. */ static int kvm_memslots_get_as_id(struct kvm_memory_slot *a, struct kvm_memory_slot *b) { if (WARN_ON_ONCE(!a && !b)) return 0; if (!a) return b->as_id; if (!b) return a->as_id; WARN_ON_ONCE(a->as_id != b->as_id); return a->as_id; } static void kvm_insert_gfn_node(struct kvm_memslots *slots, struct kvm_memory_slot *slot) { struct rb_root *gfn_tree = &slots->gfn_tree; struct rb_node **node, *parent; int idx = slots->node_idx; parent = NULL; for (node = &gfn_tree->rb_node; *node; ) { struct kvm_memory_slot *tmp; tmp = container_of(*node, struct kvm_memory_slot, gfn_node[idx]); parent = *node; if (slot->base_gfn < tmp->base_gfn) node = &(*node)->rb_left; else if (slot->base_gfn > tmp->base_gfn) node = &(*node)->rb_right; else BUG(); } rb_link_node(&slot->gfn_node[idx], parent, node); rb_insert_color(&slot->gfn_node[idx], gfn_tree); } static void kvm_erase_gfn_node(struct kvm_memslots *slots, struct kvm_memory_slot *slot) { rb_erase(&slot->gfn_node[slots->node_idx], &slots->gfn_tree); } static void kvm_replace_gfn_node(struct kvm_memslots *slots, struct kvm_memory_slot *old, struct kvm_memory_slot *new) { int idx = slots->node_idx; WARN_ON_ONCE(old->base_gfn != new->base_gfn); rb_replace_node(&old->gfn_node[idx], &new->gfn_node[idx], &slots->gfn_tree); } /* * Replace @old with @new in the inactive memslots. * * With NULL @old this simply adds @new. * With NULL @new this simply removes @old. * * If @new is non-NULL its hva_node[slots_idx] range has to be set * appropriately. */ static void kvm_replace_memslot(struct kvm *kvm, struct kvm_memory_slot *old, struct kvm_memory_slot *new) { int as_id = kvm_memslots_get_as_id(old, new); struct kvm_memslots *slots = kvm_get_inactive_memslots(kvm, as_id); int idx = slots->node_idx; if (old) { hash_del(&old->id_node[idx]); interval_tree_remove(&old->hva_node[idx], &slots->hva_tree); if ((long)old == atomic_long_read(&slots->last_used_slot)) atomic_long_set(&slots->last_used_slot, (long)new); if (!new) { kvm_erase_gfn_node(slots, old); return; } } /* * Initialize @new's hva range. Do this even when replacing an @old * slot, kvm_copy_memslot() deliberately does not touch node data. */ new->hva_node[idx].start = new->userspace_addr; new->hva_node[idx].last = new->userspace_addr + (new->npages << PAGE_SHIFT) - 1; /* * (Re)Add the new memslot. There is no O(1) interval_tree_replace(), * hva_node needs to be swapped with remove+insert even though hva can't * change when replacing an existing slot. */ hash_add(slots->id_hash, &new->id_node[idx], new->id); interval_tree_insert(&new->hva_node[idx], &slots->hva_tree); /* * If the memslot gfn is unchanged, rb_replace_node() can be used to * switch the node in the gfn tree instead of removing the old and * inserting the new as two separate operations. Replacement is a * single O(1) operation versus two O(log(n)) operations for * remove+insert. */ if (old && old->base_gfn == new->base_gfn) { kvm_replace_gfn_node(slots, old, new); } else { if (old) kvm_erase_gfn_node(slots, old); kvm_insert_gfn_node(slots, new); } } /* * Flags that do not access any of the extra space of struct * kvm_userspace_memory_region2. KVM_SET_USER_MEMORY_REGION_V1_FLAGS * only allows these. */ #define KVM_SET_USER_MEMORY_REGION_V1_FLAGS \ (KVM_MEM_LOG_DIRTY_PAGES | KVM_MEM_READONLY) static int check_memory_region_flags(struct kvm *kvm, const struct kvm_userspace_memory_region2 *mem) { u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES; if (kvm_arch_has_private_mem(kvm)) valid_flags |= KVM_MEM_GUEST_MEMFD; /* Dirty logging private memory is not currently supported. */ if (mem->flags & KVM_MEM_GUEST_MEMFD) valid_flags &= ~KVM_MEM_LOG_DIRTY_PAGES; /* * GUEST_MEMFD is incompatible with read-only memslots, as writes to * read-only memslots have emulated MMIO, not page fault, semantics, * and KVM doesn't allow emulated MMIO for private memory. */ if (kvm_arch_has_readonly_mem(kvm) && !(mem->flags & KVM_MEM_GUEST_MEMFD)) valid_flags |= KVM_MEM_READONLY; if (mem->flags & ~valid_flags) return -EINVAL; return 0; } static void kvm_swap_active_memslots(struct kvm *kvm, int as_id) { struct kvm_memslots *slots = kvm_get_inactive_memslots(kvm, as_id); /* Grab the generation from the activate memslots. */ u64 gen = __kvm_memslots(kvm, as_id)->generation; WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS); slots->generation = gen | KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS; /* * Do not store the new memslots while there are invalidations in * progress, otherwise the locking in invalidate_range_start and * invalidate_range_end will be unbalanced. */ spin_lock(&kvm->mn_invalidate_lock); prepare_to_rcuwait(&kvm->mn_memslots_update_rcuwait); while (kvm->mn_active_invalidate_count) { set_current_state(TASK_UNINTERRUPTIBLE); spin_unlock(&kvm->mn_invalidate_lock); schedule(); spin_lock(&kvm->mn_invalidate_lock); } finish_rcuwait(&kvm->mn_memslots_update_rcuwait); rcu_assign_pointer(kvm->memslots[as_id], slots); spin_unlock(&kvm->mn_invalidate_lock); /* * Acquired in kvm_set_memslot. Must be released before synchronize * SRCU below in order to avoid deadlock with another thread * acquiring the slots_arch_lock in an srcu critical section. */ mutex_unlock(&kvm->slots_arch_lock); synchronize_srcu_expedited(&kvm->srcu); /* * Increment the new memslot generation a second time, dropping the * update in-progress flag and incrementing the generation based on * the number of address spaces. This provides a unique and easily * identifiable generation number while the memslots are in flux. */ gen = slots->generation & ~KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS; /* * Generations must be unique even across address spaces. We do not need * a global counter for that, instead the generation space is evenly split * across address spaces. For example, with two address spaces, address * space 0 will use generations 0, 2, 4, ... while address space 1 will * use generations 1, 3, 5, ... */ gen += kvm_arch_nr_memslot_as_ids(kvm); kvm_arch_memslots_updated(kvm, gen); slots->generation = gen; } static int kvm_prepare_memory_region(struct kvm *kvm, const struct kvm_memory_slot *old, struct kvm_memory_slot *new, enum kvm_mr_change change) { int r; /* * If dirty logging is disabled, nullify the bitmap; the old bitmap * will be freed on "commit". If logging is enabled in both old and * new, reuse the existing bitmap. If logging is enabled only in the * new and KVM isn't using a ring buffer, allocate and initialize a * new bitmap. */ if (change != KVM_MR_DELETE) { if (!(new->flags & KVM_MEM_LOG_DIRTY_PAGES)) new->dirty_bitmap = NULL; else if (old && old->dirty_bitmap) new->dirty_bitmap = old->dirty_bitmap; else if (kvm_use_dirty_bitmap(kvm)) { r = kvm_alloc_dirty_bitmap(new); if (r) return r; if (kvm_dirty_log_manual_protect_and_init_set(kvm)) bitmap_set(new->dirty_bitmap, 0, new->npages); } } r = kvm_arch_prepare_memory_region(kvm, old, new, change); /* Free the bitmap on failure if it was allocated above. */ if (r && new && new->dirty_bitmap && (!old || !old->dirty_bitmap)) kvm_destroy_dirty_bitmap(new); return r; } static void kvm_commit_memory_region(struct kvm *kvm, struct kvm_memory_slot *old, const struct kvm_memory_slot *new, enum kvm_mr_change change) { int old_flags = old ? old->flags : 0; int new_flags = new ? new->flags : 0; /* * Update the total number of memslot pages before calling the arch * hook so that architectures can consume the result directly. */ if (change == KVM_MR_DELETE) kvm->nr_memslot_pages -= old->npages; else if (change == KVM_MR_CREATE) kvm->nr_memslot_pages += new->npages; if ((old_flags ^ new_flags) & KVM_MEM_LOG_DIRTY_PAGES) { int change = (new_flags & KVM_MEM_LOG_DIRTY_PAGES) ? 1 : -1; atomic_set(&kvm->nr_memslots_dirty_logging, atomic_read(&kvm->nr_memslots_dirty_logging) + change); } kvm_arch_commit_memory_region(kvm, old, new, change); switch (change) { case KVM_MR_CREATE: /* Nothing more to do. */ break; case KVM_MR_DELETE: /* Free the old memslot and all its metadata. */ kvm_free_memslot(kvm, old); break; case KVM_MR_MOVE: case KVM_MR_FLAGS_ONLY: /* * Free the dirty bitmap as needed; the below check encompasses * both the flags and whether a ring buffer is being used) */ if (old->dirty_bitmap && !new->dirty_bitmap) kvm_destroy_dirty_bitmap(old); /* * The final quirk. Free the detached, old slot, but only its * memory, not any metadata. Metadata, including arch specific * data, may be reused by @new. */ kfree(old); break; default: BUG(); } } /* * Activate @new, which must be installed in the inactive slots by the caller, * by swapping the active slots and then propagating @new to @old once @old is * unreachable and can be safely modified. * * With NULL @old this simply adds @new to @active (while swapping the sets). * With NULL @new this simply removes @old from @active and frees it * (while also swapping the sets). */ static void kvm_activate_memslot(struct kvm *kvm, struct kvm_memory_slot *old, struct kvm_memory_slot *new) { int as_id = kvm_memslots_get_as_id(old, new); kvm_swap_active_memslots(kvm, as_id); /* Propagate the new memslot to the now inactive memslots. */ kvm_replace_memslot(kvm, old, new); } static void kvm_copy_memslot(struct kvm_memory_slot *dest, const struct kvm_memory_slot *src) { dest->base_gfn = src->base_gfn; dest->npages = src->npages; dest->dirty_bitmap = src->dirty_bitmap; dest->arch = src->arch; dest->userspace_addr = src->userspace_addr; dest->flags = src->flags; dest->id = src->id; dest->as_id = src->as_id; } static void kvm_invalidate_memslot(struct kvm *kvm, struct kvm_memory_slot *old, struct kvm_memory_slot *invalid_slot) { /* * Mark the current slot INVALID. As with all memslot modifications, * this must be done on an unreachable slot to avoid modifying the * current slot in the active tree. */ kvm_copy_memslot(invalid_slot, old); invalid_slot->flags |= KVM_MEMSLOT_INVALID; kvm_replace_memslot(kvm, old, invalid_slot); /* * Activate the slot that is now marked INVALID, but don't propagate * the slot to the now inactive slots. The slot is either going to be * deleted or recreated as a new slot. */ kvm_swap_active_memslots(kvm, old->as_id); /* * From this point no new shadow pages pointing to a deleted, or moved, * memslot will be created. Validation of sp->gfn happens in: * - gfn_to_hva (kvm_read_guest, gfn_to_pfn) * - kvm_is_visible_gfn (mmu_check_root) */ kvm_arch_flush_shadow_memslot(kvm, old); kvm_arch_guest_memory_reclaimed(kvm); /* Was released by kvm_swap_active_memslots(), reacquire. */ mutex_lock(&kvm->slots_arch_lock); /* * Copy the arch-specific field of the newly-installed slot back to the * old slot as the arch data could have changed between releasing * slots_arch_lock in kvm_swap_active_memslots() and re-acquiring the lock * above. Writers are required to retrieve memslots *after* acquiring * slots_arch_lock, thus the active slot's data is guaranteed to be fresh. */ old->arch = invalid_slot->arch; } static void kvm_create_memslot(struct kvm *kvm, struct kvm_memory_slot *new) { /* Add the new memslot to the inactive set and activate. */ kvm_replace_memslot(kvm, NULL, new); kvm_activate_memslot(kvm, NULL, new); } static void kvm_delete_memslot(struct kvm *kvm, struct kvm_memory_slot *old, struct kvm_memory_slot *invalid_slot) { /* * Remove the old memslot (in the inactive memslots) by passing NULL as * the "new" slot, and for the invalid version in the active slots. */ kvm_replace_memslot(kvm, old, NULL); kvm_activate_memslot(kvm, invalid_slot, NULL); } static void kvm_move_memslot(struct kvm *kvm, struct kvm_memory_slot *old, struct kvm_memory_slot *new, struct kvm_memory_slot *invalid_slot) { /* * Replace the old memslot in the inactive slots, and then swap slots * and replace the current INVALID with the new as well. */ kvm_replace_memslot(kvm, old, new); kvm_activate_memslot(kvm, invalid_slot, new); } static void kvm_update_flags_memslot(struct kvm *kvm, struct kvm_memory_slot *old, struct kvm_memory_slot *new) { /* * Similar to the MOVE case, but the slot doesn't need to be zapped as * an intermediate step. Instead, the old memslot is simply replaced * with a new, updated copy in both memslot sets. */ kvm_replace_memslot(kvm, old, new); kvm_activate_memslot(kvm, old, new); } static int kvm_set_memslot(struct kvm *kvm, struct kvm_memory_slot *old, struct kvm_memory_slot *new, enum kvm_mr_change change) { struct kvm_memory_slot *invalid_slot; int r; /* * Released in kvm_swap_active_memslots(). * * Must be held from before the current memslots are copied until after * the new memslots are installed with rcu_assign_pointer, then * released before the synchronize srcu in kvm_swap_active_memslots(). * * When modifying memslots outside of the slots_lock, must be held * before reading the pointer to the current memslots until after all * changes to those memslots are complete. * * These rules ensure that installing new memslots does not lose * changes made to the previous memslots. */ mutex_lock(&kvm->slots_arch_lock); /* * Invalidate the old slot if it's being deleted or moved. This is * done prior to actually deleting/moving the memslot to allow vCPUs to * continue running by ensuring there are no mappings or shadow pages * for the memslot when it is deleted/moved. Without pre-invalidation * (and without a lock), a window would exist between effecting the * delete/move and committing the changes in arch code where KVM or a * guest could access a non-existent memslot. * * Modifications are done on a temporary, unreachable slot. The old * slot needs to be preserved in case a later step fails and the * invalidation needs to be reverted. */ if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) { invalid_slot = kzalloc(sizeof(*invalid_slot), GFP_KERNEL_ACCOUNT); if (!invalid_slot) { mutex_unlock(&kvm->slots_arch_lock); return -ENOMEM; } kvm_invalidate_memslot(kvm, old, invalid_slot); } r = kvm_prepare_memory_region(kvm, old, new, change); if (r) { /* * For DELETE/MOVE, revert the above INVALID change. No * modifications required since the original slot was preserved * in the inactive slots. Changing the active memslots also * release slots_arch_lock. */ if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) { kvm_activate_memslot(kvm, invalid_slot, old); kfree(invalid_slot); } else { mutex_unlock(&kvm->slots_arch_lock); } return r; } /* * For DELETE and MOVE, the working slot is now active as the INVALID * version of the old slot. MOVE is particularly special as it reuses * the old slot and returns a copy of the old slot (in working_slot). * For CREATE, there is no old slot. For DELETE and FLAGS_ONLY, the * old slot is detached but otherwise preserved. */ if (change == KVM_MR_CREATE) kvm_create_memslot(kvm, new); else if (change == KVM_MR_DELETE) kvm_delete_memslot(kvm, old, invalid_slot); else if (change == KVM_MR_MOVE) kvm_move_memslot(kvm, old, new, invalid_slot); else if (change == KVM_MR_FLAGS_ONLY) kvm_update_flags_memslot(kvm, old, new); else BUG(); /* Free the temporary INVALID slot used for DELETE and MOVE. */ if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) kfree(invalid_slot); /* * No need to refresh new->arch, changes after dropping slots_arch_lock * will directly hit the final, active memslot. Architectures are * responsible for knowing that new->arch may be stale. */ kvm_commit_memory_region(kvm, old, new, change); return 0; } static bool kvm_check_memslot_overlap(struct kvm_memslots *slots, int id, gfn_t start, gfn_t end) { struct kvm_memslot_iter iter; kvm_for_each_memslot_in_gfn_range(&iter, slots, start, end) { if (iter.slot->id != id) return true; } return false; } static int kvm_set_memory_region(struct kvm *kvm, const struct kvm_userspace_memory_region2 *mem) { struct kvm_memory_slot *old, *new; struct kvm_memslots *slots; enum kvm_mr_change change; unsigned long npages; gfn_t base_gfn; int as_id, id; int r; lockdep_assert_held(&kvm->slots_lock); r = check_memory_region_flags(kvm, mem); if (r) return r; as_id = mem->slot >> 16; id = (u16)mem->slot; /* General sanity checks */ if ((mem->memory_size & (PAGE_SIZE - 1)) || (mem->memory_size != (unsigned long)mem->memory_size)) return -EINVAL; if (mem->guest_phys_addr & (PAGE_SIZE - 1)) return -EINVAL; /* We can read the guest memory with __xxx_user() later on. */ if ((mem->userspace_addr & (PAGE_SIZE - 1)) || (mem->userspace_addr != untagged_addr(mem->userspace_addr)) || !access_ok((void __user *)(unsigned long)mem->userspace_addr, mem->memory_size)) return -EINVAL; if (mem->flags & KVM_MEM_GUEST_MEMFD && (mem->guest_memfd_offset & (PAGE_SIZE - 1) || mem->guest_memfd_offset + mem->memory_size < mem->guest_memfd_offset)) return -EINVAL; if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_MEM_SLOTS_NUM) return -EINVAL; if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr) return -EINVAL; /* * The size of userspace-defined memory regions is restricted in order * to play nice with dirty bitmap operations, which are indexed with an * "unsigned int". KVM's internal memory regions don't support dirty * logging, and so are exempt. */ if (id < KVM_USER_MEM_SLOTS && (mem->memory_size >> PAGE_SHIFT) > KVM_MEM_MAX_NR_PAGES) return -EINVAL; slots = __kvm_memslots(kvm, as_id); /* * Note, the old memslot (and the pointer itself!) may be invalidated * and/or destroyed by kvm_set_memslot(). */ old = id_to_memslot(slots, id); if (!mem->memory_size) { if (!old || !old->npages) return -EINVAL; if (WARN_ON_ONCE(kvm->nr_memslot_pages < old->npages)) return -EIO; return kvm_set_memslot(kvm, old, NULL, KVM_MR_DELETE); } base_gfn = (mem->guest_phys_addr >> PAGE_SHIFT); npages = (mem->memory_size >> PAGE_SHIFT); if (!old || !old->npages) { change = KVM_MR_CREATE; /* * To simplify KVM internals, the total number of pages across * all memslots must fit in an unsigned long. */ if ((kvm->nr_memslot_pages + npages) < kvm->nr_memslot_pages) return -EINVAL; } else { /* Modify an existing slot. */ /* Private memslots are immutable, they can only be deleted. */ if (mem->flags & KVM_MEM_GUEST_MEMFD) return -EINVAL; if ((mem->userspace_addr != old->userspace_addr) || (npages != old->npages) || ((mem->flags ^ old->flags) & KVM_MEM_READONLY)) return -EINVAL; if (base_gfn != old->base_gfn) change = KVM_MR_MOVE; else if (mem->flags != old->flags) change = KVM_MR_FLAGS_ONLY; else /* Nothing to change. */ return 0; } if ((change == KVM_MR_CREATE || change == KVM_MR_MOVE) && kvm_check_memslot_overlap(slots, id, base_gfn, base_gfn + npages)) return -EEXIST; /* Allocate a slot that will persist in the memslot. */ new = kzalloc(sizeof(*new), GFP_KERNEL_ACCOUNT); if (!new) return -ENOMEM; new->as_id = as_id; new->id = id; new->base_gfn = base_gfn; new->npages = npages; new->flags = mem->flags; new->userspace_addr = mem->userspace_addr; if (mem->flags & KVM_MEM_GUEST_MEMFD) { r = kvm_gmem_bind(kvm, new, mem->guest_memfd, mem->guest_memfd_offset); if (r) goto out; } r = kvm_set_memslot(kvm, old, new, change); if (r) goto out_unbind; return 0; out_unbind: if (mem->flags & KVM_MEM_GUEST_MEMFD) kvm_gmem_unbind(new); out: kfree(new); return r; } int kvm_set_internal_memslot(struct kvm *kvm, const struct kvm_userspace_memory_region2 *mem) { if (WARN_ON_ONCE(mem->slot < KVM_USER_MEM_SLOTS)) return -EINVAL; if (WARN_ON_ONCE(mem->flags)) return -EINVAL; return kvm_set_memory_region(kvm, mem); } EXPORT_SYMBOL_GPL(kvm_set_internal_memslot); static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region2 *mem) { if ((u16)mem->slot >= KVM_USER_MEM_SLOTS) return -EINVAL; guard(mutex)(&kvm->slots_lock); return kvm_set_memory_region(kvm, mem); } #ifndef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT /** * kvm_get_dirty_log - get a snapshot of dirty pages * @kvm: pointer to kvm instance * @log: slot id and address to which we copy the log * @is_dirty: set to '1' if any dirty pages were found * @memslot: set to the associated memslot, always valid on success */ int kvm_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log, int *is_dirty, struct kvm_memory_slot **memslot) { struct kvm_memslots *slots; int i, as_id, id; unsigned long n; unsigned long any = 0; /* Dirty ring tracking may be exclusive to dirty log tracking */ if (!kvm_use_dirty_bitmap(kvm)) return -ENXIO; *memslot = NULL; *is_dirty = 0; as_id = log->slot >> 16; id = (u16)log->slot; if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_USER_MEM_SLOTS) return -EINVAL; slots = __kvm_memslots(kvm, as_id); *memslot = id_to_memslot(slots, id); if (!(*memslot) || !(*memslot)->dirty_bitmap) return -ENOENT; kvm_arch_sync_dirty_log(kvm, *memslot); n = kvm_dirty_bitmap_bytes(*memslot); for (i = 0; !any && i < n/sizeof(long); ++i) any = (*memslot)->dirty_bitmap[i]; if (copy_to_user(log->dirty_bitmap, (*memslot)->dirty_bitmap, n)) return -EFAULT; if (any) *is_dirty = 1; return 0; } EXPORT_SYMBOL_GPL(kvm_get_dirty_log); #else /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */ /** * kvm_get_dirty_log_protect - get a snapshot of dirty pages * and reenable dirty page tracking for the corresponding pages. * @kvm: pointer to kvm instance * @log: slot id and address to which we copy the log * * We need to keep it in mind that VCPU threads can write to the bitmap * concurrently. So, to avoid losing track of dirty pages we keep the * following order: * * 1. Take a snapshot of the bit and clear it if needed. * 2. Write protect the corresponding page. * 3. Copy the snapshot to the userspace. * 4. Upon return caller flushes TLB's if needed. * * Between 2 and 4, the guest may write to the page using the remaining TLB * entry. This is not a problem because the page is reported dirty using * the snapshot taken before and step 4 ensures that writes done after * exiting to userspace will be logged for the next call. * */ static int kvm_get_dirty_log_protect(struct kvm *kvm, struct kvm_dirty_log *log) { struct kvm_memslots *slots; struct kvm_memory_slot *memslot; int i, as_id, id; unsigned long n; unsigned long *dirty_bitmap; unsigned long *dirty_bitmap_buffer; bool flush; /* Dirty ring tracking may be exclusive to dirty log tracking */ if (!kvm_use_dirty_bitmap(kvm)) return -ENXIO; as_id = log->slot >> 16; id = (u16)log->slot; if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_USER_MEM_SLOTS) return -EINVAL; slots = __kvm_memslots(kvm, as_id); memslot = id_to_memslot(slots, id); if (!memslot || !memslot->dirty_bitmap) return -ENOENT; dirty_bitmap = memslot->dirty_bitmap; kvm_arch_sync_dirty_log(kvm, memslot); n = kvm_dirty_bitmap_bytes(memslot); flush = false; if (kvm->manual_dirty_log_protect) { /* * Unlike kvm_get_dirty_log, we always return false in *flush, * because no flush is needed until KVM_CLEAR_DIRTY_LOG. There * is some code duplication between this function and * kvm_get_dirty_log, but hopefully all architecture * transition to kvm_get_dirty_log_protect and kvm_get_dirty_log * can be eliminated. */ dirty_bitmap_buffer = dirty_bitmap; } else { dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot); memset(dirty_bitmap_buffer, 0, n); KVM_MMU_LOCK(kvm); for (i = 0; i < n / sizeof(long); i++) { unsigned long mask; gfn_t offset; if (!dirty_bitmap[i]) continue; flush = true; mask = xchg(&dirty_bitmap[i], 0); dirty_bitmap_buffer[i] = mask; offset = i * BITS_PER_LONG; kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, offset, mask); } KVM_MMU_UNLOCK(kvm); } if (flush) kvm_flush_remote_tlbs_memslot(kvm, memslot); if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n)) return -EFAULT; return 0; } /** * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot * @kvm: kvm instance * @log: slot id and address to which we copy the log * * Steps 1-4 below provide general overview of dirty page logging. See * kvm_get_dirty_log_protect() function description for additional details. * * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we * always flush the TLB (step 4) even if previous step failed and the dirty * bitmap may be corrupt. Regardless of previous outcome the KVM logging API * does not preclude user space subsequent dirty log read. Flushing TLB ensures * writes will be marked dirty for next log read. * * 1. Take a snapshot of the bit and clear it if needed. * 2. Write protect the corresponding page. * 3. Copy the snapshot to the userspace. * 4. Flush TLB's if needed. */ static int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) { int r; mutex_lock(&kvm->slots_lock); r = kvm_get_dirty_log_protect(kvm, log); mutex_unlock(&kvm->slots_lock); return r; } /** * kvm_clear_dirty_log_protect - clear dirty bits in the bitmap * and reenable dirty page tracking for the corresponding pages. * @kvm: pointer to kvm instance * @log: slot id and address from which to fetch the bitmap of dirty pages */ static int kvm_clear_dirty_log_protect(struct kvm *kvm, struct kvm_clear_dirty_log *log) { struct kvm_memslots *slots; struct kvm_memory_slot *memslot; int as_id, id; gfn_t offset; unsigned long i, n; unsigned long *dirty_bitmap; unsigned long *dirty_bitmap_buffer; bool flush; /* Dirty ring tracking may be exclusive to dirty log tracking */ if (!kvm_use_dirty_bitmap(kvm)) return -ENXIO; as_id = log->slot >> 16; id = (u16)log->slot; if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_USER_MEM_SLOTS) return -EINVAL; if (log->first_page & 63) return -EINVAL; slots = __kvm_memslots(kvm, as_id); memslot = id_to_memslot(slots, id); if (!memslot || !memslot->dirty_bitmap) return -ENOENT; dirty_bitmap = memslot->dirty_bitmap; n = ALIGN(log->num_pages, BITS_PER_LONG) / 8; if (log->first_page > memslot->npages || log->num_pages > memslot->npages - log->first_page || (log->num_pages < memslot->npages - log->first_page && (log->num_pages & 63))) return -EINVAL; kvm_arch_sync_dirty_log(kvm, memslot); flush = false; dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot); if (copy_from_user(dirty_bitmap_buffer, log->dirty_bitmap, n)) return -EFAULT; KVM_MMU_LOCK(kvm); for (offset = log->first_page, i = offset / BITS_PER_LONG, n = DIV_ROUND_UP(log->num_pages, BITS_PER_LONG); n--; i++, offset += BITS_PER_LONG) { unsigned long mask = *dirty_bitmap_buffer++; atomic_long_t *p = (atomic_long_t *) &dirty_bitmap[i]; if (!mask) continue; mask &= atomic_long_fetch_andnot(mask, p); /* * mask contains the bits that really have been cleared. This * never includes any bits beyond the length of the memslot (if * the length is not aligned to 64 pages), therefore it is not * a problem if userspace sets them in log->dirty_bitmap. */ if (mask) { flush = true; kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, offset, mask); } } KVM_MMU_UNLOCK(kvm); if (flush) kvm_flush_remote_tlbs_memslot(kvm, memslot); return 0; } static int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm, struct kvm_clear_dirty_log *log) { int r; mutex_lock(&kvm->slots_lock); r = kvm_clear_dirty_log_protect(kvm, log); mutex_unlock(&kvm->slots_lock); return r; } #endif /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */ #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES static u64 kvm_supported_mem_attributes(struct kvm *kvm) { if (!kvm || kvm_arch_has_private_mem(kvm)) return KVM_MEMORY_ATTRIBUTE_PRIVATE; return 0; } /* * Returns true if _all_ gfns in the range [@start, @end) have attributes * such that the bits in @mask match @attrs. */ bool kvm_range_has_memory_attributes(struct kvm *kvm, gfn_t start, gfn_t end, unsigned long mask, unsigned long attrs) { XA_STATE(xas, &kvm->mem_attr_array, start); unsigned long index; void *entry; mask &= kvm_supported_mem_attributes(kvm); if (attrs & ~mask) return false; if (end == start + 1) return (kvm_get_memory_attributes(kvm, start) & mask) == attrs; guard(rcu)(); if (!attrs) return !xas_find(&xas, end - 1); for (index = start; index < end; index++) { do { entry = xas_next(&xas); } while (xas_retry(&xas, entry)); if (xas.xa_index != index || (xa_to_value(entry) & mask) != attrs) return false; } return true; } static __always_inline void kvm_handle_gfn_range(struct kvm *kvm, struct kvm_mmu_notifier_range *range) { struct kvm_gfn_range gfn_range; struct kvm_memory_slot *slot; struct kvm_memslots *slots; struct kvm_memslot_iter iter; bool found_memslot = false; bool ret = false; int i; gfn_range.arg = range->arg; gfn_range.may_block = range->may_block; /* * If/when KVM supports more attributes beyond private .vs shared, this * _could_ set KVM_FILTER_{SHARED,PRIVATE} appropriately if the entire target * range already has the desired private vs. shared state (it's unclear * if that is a net win). For now, KVM reaches this point if and only * if the private flag is being toggled, i.e. all mappings are in play. */ for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) { slots = __kvm_memslots(kvm, i); kvm_for_each_memslot_in_gfn_range(&iter, slots, range->start, range->end) { slot = iter.slot; gfn_range.slot = slot; gfn_range.start = max(range->start, slot->base_gfn); gfn_range.end = min(range->end, slot->base_gfn + slot->npages); if (gfn_range.start >= gfn_range.end) continue; if (!found_memslot) { found_memslot = true; KVM_MMU_LOCK(kvm); if (!IS_KVM_NULL_FN(range->on_lock)) range->on_lock(kvm); } ret |= range->handler(kvm, &gfn_range); } } if (range->flush_on_ret && ret) kvm_flush_remote_tlbs(kvm); if (found_memslot) KVM_MMU_UNLOCK(kvm); } static bool kvm_pre_set_memory_attributes(struct kvm *kvm, struct kvm_gfn_range *range) { /* * Unconditionally add the range to the invalidation set, regardless of * whether or not the arch callback actually needs to zap SPTEs. E.g. * if KVM supports RWX attributes in the future and the attributes are * going from R=>RW, zapping isn't strictly necessary. Unconditionally * adding the range allows KVM to require that MMU invalidations add at * least one range between begin() and end(), e.g. allows KVM to detect * bugs where the add() is missed. Relaxing the rule *might* be safe, * but it's not obvious that allowing new mappings while the attributes * are in flux is desirable or worth the complexity. */ kvm_mmu_invalidate_range_add(kvm, range->start, range->end); return kvm_arch_pre_set_memory_attributes(kvm, range); } /* Set @attributes for the gfn range [@start, @end). */ static int kvm_vm_set_mem_attributes(struct kvm *kvm, gfn_t start, gfn_t end, unsigned long attributes) { struct kvm_mmu_notifier_range pre_set_range = { .start = start, .end = end, .arg.attributes = attributes, .handler = kvm_pre_set_memory_attributes, .on_lock = kvm_mmu_invalidate_begin, .flush_on_ret = true, .may_block = true, }; struct kvm_mmu_notifier_range post_set_range = { .start = start, .end = end, .arg.attributes = attributes, .handler = kvm_arch_post_set_memory_attributes, .on_lock = kvm_mmu_invalidate_end, .may_block = true, }; unsigned long i; void *entry; int r = 0; entry = attributes ? xa_mk_value(attributes) : NULL; mutex_lock(&kvm->slots_lock); /* Nothing to do if the entire range as the desired attributes. */ if (kvm_range_has_memory_attributes(kvm, start, end, ~0, attributes)) goto out_unlock; /* * Reserve memory ahead of time to avoid having to deal with failures * partway through setting the new attributes. */ for (i = start; i < end; i++) { r = xa_reserve(&kvm->mem_attr_array, i, GFP_KERNEL_ACCOUNT); if (r) goto out_unlock; } kvm_handle_gfn_range(kvm, &pre_set_range); for (i = start; i < end; i++) { r = xa_err(xa_store(&kvm->mem_attr_array, i, entry, GFP_KERNEL_ACCOUNT)); KVM_BUG_ON(r, kvm); } kvm_handle_gfn_range(kvm, &post_set_range); out_unlock: mutex_unlock(&kvm->slots_lock); return r; } static int kvm_vm_ioctl_set_mem_attributes(struct kvm *kvm, struct kvm_memory_attributes *attrs) { gfn_t start, end; /* flags is currently not used. */ if (attrs->flags) return -EINVAL; if (attrs->attributes & ~kvm_supported_mem_attributes(kvm)) return -EINVAL; if (attrs->size == 0 || attrs->address + attrs->size < attrs->address) return -EINVAL; if (!PAGE_ALIGNED(attrs->address) || !PAGE_ALIGNED(attrs->size)) return -EINVAL; start = attrs->address >> PAGE_SHIFT; end = (attrs->address + attrs->size) >> PAGE_SHIFT; /* * xarray tracks data using "unsigned long", and as a result so does * KVM. For simplicity, supports generic attributes only on 64-bit * architectures. */ BUILD_BUG_ON(sizeof(attrs->attributes) != sizeof(unsigned long)); return kvm_vm_set_mem_attributes(kvm, start, end, attrs->attributes); } #endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */ struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) { return __gfn_to_memslot(kvm_memslots(kvm), gfn); } EXPORT_SYMBOL_GPL(gfn_to_memslot); struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn) { struct kvm_memslots *slots = kvm_vcpu_memslots(vcpu); u64 gen = slots->generation; struct kvm_memory_slot *slot; /* * This also protects against using a memslot from a different address space, * since different address spaces have different generation numbers. */ if (unlikely(gen != vcpu->last_used_slot_gen)) { vcpu->last_used_slot = NULL; vcpu->last_used_slot_gen = gen; } slot = try_get_memslot(vcpu->last_used_slot, gfn); if (slot) return slot; /* * Fall back to searching all memslots. We purposely use * search_memslots() instead of __gfn_to_memslot() to avoid * thrashing the VM-wide last_used_slot in kvm_memslots. */ slot = search_memslots(slots, gfn, false); if (slot) { vcpu->last_used_slot = slot; return slot; } return NULL; } bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) { struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn); return kvm_is_visible_memslot(memslot); } EXPORT_SYMBOL_GPL(kvm_is_visible_gfn); bool kvm_vcpu_is_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) { struct kvm_memory_slot *memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); return kvm_is_visible_memslot(memslot); } EXPORT_SYMBOL_GPL(kvm_vcpu_is_visible_gfn); unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn) { struct vm_area_struct *vma; unsigned long addr, size; size = PAGE_SIZE; addr = kvm_vcpu_gfn_to_hva_prot(vcpu, gfn, NULL); if (kvm_is_error_hva(addr)) return PAGE_SIZE; mmap_read_lock(current->mm); vma = find_vma(current->mm, addr); if (!vma) goto out; size = vma_kernel_pagesize(vma); out: mmap_read_unlock(current->mm); return size; } static bool memslot_is_readonly(const struct kvm_memory_slot *slot) { return slot->flags & KVM_MEM_READONLY; } static unsigned long __gfn_to_hva_many(const struct kvm_memory_slot *slot, gfn_t gfn, gfn_t *nr_pages, bool write) { if (!slot || slot->flags & KVM_MEMSLOT_INVALID) return KVM_HVA_ERR_BAD; if (memslot_is_readonly(slot) && write) return KVM_HVA_ERR_RO_BAD; if (nr_pages) *nr_pages = slot->npages - (gfn - slot->base_gfn); return __gfn_to_hva_memslot(slot, gfn); } static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn, gfn_t *nr_pages) { return __gfn_to_hva_many(slot, gfn, nr_pages, true); } unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, gfn_t gfn) { return gfn_to_hva_many(slot, gfn, NULL); } EXPORT_SYMBOL_GPL(gfn_to_hva_memslot); unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) { return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL); } EXPORT_SYMBOL_GPL(gfn_to_hva); unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn) { return gfn_to_hva_many(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn, NULL); } EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_hva); /* * Return the hva of a @gfn and the R/W attribute if possible. * * @slot: the kvm_memory_slot which contains @gfn * @gfn: the gfn to be translated * @writable: used to return the read/write attribute of the @slot if the hva * is valid and @writable is not NULL */ unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot, gfn_t gfn, bool *writable) { unsigned long hva = __gfn_to_hva_many(slot, gfn, NULL, false); if (!kvm_is_error_hva(hva) && writable) *writable = !memslot_is_readonly(slot); return hva; } unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable) { struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); return gfn_to_hva_memslot_prot(slot, gfn, writable); } unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable) { struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); return gfn_to_hva_memslot_prot(slot, gfn, writable); } static bool kvm_is_ad_tracked_page(struct page *page) { /* * Per page-flags.h, pages tagged PG_reserved "should in general not be * touched (e.g. set dirty) except by its owner". */ return !PageReserved(page); } static void kvm_set_page_dirty(struct page *page) { if (kvm_is_ad_tracked_page(page)) SetPageDirty(page); } static void kvm_set_page_accessed(struct page *page) { if (kvm_is_ad_tracked_page(page)) mark_page_accessed(page); } void kvm_release_page_clean(struct page *page) { if (!page) return; kvm_set_page_accessed(page); put_page(page); } EXPORT_SYMBOL_GPL(kvm_release_page_clean); void kvm_release_page_dirty(struct page *page) { if (!page) return; kvm_set_page_dirty(page); kvm_release_page_clean(page); } EXPORT_SYMBOL_GPL(kvm_release_page_dirty); static kvm_pfn_t kvm_resolve_pfn(struct kvm_follow_pfn *kfp, struct page *page, struct follow_pfnmap_args *map, bool writable) { kvm_pfn_t pfn; WARN_ON_ONCE(!!page == !!map); if (kfp->map_writable) *kfp->map_writable = writable; if (map) pfn = map->pfn; else pfn = page_to_pfn(page); *kfp->refcounted_page = page; return pfn; } /* * The fast path to get the writable pfn which will be stored in @pfn, * true indicates success, otherwise false is returned. */ static bool hva_to_pfn_fast(struct kvm_follow_pfn *kfp, kvm_pfn_t *pfn) { struct page *page; bool r; /* * Try the fast-only path when the caller wants to pin/get the page for * writing. If the caller only wants to read the page, KVM must go * down the full, slow path in order to avoid racing an operation that * breaks Copy-on-Write (CoW), e.g. so that KVM doesn't end up pointing * at the old, read-only page while mm/ points at a new, writable page. */ if (!((kfp->flags & FOLL_WRITE) || kfp->map_writable)) return false; if (kfp->pin) r = pin_user_pages_fast(kfp->hva, 1, FOLL_WRITE, &page) == 1; else r = get_user_page_fast_only(kfp->hva, FOLL_WRITE, &page); if (r) { *pfn = kvm_resolve_pfn(kfp, page, NULL, true); return true; } return false; } /* * The slow path to get the pfn of the specified host virtual address, * 1 indicates success, -errno is returned if error is detected. */ static int hva_to_pfn_slow(struct kvm_follow_pfn *kfp, kvm_pfn_t *pfn) { /* * When a VCPU accesses a page that is not mapped into the secondary * MMU, we lookup the page using GUP to map it, so the guest VCPU can * make progress. We always want to honor NUMA hinting faults in that * case, because GUP usage corresponds to memory accesses from the VCPU. * Otherwise, we'd not trigger NUMA hinting faults once a page is * mapped into the secondary MMU and gets accessed by a VCPU. * * Note that get_user_page_fast_only() and FOLL_WRITE for now * implicitly honor NUMA hinting faults and don't need this flag. */ unsigned int flags = FOLL_HWPOISON | FOLL_HONOR_NUMA_FAULT | kfp->flags; struct page *page, *wpage; int npages; if (kfp->pin) npages = pin_user_pages_unlocked(kfp->hva, 1, &page, flags); else npages = get_user_pages_unlocked(kfp->hva, 1, &page, flags); if (npages != 1) return npages; /* * Pinning is mutually exclusive with opportunistically mapping a read * fault as writable, as KVM should never pin pages when mapping memory * into the guest (pinning is only for direct accesses from KVM). */ if (WARN_ON_ONCE(kfp->map_writable && kfp->pin)) goto out; /* map read fault as writable if possible */ if (!(flags & FOLL_WRITE) && kfp->map_writable && get_user_page_fast_only(kfp->hva, FOLL_WRITE, &wpage)) { put_page(page); page = wpage; flags |= FOLL_WRITE; } out: *pfn = kvm_resolve_pfn(kfp, page, NULL, flags & FOLL_WRITE); return npages; } static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault) { if (unlikely(!(vma->vm_flags & VM_READ))) return false; if (write_fault && (unlikely(!(vma->vm_flags & VM_WRITE)))) return false; return true; } static int hva_to_pfn_remapped(struct vm_area_struct *vma, struct kvm_follow_pfn *kfp, kvm_pfn_t *p_pfn) { struct follow_pfnmap_args args = { .vma = vma, .address = kfp->hva }; bool write_fault = kfp->flags & FOLL_WRITE; int r; /* * Remapped memory cannot be pinned in any meaningful sense. Bail if * the caller wants to pin the page, i.e. access the page outside of * MMU notifier protection, and unsafe umappings are disallowed. */ if (kfp->pin && !allow_unsafe_mappings) return -EINVAL; r = follow_pfnmap_start(&args); if (r) { /* * get_user_pages fails for VM_IO and VM_PFNMAP vmas and does * not call the fault handler, so do it here. */ bool unlocked = false; r = fixup_user_fault(current->mm, kfp->hva, (write_fault ? FAULT_FLAG_WRITE : 0), &unlocked); if (unlocked) return -EAGAIN; if (r) return r; r = follow_pfnmap_start(&args); if (r) return r; } if (write_fault && !args.writable) { *p_pfn = KVM_PFN_ERR_RO_FAULT; goto out; } *p_pfn = kvm_resolve_pfn(kfp, NULL, &args, args.writable); out: follow_pfnmap_end(&args); return r; } kvm_pfn_t hva_to_pfn(struct kvm_follow_pfn *kfp) { struct vm_area_struct *vma; kvm_pfn_t pfn; int npages, r; might_sleep(); if (WARN_ON_ONCE(!kfp->refcounted_page)) return KVM_PFN_ERR_FAULT; if (hva_to_pfn_fast(kfp, &pfn)) return pfn; npages = hva_to_pfn_slow(kfp, &pfn); if (npages == 1) return pfn; if (npages == -EINTR || npages == -EAGAIN) return KVM_PFN_ERR_SIGPENDING; if (npages == -EHWPOISON) return KVM_PFN_ERR_HWPOISON; mmap_read_lock(current->mm); retry: vma = vma_lookup(current->mm, kfp->hva); if (vma == NULL) pfn = KVM_PFN_ERR_FAULT; else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) { r = hva_to_pfn_remapped(vma, kfp, &pfn); if (r == -EAGAIN) goto retry; if (r < 0) pfn = KVM_PFN_ERR_FAULT; } else { if ((kfp->flags & FOLL_NOWAIT) && vma_is_valid(vma, kfp->flags & FOLL_WRITE)) pfn = KVM_PFN_ERR_NEEDS_IO; else pfn = KVM_PFN_ERR_FAULT; } mmap_read_unlock(current->mm); return pfn; } static kvm_pfn_t kvm_follow_pfn(struct kvm_follow_pfn *kfp) { kfp->hva = __gfn_to_hva_many(kfp->slot, kfp->gfn, NULL, kfp->flags & FOLL_WRITE); if (kfp->hva == KVM_HVA_ERR_RO_BAD) return KVM_PFN_ERR_RO_FAULT; if (kvm_is_error_hva(kfp->hva)) return KVM_PFN_NOSLOT; if (memslot_is_readonly(kfp->slot) && kfp->map_writable) { *kfp->map_writable = false; kfp->map_writable = NULL; } return hva_to_pfn(kfp); } kvm_pfn_t __kvm_faultin_pfn(const struct kvm_memory_slot *slot, gfn_t gfn, unsigned int foll, bool *writable, struct page **refcounted_page) { struct kvm_follow_pfn kfp = { .slot = slot, .gfn = gfn, .flags = foll, .map_writable = writable, .refcounted_page = refcounted_page, }; if (WARN_ON_ONCE(!writable || !refcounted_page)) return KVM_PFN_ERR_FAULT; *writable = false; *refcounted_page = NULL; return kvm_follow_pfn(&kfp); } EXPORT_SYMBOL_GPL(__kvm_faultin_pfn); int kvm_prefetch_pages(struct kvm_memory_slot *slot, gfn_t gfn, struct page **pages, int nr_pages) { unsigned long addr; gfn_t entry = 0; addr = gfn_to_hva_many(slot, gfn, &entry); if (kvm_is_error_hva(addr)) return -1; if (entry < nr_pages) return 0; return get_user_pages_fast_only(addr, nr_pages, FOLL_WRITE, pages); } EXPORT_SYMBOL_GPL(kvm_prefetch_pages); /* * Don't use this API unless you are absolutely, positively certain that KVM * needs to get a struct page, e.g. to pin the page for firmware DMA. * * FIXME: Users of this API likely need to FOLL_PIN the page, not just elevate * its refcount. */ struct page *__gfn_to_page(struct kvm *kvm, gfn_t gfn, bool write) { struct page *refcounted_page = NULL; struct kvm_follow_pfn kfp = { .slot = gfn_to_memslot(kvm, gfn), .gfn = gfn, .flags = write ? FOLL_WRITE : 0, .refcounted_page = &refcounted_page, }; (void)kvm_follow_pfn(&kfp); return refcounted_page; } EXPORT_SYMBOL_GPL(__gfn_to_page); int __kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map, bool writable) { struct kvm_follow_pfn kfp = { .slot = gfn_to_memslot(vcpu->kvm, gfn), .gfn = gfn, .flags = writable ? FOLL_WRITE : 0, .refcounted_page = &map->pinned_page, .pin = true, }; map->pinned_page = NULL; map->page = NULL; map->hva = NULL; map->gfn = gfn; map->writable = writable; map->pfn = kvm_follow_pfn(&kfp); if (is_error_noslot_pfn(map->pfn)) return -EINVAL; if (pfn_valid(map->pfn)) { map->page = pfn_to_page(map->pfn); map->hva = kmap(map->page); #ifdef CONFIG_HAS_IOMEM } else { map->hva = memremap(pfn_to_hpa(map->pfn), PAGE_SIZE, MEMREMAP_WB); #endif } return map->hva ? 0 : -EFAULT; } EXPORT_SYMBOL_GPL(__kvm_vcpu_map); void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map) { if (!map->hva) return; if (map->page) kunmap(map->page); #ifdef CONFIG_HAS_IOMEM else memunmap(map->hva); #endif if (map->writable) kvm_vcpu_mark_page_dirty(vcpu, map->gfn); if (map->pinned_page) { if (map->writable) kvm_set_page_dirty(map->pinned_page); kvm_set_page_accessed(map->pinned_page); unpin_user_page(map->pinned_page); } map->hva = NULL; map->page = NULL; map->pinned_page = NULL; } EXPORT_SYMBOL_GPL(kvm_vcpu_unmap); static int next_segment(unsigned long len, int offset) { if (len > PAGE_SIZE - offset) return PAGE_SIZE - offset; else return len; } /* Copy @len bytes from guest memory at '(@gfn * PAGE_SIZE) + @offset' to @data */ static int __kvm_read_guest_page(struct kvm_memory_slot *slot, gfn_t gfn, void *data, int offset, int len) { int r; unsigned long addr; if (WARN_ON_ONCE(offset + len > PAGE_SIZE)) return -EFAULT; addr = gfn_to_hva_memslot_prot(slot, gfn, NULL); if (kvm_is_error_hva(addr)) return -EFAULT; r = __copy_from_user(data, (void __user *)addr + offset, len); if (r) return -EFAULT; return 0; } int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, int len) { struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); return __kvm_read_guest_page(slot, gfn, data, offset, len); } EXPORT_SYMBOL_GPL(kvm_read_guest_page); int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data, int offset, int len) { struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); return __kvm_read_guest_page(slot, gfn, data, offset, len); } EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_page); int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len) { gfn_t gfn = gpa >> PAGE_SHIFT; int seg; int offset = offset_in_page(gpa); int ret; while ((seg = next_segment(len, offset)) != 0) { ret = kvm_read_guest_page(kvm, gfn, data, offset, seg); if (ret < 0) return ret; offset = 0; len -= seg; data += seg; ++gfn; } return 0; } EXPORT_SYMBOL_GPL(kvm_read_guest); int kvm_vcpu_read_guest(struct kvm_vcpu *vcpu, gpa_t gpa, void *data, unsigned long len) { gfn_t gfn = gpa >> PAGE_SHIFT; int seg; int offset = offset_in_page(gpa); int ret; while ((seg = next_segment(len, offset)) != 0) { ret = kvm_vcpu_read_guest_page(vcpu, gfn, data, offset, seg); if (ret < 0) return ret; offset = 0; len -= seg; data += seg; ++gfn; } return 0; } EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest); static int __kvm_read_guest_atomic(struct kvm_memory_slot *slot, gfn_t gfn, void *data, int offset, unsigned long len) { int r; unsigned long addr; if (WARN_ON_ONCE(offset + len > PAGE_SIZE)) return -EFAULT; addr = gfn_to_hva_memslot_prot(slot, gfn, NULL); if (kvm_is_error_hva(addr)) return -EFAULT; pagefault_disable(); r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len); pagefault_enable(); if (r) return -EFAULT; return 0; } int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa, void *data, unsigned long len) { gfn_t gfn = gpa >> PAGE_SHIFT; struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); int offset = offset_in_page(gpa); return __kvm_read_guest_atomic(slot, gfn, data, offset, len); } EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_atomic); /* Copy @len bytes from @data into guest memory at '(@gfn * PAGE_SIZE) + @offset' */ static int __kvm_write_guest_page(struct kvm *kvm, struct kvm_memory_slot *memslot, gfn_t gfn, const void *data, int offset, int len) { int r; unsigned long addr; if (WARN_ON_ONCE(offset + len > PAGE_SIZE)) return -EFAULT; addr = gfn_to_hva_memslot(memslot, gfn); if (kvm_is_error_hva(addr)) return -EFAULT; r = __copy_to_user((void __user *)addr + offset, data, len); if (r) return -EFAULT; mark_page_dirty_in_slot(kvm, memslot, gfn); return 0; } int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data, int offset, int len) { struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); return __kvm_write_guest_page(kvm, slot, gfn, data, offset, len); } EXPORT_SYMBOL_GPL(kvm_write_guest_page); int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, const void *data, int offset, int len) { struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); return __kvm_write_guest_page(vcpu->kvm, slot, gfn, data, offset, len); } EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_page); int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, unsigned long len) { gfn_t gfn = gpa >> PAGE_SHIFT; int seg; int offset = offset_in_page(gpa); int ret; while ((seg = next_segment(len, offset)) != 0) { ret = kvm_write_guest_page(kvm, gfn, data, offset, seg); if (ret < 0) return ret; offset = 0; len -= seg; data += seg; ++gfn; } return 0; } EXPORT_SYMBOL_GPL(kvm_write_guest); int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data, unsigned long len) { gfn_t gfn = gpa >> PAGE_SHIFT; int seg; int offset = offset_in_page(gpa); int ret; while ((seg = next_segment(len, offset)) != 0) { ret = kvm_vcpu_write_guest_page(vcpu, gfn, data, offset, seg); if (ret < 0) return ret; offset = 0; len -= seg; data += seg; ++gfn; } return 0; } EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest); static int __kvm_gfn_to_hva_cache_init(struct kvm_memslots *slots, struct gfn_to_hva_cache *ghc, gpa_t gpa, unsigned long len) { int offset = offset_in_page(gpa); gfn_t start_gfn = gpa >> PAGE_SHIFT; gfn_t end_gfn = (gpa + len - 1) >> PAGE_SHIFT; gfn_t nr_pages_needed = end_gfn - start_gfn + 1; gfn_t nr_pages_avail; /* Update ghc->generation before performing any error checks. */ ghc->generation = slots->generation; if (start_gfn > end_gfn) { ghc->hva = KVM_HVA_ERR_BAD; return -EINVAL; } /* * If the requested region crosses two memslots, we still * verify that the entire region is valid here. */ for ( ; start_gfn <= end_gfn; start_gfn += nr_pages_avail) { ghc->memslot = __gfn_to_memslot(slots, start_gfn); ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn, &nr_pages_avail); if (kvm_is_error_hva(ghc->hva)) return -EFAULT; } /* Use the slow path for cross page reads and writes. */ if (nr_pages_needed == 1) ghc->hva += offset; else ghc->memslot = NULL; ghc->gpa = gpa; ghc->len = len; return 0; } int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, gpa_t gpa, unsigned long len) { struct kvm_memslots *slots = kvm_memslots(kvm); return __kvm_gfn_to_hva_cache_init(slots, ghc, gpa, len); } EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init); int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, void *data, unsigned int offset, unsigned long len) { struct kvm_memslots *slots = kvm_memslots(kvm); int r; gpa_t gpa = ghc->gpa + offset; if (WARN_ON_ONCE(len + offset > ghc->len)) return -EINVAL; if (slots->generation != ghc->generation) { if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len)) return -EFAULT; } if (kvm_is_error_hva(ghc->hva)) return -EFAULT; if (unlikely(!ghc->memslot)) return kvm_write_guest(kvm, gpa, data, len); r = __copy_to_user((void __user *)ghc->hva + offset, data, len); if (r) return -EFAULT; mark_page_dirty_in_slot(kvm, ghc->memslot, gpa >> PAGE_SHIFT); return 0; } EXPORT_SYMBOL_GPL(kvm_write_guest_offset_cached); int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, void *data, unsigned long len) { return kvm_write_guest_offset_cached(kvm, ghc, data, 0, len); } EXPORT_SYMBOL_GPL(kvm_write_guest_cached); int kvm_read_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, void *data, unsigned int offset, unsigned long len) { struct kvm_memslots *slots = kvm_memslots(kvm); int r; gpa_t gpa = ghc->gpa + offset; if (WARN_ON_ONCE(len + offset > ghc->len)) return -EINVAL; if (slots->generation != ghc->generation) { if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len)) return -EFAULT; } if (kvm_is_error_hva(ghc->hva)) return -EFAULT; if (unlikely(!ghc->memslot)) return kvm_read_guest(kvm, gpa, data, len); r = __copy_from_user(data, (void __user *)ghc->hva + offset, len); if (r) return -EFAULT; return 0; } EXPORT_SYMBOL_GPL(kvm_read_guest_offset_cached); int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, void *data, unsigned long len) { return kvm_read_guest_offset_cached(kvm, ghc, data, 0, len); } EXPORT_SYMBOL_GPL(kvm_read_guest_cached); int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len) { const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0))); gfn_t gfn = gpa >> PAGE_SHIFT; int seg; int offset = offset_in_page(gpa); int ret; while ((seg = next_segment(len, offset)) != 0) { ret = kvm_write_guest_page(kvm, gfn, zero_page, offset, seg); if (ret < 0) return ret; offset = 0; len -= seg; ++gfn; } return 0; } EXPORT_SYMBOL_GPL(kvm_clear_guest); void mark_page_dirty_in_slot(struct kvm *kvm, const struct kvm_memory_slot *memslot, gfn_t gfn) { struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); #ifdef CONFIG_HAVE_KVM_DIRTY_RING if (WARN_ON_ONCE(vcpu && vcpu->kvm != kvm)) return; WARN_ON_ONCE(!vcpu && !kvm_arch_allow_write_without_running_vcpu(kvm)); #endif if (memslot && kvm_slot_dirty_track_enabled(memslot)) { unsigned long rel_gfn = gfn - memslot->base_gfn; u32 slot = (memslot->as_id << 16) | memslot->id; if (kvm->dirty_ring_size && vcpu) kvm_dirty_ring_push(vcpu, slot, rel_gfn); else if (memslot->dirty_bitmap) set_bit_le(rel_gfn, memslot->dirty_bitmap); } } EXPORT_SYMBOL_GPL(mark_page_dirty_in_slot); void mark_page_dirty(struct kvm *kvm, gfn_t gfn) { struct kvm_memory_slot *memslot; memslot = gfn_to_memslot(kvm, gfn); mark_page_dirty_in_slot(kvm, memslot, gfn); } EXPORT_SYMBOL_GPL(mark_page_dirty); void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn) { struct kvm_memory_slot *memslot; memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); mark_page_dirty_in_slot(vcpu->kvm, memslot, gfn); } EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty); void kvm_sigset_activate(struct kvm_vcpu *vcpu) { if (!vcpu->sigset_active) return; /* * This does a lockless modification of ->real_blocked, which is fine * because, only current can change ->real_blocked and all readers of * ->real_blocked don't care as long ->real_blocked is always a subset * of ->blocked. */ sigprocmask(SIG_SETMASK, &vcpu->sigset, &current->real_blocked); } void kvm_sigset_deactivate(struct kvm_vcpu *vcpu) { if (!vcpu->sigset_active) return; sigprocmask(SIG_SETMASK, &current->real_blocked, NULL); sigemptyset(&current->real_blocked); } static void grow_halt_poll_ns(struct kvm_vcpu *vcpu) { unsigned int old, val, grow, grow_start; old = val = vcpu->halt_poll_ns; grow_start = READ_ONCE(halt_poll_ns_grow_start); grow = READ_ONCE(halt_poll_ns_grow); if (!grow) goto out; val *= grow; if (val < grow_start) val = grow_start; vcpu->halt_poll_ns = val; out: trace_kvm_halt_poll_ns_grow(vcpu->vcpu_id, val, old); } static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu) { unsigned int old, val, shrink, grow_start; old = val = vcpu->halt_poll_ns; shrink = READ_ONCE(halt_poll_ns_shrink); grow_start = READ_ONCE(halt_poll_ns_grow_start); if (shrink == 0) val = 0; else val /= shrink; if (val < grow_start) val = 0; vcpu->halt_poll_ns = val; trace_kvm_halt_poll_ns_shrink(vcpu->vcpu_id, val, old); } static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu) { int ret = -EINTR; int idx = srcu_read_lock(&vcpu->kvm->srcu); if (kvm_arch_vcpu_runnable(vcpu)) goto out; if (kvm_cpu_has_pending_timer(vcpu)) goto out; if (signal_pending(current)) goto out; if (kvm_check_request(KVM_REQ_UNBLOCK, vcpu)) goto out; ret = 0; out: srcu_read_unlock(&vcpu->kvm->srcu, idx); return ret; } /* * Block the vCPU until the vCPU is runnable, an event arrives, or a signal is * pending. This is mostly used when halting a vCPU, but may also be used * directly for other vCPU non-runnable states, e.g. x86's Wait-For-SIPI. */ bool kvm_vcpu_block(struct kvm_vcpu *vcpu) { struct rcuwait *wait = kvm_arch_vcpu_get_wait(vcpu); bool waited = false; vcpu->stat.generic.blocking = 1; preempt_disable(); kvm_arch_vcpu_blocking(vcpu); prepare_to_rcuwait(wait); preempt_enable(); for (;;) { set_current_state(TASK_INTERRUPTIBLE); if (kvm_vcpu_check_block(vcpu) < 0) break; waited = true; schedule(); } preempt_disable(); finish_rcuwait(wait); kvm_arch_vcpu_unblocking(vcpu); preempt_enable(); vcpu->stat.generic.blocking = 0; return waited; } static inline void update_halt_poll_stats(struct kvm_vcpu *vcpu, ktime_t start, ktime_t end, bool success) { struct kvm_vcpu_stat_generic *stats = &vcpu->stat.generic; u64 poll_ns = ktime_to_ns(ktime_sub(end, start)); ++vcpu->stat.generic.halt_attempted_poll; if (success) { ++vcpu->stat.generic.halt_successful_poll; if (!vcpu_valid_wakeup(vcpu)) ++vcpu->stat.generic.halt_poll_invalid; stats->halt_poll_success_ns += poll_ns; KVM_STATS_LOG_HIST_UPDATE(stats->halt_poll_success_hist, poll_ns); } else { stats->halt_poll_fail_ns += poll_ns; KVM_STATS_LOG_HIST_UPDATE(stats->halt_poll_fail_hist, poll_ns); } } static unsigned int kvm_vcpu_max_halt_poll_ns(struct kvm_vcpu *vcpu) { struct kvm *kvm = vcpu->kvm; if (kvm->override_halt_poll_ns) { /* * Ensure kvm->max_halt_poll_ns is not read before * kvm->override_halt_poll_ns. * * Pairs with the smp_wmb() when enabling KVM_CAP_HALT_POLL. */ smp_rmb(); return READ_ONCE(kvm->max_halt_poll_ns); } return READ_ONCE(halt_poll_ns); } /* * Emulate a vCPU halt condition, e.g. HLT on x86, WFI on arm, etc... If halt * polling is enabled, busy wait for a short time before blocking to avoid the * expensive block+unblock sequence if a wake event arrives soon after the vCPU * is halted. */ void kvm_vcpu_halt(struct kvm_vcpu *vcpu) { unsigned int max_halt_poll_ns = kvm_vcpu_max_halt_poll_ns(vcpu); bool halt_poll_allowed = !kvm_arch_no_poll(vcpu); ktime_t start, cur, poll_end; bool waited = false; bool do_halt_poll; u64 halt_ns; if (vcpu->halt_poll_ns > max_halt_poll_ns) vcpu->halt_poll_ns = max_halt_poll_ns; do_halt_poll = halt_poll_allowed && vcpu->halt_poll_ns; start = cur = poll_end = ktime_get(); if (do_halt_poll) { ktime_t stop = ktime_add_ns(start, vcpu->halt_poll_ns); do { if (kvm_vcpu_check_block(vcpu) < 0) goto out; cpu_relax(); poll_end = cur = ktime_get(); } while (kvm_vcpu_can_poll(cur, stop)); } waited = kvm_vcpu_block(vcpu); cur = ktime_get(); if (waited) { vcpu->stat.generic.halt_wait_ns += ktime_to_ns(cur) - ktime_to_ns(poll_end); KVM_STATS_LOG_HIST_UPDATE(vcpu->stat.generic.halt_wait_hist, ktime_to_ns(cur) - ktime_to_ns(poll_end)); } out: /* The total time the vCPU was "halted", including polling time. */ halt_ns = ktime_to_ns(cur) - ktime_to_ns(start); /* * Note, halt-polling is considered successful so long as the vCPU was * never actually scheduled out, i.e. even if the wake event arrived * after of the halt-polling loop itself, but before the full wait. */ if (do_halt_poll) update_halt_poll_stats(vcpu, start, poll_end, !waited); if (halt_poll_allowed) { /* Recompute the max halt poll time in case it changed. */ max_halt_poll_ns = kvm_vcpu_max_halt_poll_ns(vcpu); if (!vcpu_valid_wakeup(vcpu)) { shrink_halt_poll_ns(vcpu); } else if (max_halt_poll_ns) { if (halt_ns <= vcpu->halt_poll_ns) ; /* we had a long block, shrink polling */ else if (vcpu->halt_poll_ns && halt_ns > max_halt_poll_ns) shrink_halt_poll_ns(vcpu); /* we had a short halt and our poll time is too small */ else if (vcpu->halt_poll_ns < max_halt_poll_ns && halt_ns < max_halt_poll_ns) grow_halt_poll_ns(vcpu); } else { vcpu->halt_poll_ns = 0; } } trace_kvm_vcpu_wakeup(halt_ns, waited, vcpu_valid_wakeup(vcpu)); } EXPORT_SYMBOL_GPL(kvm_vcpu_halt); bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu) { if (__kvm_vcpu_wake_up(vcpu)) { WRITE_ONCE(vcpu->ready, true); ++vcpu->stat.generic.halt_wakeup; return true; } return false; } EXPORT_SYMBOL_GPL(kvm_vcpu_wake_up); #ifndef CONFIG_S390 /* * Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode. */ void kvm_vcpu_kick(struct kvm_vcpu *vcpu) { int me, cpu; if (kvm_vcpu_wake_up(vcpu)) return; me = get_cpu(); /* * The only state change done outside the vcpu mutex is IN_GUEST_MODE * to EXITING_GUEST_MODE. Therefore the moderately expensive "should * kick" check does not need atomic operations if kvm_vcpu_kick is used * within the vCPU thread itself. */ if (vcpu == __this_cpu_read(kvm_running_vcpu)) { if (vcpu->mode == IN_GUEST_MODE) WRITE_ONCE(vcpu->mode, EXITING_GUEST_MODE); goto out; } /* * Note, the vCPU could get migrated to a different pCPU at any point * after kvm_arch_vcpu_should_kick(), which could result in sending an * IPI to the previous pCPU. But, that's ok because the purpose of the * IPI is to force the vCPU to leave IN_GUEST_MODE, and migrating the * vCPU also requires it to leave IN_GUEST_MODE. */ if (kvm_arch_vcpu_should_kick(vcpu)) { cpu = READ_ONCE(vcpu->cpu); if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) smp_send_reschedule(cpu); } out: put_cpu(); } EXPORT_SYMBOL_GPL(kvm_vcpu_kick); #endif /* !CONFIG_S390 */ int kvm_vcpu_yield_to(struct kvm_vcpu *target) { struct task_struct *task = NULL; int ret; if (!read_trylock(&target->pid_lock)) return 0; if (target->pid) task = get_pid_task(target->pid, PIDTYPE_PID); read_unlock(&target->pid_lock); if (!task) return 0; ret = yield_to(task, 1); put_task_struct(task); return ret; } EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to); /* * Helper that checks whether a VCPU is eligible for directed yield. * Most eligible candidate to yield is decided by following heuristics: * * (a) VCPU which has not done pl-exit or cpu relax intercepted recently * (preempted lock holder), indicated by @in_spin_loop. * Set at the beginning and cleared at the end of interception/PLE handler. * * (b) VCPU which has done pl-exit/ cpu relax intercepted but did not get * chance last time (mostly it has become eligible now since we have probably * yielded to lockholder in last iteration. This is done by toggling * @dy_eligible each time a VCPU checked for eligibility.) * * Yielding to a recently pl-exited/cpu relax intercepted VCPU before yielding * to preempted lock-holder could result in wrong VCPU selection and CPU * burning. Giving priority for a potential lock-holder increases lock * progress. * * Since algorithm is based on heuristics, accessing another VCPU data without * locking does not harm. It may result in trying to yield to same VCPU, fail * and continue with next VCPU and so on. */ static bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu) { #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT bool eligible; eligible = !vcpu->spin_loop.in_spin_loop || vcpu->spin_loop.dy_eligible; if (vcpu->spin_loop.in_spin_loop) kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible); return eligible; #else return true; #endif } /* * Unlike kvm_arch_vcpu_runnable, this function is called outside * a vcpu_load/vcpu_put pair. However, for most architectures * kvm_arch_vcpu_runnable does not require vcpu_load. */ bool __weak kvm_arch_dy_runnable(struct kvm_vcpu *vcpu) { return kvm_arch_vcpu_runnable(vcpu); } static bool vcpu_dy_runnable(struct kvm_vcpu *vcpu) { if (kvm_arch_dy_runnable(vcpu)) return true; #ifdef CONFIG_KVM_ASYNC_PF if (!list_empty_careful(&vcpu->async_pf.done)) return true; #endif return false; } /* * By default, simply query the target vCPU's current mode when checking if a * vCPU was preempted in kernel mode. All architectures except x86 (or more * specifical, except VMX) allow querying whether or not a vCPU is in kernel * mode even if the vCPU is NOT loaded, i.e. using kvm_arch_vcpu_in_kernel() * directly for cross-vCPU checks is functionally correct and accurate. */ bool __weak kvm_arch_vcpu_preempted_in_kernel(struct kvm_vcpu *vcpu) { return kvm_arch_vcpu_in_kernel(vcpu); } bool __weak kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu) { return false; } void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode) { int nr_vcpus, start, i, idx, yielded; struct kvm *kvm = me->kvm; struct kvm_vcpu *vcpu; int try = 3; nr_vcpus = atomic_read(&kvm->online_vcpus); if (nr_vcpus < 2) return; /* Pairs with the smp_wmb() in kvm_vm_ioctl_create_vcpu(). */ smp_rmb(); kvm_vcpu_set_in_spin_loop(me, true); /* * The current vCPU ("me") is spinning in kernel mode, i.e. is likely * waiting for a resource to become available. Attempt to yield to a * vCPU that is runnable, but not currently running, e.g. because the * vCPU was preempted by a higher priority task. With luck, the vCPU * that was preempted is holding a lock or some other resource that the * current vCPU is waiting to acquire, and yielding to the other vCPU * will allow it to make forward progress and release the lock (or kick * the spinning vCPU, etc). * * Since KVM has no insight into what exactly the guest is doing, * approximate a round-robin selection by iterating over all vCPUs, * starting at the last boosted vCPU. I.e. if N=kvm->last_boosted_vcpu, * iterate over vCPU[N+1]..vCPU[N-1], wrapping as needed. * * Note, this is inherently racy, e.g. if multiple vCPUs are spinning, * they may all try to yield to the same vCPU(s). But as above, this * is all best effort due to KVM's lack of visibility into the guest. */ start = READ_ONCE(kvm->last_boosted_vcpu) + 1; for (i = 0; i < nr_vcpus; i++) { idx = (start + i) % nr_vcpus; if (idx == me->vcpu_idx) continue; vcpu = xa_load(&kvm->vcpu_array, idx); if (!READ_ONCE(vcpu->ready)) continue; if (kvm_vcpu_is_blocking(vcpu) && !vcpu_dy_runnable(vcpu)) continue; /* * Treat the target vCPU as being in-kernel if it has a pending * interrupt, as the vCPU trying to yield may be spinning * waiting on IPI delivery, i.e. the target vCPU is in-kernel * for the purposes of directed yield. */ if (READ_ONCE(vcpu->preempted) && yield_to_kernel_mode && !kvm_arch_dy_has_pending_interrupt(vcpu) && !kvm_arch_vcpu_preempted_in_kernel(vcpu)) continue; if (!kvm_vcpu_eligible_for_directed_yield(vcpu)) continue; yielded = kvm_vcpu_yield_to(vcpu); if (yielded > 0) { WRITE_ONCE(kvm->last_boosted_vcpu, i); break; } else if (yielded < 0 && !--try) { break; } } kvm_vcpu_set_in_spin_loop(me, false); /* Ensure vcpu is not eligible during next spinloop */ kvm_vcpu_set_dy_eligible(me, false); } EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin); static bool kvm_page_in_dirty_ring(struct kvm *kvm, unsigned long pgoff) { #ifdef CONFIG_HAVE_KVM_DIRTY_RING return (pgoff >= KVM_DIRTY_LOG_PAGE_OFFSET) && (pgoff < KVM_DIRTY_LOG_PAGE_OFFSET + kvm->dirty_ring_size / PAGE_SIZE); #else return false; #endif } static vm_fault_t kvm_vcpu_fault(struct vm_fault *vmf) { struct kvm_vcpu *vcpu = vmf->vma->vm_file->private_data; struct page *page; if (vmf->pgoff == 0) page = virt_to_page(vcpu->run); #ifdef CONFIG_X86 else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET) page = virt_to_page(vcpu->arch.pio_data); #endif #ifdef CONFIG_KVM_MMIO else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET) page = virt_to_page(vcpu->kvm->coalesced_mmio_ring); #endif else if (kvm_page_in_dirty_ring(vcpu->kvm, vmf->pgoff)) page = kvm_dirty_ring_get_page( &vcpu->dirty_ring, vmf->pgoff - KVM_DIRTY_LOG_PAGE_OFFSET); else return kvm_arch_vcpu_fault(vcpu, vmf); get_page(page); vmf->page = page; return 0; } static const struct vm_operations_struct kvm_vcpu_vm_ops = { .fault = kvm_vcpu_fault, }; static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma) { struct kvm_vcpu *vcpu = file->private_data; unsigned long pages = vma_pages(vma); if ((kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff) || kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff + pages - 1)) && ((vma->vm_flags & VM_EXEC) || !(vma->vm_flags & VM_SHARED))) return -EINVAL; vma->vm_ops = &kvm_vcpu_vm_ops; return 0; } static int kvm_vcpu_release(struct inode *inode, struct file *filp) { struct kvm_vcpu *vcpu = filp->private_data; kvm_put_kvm(vcpu->kvm); return 0; } static struct file_operations kvm_vcpu_fops = { .release = kvm_vcpu_release, .unlocked_ioctl = kvm_vcpu_ioctl, .mmap = kvm_vcpu_mmap, .llseek = noop_llseek, KVM_COMPAT(kvm_vcpu_compat_ioctl), }; /* * Allocates an inode for the vcpu. */ static int create_vcpu_fd(struct kvm_vcpu *vcpu) { char name[8 + 1 + ITOA_MAX_LEN + 1]; snprintf(name, sizeof(name), "kvm-vcpu:%d", vcpu->vcpu_id); return anon_inode_getfd(name, &kvm_vcpu_fops, vcpu, O_RDWR | O_CLOEXEC); } #ifdef __KVM_HAVE_ARCH_VCPU_DEBUGFS static int vcpu_get_pid(void *data, u64 *val) { struct kvm_vcpu *vcpu = data; read_lock(&vcpu->pid_lock); *val = pid_nr(vcpu->pid); read_unlock(&vcpu->pid_lock); return 0; } DEFINE_SIMPLE_ATTRIBUTE(vcpu_get_pid_fops, vcpu_get_pid, NULL, "%llu\n"); static void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu) { struct dentry *debugfs_dentry; char dir_name[ITOA_MAX_LEN * 2]; if (!debugfs_initialized()) return; snprintf(dir_name, sizeof(dir_name), "vcpu%d", vcpu->vcpu_id); debugfs_dentry = debugfs_create_dir(dir_name, vcpu->kvm->debugfs_dentry); debugfs_create_file("pid", 0444, debugfs_dentry, vcpu, &vcpu_get_pid_fops); kvm_arch_create_vcpu_debugfs(vcpu, debugfs_dentry); } #endif /* * Creates some virtual cpus. Good luck creating more than one. */ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, unsigned long id) { int r; struct kvm_vcpu *vcpu; struct page *page; /* * KVM tracks vCPU IDs as 'int', be kind to userspace and reject * too-large values instead of silently truncating. * * Ensure KVM_MAX_VCPU_IDS isn't pushed above INT_MAX without first * changing the storage type (at the very least, IDs should be tracked * as unsigned ints). */ BUILD_BUG_ON(KVM_MAX_VCPU_IDS > INT_MAX); if (id >= KVM_MAX_VCPU_IDS) return -EINVAL; mutex_lock(&kvm->lock); if (kvm->created_vcpus >= kvm->max_vcpus) { mutex_unlock(&kvm->lock); return -EINVAL; } r = kvm_arch_vcpu_precreate(kvm, id); if (r) { mutex_unlock(&kvm->lock); return r; } kvm->created_vcpus++; mutex_unlock(&kvm->lock); vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT); if (!vcpu) { r = -ENOMEM; goto vcpu_decrement; } BUILD_BUG_ON(sizeof(struct kvm_run) > PAGE_SIZE); page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); if (!page) { r = -ENOMEM; goto vcpu_free; } vcpu->run = page_address(page); kvm_vcpu_init(vcpu, kvm, id); r = kvm_arch_vcpu_create(vcpu); if (r) goto vcpu_free_run_page; if (kvm->dirty_ring_size) { r = kvm_dirty_ring_alloc(&vcpu->dirty_ring, id, kvm->dirty_ring_size); if (r) goto arch_vcpu_destroy; } mutex_lock(&kvm->lock); if (kvm_get_vcpu_by_id(kvm, id)) { r = -EEXIST; goto unlock_vcpu_destroy; } vcpu->vcpu_idx = atomic_read(&kvm->online_vcpus); r = xa_insert(&kvm->vcpu_array, vcpu->vcpu_idx, vcpu, GFP_KERNEL_ACCOUNT); WARN_ON_ONCE(r == -EBUSY); if (r) goto unlock_vcpu_destroy; /* * Now it's all set up, let userspace reach it. Grab the vCPU's mutex * so that userspace can't invoke vCPU ioctl()s until the vCPU is fully * visible (per online_vcpus), e.g. so that KVM doesn't get tricked * into a NULL-pointer dereference because KVM thinks the _current_ * vCPU doesn't exist. As a bonus, taking vcpu->mutex ensures lockdep * knows it's taken *inside* kvm->lock. */ mutex_lock(&vcpu->mutex); kvm_get_kvm(kvm); r = create_vcpu_fd(vcpu); if (r < 0) goto kvm_put_xa_erase; /* * Pairs with smp_rmb() in kvm_get_vcpu. Store the vcpu * pointer before kvm->online_vcpu's incremented value. */ smp_wmb(); atomic_inc(&kvm->online_vcpus); mutex_unlock(&vcpu->mutex); mutex_unlock(&kvm->lock); kvm_arch_vcpu_postcreate(vcpu); kvm_create_vcpu_debugfs(vcpu); return r; kvm_put_xa_erase: mutex_unlock(&vcpu->mutex); kvm_put_kvm_no_destroy(kvm); xa_erase(&kvm->vcpu_array, vcpu->vcpu_idx); unlock_vcpu_destroy: mutex_unlock(&kvm->lock); kvm_dirty_ring_free(&vcpu->dirty_ring); arch_vcpu_destroy: kvm_arch_vcpu_destroy(vcpu); vcpu_free_run_page: free_page((unsigned long)vcpu->run); vcpu_free: kmem_cache_free(kvm_vcpu_cache, vcpu); vcpu_decrement: mutex_lock(&kvm->lock); kvm->created_vcpus--; mutex_unlock(&kvm->lock); return r; } static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset) { if (sigset) { sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP)); vcpu->sigset_active = 1; vcpu->sigset = *sigset; } else vcpu->sigset_active = 0; return 0; } static ssize_t kvm_vcpu_stats_read(struct file *file, char __user *user_buffer, size_t size, loff_t *offset) { struct kvm_vcpu *vcpu = file->private_data; return kvm_stats_read(vcpu->stats_id, &kvm_vcpu_stats_header, &kvm_vcpu_stats_desc[0], &vcpu->stat, sizeof(vcpu->stat), user_buffer, size, offset); } static int kvm_vcpu_stats_release(struct inode *inode, struct file *file) { struct kvm_vcpu *vcpu = file->private_data; kvm_put_kvm(vcpu->kvm); return 0; } static const struct file_operations kvm_vcpu_stats_fops = { .owner = THIS_MODULE, .read = kvm_vcpu_stats_read, .release = kvm_vcpu_stats_release, .llseek = noop_llseek, }; static int kvm_vcpu_ioctl_get_stats_fd(struct kvm_vcpu *vcpu) { int fd; struct file *file; char name[15 + ITOA_MAX_LEN + 1]; snprintf(name, sizeof(name), "kvm-vcpu-stats:%d", vcpu->vcpu_id); fd = get_unused_fd_flags(O_CLOEXEC); if (fd < 0) return fd; file = anon_inode_getfile(name, &kvm_vcpu_stats_fops, vcpu, O_RDONLY); if (IS_ERR(file)) { put_unused_fd(fd); return PTR_ERR(file); } kvm_get_kvm(vcpu->kvm); file->f_mode |= FMODE_PREAD; fd_install(fd, file); return fd; } #ifdef CONFIG_KVM_GENERIC_PRE_FAULT_MEMORY static int kvm_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu, struct kvm_pre_fault_memory *range) { int idx; long r; u64 full_size; if (range->flags) return -EINVAL; if (!PAGE_ALIGNED(range->gpa) || !PAGE_ALIGNED(range->size) || range->gpa + range->size <= range->gpa) return -EINVAL; vcpu_load(vcpu); idx = srcu_read_lock(&vcpu->kvm->srcu); full_size = range->size; do { if (signal_pending(current)) { r = -EINTR; break; } r = kvm_arch_vcpu_pre_fault_memory(vcpu, range); if (WARN_ON_ONCE(r == 0 || r == -EIO)) break; if (r < 0) break; range->size -= r; range->gpa += r; cond_resched(); } while (range->size); srcu_read_unlock(&vcpu->kvm->srcu, idx); vcpu_put(vcpu); /* Return success if at least one page was mapped successfully. */ return full_size == range->size ? r : 0; } #endif static int kvm_wait_for_vcpu_online(struct kvm_vcpu *vcpu) { struct kvm *kvm = vcpu->kvm; /* * In practice, this happy path will always be taken, as a well-behaved * VMM will never invoke a vCPU ioctl() before KVM_CREATE_VCPU returns. */ if (likely(vcpu->vcpu_idx < atomic_read(&kvm->online_vcpus))) return 0; /* * Acquire and release the vCPU's mutex to wait for vCPU creation to * complete (kvm_vm_ioctl_create_vcpu() holds the mutex until the vCPU * is fully online). */ if (mutex_lock_killable(&vcpu->mutex)) return -EINTR; mutex_unlock(&vcpu->mutex); if (WARN_ON_ONCE(!kvm_get_vcpu(kvm, vcpu->vcpu_idx))) return -EIO; return 0; } static long kvm_vcpu_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { struct kvm_vcpu *vcpu = filp->private_data; void __user *argp = (void __user *)arg; int r; struct kvm_fpu *fpu = NULL; struct kvm_sregs *kvm_sregs = NULL; if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_dead) return -EIO; if (unlikely(_IOC_TYPE(ioctl) != KVMIO)) return -EINVAL; /* * Wait for the vCPU to be online before handling the ioctl(), as KVM * assumes the vCPU is reachable via vcpu_array, i.e. may dereference * a NULL pointer if userspace invokes an ioctl() before KVM is ready. */ r = kvm_wait_for_vcpu_online(vcpu); if (r) return r; /* * Some architectures have vcpu ioctls that are asynchronous to vcpu * execution; mutex_lock() would break them. */ r = kvm_arch_vcpu_async_ioctl(filp, ioctl, arg); if (r != -ENOIOCTLCMD) return r; if (mutex_lock_killable(&vcpu->mutex)) return -EINTR; switch (ioctl) { case KVM_RUN: { struct pid *oldpid; r = -EINVAL; if (arg) goto out; /* * Note, vcpu->pid is primarily protected by vcpu->mutex. The * dedicated r/w lock allows other tasks, e.g. other vCPUs, to * read vcpu->pid while this vCPU is in KVM_RUN, e.g. to yield * directly to this vCPU */ oldpid = vcpu->pid; if (unlikely(oldpid != task_pid(current))) { /* The thread running this VCPU changed. */ struct pid *newpid; r = kvm_arch_vcpu_run_pid_change(vcpu); if (r) break; newpid = get_task_pid(current, PIDTYPE_PID); write_lock(&vcpu->pid_lock); vcpu->pid = newpid; write_unlock(&vcpu->pid_lock); put_pid(oldpid); } vcpu->wants_to_run = !READ_ONCE(vcpu->run->immediate_exit__unsafe); r = kvm_arch_vcpu_ioctl_run(vcpu); vcpu->wants_to_run = false; trace_kvm_userspace_exit(vcpu->run->exit_reason, r); break; } case KVM_GET_REGS: { struct kvm_regs *kvm_regs; r = -ENOMEM; kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL); if (!kvm_regs) goto out; r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs); if (r) goto out_free1; r = -EFAULT; if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs))) goto out_free1; r = 0; out_free1: kfree(kvm_regs); break; } case KVM_SET_REGS: { struct kvm_regs *kvm_regs; kvm_regs = memdup_user(argp, sizeof(*kvm_regs)); if (IS_ERR(kvm_regs)) { r = PTR_ERR(kvm_regs); goto out; } r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs); kfree(kvm_regs); break; } case KVM_GET_SREGS: { kvm_sregs = kzalloc(sizeof(struct kvm_sregs), GFP_KERNEL); r = -ENOMEM; if (!kvm_sregs) goto out; r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs); if (r) goto out; r = -EFAULT; if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs))) goto out; r = 0; break; } case KVM_SET_SREGS: { kvm_sregs = memdup_user(argp, sizeof(*kvm_sregs)); if (IS_ERR(kvm_sregs)) { r = PTR_ERR(kvm_sregs); kvm_sregs = NULL; goto out; } r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs); break; } case KVM_GET_MP_STATE: { struct kvm_mp_state mp_state; r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state); if (r) goto out; r = -EFAULT; if (copy_to_user(argp, &mp_state, sizeof(mp_state))) goto out; r = 0; break; } case KVM_SET_MP_STATE: { struct kvm_mp_state mp_state; r = -EFAULT; if (copy_from_user(&mp_state, argp, sizeof(mp_state))) goto out; r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state); break; } case KVM_TRANSLATE: { struct kvm_translation tr; r = -EFAULT; if (copy_from_user(&tr, argp, sizeof(tr))) goto out; r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr); if (r) goto out; r = -EFAULT; if (copy_to_user(argp, &tr, sizeof(tr))) goto out; r = 0; break; } case KVM_SET_GUEST_DEBUG: { struct kvm_guest_debug dbg; r = -EFAULT; if (copy_from_user(&dbg, argp, sizeof(dbg))) goto out; r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg); break; } case KVM_SET_SIGNAL_MASK: { struct kvm_signal_mask __user *sigmask_arg = argp; struct kvm_signal_mask kvm_sigmask; sigset_t sigset, *p; p = NULL; if (argp) { r = -EFAULT; if (copy_from_user(&kvm_sigmask, argp, sizeof(kvm_sigmask))) goto out; r = -EINVAL; if (kvm_sigmask.len != sizeof(sigset)) goto out; r = -EFAULT; if (copy_from_user(&sigset, sigmask_arg->sigset, sizeof(sigset))) goto out; p = &sigset; } r = kvm_vcpu_ioctl_set_sigmask(vcpu, p); break; } case KVM_GET_FPU: { fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL); r = -ENOMEM; if (!fpu) goto out; r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu); if (r) goto out; r = -EFAULT; if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu))) goto out; r = 0; break; } case KVM_SET_FPU: { fpu = memdup_user(argp, sizeof(*fpu)); if (IS_ERR(fpu)) { r = PTR_ERR(fpu); fpu = NULL; goto out; } r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu); break; } case KVM_GET_STATS_FD: { r = kvm_vcpu_ioctl_get_stats_fd(vcpu); break; } #ifdef CONFIG_KVM_GENERIC_PRE_FAULT_MEMORY case KVM_PRE_FAULT_MEMORY: { struct kvm_pre_fault_memory range; r = -EFAULT; if (copy_from_user(&range, argp, sizeof(range))) break; r = kvm_vcpu_pre_fault_memory(vcpu, &range); /* Pass back leftover range. */ if (copy_to_user(argp, &range, sizeof(range))) r = -EFAULT; break; } #endif default: r = kvm_arch_vcpu_ioctl(filp, ioctl, arg); } out: mutex_unlock(&vcpu->mutex); kfree(fpu); kfree(kvm_sregs); return r; } #ifdef CONFIG_KVM_COMPAT static long kvm_vcpu_compat_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { struct kvm_vcpu *vcpu = filp->private_data; void __user *argp = compat_ptr(arg); int r; if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_dead) return -EIO; switch (ioctl) { case KVM_SET_SIGNAL_MASK: { struct kvm_signal_mask __user *sigmask_arg = argp; struct kvm_signal_mask kvm_sigmask; sigset_t sigset; if (argp) { r = -EFAULT; if (copy_from_user(&kvm_sigmask, argp, sizeof(kvm_sigmask))) goto out; r = -EINVAL; if (kvm_sigmask.len != sizeof(compat_sigset_t)) goto out; r = -EFAULT; if (get_compat_sigset(&sigset, (compat_sigset_t __user *)sigmask_arg->sigset)) goto out; r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset); } else r = kvm_vcpu_ioctl_set_sigmask(vcpu, NULL); break; } default: r = kvm_vcpu_ioctl(filp, ioctl, arg); } out: return r; } #endif static int kvm_device_mmap(struct file *filp, struct vm_area_struct *vma) { struct kvm_device *dev = filp->private_data; if (dev->ops->mmap) return dev->ops->mmap(dev, vma); return -ENODEV; } static int kvm_device_ioctl_attr(struct kvm_device *dev, int (*accessor)(struct kvm_device *dev, struct kvm_device_attr *attr), unsigned long arg) { struct kvm_device_attr attr; if (!accessor) return -EPERM; if (copy_from_user(&attr, (void __user *)arg, sizeof(attr))) return -EFAULT; return accessor(dev, &attr); } static long kvm_device_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { struct kvm_device *dev = filp->private_data; if (dev->kvm->mm != current->mm || dev->kvm->vm_dead) return -EIO; switch (ioctl) { case KVM_SET_DEVICE_ATTR: return kvm_device_ioctl_attr(dev, dev->ops->set_attr, arg); case KVM_GET_DEVICE_ATTR: return kvm_device_ioctl_attr(dev, dev->ops->get_attr, arg); case KVM_HAS_DEVICE_ATTR: return kvm_device_ioctl_attr(dev, dev->ops->has_attr, arg); default: if (dev->ops->ioctl) return dev->ops->ioctl(dev, ioctl, arg); return -ENOTTY; } } static int kvm_device_release(struct inode *inode, struct file *filp) { struct kvm_device *dev = filp->private_data; struct kvm *kvm = dev->kvm; if (dev->ops->release) { mutex_lock(&kvm->lock); list_del_rcu(&dev->vm_node); synchronize_rcu(); dev->ops->release(dev); mutex_unlock(&kvm->lock); } kvm_put_kvm(kvm); return 0; } static struct file_operations kvm_device_fops = { .unlocked_ioctl = kvm_device_ioctl, .release = kvm_device_release, KVM_COMPAT(kvm_device_ioctl), .mmap = kvm_device_mmap, }; struct kvm_device *kvm_device_from_filp(struct file *filp) { if (filp->f_op != &kvm_device_fops) return NULL; return filp->private_data; } static const struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = { #ifdef CONFIG_KVM_MPIC [KVM_DEV_TYPE_FSL_MPIC_20] = &kvm_mpic_ops, [KVM_DEV_TYPE_FSL_MPIC_42] = &kvm_mpic_ops, #endif }; int kvm_register_device_ops(const struct kvm_device_ops *ops, u32 type) { if (type >= ARRAY_SIZE(kvm_device_ops_table)) return -ENOSPC; if (kvm_device_ops_table[type] != NULL) return -EEXIST; kvm_device_ops_table[type] = ops; return 0; } void kvm_unregister_device_ops(u32 type) { if (kvm_device_ops_table[type] != NULL) kvm_device_ops_table[type] = NULL; } static int kvm_ioctl_create_device(struct kvm *kvm, struct kvm_create_device *cd) { const struct kvm_device_ops *ops; struct kvm_device *dev; bool test = cd->flags & KVM_CREATE_DEVICE_TEST; int type; int ret; if (cd->type >= ARRAY_SIZE(kvm_device_ops_table)) return -ENODEV; type = array_index_nospec(cd->type, ARRAY_SIZE(kvm_device_ops_table)); ops = kvm_device_ops_table[type]; if (ops == NULL) return -ENODEV; if (test) return 0; dev = kzalloc(sizeof(*dev), GFP_KERNEL_ACCOUNT); if (!dev) return -ENOMEM; dev->ops = ops; dev->kvm = kvm; mutex_lock(&kvm->lock); ret = ops->create(dev, type); if (ret < 0) { mutex_unlock(&kvm->lock); kfree(dev); return ret; } list_add_rcu(&dev->vm_node, &kvm->devices); mutex_unlock(&kvm->lock); if (ops->init) ops->init(dev); kvm_get_kvm(kvm); ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR | O_CLOEXEC); if (ret < 0) { kvm_put_kvm_no_destroy(kvm); mutex_lock(&kvm->lock); list_del_rcu(&dev->vm_node); synchronize_rcu(); if (ops->release) ops->release(dev); mutex_unlock(&kvm->lock); if (ops->destroy) ops->destroy(dev); return ret; } cd->fd = ret; return 0; } static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) { switch (arg) { case KVM_CAP_USER_MEMORY: case KVM_CAP_USER_MEMORY2: case KVM_CAP_DESTROY_MEMORY_REGION_WORKS: case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS: case KVM_CAP_INTERNAL_ERROR_DATA: #ifdef CONFIG_HAVE_KVM_MSI case KVM_CAP_SIGNAL_MSI: #endif #ifdef CONFIG_HAVE_KVM_IRQCHIP case KVM_CAP_IRQFD: #endif case KVM_CAP_IOEVENTFD_ANY_LENGTH: case KVM_CAP_CHECK_EXTENSION_VM: case KVM_CAP_ENABLE_CAP_VM: case KVM_CAP_HALT_POLL: return 1; #ifdef CONFIG_KVM_MMIO case KVM_CAP_COALESCED_MMIO: return KVM_COALESCED_MMIO_PAGE_OFFSET; case KVM_CAP_COALESCED_PIO: return 1; #endif #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2: return KVM_DIRTY_LOG_MANUAL_CAPS; #endif #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING case KVM_CAP_IRQ_ROUTING: return KVM_MAX_IRQ_ROUTES; #endif #if KVM_MAX_NR_ADDRESS_SPACES > 1 case KVM_CAP_MULTI_ADDRESS_SPACE: if (kvm) return kvm_arch_nr_memslot_as_ids(kvm); return KVM_MAX_NR_ADDRESS_SPACES; #endif case KVM_CAP_NR_MEMSLOTS: return KVM_USER_MEM_SLOTS; case KVM_CAP_DIRTY_LOG_RING: #ifdef CONFIG_HAVE_KVM_DIRTY_RING_TSO return KVM_DIRTY_RING_MAX_ENTRIES * sizeof(struct kvm_dirty_gfn); #else return 0; #endif case KVM_CAP_DIRTY_LOG_RING_ACQ_REL: #ifdef CONFIG_HAVE_KVM_DIRTY_RING_ACQ_REL return KVM_DIRTY_RING_MAX_ENTRIES * sizeof(struct kvm_dirty_gfn); #else return 0; #endif #ifdef CONFIG_NEED_KVM_DIRTY_RING_WITH_BITMAP case KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP: #endif case KVM_CAP_BINARY_STATS_FD: case KVM_CAP_SYSTEM_EVENT_DATA: case KVM_CAP_DEVICE_CTRL: return 1; #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES case KVM_CAP_MEMORY_ATTRIBUTES: return kvm_supported_mem_attributes(kvm); #endif #ifdef CONFIG_KVM_PRIVATE_MEM case KVM_CAP_GUEST_MEMFD: return !kvm || kvm_arch_has_private_mem(kvm); #endif default: break; } return kvm_vm_ioctl_check_extension(kvm, arg); } static int kvm_vm_ioctl_enable_dirty_log_ring(struct kvm *kvm, u32 size) { int r; if (!KVM_DIRTY_LOG_PAGE_OFFSET) return -EINVAL; /* the size should be power of 2 */ if (!size || (size & (size - 1))) return -EINVAL; /* Should be bigger to keep the reserved entries, or a page */ if (size < kvm_dirty_ring_get_rsvd_entries() * sizeof(struct kvm_dirty_gfn) || size < PAGE_SIZE) return -EINVAL; if (size > KVM_DIRTY_RING_MAX_ENTRIES * sizeof(struct kvm_dirty_gfn)) return -E2BIG; /* We only allow it to set once */ if (kvm->dirty_ring_size) return -EINVAL; mutex_lock(&kvm->lock); if (kvm->created_vcpus) { /* We don't allow to change this value after vcpu created */ r = -EINVAL; } else { kvm->dirty_ring_size = size; r = 0; } mutex_unlock(&kvm->lock); return r; } static int kvm_vm_ioctl_reset_dirty_pages(struct kvm *kvm) { unsigned long i; struct kvm_vcpu *vcpu; int cleared = 0; if (!kvm->dirty_ring_size) return -EINVAL; mutex_lock(&kvm->slots_lock); kvm_for_each_vcpu(i, vcpu, kvm) cleared += kvm_dirty_ring_reset(vcpu->kvm, &vcpu->dirty_ring); mutex_unlock(&kvm->slots_lock); if (cleared) kvm_flush_remote_tlbs(kvm); return cleared; } int __attribute__((weak)) kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) { return -EINVAL; } bool kvm_are_all_memslots_empty(struct kvm *kvm) { int i; lockdep_assert_held(&kvm->slots_lock); for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) { if (!kvm_memslots_empty(__kvm_memslots(kvm, i))) return false; } return true; } EXPORT_SYMBOL_GPL(kvm_are_all_memslots_empty); static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm, struct kvm_enable_cap *cap) { switch (cap->cap) { #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2: { u64 allowed_options = KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE; if (cap->args[0] & KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE) allowed_options = KVM_DIRTY_LOG_MANUAL_CAPS; if (cap->flags || (cap->args[0] & ~allowed_options)) return -EINVAL; kvm->manual_dirty_log_protect = cap->args[0]; return 0; } #endif case KVM_CAP_HALT_POLL: { if (cap->flags || cap->args[0] != (unsigned int)cap->args[0]) return -EINVAL; kvm->max_halt_poll_ns = cap->args[0]; /* * Ensure kvm->override_halt_poll_ns does not become visible * before kvm->max_halt_poll_ns. * * Pairs with the smp_rmb() in kvm_vcpu_max_halt_poll_ns(). */ smp_wmb(); kvm->override_halt_poll_ns = true; return 0; } case KVM_CAP_DIRTY_LOG_RING: case KVM_CAP_DIRTY_LOG_RING_ACQ_REL: if (!kvm_vm_ioctl_check_extension_generic(kvm, cap->cap)) return -EINVAL; return kvm_vm_ioctl_enable_dirty_log_ring(kvm, cap->args[0]); case KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP: { int r = -EINVAL; if (!IS_ENABLED(CONFIG_NEED_KVM_DIRTY_RING_WITH_BITMAP) || !kvm->dirty_ring_size || cap->flags) return r; mutex_lock(&kvm->slots_lock); /* * For simplicity, allow enabling ring+bitmap if and only if * there are no memslots, e.g. to ensure all memslots allocate * a bitmap after the capability is enabled. */ if (kvm_are_all_memslots_empty(kvm)) { kvm->dirty_ring_with_bitmap = true; r = 0; } mutex_unlock(&kvm->slots_lock); return r; } default: return kvm_vm_ioctl_enable_cap(kvm, cap); } } static ssize_t kvm_vm_stats_read(struct file *file, char __user *user_buffer, size_t size, loff_t *offset) { struct kvm *kvm = file->private_data; return kvm_stats_read(kvm->stats_id, &kvm_vm_stats_header, &kvm_vm_stats_desc[0], &kvm->stat, sizeof(kvm->stat), user_buffer, size, offset); } static int kvm_vm_stats_release(struct inode *inode, struct file *file) { struct kvm *kvm = file->private_data; kvm_put_kvm(kvm); return 0; } static const struct file_operations kvm_vm_stats_fops = { .owner = THIS_MODULE, .read = kvm_vm_stats_read, .release = kvm_vm_stats_release, .llseek = noop_llseek, }; static int kvm_vm_ioctl_get_stats_fd(struct kvm *kvm) { int fd; struct file *file; fd = get_unused_fd_flags(O_CLOEXEC); if (fd < 0) return fd; file = anon_inode_getfile("kvm-vm-stats", &kvm_vm_stats_fops, kvm, O_RDONLY); if (IS_ERR(file)) { put_unused_fd(fd); return PTR_ERR(file); } kvm_get_kvm(kvm); file->f_mode |= FMODE_PREAD; fd_install(fd, file); return fd; } #define SANITY_CHECK_MEM_REGION_FIELD(field) \ do { \ BUILD_BUG_ON(offsetof(struct kvm_userspace_memory_region, field) != \ offsetof(struct kvm_userspace_memory_region2, field)); \ BUILD_BUG_ON(sizeof_field(struct kvm_userspace_memory_region, field) != \ sizeof_field(struct kvm_userspace_memory_region2, field)); \ } while (0) static long kvm_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { struct kvm *kvm = filp->private_data; void __user *argp = (void __user *)arg; int r; if (kvm->mm != current->mm || kvm->vm_dead) return -EIO; switch (ioctl) { case KVM_CREATE_VCPU: r = kvm_vm_ioctl_create_vcpu(kvm, arg); break; case KVM_ENABLE_CAP: { struct kvm_enable_cap cap; r = -EFAULT; if (copy_from_user(&cap, argp, sizeof(cap))) goto out; r = kvm_vm_ioctl_enable_cap_generic(kvm, &cap); break; } case KVM_SET_USER_MEMORY_REGION2: case KVM_SET_USER_MEMORY_REGION: { struct kvm_userspace_memory_region2 mem; unsigned long size; if (ioctl == KVM_SET_USER_MEMORY_REGION) { /* * Fields beyond struct kvm_userspace_memory_region shouldn't be * accessed, but avoid leaking kernel memory in case of a bug. */ memset(&mem, 0, sizeof(mem)); size = sizeof(struct kvm_userspace_memory_region); } else { size = sizeof(struct kvm_userspace_memory_region2); } /* Ensure the common parts of the two structs are identical. */ SANITY_CHECK_MEM_REGION_FIELD(slot); SANITY_CHECK_MEM_REGION_FIELD(flags); SANITY_CHECK_MEM_REGION_FIELD(guest_phys_addr); SANITY_CHECK_MEM_REGION_FIELD(memory_size); SANITY_CHECK_MEM_REGION_FIELD(userspace_addr); r = -EFAULT; if (copy_from_user(&mem, argp, size)) goto out; r = -EINVAL; if (ioctl == KVM_SET_USER_MEMORY_REGION && (mem.flags & ~KVM_SET_USER_MEMORY_REGION_V1_FLAGS)) goto out; r = kvm_vm_ioctl_set_memory_region(kvm, &mem); break; } case KVM_GET_DIRTY_LOG: { struct kvm_dirty_log log; r = -EFAULT; if (copy_from_user(&log, argp, sizeof(log))) goto out; r = kvm_vm_ioctl_get_dirty_log(kvm, &log); break; } #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT case KVM_CLEAR_DIRTY_LOG: { struct kvm_clear_dirty_log log; r = -EFAULT; if (copy_from_user(&log, argp, sizeof(log))) goto out; r = kvm_vm_ioctl_clear_dirty_log(kvm, &log); break; } #endif #ifdef CONFIG_KVM_MMIO case KVM_REGISTER_COALESCED_MMIO: { struct kvm_coalesced_mmio_zone zone; r = -EFAULT; if (copy_from_user(&zone, argp, sizeof(zone))) goto out; r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone); break; } case KVM_UNREGISTER_COALESCED_MMIO: { struct kvm_coalesced_mmio_zone zone; r = -EFAULT; if (copy_from_user(&zone, argp, sizeof(zone))) goto out; r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone); break; } #endif case KVM_IRQFD: { struct kvm_irqfd data; r = -EFAULT; if (copy_from_user(&data, argp, sizeof(data))) goto out; r = kvm_irqfd(kvm, &data); break; } case KVM_IOEVENTFD: { struct kvm_ioeventfd data; r = -EFAULT; if (copy_from_user(&data, argp, sizeof(data))) goto out; r = kvm_ioeventfd(kvm, &data); break; } #ifdef CONFIG_HAVE_KVM_MSI case KVM_SIGNAL_MSI: { struct kvm_msi msi; r = -EFAULT; if (copy_from_user(&msi, argp, sizeof(msi))) goto out; r = kvm_send_userspace_msi(kvm, &msi); break; } #endif #ifdef __KVM_HAVE_IRQ_LINE case KVM_IRQ_LINE_STATUS: case KVM_IRQ_LINE: { struct kvm_irq_level irq_event; r = -EFAULT; if (copy_from_user(&irq_event, argp, sizeof(irq_event))) goto out; r = kvm_vm_ioctl_irq_line(kvm, &irq_event, ioctl == KVM_IRQ_LINE_STATUS); if (r) goto out; r = -EFAULT; if (ioctl == KVM_IRQ_LINE_STATUS) { if (copy_to_user(argp, &irq_event, sizeof(irq_event))) goto out; } r = 0; break; } #endif #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING case KVM_SET_GSI_ROUTING: { struct kvm_irq_routing routing; struct kvm_irq_routing __user *urouting; struct kvm_irq_routing_entry *entries = NULL; r = -EFAULT; if (copy_from_user(&routing, argp, sizeof(routing))) goto out; r = -EINVAL; if (!kvm_arch_can_set_irq_routing(kvm)) goto out; if (routing.nr > KVM_MAX_IRQ_ROUTES) goto out; if (routing.flags) goto out; if (routing.nr) { urouting = argp; entries = vmemdup_array_user(urouting->entries, routing.nr, sizeof(*entries)); if (IS_ERR(entries)) { r = PTR_ERR(entries); goto out; } } r = kvm_set_irq_routing(kvm, entries, routing.nr, routing.flags); kvfree(entries); break; } #endif /* CONFIG_HAVE_KVM_IRQ_ROUTING */ #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES case KVM_SET_MEMORY_ATTRIBUTES: { struct kvm_memory_attributes attrs; r = -EFAULT; if (copy_from_user(&attrs, argp, sizeof(attrs))) goto out; r = kvm_vm_ioctl_set_mem_attributes(kvm, &attrs); break; } #endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */ case KVM_CREATE_DEVICE: { struct kvm_create_device cd; r = -EFAULT; if (copy_from_user(&cd, argp, sizeof(cd))) goto out; r = kvm_ioctl_create_device(kvm, &cd); if (r) goto out; r = -EFAULT; if (copy_to_user(argp, &cd, sizeof(cd))) goto out; r = 0; break; } case KVM_CHECK_EXTENSION: r = kvm_vm_ioctl_check_extension_generic(kvm, arg); break; case KVM_RESET_DIRTY_RINGS: r = kvm_vm_ioctl_reset_dirty_pages(kvm); break; case KVM_GET_STATS_FD: r = kvm_vm_ioctl_get_stats_fd(kvm); break; #ifdef CONFIG_KVM_PRIVATE_MEM case KVM_CREATE_GUEST_MEMFD: { struct kvm_create_guest_memfd guest_memfd; r = -EFAULT; if (copy_from_user(&guest_memfd, argp, sizeof(guest_memfd))) goto out; r = kvm_gmem_create(kvm, &guest_memfd); break; } #endif default: r = kvm_arch_vm_ioctl(filp, ioctl, arg); } out: return r; } #ifdef CONFIG_KVM_COMPAT struct compat_kvm_dirty_log { __u32 slot; __u32 padding1; union { compat_uptr_t dirty_bitmap; /* one bit per page */ __u64 padding2; }; }; struct compat_kvm_clear_dirty_log { __u32 slot; __u32 num_pages; __u64 first_page; union { compat_uptr_t dirty_bitmap; /* one bit per page */ __u64 padding2; }; }; long __weak kvm_arch_vm_compat_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { return -ENOTTY; } static long kvm_vm_compat_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { struct kvm *kvm = filp->private_data; int r; if (kvm->mm != current->mm || kvm->vm_dead) return -EIO; r = kvm_arch_vm_compat_ioctl(filp, ioctl, arg); if (r != -ENOTTY) return r; switch (ioctl) { #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT case KVM_CLEAR_DIRTY_LOG: { struct compat_kvm_clear_dirty_log compat_log; struct kvm_clear_dirty_log log; if (copy_from_user(&compat_log, (void __user *)arg, sizeof(compat_log))) return -EFAULT; log.slot = compat_log.slot; log.num_pages = compat_log.num_pages; log.first_page = compat_log.first_page; log.padding2 = compat_log.padding2; log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap); r = kvm_vm_ioctl_clear_dirty_log(kvm, &log); break; } #endif case KVM_GET_DIRTY_LOG: { struct compat_kvm_dirty_log compat_log; struct kvm_dirty_log log; if (copy_from_user(&compat_log, (void __user *)arg, sizeof(compat_log))) return -EFAULT; log.slot = compat_log.slot; log.padding1 = compat_log.padding1; log.padding2 = compat_log.padding2; log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap); r = kvm_vm_ioctl_get_dirty_log(kvm, &log); break; } default: r = kvm_vm_ioctl(filp, ioctl, arg); } return r; } #endif static struct file_operations kvm_vm_fops = { .release = kvm_vm_release, .unlocked_ioctl = kvm_vm_ioctl, .llseek = noop_llseek, KVM_COMPAT(kvm_vm_compat_ioctl), }; bool file_is_kvm(struct file *file) { return file && file->f_op == &kvm_vm_fops; } EXPORT_SYMBOL_GPL(file_is_kvm); static int kvm_dev_ioctl_create_vm(unsigned long type) { char fdname[ITOA_MAX_LEN + 1]; int r, fd; struct kvm *kvm; struct file *file; fd = get_unused_fd_flags(O_CLOEXEC); if (fd < 0) return fd; snprintf(fdname, sizeof(fdname), "%d", fd); kvm = kvm_create_vm(type, fdname); if (IS_ERR(kvm)) { r = PTR_ERR(kvm); goto put_fd; } file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR); if (IS_ERR(file)) { r = PTR_ERR(file); goto put_kvm; } /* * Don't call kvm_put_kvm anymore at this point; file->f_op is * already set, with ->release() being kvm_vm_release(). In error * cases it will be called by the final fput(file) and will take * care of doing kvm_put_kvm(kvm). */ kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, kvm); fd_install(fd, file); return fd; put_kvm: kvm_put_kvm(kvm); put_fd: put_unused_fd(fd); return r; } static long kvm_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { int r = -EINVAL; switch (ioctl) { case KVM_GET_API_VERSION: if (arg) goto out; r = KVM_API_VERSION; break; case KVM_CREATE_VM: r = kvm_dev_ioctl_create_vm(arg); break; case KVM_CHECK_EXTENSION: r = kvm_vm_ioctl_check_extension_generic(NULL, arg); break; case KVM_GET_VCPU_MMAP_SIZE: if (arg) goto out; r = PAGE_SIZE; /* struct kvm_run */ #ifdef CONFIG_X86 r += PAGE_SIZE; /* pio data page */ #endif #ifdef CONFIG_KVM_MMIO r += PAGE_SIZE; /* coalesced mmio ring page */ #endif break; default: return kvm_arch_dev_ioctl(filp, ioctl, arg); } out: return r; } static struct file_operations kvm_chardev_ops = { .unlocked_ioctl = kvm_dev_ioctl, .llseek = noop_llseek, KVM_COMPAT(kvm_dev_ioctl), }; static struct miscdevice kvm_dev = { KVM_MINOR, "kvm", &kvm_chardev_ops, }; #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING static bool enable_virt_at_load = true; module_param(enable_virt_at_load, bool, 0444); __visible bool kvm_rebooting; EXPORT_SYMBOL_GPL(kvm_rebooting); static DEFINE_PER_CPU(bool, virtualization_enabled); static DEFINE_MUTEX(kvm_usage_lock); static int kvm_usage_count; __weak void kvm_arch_enable_virtualization(void) { } __weak void kvm_arch_disable_virtualization(void) { } static int kvm_enable_virtualization_cpu(void) { if (__this_cpu_read(virtualization_enabled)) return 0; if (kvm_arch_enable_virtualization_cpu()) { pr_info("kvm: enabling virtualization on CPU%d failed\n", raw_smp_processor_id()); return -EIO; } __this_cpu_write(virtualization_enabled, true); return 0; } static int kvm_online_cpu(unsigned int cpu) { /* * Abort the CPU online process if hardware virtualization cannot * be enabled. Otherwise running VMs would encounter unrecoverable * errors when scheduled to this CPU. */ return kvm_enable_virtualization_cpu(); } static void kvm_disable_virtualization_cpu(void *ign) { if (!__this_cpu_read(virtualization_enabled)) return; kvm_arch_disable_virtualization_cpu(); __this_cpu_write(virtualization_enabled, false); } static int kvm_offline_cpu(unsigned int cpu) { kvm_disable_virtualization_cpu(NULL); return 0; } static void kvm_shutdown(void) { /* * Disable hardware virtualization and set kvm_rebooting to indicate * that KVM has asynchronously disabled hardware virtualization, i.e. * that relevant errors and exceptions aren't entirely unexpected. * Some flavors of hardware virtualization need to be disabled before * transferring control to firmware (to perform shutdown/reboot), e.g. * on x86, virtualization can block INIT interrupts, which are used by * firmware to pull APs back under firmware control. Note, this path * is used for both shutdown and reboot scenarios, i.e. neither name is * 100% comprehensive. */ pr_info("kvm: exiting hardware virtualization\n"); kvm_rebooting = true; on_each_cpu(kvm_disable_virtualization_cpu, NULL, 1); } static int kvm_suspend(void) { /* * Secondary CPUs and CPU hotplug are disabled across the suspend/resume * callbacks, i.e. no need to acquire kvm_usage_lock to ensure the usage * count is stable. Assert that kvm_usage_lock is not held to ensure * the system isn't suspended while KVM is enabling hardware. Hardware * enabling can be preempted, but the task cannot be frozen until it has * dropped all locks (userspace tasks are frozen via a fake signal). */ lockdep_assert_not_held(&kvm_usage_lock); lockdep_assert_irqs_disabled(); kvm_disable_virtualization_cpu(NULL); return 0; } static void kvm_resume(void) { lockdep_assert_not_held(&kvm_usage_lock); lockdep_assert_irqs_disabled(); WARN_ON_ONCE(kvm_enable_virtualization_cpu()); } static struct syscore_ops kvm_syscore_ops = { .suspend = kvm_suspend, .resume = kvm_resume, .shutdown = kvm_shutdown, }; static int kvm_enable_virtualization(void) { int r; guard(mutex)(&kvm_usage_lock); if (kvm_usage_count++) return 0; kvm_arch_enable_virtualization(); r = cpuhp_setup_state(CPUHP_AP_KVM_ONLINE, "kvm/cpu:online", kvm_online_cpu, kvm_offline_cpu); if (r) goto err_cpuhp; register_syscore_ops(&kvm_syscore_ops); /* * Undo virtualization enabling and bail if the system is going down. * If userspace initiated a forced reboot, e.g. reboot -f, then it's * possible for an in-flight operation to enable virtualization after * syscore_shutdown() is called, i.e. without kvm_shutdown() being * invoked. Note, this relies on system_state being set _before_ * kvm_shutdown(), e.g. to ensure either kvm_shutdown() is invoked * or this CPU observes the impending shutdown. Which is why KVM uses * a syscore ops hook instead of registering a dedicated reboot * notifier (the latter runs before system_state is updated). */ if (system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF || system_state == SYSTEM_RESTART) { r = -EBUSY; goto err_rebooting; } return 0; err_rebooting: unregister_syscore_ops(&kvm_syscore_ops); cpuhp_remove_state(CPUHP_AP_KVM_ONLINE); err_cpuhp: kvm_arch_disable_virtualization(); --kvm_usage_count; return r; } static void kvm_disable_virtualization(void) { guard(mutex)(&kvm_usage_lock); if (--kvm_usage_count) return; unregister_syscore_ops(&kvm_syscore_ops); cpuhp_remove_state(CPUHP_AP_KVM_ONLINE); kvm_arch_disable_virtualization(); } static int kvm_init_virtualization(void) { if (enable_virt_at_load) return kvm_enable_virtualization(); return 0; } static void kvm_uninit_virtualization(void) { if (enable_virt_at_load) kvm_disable_virtualization(); } #else /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */ static int kvm_enable_virtualization(void) { return 0; } static int kvm_init_virtualization(void) { return 0; } static void kvm_disable_virtualization(void) { } static void kvm_uninit_virtualization(void) { } #endif /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */ static void kvm_iodevice_destructor(struct kvm_io_device *dev) { if (dev->ops->destructor) dev->ops->destructor(dev); } static void kvm_io_bus_destroy(struct kvm_io_bus *bus) { int i; for (i = 0; i < bus->dev_count; i++) { struct kvm_io_device *pos = bus->range[i].dev; kvm_iodevice_destructor(pos); } kfree(bus); } static inline int kvm_io_bus_cmp(const struct kvm_io_range *r1, const struct kvm_io_range *r2) { gpa_t addr1 = r1->addr; gpa_t addr2 = r2->addr; if (addr1 < addr2) return -1; /* If r2->len == 0, match the exact address. If r2->len != 0, * accept any overlapping write. Any order is acceptable for * overlapping ranges, because kvm_io_bus_get_first_dev ensures * we process all of them. */ if (r2->len) { addr1 += r1->len; addr2 += r2->len; } if (addr1 > addr2) return 1; return 0; } static int kvm_io_bus_sort_cmp(const void *p1, const void *p2) { return kvm_io_bus_cmp(p1, p2); } static int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus, gpa_t addr, int len) { struct kvm_io_range *range, key; int off; key = (struct kvm_io_range) { .addr = addr, .len = len, }; range = bsearch(&key, bus->range, bus->dev_count, sizeof(struct kvm_io_range), kvm_io_bus_sort_cmp); if (range == NULL) return -ENOENT; off = range - bus->range; while (off > 0 && kvm_io_bus_cmp(&key, &bus->range[off-1]) == 0) off--; return off; } static int __kvm_io_bus_write(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus, struct kvm_io_range *range, const void *val) { int idx; idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len); if (idx < 0) return -EOPNOTSUPP; while (idx < bus->dev_count && kvm_io_bus_cmp(range, &bus->range[idx]) == 0) { if (!kvm_iodevice_write(vcpu, bus->range[idx].dev, range->addr, range->len, val)) return idx; idx++; } return -EOPNOTSUPP; } /* kvm_io_bus_write - called under kvm->slots_lock */ int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, int len, const void *val) { struct kvm_io_bus *bus; struct kvm_io_range range; int r; range = (struct kvm_io_range) { .addr = addr, .len = len, }; bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu); if (!bus) return -ENOMEM; r = __kvm_io_bus_write(vcpu, bus, &range, val); return r < 0 ? r : 0; } EXPORT_SYMBOL_GPL(kvm_io_bus_write); /* kvm_io_bus_write_cookie - called under kvm->slots_lock */ int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, int len, const void *val, long cookie) { struct kvm_io_bus *bus; struct kvm_io_range range; range = (struct kvm_io_range) { .addr = addr, .len = len, }; bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu); if (!bus) return -ENOMEM; /* First try the device referenced by cookie. */ if ((cookie >= 0) && (cookie < bus->dev_count) && (kvm_io_bus_cmp(&range, &bus->range[cookie]) == 0)) if (!kvm_iodevice_write(vcpu, bus->range[cookie].dev, addr, len, val)) return cookie; /* * cookie contained garbage; fall back to search and return the * correct cookie value. */ return __kvm_io_bus_write(vcpu, bus, &range, val); } static int __kvm_io_bus_read(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus, struct kvm_io_range *range, void *val) { int idx; idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len); if (idx < 0) return -EOPNOTSUPP; while (idx < bus->dev_count && kvm_io_bus_cmp(range, &bus->range[idx]) == 0) { if (!kvm_iodevice_read(vcpu, bus->range[idx].dev, range->addr, range->len, val)) return idx; idx++; } return -EOPNOTSUPP; } /* kvm_io_bus_read - called under kvm->slots_lock */ int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, int len, void *val) { struct kvm_io_bus *bus; struct kvm_io_range range; int r; range = (struct kvm_io_range) { .addr = addr, .len = len, }; bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu); if (!bus) return -ENOMEM; r = __kvm_io_bus_read(vcpu, bus, &range, val); return r < 0 ? r : 0; } int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, int len, struct kvm_io_device *dev) { int i; struct kvm_io_bus *new_bus, *bus; struct kvm_io_range range; lockdep_assert_held(&kvm->slots_lock); bus = kvm_get_bus(kvm, bus_idx); if (!bus) return -ENOMEM; /* exclude ioeventfd which is limited by maximum fd */ if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1) return -ENOSPC; new_bus = kmalloc(struct_size(bus, range, bus->dev_count + 1), GFP_KERNEL_ACCOUNT); if (!new_bus) return -ENOMEM; range = (struct kvm_io_range) { .addr = addr, .len = len, .dev = dev, }; for (i = 0; i < bus->dev_count; i++) if (kvm_io_bus_cmp(&bus->range[i], &range) > 0) break; memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range)); new_bus->dev_count++; new_bus->range[i] = range; memcpy(new_bus->range + i + 1, bus->range + i, (bus->dev_count - i) * sizeof(struct kvm_io_range)); rcu_assign_pointer(kvm->buses[bus_idx], new_bus); synchronize_srcu_expedited(&kvm->srcu); kfree(bus); return 0; } int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, struct kvm_io_device *dev) { int i; struct kvm_io_bus *new_bus, *bus; lockdep_assert_held(&kvm->slots_lock); bus = kvm_get_bus(kvm, bus_idx); if (!bus) return 0; for (i = 0; i < bus->dev_count; i++) { if (bus->range[i].dev == dev) { break; } } if (i == bus->dev_count) return 0; new_bus = kmalloc(struct_size(bus, range, bus->dev_count - 1), GFP_KERNEL_ACCOUNT); if (new_bus) { memcpy(new_bus, bus, struct_size(bus, range, i)); new_bus->dev_count--; memcpy(new_bus->range + i, bus->range + i + 1, flex_array_size(new_bus, range, new_bus->dev_count - i)); } rcu_assign_pointer(kvm->buses[bus_idx], new_bus); synchronize_srcu_expedited(&kvm->srcu); /* * If NULL bus is installed, destroy the old bus, including all the * attached devices. Otherwise, destroy the caller's device only. */ if (!new_bus) { pr_err("kvm: failed to shrink bus, removing it completely\n"); kvm_io_bus_destroy(bus); return -ENOMEM; } kvm_iodevice_destructor(dev); kfree(bus); return 0; } struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr) { struct kvm_io_bus *bus; int dev_idx, srcu_idx; struct kvm_io_device *iodev = NULL; srcu_idx = srcu_read_lock(&kvm->srcu); bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu); if (!bus) goto out_unlock; dev_idx = kvm_io_bus_get_first_dev(bus, addr, 1); if (dev_idx < 0) goto out_unlock; iodev = bus->range[dev_idx].dev; out_unlock: srcu_read_unlock(&kvm->srcu, srcu_idx); return iodev; } EXPORT_SYMBOL_GPL(kvm_io_bus_get_dev); static int kvm_debugfs_open(struct inode *inode, struct file *file, int (*get)(void *, u64 *), int (*set)(void *, u64), const char *fmt) { int ret; struct kvm_stat_data *stat_data = inode->i_private; /* * The debugfs files are a reference to the kvm struct which * is still valid when kvm_destroy_vm is called. kvm_get_kvm_safe * avoids the race between open and the removal of the debugfs directory. */ if (!kvm_get_kvm_safe(stat_data->kvm)) return -ENOENT; ret = simple_attr_open(inode, file, get, kvm_stats_debugfs_mode(stat_data->desc) & 0222 ? set : NULL, fmt); if (ret) kvm_put_kvm(stat_data->kvm); return ret; } static int kvm_debugfs_release(struct inode *inode, struct file *file) { struct kvm_stat_data *stat_data = inode->i_private; simple_attr_release(inode, file); kvm_put_kvm(stat_data->kvm); return 0; } static int kvm_get_stat_per_vm(struct kvm *kvm, size_t offset, u64 *val) { *val = *(u64 *)((void *)(&kvm->stat) + offset); return 0; } static int kvm_clear_stat_per_vm(struct kvm *kvm, size_t offset) { *(u64 *)((void *)(&kvm->stat) + offset) = 0; return 0; } static int kvm_get_stat_per_vcpu(struct kvm *kvm, size_t offset, u64 *val) { unsigned long i; struct kvm_vcpu *vcpu; *val = 0; kvm_for_each_vcpu(i, vcpu, kvm) *val += *(u64 *)((void *)(&vcpu->stat) + offset); return 0; } static int kvm_clear_stat_per_vcpu(struct kvm *kvm, size_t offset) { unsigned long i; struct kvm_vcpu *vcpu; kvm_for_each_vcpu(i, vcpu, kvm) *(u64 *)((void *)(&vcpu->stat) + offset) = 0; return 0; } static int kvm_stat_data_get(void *data, u64 *val) { int r = -EFAULT; struct kvm_stat_data *stat_data = data; switch (stat_data->kind) { case KVM_STAT_VM: r = kvm_get_stat_per_vm(stat_data->kvm, stat_data->desc->desc.offset, val); break; case KVM_STAT_VCPU: r = kvm_get_stat_per_vcpu(stat_data->kvm, stat_data->desc->desc.offset, val); break; } return r; } static int kvm_stat_data_clear(void *data, u64 val) { int r = -EFAULT; struct kvm_stat_data *stat_data = data; if (val) return -EINVAL; switch (stat_data->kind) { case KVM_STAT_VM: r = kvm_clear_stat_per_vm(stat_data->kvm, stat_data->desc->desc.offset); break; case KVM_STAT_VCPU: r = kvm_clear_stat_per_vcpu(stat_data->kvm, stat_data->desc->desc.offset); break; } return r; } static int kvm_stat_data_open(struct inode *inode, struct file *file) { __simple_attr_check_format("%llu\n", 0ull); return kvm_debugfs_open(inode, file, kvm_stat_data_get, kvm_stat_data_clear, "%llu\n"); } static const struct file_operations stat_fops_per_vm = { .owner = THIS_MODULE, .open = kvm_stat_data_open, .release = kvm_debugfs_release, .read = simple_attr_read, .write = simple_attr_write, }; static int vm_stat_get(void *_offset, u64 *val) { unsigned offset = (long)_offset; struct kvm *kvm; u64 tmp_val; *val = 0; mutex_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) { kvm_get_stat_per_vm(kvm, offset, &tmp_val); *val += tmp_val; } mutex_unlock(&kvm_lock); return 0; } static int vm_stat_clear(void *_offset, u64 val) { unsigned offset = (long)_offset; struct kvm *kvm; if (val) return -EINVAL; mutex_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) { kvm_clear_stat_per_vm(kvm, offset); } mutex_unlock(&kvm_lock); return 0; } DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, vm_stat_clear, "%llu\n"); DEFINE_SIMPLE_ATTRIBUTE(vm_stat_readonly_fops, vm_stat_get, NULL, "%llu\n"); static int vcpu_stat_get(void *_offset, u64 *val) { unsigned offset = (long)_offset; struct kvm *kvm; u64 tmp_val; *val = 0; mutex_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) { kvm_get_stat_per_vcpu(kvm, offset, &tmp_val); *val += tmp_val; } mutex_unlock(&kvm_lock); return 0; } static int vcpu_stat_clear(void *_offset, u64 val) { unsigned offset = (long)_offset; struct kvm *kvm; if (val) return -EINVAL; mutex_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) { kvm_clear_stat_per_vcpu(kvm, offset); } mutex_unlock(&kvm_lock); return 0; } DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, vcpu_stat_clear, "%llu\n"); DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_readonly_fops, vcpu_stat_get, NULL, "%llu\n"); static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm) { struct kobj_uevent_env *env; unsigned long long created, active; if (!kvm_dev.this_device || !kvm) return; mutex_lock(&kvm_lock); if (type == KVM_EVENT_CREATE_VM) { kvm_createvm_count++; kvm_active_vms++; } else if (type == KVM_EVENT_DESTROY_VM) { kvm_active_vms--; } created = kvm_createvm_count; active = kvm_active_vms; mutex_unlock(&kvm_lock); env = kzalloc(sizeof(*env), GFP_KERNEL); if (!env) return; add_uevent_var(env, "CREATED=%llu", created); add_uevent_var(env, "COUNT=%llu", active); if (type == KVM_EVENT_CREATE_VM) { add_uevent_var(env, "EVENT=create"); kvm->userspace_pid = task_pid_nr(current); } else if (type == KVM_EVENT_DESTROY_VM) { add_uevent_var(env, "EVENT=destroy"); } add_uevent_var(env, "PID=%d", kvm->userspace_pid); if (!IS_ERR(kvm->debugfs_dentry)) { char *tmp, *p = kmalloc(PATH_MAX, GFP_KERNEL); if (p) { tmp = dentry_path_raw(kvm->debugfs_dentry, p, PATH_MAX); if (!IS_ERR(tmp)) add_uevent_var(env, "STATS_PATH=%s", tmp); kfree(p); } } /* no need for checks, since we are adding at most only 5 keys */ env->envp[env->envp_idx++] = NULL; kobject_uevent_env(&kvm_dev.this_device->kobj, KOBJ_CHANGE, env->envp); kfree(env); } static void kvm_init_debug(void) { const struct file_operations *fops; const struct _kvm_stats_desc *pdesc; int i; kvm_debugfs_dir = debugfs_create_dir("kvm", NULL); for (i = 0; i < kvm_vm_stats_header.num_desc; ++i) { pdesc = &kvm_vm_stats_desc[i]; if (kvm_stats_debugfs_mode(pdesc) & 0222) fops = &vm_stat_fops; else fops = &vm_stat_readonly_fops; debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc), kvm_debugfs_dir, (void *)(long)pdesc->desc.offset, fops); } for (i = 0; i < kvm_vcpu_stats_header.num_desc; ++i) { pdesc = &kvm_vcpu_stats_desc[i]; if (kvm_stats_debugfs_mode(pdesc) & 0222) fops = &vcpu_stat_fops; else fops = &vcpu_stat_readonly_fops; debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc), kvm_debugfs_dir, (void *)(long)pdesc->desc.offset, fops); } } static inline struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn) { return container_of(pn, struct kvm_vcpu, preempt_notifier); } static void kvm_sched_in(struct preempt_notifier *pn, int cpu) { struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); WRITE_ONCE(vcpu->preempted, false); WRITE_ONCE(vcpu->ready, false); __this_cpu_write(kvm_running_vcpu, vcpu); kvm_arch_vcpu_load(vcpu, cpu); WRITE_ONCE(vcpu->scheduled_out, false); } static void kvm_sched_out(struct preempt_notifier *pn, struct task_struct *next) { struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); WRITE_ONCE(vcpu->scheduled_out, true); if (task_is_runnable(current) && vcpu->wants_to_run) { WRITE_ONCE(vcpu->preempted, true); WRITE_ONCE(vcpu->ready, true); } kvm_arch_vcpu_put(vcpu); __this_cpu_write(kvm_running_vcpu, NULL); } /** * kvm_get_running_vcpu - get the vcpu running on the current CPU. * * We can disable preemption locally around accessing the per-CPU variable, * and use the resolved vcpu pointer after enabling preemption again, * because even if the current thread is migrated to another CPU, reading * the per-CPU value later will give us the same value as we update the * per-CPU variable in the preempt notifier handlers. */ struct kvm_vcpu *kvm_get_running_vcpu(void) { struct kvm_vcpu *vcpu; preempt_disable(); vcpu = __this_cpu_read(kvm_running_vcpu); preempt_enable(); return vcpu; } EXPORT_SYMBOL_GPL(kvm_get_running_vcpu); /** * kvm_get_running_vcpus - get the per-CPU array of currently running vcpus. */ struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void) { return &kvm_running_vcpu; } #ifdef CONFIG_GUEST_PERF_EVENTS static unsigned int kvm_guest_state(void) { struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); unsigned int state; if (!kvm_arch_pmi_in_guest(vcpu)) return 0; state = PERF_GUEST_ACTIVE; if (!kvm_arch_vcpu_in_kernel(vcpu)) state |= PERF_GUEST_USER; return state; } static unsigned long kvm_guest_get_ip(void) { struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); /* Retrieving the IP must be guarded by a call to kvm_guest_state(). */ if (WARN_ON_ONCE(!kvm_arch_pmi_in_guest(vcpu))) return 0; return kvm_arch_vcpu_get_ip(vcpu); } static struct perf_guest_info_callbacks kvm_guest_cbs = { .state = kvm_guest_state, .get_ip = kvm_guest_get_ip, .handle_intel_pt_intr = NULL, }; void kvm_register_perf_callbacks(unsigned int (*pt_intr_handler)(void)) { kvm_guest_cbs.handle_intel_pt_intr = pt_intr_handler; perf_register_guest_info_callbacks(&kvm_guest_cbs); } void kvm_unregister_perf_callbacks(void) { perf_unregister_guest_info_callbacks(&kvm_guest_cbs); } #endif int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module) { int r; int cpu; /* A kmem cache lets us meet the alignment requirements of fx_save. */ if (!vcpu_align) vcpu_align = __alignof__(struct kvm_vcpu); kvm_vcpu_cache = kmem_cache_create_usercopy("kvm_vcpu", vcpu_size, vcpu_align, SLAB_ACCOUNT, offsetof(struct kvm_vcpu, arch), offsetofend(struct kvm_vcpu, stats_id) - offsetof(struct kvm_vcpu, arch), NULL); if (!kvm_vcpu_cache) return -ENOMEM; for_each_possible_cpu(cpu) { if (!alloc_cpumask_var_node(&per_cpu(cpu_kick_mask, cpu), GFP_KERNEL, cpu_to_node(cpu))) { r = -ENOMEM; goto err_cpu_kick_mask; } } r = kvm_irqfd_init(); if (r) goto err_irqfd; r = kvm_async_pf_init(); if (r) goto err_async_pf; kvm_chardev_ops.owner = module; kvm_vm_fops.owner = module; kvm_vcpu_fops.owner = module; kvm_device_fops.owner = module; kvm_preempt_ops.sched_in = kvm_sched_in; kvm_preempt_ops.sched_out = kvm_sched_out; kvm_init_debug(); r = kvm_vfio_ops_init(); if (WARN_ON_ONCE(r)) goto err_vfio; kvm_gmem_init(module); r = kvm_init_virtualization(); if (r) goto err_virt; /* * Registration _must_ be the very last thing done, as this exposes * /dev/kvm to userspace, i.e. all infrastructure must be setup! */ r = misc_register(&kvm_dev); if (r) { pr_err("kvm: misc device register failed\n"); goto err_register; } return 0; err_register: kvm_uninit_virtualization(); err_virt: kvm_vfio_ops_exit(); err_vfio: kvm_async_pf_deinit(); err_async_pf: kvm_irqfd_exit(); err_irqfd: err_cpu_kick_mask: for_each_possible_cpu(cpu) free_cpumask_var(per_cpu(cpu_kick_mask, cpu)); kmem_cache_destroy(kvm_vcpu_cache); return r; } EXPORT_SYMBOL_GPL(kvm_init); void kvm_exit(void) { int cpu; /* * Note, unregistering /dev/kvm doesn't strictly need to come first, * fops_get(), a.k.a. try_module_get(), prevents acquiring references * to KVM while the module is being stopped. */ misc_deregister(&kvm_dev); kvm_uninit_virtualization(); debugfs_remove_recursive(kvm_debugfs_dir); for_each_possible_cpu(cpu) free_cpumask_var(per_cpu(cpu_kick_mask, cpu)); kmem_cache_destroy(kvm_vcpu_cache); kvm_vfio_ops_exit(); kvm_async_pf_deinit(); kvm_irqfd_exit(); } EXPORT_SYMBOL_GPL(kvm_exit);
20 15 21 4 30 80 44 1 26 35 3 4 15 1 3 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 // SPDX-License-Identifier: GPL-2.0-only /* * linux/fs/binfmt_elf.c * * These are the functions used to load ELF format executables as used * on SVr4 machines. Information on the format may be found in the book * "UNIX SYSTEM V RELEASE 4 Programmers Guide: Ansi C and Programming Support * Tools". * * Copyright 1993, 1994: Eric Youngdale (ericy@cais.com). */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/fs.h> #include <linux/log2.h> #include <linux/mm.h> #include <linux/mman.h> #include <linux/errno.h> #include <linux/signal.h> #include <linux/binfmts.h> #include <linux/string.h> #include <linux/file.h> #include <linux/slab.h> #include <linux/personality.h> #include <linux/elfcore.h> #include <linux/init.h> #include <linux/highuid.h> #include <linux/compiler.h> #include <linux/highmem.h> #include <linux/hugetlb.h> #include <linux/pagemap.h> #include <linux/vmalloc.h> #include <linux/security.h> #include <linux/random.h> #include <linux/elf.h> #include <linux/elf-randomize.h> #include <linux/utsname.h> #include <linux/coredump.h> #include <linux/sched.h> #include <linux/sched/coredump.h> #include <linux/sched/task_stack.h> #include <linux/sched/cputime.h> #include <linux/sizes.h> #include <linux/types.h> #include <linux/cred.h> #include <linux/dax.h> #include <linux/uaccess.h> #include <linux/rseq.h> #include <asm/param.h> #include <asm/page.h> #ifndef ELF_COMPAT #define ELF_COMPAT 0 #endif #ifndef user_long_t #define user_long_t long #endif #ifndef user_siginfo_t #define user_siginfo_t siginfo_t #endif /* That's for binfmt_elf_fdpic to deal with */ #ifndef elf_check_fdpic #define elf_check_fdpic(ex) false #endif static int load_elf_binary(struct linux_binprm *bprm); #ifdef CONFIG_USELIB static int load_elf_library(struct file *); #else #define load_elf_library NULL #endif /* * If we don't support core dumping, then supply a NULL so we * don't even try. */ #ifdef CONFIG_ELF_CORE static int elf_core_dump(struct coredump_params *cprm); #else #define elf_core_dump NULL #endif #if ELF_EXEC_PAGESIZE > PAGE_SIZE #define ELF_MIN_ALIGN ELF_EXEC_PAGESIZE #else #define ELF_MIN_ALIGN PAGE_SIZE #endif #ifndef ELF_CORE_EFLAGS #define ELF_CORE_EFLAGS 0 #endif #define ELF_PAGESTART(_v) ((_v) & ~(int)(ELF_MIN_ALIGN-1)) #define ELF_PAGEOFFSET(_v) ((_v) & (ELF_MIN_ALIGN-1)) #define ELF_PAGEALIGN(_v) (((_v) + ELF_MIN_ALIGN - 1) & ~(ELF_MIN_ALIGN - 1)) static struct linux_binfmt elf_format = { .module = THIS_MODULE, .load_binary = load_elf_binary, .load_shlib = load_elf_library, #ifdef CONFIG_COREDUMP .core_dump = elf_core_dump, .min_coredump = ELF_EXEC_PAGESIZE, #endif }; #define BAD_ADDR(x) (unlikely((unsigned long)(x) >= TASK_SIZE)) /* * We need to explicitly zero any trailing portion of the page that follows * p_filesz when it ends before the page ends (e.g. bss), otherwise this * memory will contain the junk from the file that should not be present. */ static int padzero(unsigned long address) { unsigned long nbyte; nbyte = ELF_PAGEOFFSET(address); if (nbyte) { nbyte = ELF_MIN_ALIGN - nbyte; if (clear_user((void __user *)address, nbyte)) return -EFAULT; } return 0; } /* Let's use some macros to make this stack manipulation a little clearer */ #ifdef CONFIG_STACK_GROWSUP #define STACK_ADD(sp, items) ((elf_addr_t __user *)(sp) + (items)) #define STACK_ROUND(sp, items) \ ((15 + (unsigned long) ((sp) + (items))) &~ 15UL) #define STACK_ALLOC(sp, len) ({ \ elf_addr_t __user *old_sp = (elf_addr_t __user *)sp; sp += len; \ old_sp; }) #else #define STACK_ADD(sp, items) ((elf_addr_t __user *)(sp) - (items)) #define STACK_ROUND(sp, items) \ (((unsigned long) (sp - items)) &~ 15UL) #define STACK_ALLOC(sp, len) (sp -= len) #endif #ifndef ELF_BASE_PLATFORM /* * AT_BASE_PLATFORM indicates the "real" hardware/microarchitecture. * If the arch defines ELF_BASE_PLATFORM (in asm/elf.h), the value * will be copied to the user stack in the same manner as AT_PLATFORM. */ #define ELF_BASE_PLATFORM NULL #endif static int create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec, unsigned long interp_load_addr, unsigned long e_entry, unsigned long phdr_addr) { struct mm_struct *mm = current->mm; unsigned long p = bprm->p; int argc = bprm->argc; int envc = bprm->envc; elf_addr_t __user *sp; elf_addr_t __user *u_platform; elf_addr_t __user *u_base_platform; elf_addr_t __user *u_rand_bytes; const char *k_platform = ELF_PLATFORM; const char *k_base_platform = ELF_BASE_PLATFORM; unsigned char k_rand_bytes[16]; int items; elf_addr_t *elf_info; elf_addr_t flags = 0; int ei_index; const struct cred *cred = current_cred(); struct vm_area_struct *vma; /* * In some cases (e.g. Hyper-Threading), we want to avoid L1 * evictions by the processes running on the same package. One * thing we can do is to shuffle the initial stack for them. */ p = arch_align_stack(p); /* * If this architecture has a platform capability string, copy it * to userspace. In some cases (Sparc), this info is impossible * for userspace to get any other way, in others (i386) it is * merely difficult. */ u_platform = NULL; if (k_platform) { size_t len = strlen(k_platform) + 1; u_platform = (elf_addr_t __user *)STACK_ALLOC(p, len); if (copy_to_user(u_platform, k_platform, len)) return -EFAULT; } /* * If this architecture has a "base" platform capability * string, copy it to userspace. */ u_base_platform = NULL; if (k_base_platform) { size_t len = strlen(k_base_platform) + 1; u_base_platform = (elf_addr_t __user *)STACK_ALLOC(p, len); if (copy_to_user(u_base_platform, k_base_platform, len)) return -EFAULT; } /* * Generate 16 random bytes for userspace PRNG seeding. */ get_random_bytes(k_rand_bytes, sizeof(k_rand_bytes)); u_rand_bytes = (elf_addr_t __user *) STACK_ALLOC(p, sizeof(k_rand_bytes)); if (copy_to_user(u_rand_bytes, k_rand_bytes, sizeof(k_rand_bytes))) return -EFAULT; /* Create the ELF interpreter info */ elf_info = (elf_addr_t *)mm->saved_auxv; /* update AT_VECTOR_SIZE_BASE if the number of NEW_AUX_ENT() changes */ #define NEW_AUX_ENT(id, val) \ do { \ *elf_info++ = id; \ *elf_info++ = val; \ } while (0) #ifdef ARCH_DLINFO /* * ARCH_DLINFO must come first so PPC can do its special alignment of * AUXV. * update AT_VECTOR_SIZE_ARCH if the number of NEW_AUX_ENT() in * ARCH_DLINFO changes */ ARCH_DLINFO; #endif NEW_AUX_ENT(AT_HWCAP, ELF_HWCAP); NEW_AUX_ENT(AT_PAGESZ, ELF_EXEC_PAGESIZE); NEW_AUX_ENT(AT_CLKTCK, CLOCKS_PER_SEC); NEW_AUX_ENT(AT_PHDR, phdr_addr); NEW_AUX_ENT(AT_PHENT, sizeof(struct elf_phdr)); NEW_AUX_ENT(AT_PHNUM, exec->e_phnum); NEW_AUX_ENT(AT_BASE, interp_load_addr); if (bprm->interp_flags & BINPRM_FLAGS_PRESERVE_ARGV0) flags |= AT_FLAGS_PRESERVE_ARGV0; NEW_AUX_ENT(AT_FLAGS, flags); NEW_AUX_ENT(AT_ENTRY, e_entry); NEW_AUX_ENT(AT_UID, from_kuid_munged(cred->user_ns, cred->uid)); NEW_AUX_ENT(AT_EUID, from_kuid_munged(cred->user_ns, cred->euid)); NEW_AUX_ENT(AT_GID, from_kgid_munged(cred->user_ns, cred->gid)); NEW_AUX_ENT(AT_EGID, from_kgid_munged(cred->user_ns, cred->egid)); NEW_AUX_ENT(AT_SECURE, bprm->secureexec); NEW_AUX_ENT(AT_RANDOM, (elf_addr_t)(unsigned long)u_rand_bytes); #ifdef ELF_HWCAP2 NEW_AUX_ENT(AT_HWCAP2, ELF_HWCAP2); #endif #ifdef ELF_HWCAP3 NEW_AUX_ENT(AT_HWCAP3, ELF_HWCAP3); #endif #ifdef ELF_HWCAP4 NEW_AUX_ENT(AT_HWCAP4, ELF_HWCAP4); #endif NEW_AUX_ENT(AT_EXECFN, bprm->exec); if (k_platform) { NEW_AUX_ENT(AT_PLATFORM, (elf_addr_t)(unsigned long)u_platform); } if (k_base_platform) { NEW_AUX_ENT(AT_BASE_PLATFORM, (elf_addr_t)(unsigned long)u_base_platform); } if (bprm->have_execfd) { NEW_AUX_ENT(AT_EXECFD, bprm->execfd); } #ifdef CONFIG_RSEQ NEW_AUX_ENT(AT_RSEQ_FEATURE_SIZE, offsetof(struct rseq, end)); NEW_AUX_ENT(AT_RSEQ_ALIGN, __alignof__(struct rseq)); #endif #undef NEW_AUX_ENT /* AT_NULL is zero; clear the rest too */ memset(elf_info, 0, (char *)mm->saved_auxv + sizeof(mm->saved_auxv) - (char *)elf_info); /* And advance past the AT_NULL entry. */ elf_info += 2; ei_index = elf_info - (elf_addr_t *)mm->saved_auxv; sp = STACK_ADD(p, ei_index); items = (argc + 1) + (envc + 1) + 1; bprm->p = STACK_ROUND(sp, items); /* Point sp at the lowest address on the stack */ #ifdef CONFIG_STACK_GROWSUP sp = (elf_addr_t __user *)bprm->p - items - ei_index; bprm->exec = (unsigned long)sp; /* XXX: PARISC HACK */ #else sp = (elf_addr_t __user *)bprm->p; #endif /* * Grow the stack manually; some architectures have a limit on how * far ahead a user-space access may be in order to grow the stack. */ if (mmap_write_lock_killable(mm)) return -EINTR; vma = find_extend_vma_locked(mm, bprm->p); mmap_write_unlock(mm); if (!vma) return -EFAULT; /* Now, let's put argc (and argv, envp if appropriate) on the stack */ if (put_user(argc, sp++)) return -EFAULT; /* Populate list of argv pointers back to argv strings. */ p = mm->arg_end = mm->arg_start; while (argc-- > 0) { size_t len; if (put_user((elf_addr_t)p, sp++)) return -EFAULT; len = strnlen_user((void __user *)p, MAX_ARG_STRLEN); if (!len || len > MAX_ARG_STRLEN) return -EINVAL; p += len; } if (put_user(0, sp++)) return -EFAULT; mm->arg_end = p; /* Populate list of envp pointers back to envp strings. */ mm->env_end = mm->env_start = p; while (envc-- > 0) { size_t len; if (put_user((elf_addr_t)p, sp++)) return -EFAULT; len = strnlen_user((void __user *)p, MAX_ARG_STRLEN); if (!len || len > MAX_ARG_STRLEN) return -EINVAL; p += len; } if (put_user(0, sp++)) return -EFAULT; mm->env_end = p; /* Put the elf_info on the stack in the right place. */ if (copy_to_user(sp, mm->saved_auxv, ei_index * sizeof(elf_addr_t))) return -EFAULT; return 0; } /* * Map "eppnt->p_filesz" bytes from "filep" offset "eppnt->p_offset" * into memory at "addr". (Note that p_filesz is rounded up to the * next page, so any extra bytes from the file must be wiped.) */ static unsigned long elf_map(struct file *filep, unsigned long addr, const struct elf_phdr *eppnt, int prot, int type, unsigned long total_size) { unsigned long map_addr; unsigned long size = eppnt->p_filesz + ELF_PAGEOFFSET(eppnt->p_vaddr); unsigned long off = eppnt->p_offset - ELF_PAGEOFFSET(eppnt->p_vaddr); addr = ELF_PAGESTART(addr); size = ELF_PAGEALIGN(size); /* mmap() will return -EINVAL if given a zero size, but a * segment with zero filesize is perfectly valid */ if (!size) return addr; /* * total_size is the size of the ELF (interpreter) image. * The _first_ mmap needs to know the full size, otherwise * randomization might put this image into an overlapping * position with the ELF binary image. (since size < total_size) * So we first map the 'big' image - and unmap the remainder at * the end. (which unmap is needed for ELF images with holes.) */ if (total_size) { total_size = ELF_PAGEALIGN(total_size); map_addr = vm_mmap(filep, addr, total_size, prot, type, off); if (!BAD_ADDR(map_addr)) vm_munmap(map_addr+size, total_size-size); } else map_addr = vm_mmap(filep, addr, size, prot, type, off); if ((type & MAP_FIXED_NOREPLACE) && PTR_ERR((void *)map_addr) == -EEXIST) pr_info("%d (%s): Uhuuh, elf segment at %px requested but the memory is mapped already\n", task_pid_nr(current), current->comm, (void *)addr); return(map_addr); } /* * Map "eppnt->p_filesz" bytes from "filep" offset "eppnt->p_offset" * into memory at "addr". Memory from "p_filesz" through "p_memsz" * rounded up to the next page is zeroed. */ static unsigned long elf_load(struct file *filep, unsigned long addr, const struct elf_phdr *eppnt, int prot, int type, unsigned long total_size) { unsigned long zero_start, zero_end; unsigned long map_addr; if (eppnt->p_filesz) { map_addr = elf_map(filep, addr, eppnt, prot, type, total_size); if (BAD_ADDR(map_addr)) return map_addr; if (eppnt->p_memsz > eppnt->p_filesz) { zero_start = map_addr + ELF_PAGEOFFSET(eppnt->p_vaddr) + eppnt->p_filesz; zero_end = map_addr + ELF_PAGEOFFSET(eppnt->p_vaddr) + eppnt->p_memsz; /* * Zero the end of the last mapped page but ignore * any errors if the segment isn't writable. */ if (padzero(zero_start) && (prot & PROT_WRITE)) return -EFAULT; } } else { map_addr = zero_start = ELF_PAGESTART(addr); zero_end = zero_start + ELF_PAGEOFFSET(eppnt->p_vaddr) + eppnt->p_memsz; } if (eppnt->p_memsz > eppnt->p_filesz) { /* * Map the last of the segment. * If the header is requesting these pages to be * executable, honour that (ppc32 needs this). */ int error; zero_start = ELF_PAGEALIGN(zero_start); zero_end = ELF_PAGEALIGN(zero_end); error = vm_brk_flags(zero_start, zero_end - zero_start, prot & PROT_EXEC ? VM_EXEC : 0); if (error) map_addr = error; } return map_addr; } static unsigned long total_mapping_size(const struct elf_phdr *phdr, int nr) { elf_addr_t min_addr = -1; elf_addr_t max_addr = 0; bool pt_load = false; int i; for (i = 0; i < nr; i++) { if (phdr[i].p_type == PT_LOAD) { min_addr = min(min_addr, ELF_PAGESTART(phdr[i].p_vaddr)); max_addr = max(max_addr, phdr[i].p_vaddr + phdr[i].p_memsz); pt_load = true; } } return pt_load ? (max_addr - min_addr) : 0; } static int elf_read(struct file *file, void *buf, size_t len, loff_t pos) { ssize_t rv; rv = kernel_read(file, buf, len, &pos); if (unlikely(rv != len)) { return (rv < 0) ? rv : -EIO; } return 0; } static unsigned long maximum_alignment(struct elf_phdr *cmds, int nr) { unsigned long alignment = 0; int i; for (i = 0; i < nr; i++) { if (cmds[i].p_type == PT_LOAD) { unsigned long p_align = cmds[i].p_align; /* skip non-power of two alignments as invalid */ if (!is_power_of_2(p_align)) continue; alignment = max(alignment, p_align); } } /* ensure we align to at least one page */ return ELF_PAGEALIGN(alignment); } /** * load_elf_phdrs() - load ELF program headers * @elf_ex: ELF header of the binary whose program headers should be loaded * @elf_file: the opened ELF binary file * * Loads ELF program headers from the binary file elf_file, which has the ELF * header pointed to by elf_ex, into a newly allocated array. The caller is * responsible for freeing the allocated data. Returns NULL upon failure. */ static struct elf_phdr *load_elf_phdrs(const struct elfhdr *elf_ex, struct file *elf_file) { struct elf_phdr *elf_phdata = NULL; int retval = -1; unsigned int size; /* * If the size of this structure has changed, then punt, since * we will be doing the wrong thing. */ if (elf_ex->e_phentsize != sizeof(struct elf_phdr)) goto out; /* Sanity check the number of program headers... */ /* ...and their total size. */ size = sizeof(struct elf_phdr) * elf_ex->e_phnum; if (size == 0 || size > 65536 || size > ELF_MIN_ALIGN) goto out; elf_phdata = kmalloc(size, GFP_KERNEL); if (!elf_phdata) goto out; /* Read in the program headers */ retval = elf_read(elf_file, elf_phdata, size, elf_ex->e_phoff); out: if (retval) { kfree(elf_phdata); elf_phdata = NULL; } return elf_phdata; } #ifndef CONFIG_ARCH_BINFMT_ELF_STATE /** * struct arch_elf_state - arch-specific ELF loading state * * This structure is used to preserve architecture specific data during * the loading of an ELF file, throughout the checking of architecture * specific ELF headers & through to the point where the ELF load is * known to be proceeding (ie. SET_PERSONALITY). * * This implementation is a dummy for architectures which require no * specific state. */ struct arch_elf_state { }; #define INIT_ARCH_ELF_STATE {} /** * arch_elf_pt_proc() - check a PT_LOPROC..PT_HIPROC ELF program header * @ehdr: The main ELF header * @phdr: The program header to check * @elf: The open ELF file * @is_interp: True if the phdr is from the interpreter of the ELF being * loaded, else false. * @state: Architecture-specific state preserved throughout the process * of loading the ELF. * * Inspects the program header phdr to validate its correctness and/or * suitability for the system. Called once per ELF program header in the * range PT_LOPROC to PT_HIPROC, for both the ELF being loaded and its * interpreter. * * Return: Zero to proceed with the ELF load, non-zero to fail the ELF load * with that return code. */ static inline int arch_elf_pt_proc(struct elfhdr *ehdr, struct elf_phdr *phdr, struct file *elf, bool is_interp, struct arch_elf_state *state) { /* Dummy implementation, always proceed */ return 0; } /** * arch_check_elf() - check an ELF executable * @ehdr: The main ELF header * @has_interp: True if the ELF has an interpreter, else false. * @interp_ehdr: The interpreter's ELF header * @state: Architecture-specific state preserved throughout the process * of loading the ELF. * * Provides a final opportunity for architecture code to reject the loading * of the ELF & cause an exec syscall to return an error. This is called after * all program headers to be checked by arch_elf_pt_proc have been. * * Return: Zero to proceed with the ELF load, non-zero to fail the ELF load * with that return code. */ static inline int arch_check_elf(struct elfhdr *ehdr, bool has_interp, struct elfhdr *interp_ehdr, struct arch_elf_state *state) { /* Dummy implementation, always proceed */ return 0; } #endif /* !CONFIG_ARCH_BINFMT_ELF_STATE */ static inline int make_prot(u32 p_flags, struct arch_elf_state *arch_state, bool has_interp, bool is_interp) { int prot = 0; if (p_flags & PF_R) prot |= PROT_READ; if (p_flags & PF_W) prot |= PROT_WRITE; if (p_flags & PF_X) prot |= PROT_EXEC; return arch_elf_adjust_prot(prot, arch_state, has_interp, is_interp); } /* This is much more generalized than the library routine read function, so we keep this separate. Technically the library read function is only provided so that we can read a.out libraries that have an ELF header */ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, struct file *interpreter, unsigned long no_base, struct elf_phdr *interp_elf_phdata, struct arch_elf_state *arch_state) { struct elf_phdr *eppnt; unsigned long load_addr = 0; int load_addr_set = 0; unsigned long error = ~0UL; unsigned long total_size; int i; /* First of all, some simple consistency checks */ if (interp_elf_ex->e_type != ET_EXEC && interp_elf_ex->e_type != ET_DYN) goto out; if (!elf_check_arch(interp_elf_ex) || elf_check_fdpic(interp_elf_ex)) goto out; if (!interpreter->f_op->mmap) goto out; total_size = total_mapping_size(interp_elf_phdata, interp_elf_ex->e_phnum); if (!total_size) { error = -EINVAL; goto out; } eppnt = interp_elf_phdata; for (i = 0; i < interp_elf_ex->e_phnum; i++, eppnt++) { if (eppnt->p_type == PT_LOAD) { int elf_type = MAP_PRIVATE; int elf_prot = make_prot(eppnt->p_flags, arch_state, true, true); unsigned long vaddr = 0; unsigned long k, map_addr; vaddr = eppnt->p_vaddr; if (interp_elf_ex->e_type == ET_EXEC || load_addr_set) elf_type |= MAP_FIXED; else if (no_base && interp_elf_ex->e_type == ET_DYN) load_addr = -vaddr; map_addr = elf_load(interpreter, load_addr + vaddr, eppnt, elf_prot, elf_type, total_size); total_size = 0; error = map_addr; if (BAD_ADDR(map_addr)) goto out; if (!load_addr_set && interp_elf_ex->e_type == ET_DYN) { load_addr = map_addr - ELF_PAGESTART(vaddr); load_addr_set = 1; } /* * Check to see if the section's size will overflow the * allowed task size. Note that p_filesz must always be * <= p_memsize so it's only necessary to check p_memsz. */ k = load_addr + eppnt->p_vaddr; if (BAD_ADDR(k) || eppnt->p_filesz > eppnt->p_memsz || eppnt->p_memsz > TASK_SIZE || TASK_SIZE - eppnt->p_memsz < k) { error = -ENOMEM; goto out; } } } error = load_addr; out: return error; } /* * These are the functions used to load ELF style executables and shared * libraries. There is no binary dependent code anywhere else. */ static int parse_elf_property(const char *data, size_t *off, size_t datasz, struct arch_elf_state *arch, bool have_prev_type, u32 *prev_type) { size_t o, step; const struct gnu_property *pr; int ret; if (*off == datasz) return -ENOENT; if (WARN_ON_ONCE(*off > datasz || *off % ELF_GNU_PROPERTY_ALIGN)) return -EIO; o = *off; datasz -= *off; if (datasz < sizeof(*pr)) return -ENOEXEC; pr = (const struct gnu_property *)(data + o); o += sizeof(*pr); datasz -= sizeof(*pr); if (pr->pr_datasz > datasz) return -ENOEXEC; WARN_ON_ONCE(o % ELF_GNU_PROPERTY_ALIGN); step = round_up(pr->pr_datasz, ELF_GNU_PROPERTY_ALIGN); if (step > datasz) return -ENOEXEC; /* Properties are supposed to be unique and sorted on pr_type: */ if (have_prev_type && pr->pr_type <= *prev_type) return -ENOEXEC; *prev_type = pr->pr_type; ret = arch_parse_elf_property(pr->pr_type, data + o, pr->pr_datasz, ELF_COMPAT, arch); if (ret) return ret; *off = o + step; return 0; } #define NOTE_DATA_SZ SZ_1K #define GNU_PROPERTY_TYPE_0_NAME "GNU" #define NOTE_NAME_SZ (sizeof(GNU_PROPERTY_TYPE_0_NAME)) static int parse_elf_properties(struct file *f, const struct elf_phdr *phdr, struct arch_elf_state *arch) { union { struct elf_note nhdr; char data[NOTE_DATA_SZ]; } note; loff_t pos; ssize_t n; size_t off, datasz; int ret; bool have_prev_type; u32 prev_type; if (!IS_ENABLED(CONFIG_ARCH_USE_GNU_PROPERTY) || !phdr) return 0; /* load_elf_binary() shouldn't call us unless this is true... */ if (WARN_ON_ONCE(phdr->p_type != PT_GNU_PROPERTY)) return -ENOEXEC; /* If the properties are crazy large, that's too bad (for now): */ if (phdr->p_filesz > sizeof(note)) return -ENOEXEC; pos = phdr->p_offset; n = kernel_read(f, &note, phdr->p_filesz, &pos); BUILD_BUG_ON(sizeof(note) < sizeof(note.nhdr) + NOTE_NAME_SZ); if (n < 0 || n < sizeof(note.nhdr) + NOTE_NAME_SZ) return -EIO; if (note.nhdr.n_type != NT_GNU_PROPERTY_TYPE_0 || note.nhdr.n_namesz != NOTE_NAME_SZ || strncmp(note.data + sizeof(note.nhdr), GNU_PROPERTY_TYPE_0_NAME, n - sizeof(note.nhdr))) return -ENOEXEC; off = round_up(sizeof(note.nhdr) + NOTE_NAME_SZ, ELF_GNU_PROPERTY_ALIGN); if (off > n) return -ENOEXEC; if (note.nhdr.n_descsz > n - off) return -ENOEXEC; datasz = off + note.nhdr.n_descsz; have_prev_type = false; do { ret = parse_elf_property(note.data, &off, datasz, arch, have_prev_type, &prev_type); have_prev_type = true; } while (!ret); return ret == -ENOENT ? 0 : ret; } static int load_elf_binary(struct linux_binprm *bprm) { struct file *interpreter = NULL; /* to shut gcc up */ unsigned long load_bias = 0, phdr_addr = 0; int first_pt_load = 1; unsigned long error; struct elf_phdr *elf_ppnt, *elf_phdata, *interp_elf_phdata = NULL; struct elf_phdr *elf_property_phdata = NULL; unsigned long elf_brk; int retval, i; unsigned long elf_entry; unsigned long e_entry; unsigned long interp_load_addr = 0; unsigned long start_code, end_code, start_data, end_data; unsigned long reloc_func_desc __maybe_unused = 0; int executable_stack = EXSTACK_DEFAULT; struct elfhdr *elf_ex = (struct elfhdr *)bprm->buf; struct elfhdr *interp_elf_ex = NULL; struct arch_elf_state arch_state = INIT_ARCH_ELF_STATE; struct mm_struct *mm; struct pt_regs *regs; retval = -ENOEXEC; /* First of all, some simple consistency checks */ if (memcmp(elf_ex->e_ident, ELFMAG, SELFMAG) != 0) goto out; if (elf_ex->e_type != ET_EXEC && elf_ex->e_type != ET_DYN) goto out; if (!elf_check_arch(elf_ex)) goto out; if (elf_check_fdpic(elf_ex)) goto out; if (!bprm->file->f_op->mmap) goto out; elf_phdata = load_elf_phdrs(elf_ex, bprm->file); if (!elf_phdata) goto out; elf_ppnt = elf_phdata; for (i = 0; i < elf_ex->e_phnum; i++, elf_ppnt++) { char *elf_interpreter; if (elf_ppnt->p_type == PT_GNU_PROPERTY) { elf_property_phdata = elf_ppnt; continue; } if (elf_ppnt->p_type != PT_INTERP) continue; /* * This is the program interpreter used for shared libraries - * for now assume that this is an a.out format binary. */ retval = -ENOEXEC; if (elf_ppnt->p_filesz > PATH_MAX || elf_ppnt->p_filesz < 2) goto out_free_ph; retval = -ENOMEM; elf_interpreter = kmalloc(elf_ppnt->p_filesz, GFP_KERNEL); if (!elf_interpreter) goto out_free_ph; retval = elf_read(bprm->file, elf_interpreter, elf_ppnt->p_filesz, elf_ppnt->p_offset); if (retval < 0) goto out_free_interp; /* make sure path is NULL terminated */ retval = -ENOEXEC; if (elf_interpreter[elf_ppnt->p_filesz - 1] != '\0') goto out_free_interp; interpreter = open_exec(elf_interpreter); kfree(elf_interpreter); retval = PTR_ERR(interpreter); if (IS_ERR(interpreter)) goto out_free_ph; /* * If the binary is not readable then enforce mm->dumpable = 0 * regardless of the interpreter's permissions. */ would_dump(bprm, interpreter); interp_elf_ex = kmalloc(sizeof(*interp_elf_ex), GFP_KERNEL); if (!interp_elf_ex) { retval = -ENOMEM; goto out_free_file; } /* Get the exec headers */ retval = elf_read(interpreter, interp_elf_ex, sizeof(*interp_elf_ex), 0); if (retval < 0) goto out_free_dentry; break; out_free_interp: kfree(elf_interpreter); goto out_free_ph; } elf_ppnt = elf_phdata; for (i = 0; i < elf_ex->e_phnum; i++, elf_ppnt++) switch (elf_ppnt->p_type) { case PT_GNU_STACK: if (elf_ppnt->p_flags & PF_X) executable_stack = EXSTACK_ENABLE_X; else executable_stack = EXSTACK_DISABLE_X; break; case PT_LOPROC ... PT_HIPROC: retval = arch_elf_pt_proc(elf_ex, elf_ppnt, bprm->file, false, &arch_state); if (retval) goto out_free_dentry; break; } /* Some simple consistency checks for the interpreter */ if (interpreter) { retval = -ELIBBAD; /* Not an ELF interpreter */ if (memcmp(interp_elf_ex->e_ident, ELFMAG, SELFMAG) != 0) goto out_free_dentry; /* Verify the interpreter has a valid arch */ if (!elf_check_arch(interp_elf_ex) || elf_check_fdpic(interp_elf_ex)) goto out_free_dentry; /* Load the interpreter program headers */ interp_elf_phdata = load_elf_phdrs(interp_elf_ex, interpreter); if (!interp_elf_phdata) goto out_free_dentry; /* Pass PT_LOPROC..PT_HIPROC headers to arch code */ elf_property_phdata = NULL; elf_ppnt = interp_elf_phdata; for (i = 0; i < interp_elf_ex->e_phnum; i++, elf_ppnt++) switch (elf_ppnt->p_type) { case PT_GNU_PROPERTY: elf_property_phdata = elf_ppnt; break; case PT_LOPROC ... PT_HIPROC: retval = arch_elf_pt_proc(interp_elf_ex, elf_ppnt, interpreter, true, &arch_state); if (retval) goto out_free_dentry; break; } } retval = parse_elf_properties(interpreter ?: bprm->file, elf_property_phdata, &arch_state); if (retval) goto out_free_dentry; /* * Allow arch code to reject the ELF at this point, whilst it's * still possible to return an error to the code that invoked * the exec syscall. */ retval = arch_check_elf(elf_ex, !!interpreter, interp_elf_ex, &arch_state); if (retval) goto out_free_dentry; /* Flush all traces of the currently running executable */ retval = begin_new_exec(bprm); if (retval) goto out_free_dentry; /* Do this immediately, since STACK_TOP as used in setup_arg_pages may depend on the personality. */ SET_PERSONALITY2(*elf_ex, &arch_state); if (elf_read_implies_exec(*elf_ex, executable_stack)) current->personality |= READ_IMPLIES_EXEC; const int snapshot_randomize_va_space = READ_ONCE(randomize_va_space); if (!(current->personality & ADDR_NO_RANDOMIZE) && snapshot_randomize_va_space) current->flags |= PF_RANDOMIZE; setup_new_exec(bprm); /* Do this so that we can load the interpreter, if need be. We will change some of these later */ retval = setup_arg_pages(bprm, randomize_stack_top(STACK_TOP), executable_stack); if (retval < 0) goto out_free_dentry; elf_brk = 0; start_code = ~0UL; end_code = 0; start_data = 0; end_data = 0; /* Now we do a little grungy work by mmapping the ELF image into the correct location in memory. */ for(i = 0, elf_ppnt = elf_phdata; i < elf_ex->e_phnum; i++, elf_ppnt++) { int elf_prot, elf_flags; unsigned long k, vaddr; unsigned long total_size = 0; unsigned long alignment; if (elf_ppnt->p_type != PT_LOAD) continue; elf_prot = make_prot(elf_ppnt->p_flags, &arch_state, !!interpreter, false); elf_flags = MAP_PRIVATE; vaddr = elf_ppnt->p_vaddr; /* * The first time through the loop, first_pt_load is true: * layout will be calculated. Once set, use MAP_FIXED since * we know we've already safely mapped the entire region with * MAP_FIXED_NOREPLACE in the once-per-binary logic following. */ if (!first_pt_load) { elf_flags |= MAP_FIXED; } else if (elf_ex->e_type == ET_EXEC) { /* * This logic is run once for the first LOAD Program * Header for ET_EXEC binaries. No special handling * is needed. */ elf_flags |= MAP_FIXED_NOREPLACE; } else if (elf_ex->e_type == ET_DYN) { /* * This logic is run once for the first LOAD Program * Header for ET_DYN binaries to calculate the * randomization (load_bias) for all the LOAD * Program Headers. */ /* * Calculate the entire size of the ELF mapping * (total_size), used for the initial mapping, * due to load_addr_set which is set to true later * once the initial mapping is performed. * * Note that this is only sensible when the LOAD * segments are contiguous (or overlapping). If * used for LOADs that are far apart, this would * cause the holes between LOADs to be mapped, * running the risk of having the mapping fail, * as it would be larger than the ELF file itself. * * As a result, only ET_DYN does this, since * some ET_EXEC (e.g. ia64) may have large virtual * memory holes between LOADs. * */ total_size = total_mapping_size(elf_phdata, elf_ex->e_phnum); if (!total_size) { retval = -EINVAL; goto out_free_dentry; } /* Calculate any requested alignment. */ alignment = maximum_alignment(elf_phdata, elf_ex->e_phnum); /* * There are effectively two types of ET_DYN * binaries: programs (i.e. PIE: ET_DYN with PT_INTERP) * and loaders (ET_DYN without PT_INTERP, since they * _are_ the ELF interpreter). The loaders must * be loaded away from programs since the program * may otherwise collide with the loader (especially * for ET_EXEC which does not have a randomized * position). For example to handle invocations of * "./ld.so someprog" to test out a new version of * the loader, the subsequent program that the * loader loads must avoid the loader itself, so * they cannot share the same load range. Sufficient * room for the brk must be allocated with the * loader as well, since brk must be available with * the loader. * * Therefore, programs are loaded offset from * ELF_ET_DYN_BASE and loaders are loaded into the * independently randomized mmap region (0 load_bias * without MAP_FIXED nor MAP_FIXED_NOREPLACE). */ if (interpreter) { /* On ET_DYN with PT_INTERP, we do the ASLR. */ load_bias = ELF_ET_DYN_BASE; if (current->flags & PF_RANDOMIZE) load_bias += arch_mmap_rnd(); /* Adjust alignment as requested. */ if (alignment) load_bias &= ~(alignment - 1); elf_flags |= MAP_FIXED_NOREPLACE; } else { /* * For ET_DYN without PT_INTERP, we rely on * the architectures's (potentially ASLR) mmap * base address (via a load_bias of 0). * * When a large alignment is requested, we * must do the allocation at address "0" right * now to discover where things will load so * that we can adjust the resulting alignment. * In this case (load_bias != 0), we can use * MAP_FIXED_NOREPLACE to make sure the mapping * doesn't collide with anything. */ if (alignment > ELF_MIN_ALIGN) { load_bias = elf_load(bprm->file, 0, elf_ppnt, elf_prot, elf_flags, total_size); if (BAD_ADDR(load_bias)) { retval = IS_ERR_VALUE(load_bias) ? PTR_ERR((void*)load_bias) : -EINVAL; goto out_free_dentry; } vm_munmap(load_bias, total_size); /* Adjust alignment as requested. */ if (alignment) load_bias &= ~(alignment - 1); elf_flags |= MAP_FIXED_NOREPLACE; } else load_bias = 0; } /* * Since load_bias is used for all subsequent loading * calculations, we must lower it by the first vaddr * so that the remaining calculations based on the * ELF vaddrs will be correctly offset. The result * is then page aligned. */ load_bias = ELF_PAGESTART(load_bias - vaddr); } error = elf_load(bprm->file, load_bias + vaddr, elf_ppnt, elf_prot, elf_flags, total_size); if (BAD_ADDR(error)) { retval = IS_ERR_VALUE(error) ? PTR_ERR((void*)error) : -EINVAL; goto out_free_dentry; } if (first_pt_load) { first_pt_load = 0; if (elf_ex->e_type == ET_DYN) { load_bias += error - ELF_PAGESTART(load_bias + vaddr); reloc_func_desc = load_bias; } } /* * Figure out which segment in the file contains the Program * Header table, and map to the associated memory address. */ if (elf_ppnt->p_offset <= elf_ex->e_phoff && elf_ex->e_phoff < elf_ppnt->p_offset + elf_ppnt->p_filesz) { phdr_addr = elf_ex->e_phoff - elf_ppnt->p_offset + elf_ppnt->p_vaddr; } k = elf_ppnt->p_vaddr; if ((elf_ppnt->p_flags & PF_X) && k < start_code) start_code = k; if (start_data < k) start_data = k; /* * Check to see if the section's size will overflow the * allowed task size. Note that p_filesz must always be * <= p_memsz so it is only necessary to check p_memsz. */ if (BAD_ADDR(k) || elf_ppnt->p_filesz > elf_ppnt->p_memsz || elf_ppnt->p_memsz > TASK_SIZE || TASK_SIZE - elf_ppnt->p_memsz < k) { /* set_brk can never work. Avoid overflows. */ retval = -EINVAL; goto out_free_dentry; } k = elf_ppnt->p_vaddr + elf_ppnt->p_filesz; if ((elf_ppnt->p_flags & PF_X) && end_code < k) end_code = k; if (end_data < k) end_data = k; k = elf_ppnt->p_vaddr + elf_ppnt->p_memsz; if (k > elf_brk) elf_brk = k; } e_entry = elf_ex->e_entry + load_bias; phdr_addr += load_bias; elf_brk += load_bias; start_code += load_bias; end_code += load_bias; start_data += load_bias; end_data += load_bias; current->mm->start_brk = current->mm->brk = ELF_PAGEALIGN(elf_brk); if (interpreter) { elf_entry = load_elf_interp(interp_elf_ex, interpreter, load_bias, interp_elf_phdata, &arch_state); if (!IS_ERR_VALUE(elf_entry)) { /* * load_elf_interp() returns relocation * adjustment */ interp_load_addr = elf_entry; elf_entry += interp_elf_ex->e_entry; } if (BAD_ADDR(elf_entry)) { retval = IS_ERR_VALUE(elf_entry) ? (int)elf_entry : -EINVAL; goto out_free_dentry; } reloc_func_desc = interp_load_addr; exe_file_allow_write_access(interpreter); fput(interpreter); kfree(interp_elf_ex); kfree(interp_elf_phdata); } else { elf_entry = e_entry; if (BAD_ADDR(elf_entry)) { retval = -EINVAL; goto out_free_dentry; } } kfree(elf_phdata); set_binfmt(&elf_format); #ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES retval = ARCH_SETUP_ADDITIONAL_PAGES(bprm, elf_ex, !!interpreter); if (retval < 0) goto out; #endif /* ARCH_HAS_SETUP_ADDITIONAL_PAGES */ retval = create_elf_tables(bprm, elf_ex, interp_load_addr, e_entry, phdr_addr); if (retval < 0) goto out; mm = current->mm; mm->end_code = end_code; mm->start_code = start_code; mm->start_data = start_data; mm->end_data = end_data; mm->start_stack = bprm->p; if ((current->flags & PF_RANDOMIZE) && (snapshot_randomize_va_space > 1)) { /* * For architectures with ELF randomization, when executing * a loader directly (i.e. no interpreter listed in ELF * headers), move the brk area out of the mmap region * (since it grows up, and may collide early with the stack * growing down), and into the unused ELF_ET_DYN_BASE region. */ if (IS_ENABLED(CONFIG_ARCH_HAS_ELF_RANDOMIZE) && elf_ex->e_type == ET_DYN && !interpreter) { mm->brk = mm->start_brk = ELF_ET_DYN_BASE; } else { /* Otherwise leave a gap between .bss and brk. */ mm->brk = mm->start_brk = mm->brk + PAGE_SIZE; } mm->brk = mm->start_brk = arch_randomize_brk(mm); #ifdef compat_brk_randomized current->brk_randomized = 1; #endif } if (current->personality & MMAP_PAGE_ZERO) { /* Why this, you ask??? Well SVr4 maps page 0 as read-only, and some applications "depend" upon this behavior. Since we do not have the power to recompile these, we emulate the SVr4 behavior. Sigh. */ error = vm_mmap(NULL, 0, PAGE_SIZE, PROT_READ | PROT_EXEC, MAP_FIXED | MAP_PRIVATE, 0); retval = do_mseal(0, PAGE_SIZE, 0); if (retval) pr_warn_ratelimited("pid=%d, couldn't seal address 0, ret=%d.\n", task_pid_nr(current), retval); } regs = current_pt_regs(); #ifdef ELF_PLAT_INIT /* * The ABI may specify that certain registers be set up in special * ways (on i386 %edx is the address of a DT_FINI function, for * example. In addition, it may also specify (eg, PowerPC64 ELF) * that the e_entry field is the address of the function descriptor * for the startup routine, rather than the address of the startup * routine itself. This macro performs whatever initialization to * the regs structure is required as well as any relocations to the * function descriptor entries when executing dynamically links apps. */ ELF_PLAT_INIT(regs, reloc_func_desc); #endif finalize_exec(bprm); START_THREAD(elf_ex, regs, elf_entry, bprm->p); retval = 0; out: return retval; /* error cleanup */ out_free_dentry: kfree(interp_elf_ex); kfree(interp_elf_phdata); out_free_file: exe_file_allow_write_access(interpreter); if (interpreter) fput(interpreter); out_free_ph: kfree(elf_phdata); goto out; } #ifdef CONFIG_USELIB /* This is really simpleminded and specialized - we are loading an a.out library that is given an ELF header. */ static int load_elf_library(struct file *file) { struct elf_phdr *elf_phdata; struct elf_phdr *eppnt; int retval, error, i, j; struct elfhdr elf_ex; error = -ENOEXEC; retval = elf_read(file, &elf_ex, sizeof(elf_ex), 0); if (retval < 0) goto out; if (memcmp(elf_ex.e_ident, ELFMAG, SELFMAG) != 0) goto out; /* First of all, some simple consistency checks */ if (elf_ex.e_type != ET_EXEC || elf_ex.e_phnum > 2 || !elf_check_arch(&elf_ex) || !file->f_op->mmap) goto out; if (elf_check_fdpic(&elf_ex)) goto out; /* Now read in all of the header information */ j = sizeof(struct elf_phdr) * elf_ex.e_phnum; /* j < ELF_MIN_ALIGN because elf_ex.e_phnum <= 2 */ error = -ENOMEM; elf_phdata = kmalloc(j, GFP_KERNEL); if (!elf_phdata) goto out; eppnt = elf_phdata; error = -ENOEXEC; retval = elf_read(file, eppnt, j, elf_ex.e_phoff); if (retval < 0) goto out_free_ph; for (j = 0, i = 0; i<elf_ex.e_phnum; i++) if ((eppnt + i)->p_type == PT_LOAD) j++; if (j != 1) goto out_free_ph; while (eppnt->p_type != PT_LOAD) eppnt++; /* Now use mmap to map the library into memory. */ error = elf_load(file, ELF_PAGESTART(eppnt->p_vaddr), eppnt, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_FIXED_NOREPLACE | MAP_PRIVATE, 0); if (error != ELF_PAGESTART(eppnt->p_vaddr)) goto out_free_ph; error = 0; out_free_ph: kfree(elf_phdata); out: return error; } #endif /* #ifdef CONFIG_USELIB */ #ifdef CONFIG_ELF_CORE /* * ELF core dumper * * Modelled on fs/exec.c:aout_core_dump() * Jeremy Fitzhardinge <jeremy@sw.oz.au> */ /* An ELF note in memory */ struct memelfnote { const char *name; int type; unsigned int datasz; void *data; }; static int notesize(struct memelfnote *en) { int sz; sz = sizeof(struct elf_note); sz += roundup(strlen(en->name) + 1, 4); sz += roundup(en->datasz, 4); return sz; } static int writenote(struct memelfnote *men, struct coredump_params *cprm) { struct elf_note en; en.n_namesz = strlen(men->name) + 1; en.n_descsz = men->datasz; en.n_type = men->type; return dump_emit(cprm, &en, sizeof(en)) && dump_emit(cprm, men->name, en.n_namesz) && dump_align(cprm, 4) && dump_emit(cprm, men->data, men->datasz) && dump_align(cprm, 4); } static void fill_elf_header(struct elfhdr *elf, int segs, u16 machine, u32 flags) { memset(elf, 0, sizeof(*elf)); memcpy(elf->e_ident, ELFMAG, SELFMAG); elf->e_ident[EI_CLASS] = ELF_CLASS; elf->e_ident[EI_DATA] = ELF_DATA; elf->e_ident[EI_VERSION] = EV_CURRENT; elf->e_ident[EI_OSABI] = ELF_OSABI; elf->e_type = ET_CORE; elf->e_machine = machine; elf->e_version = EV_CURRENT; elf->e_phoff = sizeof(struct elfhdr); elf->e_flags = flags; elf->e_ehsize = sizeof(struct elfhdr); elf->e_phentsize = sizeof(struct elf_phdr); elf->e_phnum = segs; } static void fill_elf_note_phdr(struct elf_phdr *phdr, int sz, loff_t offset) { phdr->p_type = PT_NOTE; phdr->p_offset = offset; phdr->p_vaddr = 0; phdr->p_paddr = 0; phdr->p_filesz = sz; phdr->p_memsz = 0; phdr->p_flags = 0; phdr->p_align = 4; } static void fill_note(struct memelfnote *note, const char *name, int type, unsigned int sz, void *data) { note->name = name; note->type = type; note->datasz = sz; note->data = data; } /* * fill up all the fields in prstatus from the given task struct, except * registers which need to be filled up separately. */ static void fill_prstatus(struct elf_prstatus_common *prstatus, struct task_struct *p, long signr) { prstatus->pr_info.si_signo = prstatus->pr_cursig = signr; prstatus->pr_sigpend = p->pending.signal.sig[0]; prstatus->pr_sighold = p->blocked.sig[0]; rcu_read_lock(); prstatus->pr_ppid = task_pid_vnr(rcu_dereference(p->real_parent)); rcu_read_unlock(); prstatus->pr_pid = task_pid_vnr(p); prstatus->pr_pgrp = task_pgrp_vnr(p); prstatus->pr_sid = task_session_vnr(p); if (thread_group_leader(p)) { struct task_cputime cputime; /* * This is the record for the group leader. It shows the * group-wide total, not its individual thread total. */ thread_group_cputime(p, &cputime); prstatus->pr_utime = ns_to_kernel_old_timeval(cputime.utime); prstatus->pr_stime = ns_to_kernel_old_timeval(cputime.stime); } else { u64 utime, stime; task_cputime(p, &utime, &stime); prstatus->pr_utime = ns_to_kernel_old_timeval(utime); prstatus->pr_stime = ns_to_kernel_old_timeval(stime); } prstatus->pr_cutime = ns_to_kernel_old_timeval(p->signal->cutime); prstatus->pr_cstime = ns_to_kernel_old_timeval(p->signal->cstime); } static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p, struct mm_struct *mm) { const struct cred *cred; unsigned int i, len; unsigned int state; /* first copy the parameters from user space */ memset(psinfo, 0, sizeof(struct elf_prpsinfo)); len = mm->arg_end - mm->arg_start; if (len >= ELF_PRARGSZ) len = ELF_PRARGSZ-1; if (copy_from_user(&psinfo->pr_psargs, (const char __user *)mm->arg_start, len)) return -EFAULT; for(i = 0; i < len; i++) if (psinfo->pr_psargs[i] == 0) psinfo->pr_psargs[i] = ' '; psinfo->pr_psargs[len] = 0; rcu_read_lock(); psinfo->pr_ppid = task_pid_vnr(rcu_dereference(p->real_parent)); rcu_read_unlock(); psinfo->pr_pid = task_pid_vnr(p); psinfo->pr_pgrp = task_pgrp_vnr(p); psinfo->pr_sid = task_session_vnr(p); state = READ_ONCE(p->__state); i = state ? ffz(~state) + 1 : 0; psinfo->pr_state = i; psinfo->pr_sname = (i > 5) ? '.' : "RSDTZW"[i]; psinfo->pr_zomb = psinfo->pr_sname == 'Z'; psinfo->pr_nice = task_nice(p); psinfo->pr_flag = p->flags; rcu_read_lock(); cred = __task_cred(p); SET_UID(psinfo->pr_uid, from_kuid_munged(cred->user_ns, cred->uid)); SET_GID(psinfo->pr_gid, from_kgid_munged(cred->user_ns, cred->gid)); rcu_read_unlock(); get_task_comm(psinfo->pr_fname, p); return 0; } static void fill_auxv_note(struct memelfnote *note, struct mm_struct *mm) { elf_addr_t *auxv = (elf_addr_t *) mm->saved_auxv; int i = 0; do i += 2; while (auxv[i - 2] != AT_NULL); fill_note(note, "CORE", NT_AUXV, i * sizeof(elf_addr_t), auxv); } static void fill_siginfo_note(struct memelfnote *note, user_siginfo_t *csigdata, const kernel_siginfo_t *siginfo) { copy_siginfo_to_external(csigdata, siginfo); fill_note(note, "CORE", NT_SIGINFO, sizeof(*csigdata), csigdata); } /* * Format of NT_FILE note: * * long count -- how many files are mapped * long page_size -- units for file_ofs * array of [COUNT] elements of * long start * long end * long file_ofs * followed by COUNT filenames in ASCII: "FILE1" NUL "FILE2" NUL... */ static int fill_files_note(struct memelfnote *note, struct coredump_params *cprm) { unsigned count, size, names_ofs, remaining, n; user_long_t *data; user_long_t *start_end_ofs; char *name_base, *name_curpos; int i; /* *Estimated* file count and total data size needed */ count = cprm->vma_count; if (count > UINT_MAX / 64) return -EINVAL; size = count * 64; names_ofs = (2 + 3 * count) * sizeof(data[0]); alloc: /* paranoia check */ if (size >= core_file_note_size_limit) { pr_warn_once("coredump Note size too large: %u (does kernel.core_file_note_size_limit sysctl need adjustment?\n", size); return -EINVAL; } size = round_up(size, PAGE_SIZE); /* * "size" can be 0 here legitimately. * Let it ENOMEM and omit NT_FILE section which will be empty anyway. */ data = kvmalloc(size, GFP_KERNEL); if (ZERO_OR_NULL_PTR(data)) return -ENOMEM; start_end_ofs = data + 2; name_base = name_curpos = ((char *)data) + names_ofs; remaining = size - names_ofs; count = 0; for (i = 0; i < cprm->vma_count; i++) { struct core_vma_metadata *m = &cprm->vma_meta[i]; struct file *file; const char *filename; file = m->file; if (!file) continue; filename = file_path(file, name_curpos, remaining); if (IS_ERR(filename)) { if (PTR_ERR(filename) == -ENAMETOOLONG) { kvfree(data); size = size * 5 / 4; goto alloc; } continue; } /* file_path() fills at the end, move name down */ /* n = strlen(filename) + 1: */ n = (name_curpos + remaining) - filename; remaining = filename - name_curpos; memmove(name_curpos, filename, n); name_curpos += n; *start_end_ofs++ = m->start; *start_end_ofs++ = m->end; *start_end_ofs++ = m->pgoff; count++; } /* Now we know exact count of files, can store it */ data[0] = count; data[1] = PAGE_SIZE; /* * Count usually is less than mm->map_count, * we need to move filenames down. */ n = cprm->vma_count - count; if (n != 0) { unsigned shift_bytes = n * 3 * sizeof(data[0]); memmove(name_base - shift_bytes, name_base, name_curpos - name_base); name_curpos -= shift_bytes; } size = name_curpos - (char *)data; fill_note(note, "CORE", NT_FILE, size, data); return 0; } #include <linux/regset.h> struct elf_thread_core_info { struct elf_thread_core_info *next; struct task_struct *task; struct elf_prstatus prstatus; struct memelfnote notes[]; }; struct elf_note_info { struct elf_thread_core_info *thread; struct memelfnote psinfo; struct memelfnote signote; struct memelfnote auxv; struct memelfnote files; user_siginfo_t csigdata; size_t size; int thread_notes; }; #ifdef CORE_DUMP_USE_REGSET /* * When a regset has a writeback hook, we call it on each thread before * dumping user memory. On register window machines, this makes sure the * user memory backing the register data is up to date before we read it. */ static void do_thread_regset_writeback(struct task_struct *task, const struct user_regset *regset) { if (regset->writeback) regset->writeback(task, regset, 1); } #ifndef PRSTATUS_SIZE #define PRSTATUS_SIZE sizeof(struct elf_prstatus) #endif #ifndef SET_PR_FPVALID #define SET_PR_FPVALID(S) ((S)->pr_fpvalid = 1) #endif static int fill_thread_core_info(struct elf_thread_core_info *t, const struct user_regset_view *view, long signr, struct elf_note_info *info) { unsigned int note_iter, view_iter; /* * NT_PRSTATUS is the one special case, because the regset data * goes into the pr_reg field inside the note contents, rather * than being the whole note contents. We fill the regset in here. * We assume that regset 0 is NT_PRSTATUS. */ fill_prstatus(&t->prstatus.common, t->task, signr); regset_get(t->task, &view->regsets[0], sizeof(t->prstatus.pr_reg), &t->prstatus.pr_reg); fill_note(&t->notes[0], "CORE", NT_PRSTATUS, PRSTATUS_SIZE, &t->prstatus); info->size += notesize(&t->notes[0]); do_thread_regset_writeback(t->task, &view->regsets[0]); /* * Each other regset might generate a note too. For each regset * that has no core_note_type or is inactive, skip it. */ note_iter = 1; for (view_iter = 1; view_iter < view->n; ++view_iter) { const struct user_regset *regset = &view->regsets[view_iter]; int note_type = regset->core_note_type; bool is_fpreg = note_type == NT_PRFPREG; void *data; int ret; do_thread_regset_writeback(t->task, regset); if (!note_type) // not for coredumps continue; if (regset->active && regset->active(t->task, regset) <= 0) continue; ret = regset_get_alloc(t->task, regset, ~0U, &data); if (ret < 0) continue; if (WARN_ON_ONCE(note_iter >= info->thread_notes)) break; if (is_fpreg) SET_PR_FPVALID(&t->prstatus); fill_note(&t->notes[note_iter], is_fpreg ? "CORE" : "LINUX", note_type, ret, data); info->size += notesize(&t->notes[note_iter]); note_iter++; } return 1; } #else static int fill_thread_core_info(struct elf_thread_core_info *t, const struct user_regset_view *view, long signr, struct elf_note_info *info) { struct task_struct *p = t->task; elf_fpregset_t *fpu; fill_prstatus(&t->prstatus.common, p, signr); elf_core_copy_task_regs(p, &t->prstatus.pr_reg); fill_note(&t->notes[0], "CORE", NT_PRSTATUS, sizeof(t->prstatus), &(t->prstatus)); info->size += notesize(&t->notes[0]); fpu = kzalloc(sizeof(elf_fpregset_t), GFP_KERNEL); if (!fpu || !elf_core_copy_task_fpregs(p, fpu)) { kfree(fpu); return 1; } t->prstatus.pr_fpvalid = 1; fill_note(&t->notes[1], "CORE", NT_PRFPREG, sizeof(*fpu), fpu); info->size += notesize(&t->notes[1]); return 1; } #endif static int fill_note_info(struct elfhdr *elf, int phdrs, struct elf_note_info *info, struct coredump_params *cprm) { struct task_struct *dump_task = current; const struct user_regset_view *view; struct elf_thread_core_info *t; struct elf_prpsinfo *psinfo; struct core_thread *ct; psinfo = kmalloc(sizeof(*psinfo), GFP_KERNEL); if (!psinfo) return 0; fill_note(&info->psinfo, "CORE", NT_PRPSINFO, sizeof(*psinfo), psinfo); #ifdef CORE_DUMP_USE_REGSET view = task_user_regset_view(dump_task); /* * Figure out how many notes we're going to need for each thread. */ info->thread_notes = 0; for (int i = 0; i < view->n; ++i) if (view->regsets[i].core_note_type != 0) ++info->thread_notes; /* * Sanity check. We rely on regset 0 being in NT_PRSTATUS, * since it is our one special case. */ if (unlikely(info->thread_notes == 0) || unlikely(view->regsets[0].core_note_type != NT_PRSTATUS)) { WARN_ON(1); return 0; } /* * Initialize the ELF file header. */ fill_elf_header(elf, phdrs, view->e_machine, view->e_flags); #else view = NULL; info->thread_notes = 2; fill_elf_header(elf, phdrs, ELF_ARCH, ELF_CORE_EFLAGS); #endif /* * Allocate a structure for each thread. */ info->thread = kzalloc(offsetof(struct elf_thread_core_info, notes[info->thread_notes]), GFP_KERNEL); if (unlikely(!info->thread)) return 0; info->thread->task = dump_task; for (ct = dump_task->signal->core_state->dumper.next; ct; ct = ct->next) { t = kzalloc(offsetof(struct elf_thread_core_info, notes[info->thread_notes]), GFP_KERNEL); if (unlikely(!t)) return 0; t->task = ct->task; t->next = info->thread->next; info->thread->next = t; } /* * Now fill in each thread's information. */ for (t = info->thread; t != NULL; t = t->next) if (!fill_thread_core_info(t, view, cprm->siginfo->si_signo, info)) return 0; /* * Fill in the two process-wide notes. */ fill_psinfo(psinfo, dump_task->group_leader, dump_task->mm); info->size += notesize(&info->psinfo); fill_siginfo_note(&info->signote, &info->csigdata, cprm->siginfo); info->size += notesize(&info->signote); fill_auxv_note(&info->auxv, current->mm); info->size += notesize(&info->auxv); if (fill_files_note(&info->files, cprm) == 0) info->size += notesize(&info->files); return 1; } /* * Write all the notes for each thread. When writing the first thread, the * process-wide notes are interleaved after the first thread-specific note. */ static int write_note_info(struct elf_note_info *info, struct coredump_params *cprm) { bool first = true; struct elf_thread_core_info *t = info->thread; do { int i; if (!writenote(&t->notes[0], cprm)) return 0; if (first && !writenote(&info->psinfo, cprm)) return 0; if (first && !writenote(&info->signote, cprm)) return 0; if (first && !writenote(&info->auxv, cprm)) return 0; if (first && info->files.data && !writenote(&info->files, cprm)) return 0; for (i = 1; i < info->thread_notes; ++i) if (t->notes[i].data && !writenote(&t->notes[i], cprm)) return 0; first = false; t = t->next; } while (t); return 1; } static void free_note_info(struct elf_note_info *info) { struct elf_thread_core_info *threads = info->thread; while (threads) { unsigned int i; struct elf_thread_core_info *t = threads; threads = t->next; WARN_ON(t->notes[0].data && t->notes[0].data != &t->prstatus); for (i = 1; i < info->thread_notes; ++i) kvfree(t->notes[i].data); kfree(t); } kfree(info->psinfo.data); kvfree(info->files.data); } static void fill_extnum_info(struct elfhdr *elf, struct elf_shdr *shdr4extnum, elf_addr_t e_shoff, int segs) { elf->e_shoff = e_shoff; elf->e_shentsize = sizeof(*shdr4extnum); elf->e_shnum = 1; elf->e_shstrndx = SHN_UNDEF; memset(shdr4extnum, 0, sizeof(*shdr4extnum)); shdr4extnum->sh_type = SHT_NULL; shdr4extnum->sh_size = elf->e_shnum; shdr4extnum->sh_link = elf->e_shstrndx; shdr4extnum->sh_info = segs; } /* * Actual dumper * * This is a two-pass process; first we find the offsets of the bits, * and then they are actually written out. If we run out of core limit * we just truncate. */ static int elf_core_dump(struct coredump_params *cprm) { int has_dumped = 0; int segs, i; struct elfhdr elf; loff_t offset = 0, dataoff; struct elf_note_info info = { }; struct elf_phdr *phdr4note = NULL; struct elf_shdr *shdr4extnum = NULL; Elf_Half e_phnum; elf_addr_t e_shoff; /* * The number of segs are recored into ELF header as 16bit value. * Please check DEFAULT_MAX_MAP_COUNT definition when you modify here. */ segs = cprm->vma_count + elf_core_extra_phdrs(cprm); /* for notes section */ segs++; /* If segs > PN_XNUM(0xffff), then e_phnum overflows. To avoid * this, kernel supports extended numbering. Have a look at * include/linux/elf.h for further information. */ e_phnum = segs > PN_XNUM ? PN_XNUM : segs; /* * Collect all the non-memory information about the process for the * notes. This also sets up the file header. */ if (!fill_note_info(&elf, e_phnum, &info, cprm)) goto end_coredump; has_dumped = 1; offset += sizeof(elf); /* ELF header */ offset += segs * sizeof(struct elf_phdr); /* Program headers */ /* Write notes phdr entry */ { size_t sz = info.size; /* For cell spufs and x86 xstate */ sz += elf_coredump_extra_notes_size(); phdr4note = kmalloc(sizeof(*phdr4note), GFP_KERNEL); if (!phdr4note) goto end_coredump; fill_elf_note_phdr(phdr4note, sz, offset); offset += sz; } dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE); offset += cprm->vma_data_size; offset += elf_core_extra_data_size(cprm); e_shoff = offset; if (e_phnum == PN_XNUM) { shdr4extnum = kmalloc(sizeof(*shdr4extnum), GFP_KERNEL); if (!shdr4extnum) goto end_coredump; fill_extnum_info(&elf, shdr4extnum, e_shoff, segs); } offset = dataoff; if (!dump_emit(cprm, &elf, sizeof(elf))) goto end_coredump; if (!dump_emit(cprm, phdr4note, sizeof(*phdr4note))) goto end_coredump; /* Write program headers for segments dump */ for (i = 0; i < cprm->vma_count; i++) { struct core_vma_metadata *meta = cprm->vma_meta + i; struct elf_phdr phdr; phdr.p_type = PT_LOAD; phdr.p_offset = offset; phdr.p_vaddr = meta->start; phdr.p_paddr = 0; phdr.p_filesz = meta->dump_size; phdr.p_memsz = meta->end - meta->start; offset += phdr.p_filesz; phdr.p_flags = 0; if (meta->flags & VM_READ) phdr.p_flags |= PF_R; if (meta->flags & VM_WRITE) phdr.p_flags |= PF_W; if (meta->flags & VM_EXEC) phdr.p_flags |= PF_X; phdr.p_align = ELF_EXEC_PAGESIZE; if (!dump_emit(cprm, &phdr, sizeof(phdr))) goto end_coredump; } if (!elf_core_write_extra_phdrs(cprm, offset)) goto end_coredump; /* write out the notes section */ if (!write_note_info(&info, cprm)) goto end_coredump; /* For cell spufs and x86 xstate */ if (elf_coredump_extra_notes_write(cprm)) goto end_coredump; /* Align to page */ dump_skip_to(cprm, dataoff); for (i = 0; i < cprm->vma_count; i++) { struct core_vma_metadata *meta = cprm->vma_meta + i; if (!dump_user_range(cprm, meta->start, meta->dump_size)) goto end_coredump; } if (!elf_core_write_extra_data(cprm)) goto end_coredump; if (e_phnum == PN_XNUM) { if (!dump_emit(cprm, shdr4extnum, sizeof(*shdr4extnum))) goto end_coredump; } end_coredump: free_note_info(&info); kfree(shdr4extnum); kfree(phdr4note); return has_dumped; } #endif /* CONFIG_ELF_CORE */ static int __init init_elf_binfmt(void) { register_binfmt(&elf_format); return 0; } static void __exit exit_elf_binfmt(void) { /* Remove the COFF and ELF loaders. */ unregister_binfmt(&elf_format); } core_initcall(init_elf_binfmt); module_exit(exit_elf_binfmt); #ifdef CONFIG_BINFMT_ELF_KUNIT_TEST #include "tests/binfmt_elf_kunit.c" #endif
188 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 // SPDX-License-Identifier: GPL-2.0-or-later /* * Virtio PCI driver - common functionality for all device versions * * This module allows virtio devices to be used over a virtual PCI device. * This can be used with QEMU based VMMs like KVM or Xen. * * Copyright IBM Corp. 2007 * Copyright Red Hat, Inc. 2014 * * Authors: * Anthony Liguori <aliguori@us.ibm.com> * Rusty Russell <rusty@rustcorp.com.au> * Michael S. Tsirkin <mst@redhat.com> */ #include "virtio_pci_common.h" static bool force_legacy = false; #if IS_ENABLED(CONFIG_VIRTIO_PCI_LEGACY) module_param(force_legacy, bool, 0444); MODULE_PARM_DESC(force_legacy, "Force legacy mode for transitional virtio 1 devices"); #endif bool vp_is_avq(struct virtio_device *vdev, unsigned int index) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); if (!virtio_has_feature(vdev, VIRTIO_F_ADMIN_VQ)) return false; return index == vp_dev->admin_vq.vq_index; } /* wait for pending irq handlers */ void vp_synchronize_vectors(struct virtio_device *vdev) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); int i; if (vp_dev->intx_enabled) synchronize_irq(vp_dev->pci_dev->irq); for (i = 0; i < vp_dev->msix_vectors; ++i) synchronize_irq(pci_irq_vector(vp_dev->pci_dev, i)); } /* the notify function used when creating a virt queue */ bool vp_notify(struct virtqueue *vq) { /* we write the queue's selector into the notification register to * signal the other end */ iowrite16(vq->index, (void __iomem *)vq->priv); return true; } /* Notify all slow path virtqueues on an interrupt. */ static void vp_vring_slow_path_interrupt(int irq, struct virtio_pci_device *vp_dev) { struct virtio_pci_vq_info *info; unsigned long flags; spin_lock_irqsave(&vp_dev->lock, flags); list_for_each_entry(info, &vp_dev->slow_virtqueues, node) vring_interrupt(irq, info->vq); spin_unlock_irqrestore(&vp_dev->lock, flags); } /* Handle a configuration change: Tell driver if it wants to know. */ static irqreturn_t vp_config_changed(int irq, void *opaque) { struct virtio_pci_device *vp_dev = opaque; virtio_config_changed(&vp_dev->vdev); vp_vring_slow_path_interrupt(irq, vp_dev); return IRQ_HANDLED; } /* Notify all virtqueues on an interrupt. */ static irqreturn_t vp_vring_interrupt(int irq, void *opaque) { struct virtio_pci_device *vp_dev = opaque; struct virtio_pci_vq_info *info; irqreturn_t ret = IRQ_NONE; unsigned long flags; spin_lock_irqsave(&vp_dev->lock, flags); list_for_each_entry(info, &vp_dev->virtqueues, node) { if (vring_interrupt(irq, info->vq) == IRQ_HANDLED) ret = IRQ_HANDLED; } spin_unlock_irqrestore(&vp_dev->lock, flags); return ret; } /* A small wrapper to also acknowledge the interrupt when it's handled. * I really need an EIO hook for the vring so I can ack the interrupt once we * know that we'll be handling the IRQ but before we invoke the callback since * the callback may notify the host which results in the host attempting to * raise an interrupt that we would then mask once we acknowledged the * interrupt. */ static irqreturn_t vp_interrupt(int irq, void *opaque) { struct virtio_pci_device *vp_dev = opaque; u8 isr; /* reading the ISR has the effect of also clearing it so it's very * important to save off the value. */ isr = ioread8(vp_dev->isr); /* It's definitely not us if the ISR was not high */ if (!isr) return IRQ_NONE; /* Configuration change? Tell driver if it wants to know. */ if (isr & VIRTIO_PCI_ISR_CONFIG) vp_config_changed(irq, opaque); return vp_vring_interrupt(irq, opaque); } static int vp_request_msix_vectors(struct virtio_device *vdev, int nvectors, bool per_vq_vectors, struct irq_affinity *desc) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); const char *name = dev_name(&vp_dev->vdev.dev); unsigned int flags = PCI_IRQ_MSIX; unsigned int i, v; int err = -ENOMEM; vp_dev->msix_vectors = nvectors; vp_dev->msix_names = kmalloc_array(nvectors, sizeof(*vp_dev->msix_names), GFP_KERNEL); if (!vp_dev->msix_names) goto error; vp_dev->msix_affinity_masks = kcalloc(nvectors, sizeof(*vp_dev->msix_affinity_masks), GFP_KERNEL); if (!vp_dev->msix_affinity_masks) goto error; for (i = 0; i < nvectors; ++i) if (!alloc_cpumask_var(&vp_dev->msix_affinity_masks[i], GFP_KERNEL)) goto error; if (!per_vq_vectors) desc = NULL; if (desc) { flags |= PCI_IRQ_AFFINITY; desc->pre_vectors++; /* virtio config vector */ } err = pci_alloc_irq_vectors_affinity(vp_dev->pci_dev, nvectors, nvectors, flags, desc); if (err < 0) goto error; vp_dev->msix_enabled = 1; /* Set the vector used for configuration */ v = vp_dev->msix_used_vectors; snprintf(vp_dev->msix_names[v], sizeof *vp_dev->msix_names, "%s-config", name); err = request_irq(pci_irq_vector(vp_dev->pci_dev, v), vp_config_changed, 0, vp_dev->msix_names[v], vp_dev); if (err) goto error; ++vp_dev->msix_used_vectors; v = vp_dev->config_vector(vp_dev, v); /* Verify we had enough resources to assign the vector */ if (v == VIRTIO_MSI_NO_VECTOR) { err = -EBUSY; goto error; } if (!per_vq_vectors) { /* Shared vector for all VQs */ v = vp_dev->msix_used_vectors; snprintf(vp_dev->msix_names[v], sizeof *vp_dev->msix_names, "%s-virtqueues", name); err = request_irq(pci_irq_vector(vp_dev->pci_dev, v), vp_vring_interrupt, 0, vp_dev->msix_names[v], vp_dev); if (err) goto error; ++vp_dev->msix_used_vectors; } return 0; error: return err; } static bool vp_is_slow_path_vector(u16 msix_vec) { return msix_vec == VP_MSIX_CONFIG_VECTOR; } static struct virtqueue *vp_setup_vq(struct virtio_device *vdev, unsigned int index, void (*callback)(struct virtqueue *vq), const char *name, bool ctx, u16 msix_vec, struct virtio_pci_vq_info **p_info) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); struct virtio_pci_vq_info *info = kmalloc(sizeof *info, GFP_KERNEL); struct virtqueue *vq; unsigned long flags; /* fill out our structure that represents an active queue */ if (!info) return ERR_PTR(-ENOMEM); vq = vp_dev->setup_vq(vp_dev, info, index, callback, name, ctx, msix_vec); if (IS_ERR(vq)) goto out_info; info->vq = vq; if (callback) { spin_lock_irqsave(&vp_dev->lock, flags); if (!vp_is_slow_path_vector(msix_vec)) list_add(&info->node, &vp_dev->virtqueues); else list_add(&info->node, &vp_dev->slow_virtqueues); spin_unlock_irqrestore(&vp_dev->lock, flags); } else { INIT_LIST_HEAD(&info->node); } *p_info = info; return vq; out_info: kfree(info); return vq; } static void vp_del_vq(struct virtqueue *vq, struct virtio_pci_vq_info *info) { struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev); unsigned long flags; /* * If it fails during re-enable reset vq. This way we won't rejoin * info->node to the queue. Prevent unexpected irqs. */ if (!vq->reset) { spin_lock_irqsave(&vp_dev->lock, flags); list_del(&info->node); spin_unlock_irqrestore(&vp_dev->lock, flags); } vp_dev->del_vq(info); kfree(info); } /* the config->del_vqs() implementation */ void vp_del_vqs(struct virtio_device *vdev) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); struct virtio_pci_vq_info *info; struct virtqueue *vq, *n; int i; list_for_each_entry_safe(vq, n, &vdev->vqs, list) { info = vp_is_avq(vdev, vq->index) ? vp_dev->admin_vq.info : vp_dev->vqs[vq->index]; if (vp_dev->per_vq_vectors) { int v = info->msix_vector; if (v != VIRTIO_MSI_NO_VECTOR && !vp_is_slow_path_vector(v)) { int irq = pci_irq_vector(vp_dev->pci_dev, v); irq_update_affinity_hint(irq, NULL); free_irq(irq, vq); } } vp_del_vq(vq, info); } vp_dev->per_vq_vectors = false; if (vp_dev->intx_enabled) { free_irq(vp_dev->pci_dev->irq, vp_dev); vp_dev->intx_enabled = 0; } for (i = 0; i < vp_dev->msix_used_vectors; ++i) free_irq(pci_irq_vector(vp_dev->pci_dev, i), vp_dev); if (vp_dev->msix_affinity_masks) { for (i = 0; i < vp_dev->msix_vectors; i++) free_cpumask_var(vp_dev->msix_affinity_masks[i]); } if (vp_dev->msix_enabled) { /* Disable the vector used for configuration */ vp_dev->config_vector(vp_dev, VIRTIO_MSI_NO_VECTOR); pci_free_irq_vectors(vp_dev->pci_dev); vp_dev->msix_enabled = 0; } vp_dev->msix_vectors = 0; vp_dev->msix_used_vectors = 0; kfree(vp_dev->msix_names); vp_dev->msix_names = NULL; kfree(vp_dev->msix_affinity_masks); vp_dev->msix_affinity_masks = NULL; kfree(vp_dev->vqs); vp_dev->vqs = NULL; } enum vp_vq_vector_policy { VP_VQ_VECTOR_POLICY_EACH, VP_VQ_VECTOR_POLICY_SHARED_SLOW, VP_VQ_VECTOR_POLICY_SHARED, }; static struct virtqueue * vp_find_one_vq_msix(struct virtio_device *vdev, int queue_idx, vq_callback_t *callback, const char *name, bool ctx, bool slow_path, int *allocated_vectors, enum vp_vq_vector_policy vector_policy, struct virtio_pci_vq_info **p_info) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); struct virtqueue *vq; u16 msix_vec; int err; if (!callback) msix_vec = VIRTIO_MSI_NO_VECTOR; else if (vector_policy == VP_VQ_VECTOR_POLICY_EACH || (vector_policy == VP_VQ_VECTOR_POLICY_SHARED_SLOW && !slow_path)) msix_vec = (*allocated_vectors)++; else if (vector_policy != VP_VQ_VECTOR_POLICY_EACH && slow_path) msix_vec = VP_MSIX_CONFIG_VECTOR; else msix_vec = VP_MSIX_VQ_VECTOR; vq = vp_setup_vq(vdev, queue_idx, callback, name, ctx, msix_vec, p_info); if (IS_ERR(vq)) return vq; if (vector_policy == VP_VQ_VECTOR_POLICY_SHARED || msix_vec == VIRTIO_MSI_NO_VECTOR || vp_is_slow_path_vector(msix_vec)) return vq; /* allocate per-vq irq if available and necessary */ snprintf(vp_dev->msix_names[msix_vec], sizeof(*vp_dev->msix_names), "%s-%s", dev_name(&vp_dev->vdev.dev), name); err = request_irq(pci_irq_vector(vp_dev->pci_dev, msix_vec), vring_interrupt, 0, vp_dev->msix_names[msix_vec], vq); if (err) { vp_del_vq(vq, *p_info); return ERR_PTR(err); } return vq; } static int vp_find_vqs_msix(struct virtio_device *vdev, unsigned int nvqs, struct virtqueue *vqs[], struct virtqueue_info vqs_info[], enum vp_vq_vector_policy vector_policy, struct irq_affinity *desc) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); struct virtio_pci_admin_vq *avq = &vp_dev->admin_vq; struct virtqueue_info *vqi; int i, err, nvectors, allocated_vectors, queue_idx = 0; struct virtqueue *vq; bool per_vq_vectors; u16 avq_num = 0; vp_dev->vqs = kcalloc(nvqs, sizeof(*vp_dev->vqs), GFP_KERNEL); if (!vp_dev->vqs) return -ENOMEM; if (vp_dev->avq_index) { err = vp_dev->avq_index(vdev, &avq->vq_index, &avq_num); if (err) goto error_find; } per_vq_vectors = vector_policy != VP_VQ_VECTOR_POLICY_SHARED; if (per_vq_vectors) { /* Best option: one for change interrupt, one per vq. */ nvectors = 1; for (i = 0; i < nvqs; ++i) { vqi = &vqs_info[i]; if (vqi->name && vqi->callback) ++nvectors; } if (avq_num && vector_policy == VP_VQ_VECTOR_POLICY_EACH) ++nvectors; } else { /* Second best: one for change, shared for all vqs. */ nvectors = 2; } err = vp_request_msix_vectors(vdev, nvectors, per_vq_vectors, desc); if (err) goto error_find; vp_dev->per_vq_vectors = per_vq_vectors; allocated_vectors = vp_dev->msix_used_vectors; for (i = 0; i < nvqs; ++i) { vqi = &vqs_info[i]; if (!vqi->name) { vqs[i] = NULL; continue; } vqs[i] = vp_find_one_vq_msix(vdev, queue_idx++, vqi->callback, vqi->name, vqi->ctx, false, &allocated_vectors, vector_policy, &vp_dev->vqs[i]); if (IS_ERR(vqs[i])) { err = PTR_ERR(vqs[i]); goto error_find; } } if (!avq_num) return 0; sprintf(avq->name, "avq.%u", avq->vq_index); vq = vp_find_one_vq_msix(vdev, avq->vq_index, vp_modern_avq_done, avq->name, false, true, &allocated_vectors, vector_policy, &vp_dev->admin_vq.info); if (IS_ERR(vq)) { err = PTR_ERR(vq); goto error_find; } return 0; error_find: vp_del_vqs(vdev); return err; } static int vp_find_vqs_intx(struct virtio_device *vdev, unsigned int nvqs, struct virtqueue *vqs[], struct virtqueue_info vqs_info[]) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); struct virtio_pci_admin_vq *avq = &vp_dev->admin_vq; int i, err, queue_idx = 0; struct virtqueue *vq; u16 avq_num = 0; vp_dev->vqs = kcalloc(nvqs, sizeof(*vp_dev->vqs), GFP_KERNEL); if (!vp_dev->vqs) return -ENOMEM; if (vp_dev->avq_index) { err = vp_dev->avq_index(vdev, &avq->vq_index, &avq_num); if (err) goto out_del_vqs; } err = request_irq(vp_dev->pci_dev->irq, vp_interrupt, IRQF_SHARED, dev_name(&vdev->dev), vp_dev); if (err) goto out_del_vqs; vp_dev->intx_enabled = 1; vp_dev->per_vq_vectors = false; for (i = 0; i < nvqs; ++i) { struct virtqueue_info *vqi = &vqs_info[i]; if (!vqi->name) { vqs[i] = NULL; continue; } vqs[i] = vp_setup_vq(vdev, queue_idx++, vqi->callback, vqi->name, vqi->ctx, VIRTIO_MSI_NO_VECTOR, &vp_dev->vqs[i]); if (IS_ERR(vqs[i])) { err = PTR_ERR(vqs[i]); goto out_del_vqs; } } if (!avq_num) return 0; sprintf(avq->name, "avq.%u", avq->vq_index); vq = vp_setup_vq(vdev, queue_idx++, vp_modern_avq_done, avq->name, false, VIRTIO_MSI_NO_VECTOR, &vp_dev->admin_vq.info); if (IS_ERR(vq)) { err = PTR_ERR(vq); goto out_del_vqs; } return 0; out_del_vqs: vp_del_vqs(vdev); return err; } /* the config->find_vqs() implementation */ int vp_find_vqs(struct virtio_device *vdev, unsigned int nvqs, struct virtqueue *vqs[], struct virtqueue_info vqs_info[], struct irq_affinity *desc) { int err; /* Try MSI-X with one vector per queue. */ err = vp_find_vqs_msix(vdev, nvqs, vqs, vqs_info, VP_VQ_VECTOR_POLICY_EACH, desc); if (!err) return 0; /* Fallback: MSI-X with one shared vector for config and * slow path queues, one vector per queue for the rest. */ err = vp_find_vqs_msix(vdev, nvqs, vqs, vqs_info, VP_VQ_VECTOR_POLICY_SHARED_SLOW, desc); if (!err) return 0; /* Fallback: MSI-X with one vector for config, one shared for queues. */ err = vp_find_vqs_msix(vdev, nvqs, vqs, vqs_info, VP_VQ_VECTOR_POLICY_SHARED, desc); if (!err) return 0; /* Is there an interrupt? If not give up. */ if (!(to_vp_device(vdev)->pci_dev->irq)) return err; /* Finally fall back to regular interrupts. */ return vp_find_vqs_intx(vdev, nvqs, vqs, vqs_info); } const char *vp_bus_name(struct virtio_device *vdev) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); return pci_name(vp_dev->pci_dev); } /* Setup the affinity for a virtqueue: * - force the affinity for per vq vector * - OR over all affinities for shared MSI * - ignore the affinity request if we're using INTX */ int vp_set_vq_affinity(struct virtqueue *vq, const struct cpumask *cpu_mask) { struct virtio_device *vdev = vq->vdev; struct virtio_pci_device *vp_dev = to_vp_device(vdev); struct virtio_pci_vq_info *info = vp_dev->vqs[vq->index]; struct cpumask *mask; unsigned int irq; if (!vq->callback) return -EINVAL; if (vp_dev->msix_enabled) { mask = vp_dev->msix_affinity_masks[info->msix_vector]; irq = pci_irq_vector(vp_dev->pci_dev, info->msix_vector); if (!cpu_mask) irq_update_affinity_hint(irq, NULL); else { cpumask_copy(mask, cpu_mask); irq_set_affinity_and_hint(irq, mask); } } return 0; } const struct cpumask *vp_get_vq_affinity(struct virtio_device *vdev, int index) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); if (!vp_dev->per_vq_vectors || vp_dev->vqs[index]->msix_vector == VIRTIO_MSI_NO_VECTOR || vp_is_slow_path_vector(vp_dev->vqs[index]->msix_vector)) return NULL; return pci_irq_get_affinity(vp_dev->pci_dev, vp_dev->vqs[index]->msix_vector); } #ifdef CONFIG_PM_SLEEP static int virtio_pci_freeze(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev); int ret; ret = virtio_device_freeze(&vp_dev->vdev); if (!ret) pci_disable_device(pci_dev); return ret; } static int virtio_pci_restore(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev); int ret; ret = pci_enable_device(pci_dev); if (ret) return ret; pci_set_master(pci_dev); return virtio_device_restore(&vp_dev->vdev); } static bool vp_supports_pm_no_reset(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); u16 pmcsr; if (!pci_dev->pm_cap) return false; pci_read_config_word(pci_dev, pci_dev->pm_cap + PCI_PM_CTRL, &pmcsr); if (PCI_POSSIBLE_ERROR(pmcsr)) { dev_err(dev, "Unable to query pmcsr"); return false; } return pmcsr & PCI_PM_CTRL_NO_SOFT_RESET; } static int virtio_pci_suspend(struct device *dev) { return vp_supports_pm_no_reset(dev) ? 0 : virtio_pci_freeze(dev); } static int virtio_pci_resume(struct device *dev) { return vp_supports_pm_no_reset(dev) ? 0 : virtio_pci_restore(dev); } static const struct dev_pm_ops virtio_pci_pm_ops = { .suspend = virtio_pci_suspend, .resume = virtio_pci_resume, .freeze = virtio_pci_freeze, .thaw = virtio_pci_restore, .poweroff = virtio_pci_freeze, .restore = virtio_pci_restore, }; #endif /* Qumranet donated their vendor ID for devices 0x1000 thru 0x10FF. */ static const struct pci_device_id virtio_pci_id_table[] = { { PCI_DEVICE(PCI_VENDOR_ID_REDHAT_QUMRANET, PCI_ANY_ID) }, { 0 } }; MODULE_DEVICE_TABLE(pci, virtio_pci_id_table); static void virtio_pci_release_dev(struct device *_d) { struct virtio_device *vdev = dev_to_virtio(_d); struct virtio_pci_device *vp_dev = to_vp_device(vdev); /* As struct device is a kobject, it's not safe to * free the memory (including the reference counter itself) * until it's release callback. */ kfree(vp_dev); } static int virtio_pci_probe(struct pci_dev *pci_dev, const struct pci_device_id *id) { struct virtio_pci_device *vp_dev, *reg_dev = NULL; int rc; /* allocate our structure and fill it out */ vp_dev = kzalloc(sizeof(struct virtio_pci_device), GFP_KERNEL); if (!vp_dev) return -ENOMEM; pci_set_drvdata(pci_dev, vp_dev); vp_dev->vdev.dev.parent = &pci_dev->dev; vp_dev->vdev.dev.release = virtio_pci_release_dev; vp_dev->pci_dev = pci_dev; INIT_LIST_HEAD(&vp_dev->virtqueues); INIT_LIST_HEAD(&vp_dev->slow_virtqueues); spin_lock_init(&vp_dev->lock); /* enable the device */ rc = pci_enable_device(pci_dev); if (rc) goto err_enable_device; if (force_legacy) { rc = virtio_pci_legacy_probe(vp_dev); /* Also try modern mode if we can't map BAR0 (no IO space). */ if (rc == -ENODEV || rc == -ENOMEM) rc = virtio_pci_modern_probe(vp_dev); if (rc) goto err_probe; } else { rc = virtio_pci_modern_probe(vp_dev); if (rc == -ENODEV) rc = virtio_pci_legacy_probe(vp_dev); if (rc) goto err_probe; } pci_set_master(pci_dev); rc = register_virtio_device(&vp_dev->vdev); reg_dev = vp_dev; if (rc) goto err_register; return 0; err_register: if (vp_dev->is_legacy) virtio_pci_legacy_remove(vp_dev); else virtio_pci_modern_remove(vp_dev); err_probe: pci_disable_device(pci_dev); err_enable_device: if (reg_dev) put_device(&vp_dev->vdev.dev); else kfree(vp_dev); return rc; } static void virtio_pci_remove(struct pci_dev *pci_dev) { struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev); struct device *dev = get_device(&vp_dev->vdev.dev); /* * Device is marked broken on surprise removal so that virtio upper * layers can abort any ongoing operation. */ if (!pci_device_is_present(pci_dev)) virtio_break_device(&vp_dev->vdev); pci_disable_sriov(pci_dev); unregister_virtio_device(&vp_dev->vdev); if (vp_dev->is_legacy) virtio_pci_legacy_remove(vp_dev); else virtio_pci_modern_remove(vp_dev); pci_disable_device(pci_dev); put_device(dev); } static int virtio_pci_sriov_configure(struct pci_dev *pci_dev, int num_vfs) { struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev); struct virtio_device *vdev = &vp_dev->vdev; int ret; if (!(vdev->config->get_status(vdev) & VIRTIO_CONFIG_S_DRIVER_OK)) return -EBUSY; if (!__virtio_test_bit(vdev, VIRTIO_F_SR_IOV)) return -EINVAL; if (pci_vfs_assigned(pci_dev)) return -EPERM; if (num_vfs == 0) { pci_disable_sriov(pci_dev); return 0; } ret = pci_enable_sriov(pci_dev, num_vfs); if (ret < 0) return ret; return num_vfs; } static void virtio_pci_reset_prepare(struct pci_dev *pci_dev) { struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev); int ret = 0; ret = virtio_device_reset_prepare(&vp_dev->vdev); if (ret) { if (ret != -EOPNOTSUPP) dev_warn(&pci_dev->dev, "Reset prepare failure: %d", ret); return; } if (pci_is_enabled(pci_dev)) pci_disable_device(pci_dev); } static void virtio_pci_reset_done(struct pci_dev *pci_dev) { struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev); int ret; if (pci_is_enabled(pci_dev)) return; ret = pci_enable_device(pci_dev); if (!ret) { pci_set_master(pci_dev); ret = virtio_device_reset_done(&vp_dev->vdev); } if (ret && ret != -EOPNOTSUPP) dev_warn(&pci_dev->dev, "Reset done failure: %d", ret); } static const struct pci_error_handlers virtio_pci_err_handler = { .reset_prepare = virtio_pci_reset_prepare, .reset_done = virtio_pci_reset_done, }; static struct pci_driver virtio_pci_driver = { .name = "virtio-pci", .id_table = virtio_pci_id_table, .probe = virtio_pci_probe, .remove = virtio_pci_remove, #ifdef CONFIG_PM_SLEEP .driver.pm = &virtio_pci_pm_ops, #endif .sriov_configure = virtio_pci_sriov_configure, .err_handler = &virtio_pci_err_handler, }; struct virtio_device *virtio_pci_vf_get_pf_dev(struct pci_dev *pdev) { struct virtio_pci_device *pf_vp_dev; pf_vp_dev = pci_iov_get_pf_drvdata(pdev, &virtio_pci_driver); if (IS_ERR(pf_vp_dev)) return NULL; return &pf_vp_dev->vdev; } module_pci_driver(virtio_pci_driver); MODULE_AUTHOR("Anthony Liguori <aliguori@us.ibm.com>"); MODULE_DESCRIPTION("virtio-pci"); MODULE_LICENSE("GPL"); MODULE_VERSION("1");
31 6 27 43 43 34 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 // SPDX-License-Identifier: GPL-2.0-or-later /* * Cryptographic API. * * Single-block cipher operations. * * Copyright (c) 2002 James Morris <jmorris@intercode.com.au> * Copyright (c) 2005 Herbert Xu <herbert@gondor.apana.org.au> */ #include <crypto/algapi.h> #include <crypto/internal/cipher.h> #include <linux/kernel.h> #include <linux/crypto.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/string.h> #include "internal.h" static int setkey_unaligned(struct crypto_cipher *tfm, const u8 *key, unsigned int keylen) { struct cipher_alg *cia = crypto_cipher_alg(tfm); unsigned long alignmask = crypto_cipher_alignmask(tfm); int ret; u8 *buffer, *alignbuffer; unsigned long absize; absize = keylen + alignmask; buffer = kmalloc(absize, GFP_ATOMIC); if (!buffer) return -ENOMEM; alignbuffer = (u8 *)ALIGN((unsigned long)buffer, alignmask + 1); memcpy(alignbuffer, key, keylen); ret = cia->cia_setkey(crypto_cipher_tfm(tfm), alignbuffer, keylen); kfree_sensitive(buffer); return ret; } int crypto_cipher_setkey(struct crypto_cipher *tfm, const u8 *key, unsigned int keylen) { struct cipher_alg *cia = crypto_cipher_alg(tfm); unsigned long alignmask = crypto_cipher_alignmask(tfm); if (keylen < cia->cia_min_keysize || keylen > cia->cia_max_keysize) return -EINVAL; if ((unsigned long)key & alignmask) return setkey_unaligned(tfm, key, keylen); return cia->cia_setkey(crypto_cipher_tfm(tfm), key, keylen); } EXPORT_SYMBOL_NS_GPL(crypto_cipher_setkey, "CRYPTO_INTERNAL"); static inline void cipher_crypt_one(struct crypto_cipher *tfm, u8 *dst, const u8 *src, bool enc) { unsigned long alignmask = crypto_cipher_alignmask(tfm); struct cipher_alg *cia = crypto_cipher_alg(tfm); void (*fn)(struct crypto_tfm *, u8 *, const u8 *) = enc ? cia->cia_encrypt : cia->cia_decrypt; if (unlikely(((unsigned long)dst | (unsigned long)src) & alignmask)) { unsigned int bs = crypto_cipher_blocksize(tfm); u8 buffer[MAX_CIPHER_BLOCKSIZE + MAX_CIPHER_ALIGNMASK]; u8 *tmp = (u8 *)ALIGN((unsigned long)buffer, alignmask + 1); memcpy(tmp, src, bs); fn(crypto_cipher_tfm(tfm), tmp, tmp); memcpy(dst, tmp, bs); } else { fn(crypto_cipher_tfm(tfm), dst, src); } } void crypto_cipher_encrypt_one(struct crypto_cipher *tfm, u8 *dst, const u8 *src) { cipher_crypt_one(tfm, dst, src, true); } EXPORT_SYMBOL_NS_GPL(crypto_cipher_encrypt_one, "CRYPTO_INTERNAL"); void crypto_cipher_decrypt_one(struct crypto_cipher *tfm, u8 *dst, const u8 *src) { cipher_crypt_one(tfm, dst, src, false); } EXPORT_SYMBOL_NS_GPL(crypto_cipher_decrypt_one, "CRYPTO_INTERNAL"); struct crypto_cipher *crypto_clone_cipher(struct crypto_cipher *cipher) { struct crypto_tfm *tfm = crypto_cipher_tfm(cipher); struct crypto_alg *alg = tfm->__crt_alg; struct crypto_cipher *ncipher; struct crypto_tfm *ntfm; if (alg->cra_init) return ERR_PTR(-ENOSYS); if (unlikely(!crypto_mod_get(alg))) return ERR_PTR(-ESTALE); ntfm = __crypto_alloc_tfmgfp(alg, CRYPTO_ALG_TYPE_CIPHER, CRYPTO_ALG_TYPE_MASK, GFP_ATOMIC); if (IS_ERR(ntfm)) { crypto_mod_put(alg); return ERR_CAST(ntfm); } ntfm->crt_flags = tfm->crt_flags; ncipher = __crypto_cipher_cast(ntfm); return ncipher; } EXPORT_SYMBOL_GPL(crypto_clone_cipher);
8 10 14 13 5 5 5 5 7 13 3 4 14 14 11 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 // SPDX-License-Identifier: GPL-2.0-only /* * linux/drivers/char/ttyprintk.c * * Copyright (C) 2010 Samo Pogacnik */ /* * This pseudo device allows user to make printk messages. It is possible * to store "console" messages inline with kernel messages for better analyses * of the boot process, for example. */ #include <linux/console.h> #include <linux/device.h> #include <linux/serial.h> #include <linux/tty.h> #include <linux/module.h> #include <linux/spinlock.h> struct ttyprintk_port { struct tty_port port; spinlock_t spinlock; }; static struct ttyprintk_port tpk_port; /* * Our simple preformatting supports transparent output of (time-stamped) * printk messages (also suitable for logging service): * - any cr is replaced by nl * - adds a ttyprintk source tag in front of each line * - too long message is fragmented, with '\'nl between fragments * - TPK_STR_SIZE isn't really the write_room limiting factor, because * it is emptied on the fly during preformatting. */ #define TPK_STR_SIZE 508 /* should be bigger then max expected line length */ #define TPK_MAX_ROOM 4096 /* we could assume 4K for instance */ #define TPK_PREFIX KERN_SOH __stringify(CONFIG_TTY_PRINTK_LEVEL) static int tpk_curr; static u8 tpk_buffer[TPK_STR_SIZE + 4]; static void tpk_flush(void) { if (tpk_curr > 0) { tpk_buffer[tpk_curr] = '\0'; printk(TPK_PREFIX "[U] %s\n", tpk_buffer); tpk_curr = 0; } } static int tpk_printk(const u8 *buf, size_t count) { size_t i; for (i = 0; i < count; i++) { if (tpk_curr >= TPK_STR_SIZE) { /* end of tmp buffer reached: cut the message in two */ tpk_buffer[tpk_curr++] = '\\'; tpk_flush(); } switch (buf[i]) { case '\r': tpk_flush(); if ((i + 1) < count && buf[i + 1] == '\n') i++; break; case '\n': tpk_flush(); break; default: tpk_buffer[tpk_curr++] = buf[i]; break; } } return count; } /* * TTY operations open function. */ static int tpk_open(struct tty_struct *tty, struct file *filp) { tty->driver_data = &tpk_port; return tty_port_open(&tpk_port.port, tty, filp); } /* * TTY operations close function. */ static void tpk_close(struct tty_struct *tty, struct file *filp) { struct ttyprintk_port *tpkp = tty->driver_data; tty_port_close(&tpkp->port, tty, filp); } /* * TTY operations write function. */ static ssize_t tpk_write(struct tty_struct *tty, const u8 *buf, size_t count) { struct ttyprintk_port *tpkp = tty->driver_data; unsigned long flags; int ret; /* exclusive use of tpk_printk within this tty */ spin_lock_irqsave(&tpkp->spinlock, flags); ret = tpk_printk(buf, count); spin_unlock_irqrestore(&tpkp->spinlock, flags); return ret; } /* * TTY operations write_room function. */ static unsigned int tpk_write_room(struct tty_struct *tty) { return TPK_MAX_ROOM; } /* * TTY operations hangup function. */ static void tpk_hangup(struct tty_struct *tty) { struct ttyprintk_port *tpkp = tty->driver_data; tty_port_hangup(&tpkp->port); } /* * TTY port operations shutdown function. */ static void tpk_port_shutdown(struct tty_port *tport) { struct ttyprintk_port *tpkp = container_of(tport, struct ttyprintk_port, port); unsigned long flags; spin_lock_irqsave(&tpkp->spinlock, flags); tpk_flush(); spin_unlock_irqrestore(&tpkp->spinlock, flags); } static const struct tty_operations ttyprintk_ops = { .open = tpk_open, .close = tpk_close, .write = tpk_write, .write_room = tpk_write_room, .hangup = tpk_hangup, }; static const struct tty_port_operations tpk_port_ops = { .shutdown = tpk_port_shutdown, }; static struct tty_driver *ttyprintk_driver; static struct tty_driver *ttyprintk_console_device(struct console *c, int *index) { *index = 0; return ttyprintk_driver; } static struct console ttyprintk_console = { .name = "ttyprintk", .device = ttyprintk_console_device, }; static int __init ttyprintk_init(void) { int ret; spin_lock_init(&tpk_port.spinlock); ttyprintk_driver = tty_alloc_driver(1, TTY_DRIVER_RESET_TERMIOS | TTY_DRIVER_REAL_RAW | TTY_DRIVER_UNNUMBERED_NODE); if (IS_ERR(ttyprintk_driver)) return PTR_ERR(ttyprintk_driver); tty_port_init(&tpk_port.port); tpk_port.port.ops = &tpk_port_ops; ttyprintk_driver->driver_name = "ttyprintk"; ttyprintk_driver->name = "ttyprintk"; ttyprintk_driver->major = TTYAUX_MAJOR; ttyprintk_driver->minor_start = 3; ttyprintk_driver->type = TTY_DRIVER_TYPE_CONSOLE; ttyprintk_driver->init_termios = tty_std_termios; ttyprintk_driver->init_termios.c_oflag = OPOST | OCRNL | ONOCR | ONLRET; tty_set_operations(ttyprintk_driver, &ttyprintk_ops); tty_port_link_device(&tpk_port.port, ttyprintk_driver, 0); ret = tty_register_driver(ttyprintk_driver); if (ret < 0) { printk(KERN_ERR "Couldn't register ttyprintk driver\n"); goto error; } register_console(&ttyprintk_console); return 0; error: tty_driver_kref_put(ttyprintk_driver); tty_port_destroy(&tpk_port.port); return ret; } static void __exit ttyprintk_exit(void) { unregister_console(&ttyprintk_console); tty_unregister_driver(ttyprintk_driver); tty_driver_kref_put(ttyprintk_driver); tty_port_destroy(&tpk_port.port); } device_initcall(ttyprintk_init); module_exit(ttyprintk_exit); MODULE_DESCRIPTION("TTY driver to output user messages via printk"); MODULE_LICENSE("GPL");
47 48 48 6 40 24 29 6 9 1 5 41 1 40 5 36 5 27 3 47 46 1 1 45 44 45 46 46 45 55 3 40 41 36 14 44 26 12 14 12 12 9 2 5 6 11 39 3 55 55 42 40 50 48 49 37 13 13 52 2 50 1 1 1 2 1 1 2 1 1 8 5 3 2 2 1 2 4 1 1 2 2 14 1 1 12 11 11 11 4 8 24 24 1 1 1 11 11 23 15 10 10 2 8 18 5 5 4 28 28 28 26 24 28 29 29 17 17 18 5 15 15 15 15 14 15 12 13 6 2 1 1 10 2 20 20 20 1 3 3 3 30 29 30 30 3 2 25 25 25 25 4 22 23 21 20 29 30 30 29 8 29 22 13 13 12 4 13 12 2 1 2 2 2 2 2 2 3 3 3 2 2 1 1 5 5 4 1 8 2 5 4 3 4 23 24 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. * * The io_pagetable is the top of datastructure that maps IOVA's to PFNs. The * PFNs can be placed into an iommu_domain, or returned to the caller as a page * list for access by an in-kernel user. * * The datastructure uses the iopt_pages to optimize the storage of the PFNs * between the domains and xarray. */ #include <linux/err.h> #include <linux/errno.h> #include <linux/iommu.h> #include <linux/iommufd.h> #include <linux/lockdep.h> #include <linux/sched/mm.h> #include <linux/slab.h> #include <uapi/linux/iommufd.h> #include "double_span.h" #include "io_pagetable.h" struct iopt_pages_list { struct iopt_pages *pages; struct iopt_area *area; struct list_head next; unsigned long start_byte; unsigned long length; }; struct iopt_area *iopt_area_contig_init(struct iopt_area_contig_iter *iter, struct io_pagetable *iopt, unsigned long iova, unsigned long last_iova) { lockdep_assert_held(&iopt->iova_rwsem); iter->cur_iova = iova; iter->last_iova = last_iova; iter->area = iopt_area_iter_first(iopt, iova, iova); if (!iter->area) return NULL; if (!iter->area->pages) { iter->area = NULL; return NULL; } return iter->area; } struct iopt_area *iopt_area_contig_next(struct iopt_area_contig_iter *iter) { unsigned long last_iova; if (!iter->area) return NULL; last_iova = iopt_area_last_iova(iter->area); if (iter->last_iova <= last_iova) return NULL; iter->cur_iova = last_iova + 1; iter->area = iopt_area_iter_next(iter->area, iter->cur_iova, iter->last_iova); if (!iter->area) return NULL; if (iter->cur_iova != iopt_area_iova(iter->area) || !iter->area->pages) { iter->area = NULL; return NULL; } return iter->area; } static bool __alloc_iova_check_hole(struct interval_tree_double_span_iter *span, unsigned long length, unsigned long iova_alignment, unsigned long page_offset) { if (span->is_used || span->last_hole - span->start_hole < length - 1) return false; span->start_hole = ALIGN(span->start_hole, iova_alignment) | page_offset; if (span->start_hole > span->last_hole || span->last_hole - span->start_hole < length - 1) return false; return true; } static bool __alloc_iova_check_used(struct interval_tree_span_iter *span, unsigned long length, unsigned long iova_alignment, unsigned long page_offset) { if (span->is_hole || span->last_used - span->start_used < length - 1) return false; span->start_used = ALIGN(span->start_used, iova_alignment) | page_offset; if (span->start_used > span->last_used || span->last_used - span->start_used < length - 1) return false; return true; } /* * Automatically find a block of IOVA that is not being used and not reserved. * Does not return a 0 IOVA even if it is valid. */ static int iopt_alloc_iova(struct io_pagetable *iopt, unsigned long *iova, unsigned long addr, unsigned long length) { unsigned long page_offset = addr % PAGE_SIZE; struct interval_tree_double_span_iter used_span; struct interval_tree_span_iter allowed_span; unsigned long max_alignment = PAGE_SIZE; unsigned long iova_alignment; lockdep_assert_held(&iopt->iova_rwsem); /* Protect roundup_pow-of_two() from overflow */ if (length == 0 || length >= ULONG_MAX / 2) return -EOVERFLOW; /* * Keep alignment present in addr when building the IOVA, which * increases the chance we can map a THP. */ if (!addr) iova_alignment = roundup_pow_of_two(length); else iova_alignment = min_t(unsigned long, roundup_pow_of_two(length), 1UL << __ffs64(addr)); #ifdef CONFIG_TRANSPARENT_HUGEPAGE max_alignment = HPAGE_SIZE; #endif /* Protect against ALIGN() overflow */ if (iova_alignment >= max_alignment) iova_alignment = max_alignment; if (iova_alignment < iopt->iova_alignment) return -EINVAL; interval_tree_for_each_span(&allowed_span, &iopt->allowed_itree, PAGE_SIZE, ULONG_MAX - PAGE_SIZE) { if (RB_EMPTY_ROOT(&iopt->allowed_itree.rb_root)) { allowed_span.start_used = PAGE_SIZE; allowed_span.last_used = ULONG_MAX - PAGE_SIZE; allowed_span.is_hole = false; } if (!__alloc_iova_check_used(&allowed_span, length, iova_alignment, page_offset)) continue; interval_tree_for_each_double_span( &used_span, &iopt->reserved_itree, &iopt->area_itree, allowed_span.start_used, allowed_span.last_used) { if (!__alloc_iova_check_hole(&used_span, length, iova_alignment, page_offset)) continue; *iova = used_span.start_hole; return 0; } } return -ENOSPC; } static int iopt_check_iova(struct io_pagetable *iopt, unsigned long iova, unsigned long length) { unsigned long last; lockdep_assert_held(&iopt->iova_rwsem); if ((iova & (iopt->iova_alignment - 1))) return -EINVAL; if (check_add_overflow(iova, length - 1, &last)) return -EOVERFLOW; /* No reserved IOVA intersects the range */ if (iopt_reserved_iter_first(iopt, iova, last)) return -EINVAL; /* Check that there is not already a mapping in the range */ if (iopt_area_iter_first(iopt, iova, last)) return -EEXIST; return 0; } /* * The area takes a slice of the pages from start_bytes to start_byte + length */ static int iopt_insert_area(struct io_pagetable *iopt, struct iopt_area *area, struct iopt_pages *pages, unsigned long iova, unsigned long start_byte, unsigned long length, int iommu_prot) { lockdep_assert_held_write(&iopt->iova_rwsem); if ((iommu_prot & IOMMU_WRITE) && !pages->writable) return -EPERM; area->iommu_prot = iommu_prot; area->page_offset = start_byte % PAGE_SIZE; if (area->page_offset & (iopt->iova_alignment - 1)) return -EINVAL; area->node.start = iova; if (check_add_overflow(iova, length - 1, &area->node.last)) return -EOVERFLOW; area->pages_node.start = start_byte / PAGE_SIZE; if (check_add_overflow(start_byte, length - 1, &area->pages_node.last)) return -EOVERFLOW; area->pages_node.last = area->pages_node.last / PAGE_SIZE; if (WARN_ON(area->pages_node.last >= pages->npages)) return -EOVERFLOW; /* * The area is inserted with a NULL pages indicating it is not fully * initialized yet. */ area->iopt = iopt; interval_tree_insert(&area->node, &iopt->area_itree); return 0; } static struct iopt_area *iopt_area_alloc(void) { struct iopt_area *area; area = kzalloc(sizeof(*area), GFP_KERNEL_ACCOUNT); if (!area) return NULL; RB_CLEAR_NODE(&area->node.rb); RB_CLEAR_NODE(&area->pages_node.rb); return area; } static int iopt_alloc_area_pages(struct io_pagetable *iopt, struct list_head *pages_list, unsigned long length, unsigned long *dst_iova, int iommu_prot, unsigned int flags) { struct iopt_pages_list *elm; unsigned long start; unsigned long iova; int rc = 0; list_for_each_entry(elm, pages_list, next) { elm->area = iopt_area_alloc(); if (!elm->area) return -ENOMEM; } down_write(&iopt->iova_rwsem); if ((length & (iopt->iova_alignment - 1)) || !length) { rc = -EINVAL; goto out_unlock; } if (flags & IOPT_ALLOC_IOVA) { /* Use the first entry to guess the ideal IOVA alignment */ elm = list_first_entry(pages_list, struct iopt_pages_list, next); switch (elm->pages->type) { case IOPT_ADDRESS_USER: start = elm->start_byte + (uintptr_t)elm->pages->uptr; break; case IOPT_ADDRESS_FILE: start = elm->start_byte + elm->pages->start; break; } rc = iopt_alloc_iova(iopt, dst_iova, start, length); if (rc) goto out_unlock; if (IS_ENABLED(CONFIG_IOMMUFD_TEST) && WARN_ON(iopt_check_iova(iopt, *dst_iova, length))) { rc = -EINVAL; goto out_unlock; } } else { rc = iopt_check_iova(iopt, *dst_iova, length); if (rc) goto out_unlock; } /* * Areas are created with a NULL pages so that the IOVA space is * reserved and we can unlock the iova_rwsem. */ iova = *dst_iova; list_for_each_entry(elm, pages_list, next) { rc = iopt_insert_area(iopt, elm->area, elm->pages, iova, elm->start_byte, elm->length, iommu_prot); if (rc) goto out_unlock; iova += elm->length; } out_unlock: up_write(&iopt->iova_rwsem); return rc; } static void iopt_abort_area(struct iopt_area *area) { if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) WARN_ON(area->pages); if (area->iopt) { down_write(&area->iopt->iova_rwsem); interval_tree_remove(&area->node, &area->iopt->area_itree); up_write(&area->iopt->iova_rwsem); } kfree(area); } void iopt_free_pages_list(struct list_head *pages_list) { struct iopt_pages_list *elm; while ((elm = list_first_entry_or_null(pages_list, struct iopt_pages_list, next))) { if (elm->area) iopt_abort_area(elm->area); if (elm->pages) iopt_put_pages(elm->pages); list_del(&elm->next); kfree(elm); } } static int iopt_fill_domains_pages(struct list_head *pages_list) { struct iopt_pages_list *undo_elm; struct iopt_pages_list *elm; int rc; list_for_each_entry(elm, pages_list, next) { rc = iopt_area_fill_domains(elm->area, elm->pages); if (rc) goto err_undo; } return 0; err_undo: list_for_each_entry(undo_elm, pages_list, next) { if (undo_elm == elm) break; iopt_area_unfill_domains(undo_elm->area, undo_elm->pages); } return rc; } int iopt_map_pages(struct io_pagetable *iopt, struct list_head *pages_list, unsigned long length, unsigned long *dst_iova, int iommu_prot, unsigned int flags) { struct iopt_pages_list *elm; int rc; rc = iopt_alloc_area_pages(iopt, pages_list, length, dst_iova, iommu_prot, flags); if (rc) return rc; down_read(&iopt->domains_rwsem); rc = iopt_fill_domains_pages(pages_list); if (rc) goto out_unlock_domains; down_write(&iopt->iova_rwsem); list_for_each_entry(elm, pages_list, next) { /* * area->pages must be set inside the domains_rwsem to ensure * any newly added domains will get filled. Moves the reference * in from the list. */ elm->area->pages = elm->pages; elm->pages = NULL; elm->area = NULL; } up_write(&iopt->iova_rwsem); out_unlock_domains: up_read(&iopt->domains_rwsem); return rc; } static int iopt_map_common(struct iommufd_ctx *ictx, struct io_pagetable *iopt, struct iopt_pages *pages, unsigned long *iova, unsigned long length, unsigned long start_byte, int iommu_prot, unsigned int flags) { struct iopt_pages_list elm = {}; LIST_HEAD(pages_list); int rc; elm.pages = pages; elm.start_byte = start_byte; if (ictx->account_mode == IOPT_PAGES_ACCOUNT_MM && elm.pages->account_mode == IOPT_PAGES_ACCOUNT_USER) elm.pages->account_mode = IOPT_PAGES_ACCOUNT_MM; elm.length = length; list_add(&elm.next, &pages_list); rc = iopt_map_pages(iopt, &pages_list, length, iova, iommu_prot, flags); if (rc) { if (elm.area) iopt_abort_area(elm.area); if (elm.pages) iopt_put_pages(elm.pages); return rc; } return 0; } /** * iopt_map_user_pages() - Map a user VA to an iova in the io page table * @ictx: iommufd_ctx the iopt is part of * @iopt: io_pagetable to act on * @iova: If IOPT_ALLOC_IOVA is set this is unused on input and contains * the chosen iova on output. Otherwise is the iova to map to on input * @uptr: User VA to map * @length: Number of bytes to map * @iommu_prot: Combination of IOMMU_READ/WRITE/etc bits for the mapping * @flags: IOPT_ALLOC_IOVA or zero * * iova, uptr, and length must be aligned to iova_alignment. For domain backed * page tables this will pin the pages and load them into the domain at iova. * For non-domain page tables this will only setup a lazy reference and the * caller must use iopt_access_pages() to touch them. * * iopt_unmap_iova() must be called to undo this before the io_pagetable can be * destroyed. */ int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt, unsigned long *iova, void __user *uptr, unsigned long length, int iommu_prot, unsigned int flags) { struct iopt_pages *pages; pages = iopt_alloc_user_pages(uptr, length, iommu_prot & IOMMU_WRITE); if (IS_ERR(pages)) return PTR_ERR(pages); return iopt_map_common(ictx, iopt, pages, iova, length, uptr - pages->uptr, iommu_prot, flags); } /** * iopt_map_file_pages() - Like iopt_map_user_pages, but map a file. * @ictx: iommufd_ctx the iopt is part of * @iopt: io_pagetable to act on * @iova: If IOPT_ALLOC_IOVA is set this is unused on input and contains * the chosen iova on output. Otherwise is the iova to map to on input * @file: file to map * @start: map file starting at this byte offset * @length: Number of bytes to map * @iommu_prot: Combination of IOMMU_READ/WRITE/etc bits for the mapping * @flags: IOPT_ALLOC_IOVA or zero */ int iopt_map_file_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt, unsigned long *iova, struct file *file, unsigned long start, unsigned long length, int iommu_prot, unsigned int flags) { struct iopt_pages *pages; pages = iopt_alloc_file_pages(file, start, length, iommu_prot & IOMMU_WRITE); if (IS_ERR(pages)) return PTR_ERR(pages); return iopt_map_common(ictx, iopt, pages, iova, length, start - pages->start, iommu_prot, flags); } struct iova_bitmap_fn_arg { unsigned long flags; struct io_pagetable *iopt; struct iommu_domain *domain; struct iommu_dirty_bitmap *dirty; }; static int __iommu_read_and_clear_dirty(struct iova_bitmap *bitmap, unsigned long iova, size_t length, void *opaque) { struct iopt_area *area; struct iopt_area_contig_iter iter; struct iova_bitmap_fn_arg *arg = opaque; struct iommu_domain *domain = arg->domain; struct iommu_dirty_bitmap *dirty = arg->dirty; const struct iommu_dirty_ops *ops = domain->dirty_ops; unsigned long last_iova = iova + length - 1; unsigned long flags = arg->flags; int ret; iopt_for_each_contig_area(&iter, area, arg->iopt, iova, last_iova) { unsigned long last = min(last_iova, iopt_area_last_iova(area)); ret = ops->read_and_clear_dirty(domain, iter.cur_iova, last - iter.cur_iova + 1, flags, dirty); if (ret) return ret; } if (!iopt_area_contig_done(&iter)) return -EINVAL; return 0; } static int iommu_read_and_clear_dirty(struct iommu_domain *domain, struct io_pagetable *iopt, unsigned long flags, struct iommu_hwpt_get_dirty_bitmap *bitmap) { const struct iommu_dirty_ops *ops = domain->dirty_ops; struct iommu_iotlb_gather gather; struct iommu_dirty_bitmap dirty; struct iova_bitmap_fn_arg arg; struct iova_bitmap *iter; int ret = 0; if (!ops || !ops->read_and_clear_dirty) return -EOPNOTSUPP; iter = iova_bitmap_alloc(bitmap->iova, bitmap->length, bitmap->page_size, u64_to_user_ptr(bitmap->data)); if (IS_ERR(iter)) return -ENOMEM; iommu_dirty_bitmap_init(&dirty, iter, &gather); arg.flags = flags; arg.iopt = iopt; arg.domain = domain; arg.dirty = &dirty; iova_bitmap_for_each(iter, &arg, __iommu_read_and_clear_dirty); if (!(flags & IOMMU_DIRTY_NO_CLEAR)) iommu_iotlb_sync(domain, &gather); iova_bitmap_free(iter); return ret; } int iommufd_check_iova_range(struct io_pagetable *iopt, struct iommu_hwpt_get_dirty_bitmap *bitmap) { size_t iommu_pgsize = iopt->iova_alignment; u64 last_iova; if (check_add_overflow(bitmap->iova, bitmap->length - 1, &last_iova)) return -EOVERFLOW; if (bitmap->iova > ULONG_MAX || last_iova > ULONG_MAX) return -EOVERFLOW; if ((bitmap->iova & (iommu_pgsize - 1)) || ((last_iova + 1) & (iommu_pgsize - 1))) return -EINVAL; if (!bitmap->page_size) return -EINVAL; if ((bitmap->iova & (bitmap->page_size - 1)) || ((last_iova + 1) & (bitmap->page_size - 1))) return -EINVAL; return 0; } int iopt_read_and_clear_dirty_data(struct io_pagetable *iopt, struct iommu_domain *domain, unsigned long flags, struct iommu_hwpt_get_dirty_bitmap *bitmap) { int ret; ret = iommufd_check_iova_range(iopt, bitmap); if (ret) return ret; down_read(&iopt->iova_rwsem); ret = iommu_read_and_clear_dirty(domain, iopt, flags, bitmap); up_read(&iopt->iova_rwsem); return ret; } static int iopt_clear_dirty_data(struct io_pagetable *iopt, struct iommu_domain *domain) { const struct iommu_dirty_ops *ops = domain->dirty_ops; struct iommu_iotlb_gather gather; struct iommu_dirty_bitmap dirty; struct iopt_area *area; int ret = 0; lockdep_assert_held_read(&iopt->iova_rwsem); iommu_dirty_bitmap_init(&dirty, NULL, &gather); for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area; area = iopt_area_iter_next(area, 0, ULONG_MAX)) { if (!area->pages) continue; ret = ops->read_and_clear_dirty(domain, iopt_area_iova(area), iopt_area_length(area), 0, &dirty); if (ret) break; } iommu_iotlb_sync(domain, &gather); return ret; } int iopt_set_dirty_tracking(struct io_pagetable *iopt, struct iommu_domain *domain, bool enable) { const struct iommu_dirty_ops *ops = domain->dirty_ops; int ret = 0; if (!ops) return -EOPNOTSUPP; down_read(&iopt->iova_rwsem); /* Clear dirty bits from PTEs to ensure a clean snapshot */ if (enable) { ret = iopt_clear_dirty_data(iopt, domain); if (ret) goto out_unlock; } ret = ops->set_dirty_tracking(domain, enable); out_unlock: up_read(&iopt->iova_rwsem); return ret; } int iopt_get_pages(struct io_pagetable *iopt, unsigned long iova, unsigned long length, struct list_head *pages_list) { struct iopt_area_contig_iter iter; unsigned long last_iova; struct iopt_area *area; int rc; if (!length) return -EINVAL; if (check_add_overflow(iova, length - 1, &last_iova)) return -EOVERFLOW; down_read(&iopt->iova_rwsem); iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova) { struct iopt_pages_list *elm; unsigned long last = min(last_iova, iopt_area_last_iova(area)); elm = kzalloc(sizeof(*elm), GFP_KERNEL_ACCOUNT); if (!elm) { rc = -ENOMEM; goto err_free; } elm->start_byte = iopt_area_start_byte(area, iter.cur_iova); elm->pages = area->pages; elm->length = (last - iter.cur_iova) + 1; kref_get(&elm->pages->kref); list_add_tail(&elm->next, pages_list); } if (!iopt_area_contig_done(&iter)) { rc = -ENOENT; goto err_free; } up_read(&iopt->iova_rwsem); return 0; err_free: up_read(&iopt->iova_rwsem); iopt_free_pages_list(pages_list); return rc; } static int iopt_unmap_iova_range(struct io_pagetable *iopt, unsigned long start, unsigned long last, unsigned long *unmapped) { struct iopt_area *area; unsigned long unmapped_bytes = 0; unsigned int tries = 0; int rc = -ENOENT; /* * The domains_rwsem must be held in read mode any time any area->pages * is NULL. This prevents domain attach/detatch from running * concurrently with cleaning up the area. */ again: down_read(&iopt->domains_rwsem); down_write(&iopt->iova_rwsem); while ((area = iopt_area_iter_first(iopt, start, last))) { unsigned long area_last = iopt_area_last_iova(area); unsigned long area_first = iopt_area_iova(area); struct iopt_pages *pages; /* Userspace should not race map/unmap's of the same area */ if (!area->pages) { rc = -EBUSY; goto out_unlock_iova; } if (area_first < start || area_last > last) { rc = -ENOENT; goto out_unlock_iova; } if (area_first != start) tries = 0; /* * num_accesses writers must hold the iova_rwsem too, so we can * safely read it under the write side of the iovam_rwsem * without the pages->mutex. */ if (area->num_accesses) { size_t length = iopt_area_length(area); start = area_first; area->prevent_access = true; up_write(&iopt->iova_rwsem); up_read(&iopt->domains_rwsem); iommufd_access_notify_unmap(iopt, area_first, length); /* Something is not responding to unmap requests. */ tries++; if (WARN_ON(tries > 100)) return -EDEADLOCK; goto again; } pages = area->pages; area->pages = NULL; up_write(&iopt->iova_rwsem); iopt_area_unfill_domains(area, pages); iopt_abort_area(area); iopt_put_pages(pages); unmapped_bytes += area_last - area_first + 1; down_write(&iopt->iova_rwsem); } if (unmapped_bytes) rc = 0; out_unlock_iova: up_write(&iopt->iova_rwsem); up_read(&iopt->domains_rwsem); if (unmapped) *unmapped = unmapped_bytes; return rc; } /** * iopt_unmap_iova() - Remove a range of iova * @iopt: io_pagetable to act on * @iova: Starting iova to unmap * @length: Number of bytes to unmap * @unmapped: Return number of bytes unmapped * * The requested range must be a superset of existing ranges. * Splitting/truncating IOVA mappings is not allowed. */ int iopt_unmap_iova(struct io_pagetable *iopt, unsigned long iova, unsigned long length, unsigned long *unmapped) { unsigned long iova_last; if (!length) return -EINVAL; if (check_add_overflow(iova, length - 1, &iova_last)) return -EOVERFLOW; return iopt_unmap_iova_range(iopt, iova, iova_last, unmapped); } int iopt_unmap_all(struct io_pagetable *iopt, unsigned long *unmapped) { int rc; rc = iopt_unmap_iova_range(iopt, 0, ULONG_MAX, unmapped); /* If the IOVAs are empty then unmap all succeeds */ if (rc == -ENOENT) return 0; return rc; } /* The caller must always free all the nodes in the allowed_iova rb_root. */ int iopt_set_allow_iova(struct io_pagetable *iopt, struct rb_root_cached *allowed_iova) { struct iopt_allowed *allowed; down_write(&iopt->iova_rwsem); swap(*allowed_iova, iopt->allowed_itree); for (allowed = iopt_allowed_iter_first(iopt, 0, ULONG_MAX); allowed; allowed = iopt_allowed_iter_next(allowed, 0, ULONG_MAX)) { if (iopt_reserved_iter_first(iopt, allowed->node.start, allowed->node.last)) { swap(*allowed_iova, iopt->allowed_itree); up_write(&iopt->iova_rwsem); return -EADDRINUSE; } } up_write(&iopt->iova_rwsem); return 0; } int iopt_reserve_iova(struct io_pagetable *iopt, unsigned long start, unsigned long last, void *owner) { struct iopt_reserved *reserved; lockdep_assert_held_write(&iopt->iova_rwsem); if (iopt_area_iter_first(iopt, start, last) || iopt_allowed_iter_first(iopt, start, last)) return -EADDRINUSE; reserved = kzalloc(sizeof(*reserved), GFP_KERNEL_ACCOUNT); if (!reserved) return -ENOMEM; reserved->node.start = start; reserved->node.last = last; reserved->owner = owner; interval_tree_insert(&reserved->node, &iopt->reserved_itree); return 0; } static void __iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner) { struct iopt_reserved *reserved, *next; lockdep_assert_held_write(&iopt->iova_rwsem); for (reserved = iopt_reserved_iter_first(iopt, 0, ULONG_MAX); reserved; reserved = next) { next = iopt_reserved_iter_next(reserved, 0, ULONG_MAX); if (reserved->owner == owner) { interval_tree_remove(&reserved->node, &iopt->reserved_itree); kfree(reserved); } } } void iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner) { down_write(&iopt->iova_rwsem); __iopt_remove_reserved_iova(iopt, owner); up_write(&iopt->iova_rwsem); } void iopt_init_table(struct io_pagetable *iopt) { init_rwsem(&iopt->iova_rwsem); init_rwsem(&iopt->domains_rwsem); iopt->area_itree = RB_ROOT_CACHED; iopt->allowed_itree = RB_ROOT_CACHED; iopt->reserved_itree = RB_ROOT_CACHED; xa_init_flags(&iopt->domains, XA_FLAGS_ACCOUNT); xa_init_flags(&iopt->access_list, XA_FLAGS_ALLOC); /* * iopt's start as SW tables that can use the entire size_t IOVA space * due to the use of size_t in the APIs. They have no alignment * restriction. */ iopt->iova_alignment = 1; } void iopt_destroy_table(struct io_pagetable *iopt) { struct interval_tree_node *node; if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) iopt_remove_reserved_iova(iopt, NULL); while ((node = interval_tree_iter_first(&iopt->allowed_itree, 0, ULONG_MAX))) { interval_tree_remove(node, &iopt->allowed_itree); kfree(container_of(node, struct iopt_allowed, node)); } WARN_ON(!RB_EMPTY_ROOT(&iopt->reserved_itree.rb_root)); WARN_ON(!xa_empty(&iopt->domains)); WARN_ON(!xa_empty(&iopt->access_list)); WARN_ON(!RB_EMPTY_ROOT(&iopt->area_itree.rb_root)); } /** * iopt_unfill_domain() - Unfill a domain with PFNs * @iopt: io_pagetable to act on * @domain: domain to unfill * * This is used when removing a domain from the iopt. Every area in the iopt * will be unmapped from the domain. The domain must already be removed from the * domains xarray. */ static void iopt_unfill_domain(struct io_pagetable *iopt, struct iommu_domain *domain) { struct iopt_area *area; lockdep_assert_held(&iopt->iova_rwsem); lockdep_assert_held_write(&iopt->domains_rwsem); /* * Some other domain is holding all the pfns still, rapidly unmap this * domain. */ if (iopt->next_domain_id != 0) { /* Pick an arbitrary remaining domain to act as storage */ struct iommu_domain *storage_domain = xa_load(&iopt->domains, 0); for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area; area = iopt_area_iter_next(area, 0, ULONG_MAX)) { struct iopt_pages *pages = area->pages; if (!pages) continue; mutex_lock(&pages->mutex); if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) WARN_ON(!area->storage_domain); if (area->storage_domain == domain) area->storage_domain = storage_domain; mutex_unlock(&pages->mutex); iopt_area_unmap_domain(area, domain); } return; } for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area; area = iopt_area_iter_next(area, 0, ULONG_MAX)) { struct iopt_pages *pages = area->pages; if (!pages) continue; mutex_lock(&pages->mutex); interval_tree_remove(&area->pages_node, &pages->domains_itree); WARN_ON(area->storage_domain != domain); area->storage_domain = NULL; iopt_area_unfill_domain(area, pages, domain); mutex_unlock(&pages->mutex); } } /** * iopt_fill_domain() - Fill a domain with PFNs * @iopt: io_pagetable to act on * @domain: domain to fill * * Fill the domain with PFNs from every area in the iopt. On failure the domain * is left unchanged. */ static int iopt_fill_domain(struct io_pagetable *iopt, struct iommu_domain *domain) { struct iopt_area *end_area; struct iopt_area *area; int rc; lockdep_assert_held(&iopt->iova_rwsem); lockdep_assert_held_write(&iopt->domains_rwsem); for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area; area = iopt_area_iter_next(area, 0, ULONG_MAX)) { struct iopt_pages *pages = area->pages; if (!pages) continue; mutex_lock(&pages->mutex); rc = iopt_area_fill_domain(area, domain); if (rc) { mutex_unlock(&pages->mutex); goto out_unfill; } if (!area->storage_domain) { WARN_ON(iopt->next_domain_id != 0); area->storage_domain = domain; interval_tree_insert(&area->pages_node, &pages->domains_itree); } mutex_unlock(&pages->mutex); } return 0; out_unfill: end_area = area; for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area; area = iopt_area_iter_next(area, 0, ULONG_MAX)) { struct iopt_pages *pages = area->pages; if (area == end_area) break; if (!pages) continue; mutex_lock(&pages->mutex); if (iopt->next_domain_id == 0) { interval_tree_remove(&area->pages_node, &pages->domains_itree); area->storage_domain = NULL; } iopt_area_unfill_domain(area, pages, domain); mutex_unlock(&pages->mutex); } return rc; } /* All existing area's conform to an increased page size */ static int iopt_check_iova_alignment(struct io_pagetable *iopt, unsigned long new_iova_alignment) { unsigned long align_mask = new_iova_alignment - 1; struct iopt_area *area; lockdep_assert_held(&iopt->iova_rwsem); lockdep_assert_held(&iopt->domains_rwsem); for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area; area = iopt_area_iter_next(area, 0, ULONG_MAX)) if ((iopt_area_iova(area) & align_mask) || (iopt_area_length(area) & align_mask) || (area->page_offset & align_mask)) return -EADDRINUSE; if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) { struct iommufd_access *access; unsigned long index; xa_for_each(&iopt->access_list, index, access) if (WARN_ON(access->iova_alignment > new_iova_alignment)) return -EADDRINUSE; } return 0; } int iopt_table_add_domain(struct io_pagetable *iopt, struct iommu_domain *domain) { const struct iommu_domain_geometry *geometry = &domain->geometry; struct iommu_domain *iter_domain; unsigned int new_iova_alignment; unsigned long index; int rc; down_write(&iopt->domains_rwsem); down_write(&iopt->iova_rwsem); xa_for_each(&iopt->domains, index, iter_domain) { if (WARN_ON(iter_domain == domain)) { rc = -EEXIST; goto out_unlock; } } /* * The io page size drives the iova_alignment. Internally the iopt_pages * works in PAGE_SIZE units and we adjust when mapping sub-PAGE_SIZE * objects into the iommu_domain. * * A iommu_domain must always be able to accept PAGE_SIZE to be * compatible as we can't guarantee higher contiguity. */ new_iova_alignment = max_t(unsigned long, 1UL << __ffs(domain->pgsize_bitmap), iopt->iova_alignment); if (new_iova_alignment > PAGE_SIZE) { rc = -EINVAL; goto out_unlock; } if (new_iova_alignment != iopt->iova_alignment) { rc = iopt_check_iova_alignment(iopt, new_iova_alignment); if (rc) goto out_unlock; } /* No area exists that is outside the allowed domain aperture */ if (geometry->aperture_start != 0) { rc = iopt_reserve_iova(iopt, 0, geometry->aperture_start - 1, domain); if (rc) goto out_reserved; } if (geometry->aperture_end != ULONG_MAX) { rc = iopt_reserve_iova(iopt, geometry->aperture_end + 1, ULONG_MAX, domain); if (rc) goto out_reserved; } rc = xa_reserve(&iopt->domains, iopt->next_domain_id, GFP_KERNEL); if (rc) goto out_reserved; rc = iopt_fill_domain(iopt, domain); if (rc) goto out_release; iopt->iova_alignment = new_iova_alignment; xa_store(&iopt->domains, iopt->next_domain_id, domain, GFP_KERNEL); iopt->next_domain_id++; up_write(&iopt->iova_rwsem); up_write(&iopt->domains_rwsem); return 0; out_release: xa_release(&iopt->domains, iopt->next_domain_id); out_reserved: __iopt_remove_reserved_iova(iopt, domain); out_unlock: up_write(&iopt->iova_rwsem); up_write(&iopt->domains_rwsem); return rc; } static int iopt_calculate_iova_alignment(struct io_pagetable *iopt) { unsigned long new_iova_alignment; struct iommufd_access *access; struct iommu_domain *domain; unsigned long index; lockdep_assert_held_write(&iopt->iova_rwsem); lockdep_assert_held(&iopt->domains_rwsem); /* See batch_iommu_map_small() */ if (iopt->disable_large_pages) new_iova_alignment = PAGE_SIZE; else new_iova_alignment = 1; xa_for_each(&iopt->domains, index, domain) new_iova_alignment = max_t(unsigned long, 1UL << __ffs(domain->pgsize_bitmap), new_iova_alignment); xa_for_each(&iopt->access_list, index, access) new_iova_alignment = max_t(unsigned long, access->iova_alignment, new_iova_alignment); if (new_iova_alignment > iopt->iova_alignment) { int rc; rc = iopt_check_iova_alignment(iopt, new_iova_alignment); if (rc) return rc; } iopt->iova_alignment = new_iova_alignment; return 0; } void iopt_table_remove_domain(struct io_pagetable *iopt, struct iommu_domain *domain) { struct iommu_domain *iter_domain = NULL; unsigned long index; down_write(&iopt->domains_rwsem); down_write(&iopt->iova_rwsem); xa_for_each(&iopt->domains, index, iter_domain) if (iter_domain == domain) break; if (WARN_ON(iter_domain != domain) || index >= iopt->next_domain_id) goto out_unlock; /* * Compress the xarray to keep it linear by swapping the entry to erase * with the tail entry and shrinking the tail. */ iopt->next_domain_id--; iter_domain = xa_erase(&iopt->domains, iopt->next_domain_id); if (index != iopt->next_domain_id) xa_store(&iopt->domains, index, iter_domain, GFP_KERNEL); iopt_unfill_domain(iopt, domain); __iopt_remove_reserved_iova(iopt, domain); WARN_ON(iopt_calculate_iova_alignment(iopt)); out_unlock: up_write(&iopt->iova_rwsem); up_write(&iopt->domains_rwsem); } /** * iopt_area_split - Split an area into two parts at iova * @area: The area to split * @iova: Becomes the last of a new area * * This splits an area into two. It is part of the VFIO compatibility to allow * poking a hole in the mapping. The two areas continue to point at the same * iopt_pages, just with different starting bytes. */ static int iopt_area_split(struct iopt_area *area, unsigned long iova) { unsigned long alignment = area->iopt->iova_alignment; unsigned long last_iova = iopt_area_last_iova(area); unsigned long start_iova = iopt_area_iova(area); unsigned long new_start = iova + 1; struct io_pagetable *iopt = area->iopt; struct iopt_pages *pages = area->pages; struct iopt_area *lhs; struct iopt_area *rhs; int rc; lockdep_assert_held_write(&iopt->iova_rwsem); if (iova == start_iova || iova == last_iova) return 0; if (!pages || area->prevent_access) return -EBUSY; if (new_start & (alignment - 1) || iopt_area_start_byte(area, new_start) & (alignment - 1)) return -EINVAL; lhs = iopt_area_alloc(); if (!lhs) return -ENOMEM; rhs = iopt_area_alloc(); if (!rhs) { rc = -ENOMEM; goto err_free_lhs; } mutex_lock(&pages->mutex); /* * Splitting is not permitted if an access exists, we don't track enough * information to split existing accesses. */ if (area->num_accesses) { rc = -EINVAL; goto err_unlock; } /* * Splitting is not permitted if a domain could have been mapped with * huge pages. */ if (area->storage_domain && !iopt->disable_large_pages) { rc = -EINVAL; goto err_unlock; } interval_tree_remove(&area->node, &iopt->area_itree); rc = iopt_insert_area(iopt, lhs, area->pages, start_iova, iopt_area_start_byte(area, start_iova), (new_start - 1) - start_iova + 1, area->iommu_prot); if (WARN_ON(rc)) goto err_insert; rc = iopt_insert_area(iopt, rhs, area->pages, new_start, iopt_area_start_byte(area, new_start), last_iova - new_start + 1, area->iommu_prot); if (WARN_ON(rc)) goto err_remove_lhs; /* * If the original area has filled a domain, domains_itree has to be * updated. */ if (area->storage_domain) { interval_tree_remove(&area->pages_node, &pages->domains_itree); interval_tree_insert(&lhs->pages_node, &pages->domains_itree); interval_tree_insert(&rhs->pages_node, &pages->domains_itree); } lhs->storage_domain = area->storage_domain; lhs->pages = area->pages; rhs->storage_domain = area->storage_domain; rhs->pages = area->pages; kref_get(&rhs->pages->kref); kfree(area); mutex_unlock(&pages->mutex); /* * No change to domains or accesses because the pages hasn't been * changed */ return 0; err_remove_lhs: interval_tree_remove(&lhs->node, &iopt->area_itree); err_insert: interval_tree_insert(&area->node, &iopt->area_itree); err_unlock: mutex_unlock(&pages->mutex); kfree(rhs); err_free_lhs: kfree(lhs); return rc; } int iopt_cut_iova(struct io_pagetable *iopt, unsigned long *iovas, size_t num_iovas) { int rc = 0; int i; down_write(&iopt->iova_rwsem); for (i = 0; i < num_iovas; i++) { struct iopt_area *area; area = iopt_area_iter_first(iopt, iovas[i], iovas[i]); if (!area) continue; rc = iopt_area_split(area, iovas[i]); if (rc) break; } up_write(&iopt->iova_rwsem); return rc; } void iopt_enable_large_pages(struct io_pagetable *iopt) { int rc; down_write(&iopt->domains_rwsem); down_write(&iopt->iova_rwsem); WRITE_ONCE(iopt->disable_large_pages, false); rc = iopt_calculate_iova_alignment(iopt); WARN_ON(rc); up_write(&iopt->iova_rwsem); up_write(&iopt->domains_rwsem); } int iopt_disable_large_pages(struct io_pagetable *iopt) { int rc = 0; down_write(&iopt->domains_rwsem); down_write(&iopt->iova_rwsem); if (iopt->disable_large_pages) goto out_unlock; /* Won't do it if domains already have pages mapped in them */ if (!xa_empty(&iopt->domains) && !RB_EMPTY_ROOT(&iopt->area_itree.rb_root)) { rc = -EINVAL; goto out_unlock; } WRITE_ONCE(iopt->disable_large_pages, true); rc = iopt_calculate_iova_alignment(iopt); if (rc) WRITE_ONCE(iopt->disable_large_pages, false); out_unlock: up_write(&iopt->iova_rwsem); up_write(&iopt->domains_rwsem); return rc; } int iopt_add_access(struct io_pagetable *iopt, struct iommufd_access *access) { u32 new_id; int rc; down_write(&iopt->domains_rwsem); down_write(&iopt->iova_rwsem); rc = xa_alloc(&iopt->access_list, &new_id, access, xa_limit_16b, GFP_KERNEL_ACCOUNT); if (rc) goto out_unlock; rc = iopt_calculate_iova_alignment(iopt); if (rc) { xa_erase(&iopt->access_list, new_id); goto out_unlock; } access->iopt_access_list_id = new_id; out_unlock: up_write(&iopt->iova_rwsem); up_write(&iopt->domains_rwsem); return rc; } void iopt_remove_access(struct io_pagetable *iopt, struct iommufd_access *access, u32 iopt_access_list_id) { down_write(&iopt->domains_rwsem); down_write(&iopt->iova_rwsem); WARN_ON(xa_erase(&iopt->access_list, iopt_access_list_id) != access); WARN_ON(iopt_calculate_iova_alignment(iopt)); up_write(&iopt->iova_rwsem); up_write(&iopt->domains_rwsem); } /* Narrow the valid_iova_itree to include reserved ranges from a device. */ int iopt_table_enforce_dev_resv_regions(struct io_pagetable *iopt, struct device *dev, phys_addr_t *sw_msi_start) { struct iommu_resv_region *resv; LIST_HEAD(resv_regions); unsigned int num_hw_msi = 0; unsigned int num_sw_msi = 0; int rc; if (iommufd_should_fail()) return -EINVAL; down_write(&iopt->iova_rwsem); /* FIXME: drivers allocate memory but there is no failure propogated */ iommu_get_resv_regions(dev, &resv_regions); list_for_each_entry(resv, &resv_regions, list) { if (resv->type == IOMMU_RESV_DIRECT_RELAXABLE) continue; if (sw_msi_start && resv->type == IOMMU_RESV_MSI) num_hw_msi++; if (sw_msi_start && resv->type == IOMMU_RESV_SW_MSI) { *sw_msi_start = resv->start; num_sw_msi++; } rc = iopt_reserve_iova(iopt, resv->start, resv->length - 1 + resv->start, dev); if (rc) goto out_reserved; } /* Drivers must offer sane combinations of regions */ if (WARN_ON(num_sw_msi && num_hw_msi) || WARN_ON(num_sw_msi > 1)) { rc = -EINVAL; goto out_reserved; } rc = 0; goto out_free_resv; out_reserved: __iopt_remove_reserved_iova(iopt, dev); out_free_resv: iommu_put_resv_regions(dev, &resv_regions); up_write(&iopt->iova_rwsem); return rc; }
61 2 62 1 1 2 2 2 2 66 4 62 4 57 6 51 7 3 3 1 1 3 1 3 3 2 2 1 6 2 4 7 7 7 7 7 7 7 7 7 7 7 7 7 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 // SPDX-License-Identifier: GPL-2.0-only /* * linux/fs/pnode.c * * (C) Copyright IBM Corporation 2005. * Author : Ram Pai (linuxram@us.ibm.com) */ #include <linux/mnt_namespace.h> #include <linux/mount.h> #include <linux/fs.h> #include <linux/nsproxy.h> #include <uapi/linux/mount.h> #include "internal.h" #include "pnode.h" /* return the next shared peer mount of @p */ static inline struct mount *next_peer(struct mount *p) { return list_entry(p->mnt_share.next, struct mount, mnt_share); } static inline struct mount *first_slave(struct mount *p) { return list_entry(p->mnt_slave_list.next, struct mount, mnt_slave); } static inline struct mount *last_slave(struct mount *p) { return list_entry(p->mnt_slave_list.prev, struct mount, mnt_slave); } static inline struct mount *next_slave(struct mount *p) { return list_entry(p->mnt_slave.next, struct mount, mnt_slave); } static struct mount *get_peer_under_root(struct mount *mnt, struct mnt_namespace *ns, const struct path *root) { struct mount *m = mnt; do { /* Check the namespace first for optimization */ if (m->mnt_ns == ns && is_path_reachable(m, m->mnt.mnt_root, root)) return m; m = next_peer(m); } while (m != mnt); return NULL; } /* * Get ID of closest dominating peer group having a representative * under the given root. * * Caller must hold namespace_sem */ int get_dominating_id(struct mount *mnt, const struct path *root) { struct mount *m; for (m = mnt->mnt_master; m != NULL; m = m->mnt_master) { struct mount *d = get_peer_under_root(m, mnt->mnt_ns, root); if (d) return d->mnt_group_id; } return 0; } static int do_make_slave(struct mount *mnt) { struct mount *master, *slave_mnt; if (list_empty(&mnt->mnt_share)) { if (IS_MNT_SHARED(mnt)) { mnt_release_group_id(mnt); CLEAR_MNT_SHARED(mnt); } master = mnt->mnt_master; if (!master) { struct list_head *p = &mnt->mnt_slave_list; while (!list_empty(p)) { slave_mnt = list_first_entry(p, struct mount, mnt_slave); list_del_init(&slave_mnt->mnt_slave); slave_mnt->mnt_master = NULL; } return 0; } } else { struct mount *m; /* * slave 'mnt' to a peer mount that has the * same root dentry. If none is available then * slave it to anything that is available. */ for (m = master = next_peer(mnt); m != mnt; m = next_peer(m)) { if (m->mnt.mnt_root == mnt->mnt.mnt_root) { master = m; break; } } list_del_init(&mnt->mnt_share); mnt->mnt_group_id = 0; CLEAR_MNT_SHARED(mnt); } list_for_each_entry(slave_mnt, &mnt->mnt_slave_list, mnt_slave) slave_mnt->mnt_master = master; list_move(&mnt->mnt_slave, &master->mnt_slave_list); list_splice(&mnt->mnt_slave_list, master->mnt_slave_list.prev); INIT_LIST_HEAD(&mnt->mnt_slave_list); mnt->mnt_master = master; return 0; } /* * vfsmount lock must be held for write */ void change_mnt_propagation(struct mount *mnt, int type) { if (type == MS_SHARED) { set_mnt_shared(mnt); return; } do_make_slave(mnt); if (type != MS_SLAVE) { list_del_init(&mnt->mnt_slave); mnt->mnt_master = NULL; if (type == MS_UNBINDABLE) mnt->mnt.mnt_flags |= MNT_UNBINDABLE; else mnt->mnt.mnt_flags &= ~MNT_UNBINDABLE; } } /* * get the next mount in the propagation tree. * @m: the mount seen last * @origin: the original mount from where the tree walk initiated * * Note that peer groups form contiguous segments of slave lists. * We rely on that in get_source() to be able to find out if * vfsmount found while iterating with propagation_next() is * a peer of one we'd found earlier. */ static struct mount *propagation_next(struct mount *m, struct mount *origin) { /* are there any slaves of this mount? */ if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list)) return first_slave(m); while (1) { struct mount *master = m->mnt_master; if (master == origin->mnt_master) { struct mount *next = next_peer(m); return (next == origin) ? NULL : next; } else if (m->mnt_slave.next != &master->mnt_slave_list) return next_slave(m); /* back at master */ m = master; } } static struct mount *skip_propagation_subtree(struct mount *m, struct mount *origin) { /* * Advance m such that propagation_next will not return * the slaves of m. */ if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list)) m = last_slave(m); return m; } static struct mount *next_group(struct mount *m, struct mount *origin) { while (1) { while (1) { struct mount *next; if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list)) return first_slave(m); next = next_peer(m); if (m->mnt_group_id == origin->mnt_group_id) { if (next == origin) return NULL; } else if (m->mnt_slave.next != &next->mnt_slave) break; m = next; } /* m is the last peer */ while (1) { struct mount *master = m->mnt_master; if (m->mnt_slave.next != &master->mnt_slave_list) return next_slave(m); m = next_peer(master); if (master->mnt_group_id == origin->mnt_group_id) break; if (master->mnt_slave.next == &m->mnt_slave) break; m = master; } if (m == origin) return NULL; } } /* all accesses are serialized by namespace_sem */ static struct mount *last_dest, *first_source, *last_source, *dest_master; static struct hlist_head *list; static inline bool peers(const struct mount *m1, const struct mount *m2) { return m1->mnt_group_id == m2->mnt_group_id && m1->mnt_group_id; } static int propagate_one(struct mount *m, struct mountpoint *dest_mp) { struct mount *child; int type; /* skip ones added by this propagate_mnt() */ if (IS_MNT_NEW(m)) return 0; /* skip if mountpoint isn't covered by it */ if (!is_subdir(dest_mp->m_dentry, m->mnt.mnt_root)) return 0; if (peers(m, last_dest)) { type = CL_MAKE_SHARED; } else { struct mount *n, *p; bool done; for (n = m; ; n = p) { p = n->mnt_master; if (p == dest_master || IS_MNT_MARKED(p)) break; } do { struct mount *parent = last_source->mnt_parent; if (peers(last_source, first_source)) break; done = parent->mnt_master == p; if (done && peers(n, parent)) break; last_source = last_source->mnt_master; } while (!done); type = CL_SLAVE; /* beginning of peer group among the slaves? */ if (IS_MNT_SHARED(m)) type |= CL_MAKE_SHARED; } child = copy_tree(last_source, last_source->mnt.mnt_root, type); if (IS_ERR(child)) return PTR_ERR(child); read_seqlock_excl(&mount_lock); mnt_set_mountpoint(m, dest_mp, child); if (m->mnt_master != dest_master) SET_MNT_MARK(m->mnt_master); read_sequnlock_excl(&mount_lock); last_dest = m; last_source = child; hlist_add_head(&child->mnt_hash, list); return count_mounts(m->mnt_ns, child); } /* * mount 'source_mnt' under the destination 'dest_mnt' at * dentry 'dest_dentry'. And propagate that mount to * all the peer and slave mounts of 'dest_mnt'. * Link all the new mounts into a propagation tree headed at * source_mnt. Also link all the new mounts using ->mnt_list * headed at source_mnt's ->mnt_list * * @dest_mnt: destination mount. * @dest_dentry: destination dentry. * @source_mnt: source mount. * @tree_list : list of heads of trees to be attached. */ int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp, struct mount *source_mnt, struct hlist_head *tree_list) { struct mount *m, *n; int ret = 0; /* * we don't want to bother passing tons of arguments to * propagate_one(); everything is serialized by namespace_sem, * so globals will do just fine. */ last_dest = dest_mnt; first_source = source_mnt; last_source = source_mnt; list = tree_list; dest_master = dest_mnt->mnt_master; /* all peers of dest_mnt, except dest_mnt itself */ for (n = next_peer(dest_mnt); n != dest_mnt; n = next_peer(n)) { ret = propagate_one(n, dest_mp); if (ret) goto out; } /* all slave groups */ for (m = next_group(dest_mnt, dest_mnt); m; m = next_group(m, dest_mnt)) { /* everything in that slave group */ n = m; do { ret = propagate_one(n, dest_mp); if (ret) goto out; n = next_peer(n); } while (n != m); } out: read_seqlock_excl(&mount_lock); hlist_for_each_entry(n, tree_list, mnt_hash) { m = n->mnt_parent; if (m->mnt_master != dest_mnt->mnt_master) CLEAR_MNT_MARK(m->mnt_master); } read_sequnlock_excl(&mount_lock); return ret; } static struct mount *find_topper(struct mount *mnt) { /* If there is exactly one mount covering mnt completely return it. */ struct mount *child; if (!list_is_singular(&mnt->mnt_mounts)) return NULL; child = list_first_entry(&mnt->mnt_mounts, struct mount, mnt_child); if (child->mnt_mountpoint != mnt->mnt.mnt_root) return NULL; return child; } /* * return true if the refcount is greater than count */ static inline int do_refcount_check(struct mount *mnt, int count) { return mnt_get_count(mnt) > count; } /** * propagation_would_overmount - check whether propagation from @from * would overmount @to * @from: shared mount * @to: mount to check * @mp: future mountpoint of @to on @from * * If @from propagates mounts to @to, @from and @to must either be peers * or one of the masters in the hierarchy of masters of @to must be a * peer of @from. * * If the root of the @to mount is equal to the future mountpoint @mp of * the @to mount on @from then @to will be overmounted by whatever is * propagated to it. * * Context: This function expects namespace_lock() to be held and that * @mp is stable. * Return: If @from overmounts @to, true is returned, false if not. */ bool propagation_would_overmount(const struct mount *from, const struct mount *to, const struct mountpoint *mp) { if (!IS_MNT_SHARED(from)) return false; if (IS_MNT_NEW(to)) return false; if (to->mnt.mnt_root != mp->m_dentry) return false; for (const struct mount *m = to; m; m = m->mnt_master) { if (peers(from, m)) return true; } return false; } /* * check if the mount 'mnt' can be unmounted successfully. * @mnt: the mount to be checked for unmount * NOTE: unmounting 'mnt' would naturally propagate to all * other mounts its parent propagates to. * Check if any of these mounts that **do not have submounts** * have more references than 'refcnt'. If so return busy. * * vfsmount lock must be held for write */ int propagate_mount_busy(struct mount *mnt, int refcnt) { struct mount *m, *child, *topper; struct mount *parent = mnt->mnt_parent; if (mnt == parent) return do_refcount_check(mnt, refcnt); /* * quickly check if the current mount can be unmounted. * If not, we don't have to go checking for all other * mounts */ if (!list_empty(&mnt->mnt_mounts) || do_refcount_check(mnt, refcnt)) return 1; for (m = propagation_next(parent, parent); m; m = propagation_next(m, parent)) { int count = 1; child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint); if (!child) continue; /* Is there exactly one mount on the child that covers * it completely whose reference should be ignored? */ topper = find_topper(child); if (topper) count += 1; else if (!list_empty(&child->mnt_mounts)) continue; if (do_refcount_check(child, count)) return 1; } return 0; } /* * Clear MNT_LOCKED when it can be shown to be safe. * * mount_lock lock must be held for write */ void propagate_mount_unlock(struct mount *mnt) { struct mount *parent = mnt->mnt_parent; struct mount *m, *child; BUG_ON(parent == mnt); for (m = propagation_next(parent, parent); m; m = propagation_next(m, parent)) { child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint); if (child) child->mnt.mnt_flags &= ~MNT_LOCKED; } } static void umount_one(struct mount *mnt, struct list_head *to_umount) { CLEAR_MNT_MARK(mnt); mnt->mnt.mnt_flags |= MNT_UMOUNT; list_del_init(&mnt->mnt_child); list_del_init(&mnt->mnt_umounting); move_from_ns(mnt, to_umount); } /* * NOTE: unmounting 'mnt' naturally propagates to all other mounts its * parent propagates to. */ static bool __propagate_umount(struct mount *mnt, struct list_head *to_umount, struct list_head *to_restore) { bool progress = false; struct mount *child; /* * The state of the parent won't change if this mount is * already unmounted or marked as without children. */ if (mnt->mnt.mnt_flags & (MNT_UMOUNT | MNT_MARKED)) goto out; /* Verify topper is the only grandchild that has not been * speculatively unmounted. */ list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) { if (child->mnt_mountpoint == mnt->mnt.mnt_root) continue; if (!list_empty(&child->mnt_umounting) && IS_MNT_MARKED(child)) continue; /* Found a mounted child */ goto children; } /* Mark mounts that can be unmounted if not locked */ SET_MNT_MARK(mnt); progress = true; /* If a mount is without children and not locked umount it. */ if (!IS_MNT_LOCKED(mnt)) { umount_one(mnt, to_umount); } else { children: list_move_tail(&mnt->mnt_umounting, to_restore); } out: return progress; } static void umount_list(struct list_head *to_umount, struct list_head *to_restore) { struct mount *mnt, *child, *tmp; list_for_each_entry(mnt, to_umount, mnt_list) { list_for_each_entry_safe(child, tmp, &mnt->mnt_mounts, mnt_child) { /* topper? */ if (child->mnt_mountpoint == mnt->mnt.mnt_root) list_move_tail(&child->mnt_umounting, to_restore); else umount_one(child, to_umount); } } } static void restore_mounts(struct list_head *to_restore) { /* Restore mounts to a clean working state */ while (!list_empty(to_restore)) { struct mount *mnt, *parent; struct mountpoint *mp; mnt = list_first_entry(to_restore, struct mount, mnt_umounting); CLEAR_MNT_MARK(mnt); list_del_init(&mnt->mnt_umounting); /* Should this mount be reparented? */ mp = mnt->mnt_mp; parent = mnt->mnt_parent; while (parent->mnt.mnt_flags & MNT_UMOUNT) { mp = parent->mnt_mp; parent = parent->mnt_parent; } if (parent != mnt->mnt_parent) mnt_change_mountpoint(parent, mp, mnt); } } static void cleanup_umount_visitations(struct list_head *visited) { while (!list_empty(visited)) { struct mount *mnt = list_first_entry(visited, struct mount, mnt_umounting); list_del_init(&mnt->mnt_umounting); } } /* * collect all mounts that receive propagation from the mount in @list, * and return these additional mounts in the same list. * @list: the list of mounts to be unmounted. * * vfsmount lock must be held for write */ int propagate_umount(struct list_head *list) { struct mount *mnt; LIST_HEAD(to_restore); LIST_HEAD(to_umount); LIST_HEAD(visited); /* Find candidates for unmounting */ list_for_each_entry_reverse(mnt, list, mnt_list) { struct mount *parent = mnt->mnt_parent; struct mount *m; /* * If this mount has already been visited it is known that it's * entire peer group and all of their slaves in the propagation * tree for the mountpoint has already been visited and there is * no need to visit them again. */ if (!list_empty(&mnt->mnt_umounting)) continue; list_add_tail(&mnt->mnt_umounting, &visited); for (m = propagation_next(parent, parent); m; m = propagation_next(m, parent)) { struct mount *child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint); if (!child) continue; if (!list_empty(&child->mnt_umounting)) { /* * If the child has already been visited it is * know that it's entire peer group and all of * their slaves in the propgation tree for the * mountpoint has already been visited and there * is no need to visit this subtree again. */ m = skip_propagation_subtree(m, parent); continue; } else if (child->mnt.mnt_flags & MNT_UMOUNT) { /* * We have come across a partially unmounted * mount in a list that has not been visited * yet. Remember it has been visited and * continue about our merry way. */ list_add_tail(&child->mnt_umounting, &visited); continue; } /* Check the child and parents while progress is made */ while (__propagate_umount(child, &to_umount, &to_restore)) { /* Is the parent a umount candidate? */ child = child->mnt_parent; if (list_empty(&child->mnt_umounting)) break; } } } umount_list(&to_umount, &to_restore); restore_mounts(&to_restore); cleanup_umount_visitations(&visited); list_splice_tail(&to_umount, list); return 0; }
3 3 3 3 3 3 3 3 1 2 3 3 3 3 3 23 8 8 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 // SPDX-License-Identifier: GPL-2.0 /* * Functions to sequence PREFLUSH and FUA writes. * * Copyright (C) 2011 Max Planck Institute for Gravitational Physics * Copyright (C) 2011 Tejun Heo <tj@kernel.org> * * REQ_{PREFLUSH|FUA} requests are decomposed to sequences consisted of three * optional steps - PREFLUSH, DATA and POSTFLUSH - according to the request * properties and hardware capability. * * If a request doesn't have data, only REQ_PREFLUSH makes sense, which * indicates a simple flush request. If there is data, REQ_PREFLUSH indicates * that the device cache should be flushed before the data is executed, and * REQ_FUA means that the data must be on non-volatile media on request * completion. * * If the device doesn't have writeback cache, PREFLUSH and FUA don't make any * difference. The requests are either completed immediately if there's no data * or executed as normal requests otherwise. * * If the device has writeback cache and supports FUA, REQ_PREFLUSH is * translated to PREFLUSH but REQ_FUA is passed down directly with DATA. * * If the device has writeback cache and doesn't support FUA, REQ_PREFLUSH * is translated to PREFLUSH and REQ_FUA to POSTFLUSH. * * The actual execution of flush is double buffered. Whenever a request * needs to execute PRE or POSTFLUSH, it queues at * fq->flush_queue[fq->flush_pending_idx]. Once certain criteria are met, a * REQ_OP_FLUSH is issued and the pending_idx is toggled. When the flush * completes, all the requests which were pending are proceeded to the next * step. This allows arbitrary merging of different types of PREFLUSH/FUA * requests. * * Currently, the following conditions are used to determine when to issue * flush. * * C1. At any given time, only one flush shall be in progress. This makes * double buffering sufficient. * * C2. Flush is deferred if any request is executing DATA of its sequence. * This avoids issuing separate POSTFLUSHes for requests which shared * PREFLUSH. * * C3. The second condition is ignored if there is a request which has * waited longer than FLUSH_PENDING_TIMEOUT. This is to avoid * starvation in the unlikely case where there are continuous stream of * FUA (without PREFLUSH) requests. * * For devices which support FUA, it isn't clear whether C2 (and thus C3) * is beneficial. * * Note that a sequenced PREFLUSH/FUA request with DATA is completed twice. * Once while executing DATA and again after the whole sequence is * complete. The first completion updates the contained bio but doesn't * finish it so that the bio submitter is notified only after the whole * sequence is complete. This is implemented by testing RQF_FLUSH_SEQ in * req_bio_endio(). * * The above peculiarity requires that each PREFLUSH/FUA request has only one * bio attached to it, which is guaranteed as they aren't allowed to be * merged in the usual way. */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/bio.h> #include <linux/blkdev.h> #include <linux/gfp.h> #include <linux/part_stat.h> #include "blk.h" #include "blk-mq.h" #include "blk-mq-sched.h" /* PREFLUSH/FUA sequences */ enum { REQ_FSEQ_PREFLUSH = (1 << 0), /* pre-flushing in progress */ REQ_FSEQ_DATA = (1 << 1), /* data write in progress */ REQ_FSEQ_POSTFLUSH = (1 << 2), /* post-flushing in progress */ REQ_FSEQ_DONE = (1 << 3), REQ_FSEQ_ACTIONS = REQ_FSEQ_PREFLUSH | REQ_FSEQ_DATA | REQ_FSEQ_POSTFLUSH, /* * If flush has been pending longer than the following timeout, * it's issued even if flush_data requests are still in flight. */ FLUSH_PENDING_TIMEOUT = 5 * HZ, }; static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, blk_opf_t flags); static inline struct blk_flush_queue * blk_get_flush_queue(struct request_queue *q, struct blk_mq_ctx *ctx) { return blk_mq_map_queue(q, REQ_OP_FLUSH, ctx)->fq; } static unsigned int blk_flush_cur_seq(struct request *rq) { return 1 << ffz(rq->flush.seq); } static void blk_flush_restore_request(struct request *rq) { /* * After flush data completion, @rq->bio is %NULL but we need to * complete the bio again. @rq->biotail is guaranteed to equal the * original @rq->bio. Restore it. */ rq->bio = rq->biotail; if (rq->bio) rq->__sector = rq->bio->bi_iter.bi_sector; /* make @rq a normal request */ rq->rq_flags &= ~RQF_FLUSH_SEQ; rq->end_io = rq->flush.saved_end_io; } static void blk_account_io_flush(struct request *rq) { struct block_device *part = rq->q->disk->part0; part_stat_lock(); part_stat_inc(part, ios[STAT_FLUSH]); part_stat_add(part, nsecs[STAT_FLUSH], blk_time_get_ns() - rq->start_time_ns); part_stat_unlock(); } /** * blk_flush_complete_seq - complete flush sequence * @rq: PREFLUSH/FUA request being sequenced * @fq: flush queue * @seq: sequences to complete (mask of %REQ_FSEQ_*, can be zero) * @error: whether an error occurred * * @rq just completed @seq part of its flush sequence, record the * completion and trigger the next step. * * CONTEXT: * spin_lock_irq(fq->mq_flush_lock) */ static void blk_flush_complete_seq(struct request *rq, struct blk_flush_queue *fq, unsigned int seq, blk_status_t error) { struct request_queue *q = rq->q; struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx]; blk_opf_t cmd_flags; BUG_ON(rq->flush.seq & seq); rq->flush.seq |= seq; cmd_flags = rq->cmd_flags; if (likely(!error)) seq = blk_flush_cur_seq(rq); else seq = REQ_FSEQ_DONE; switch (seq) { case REQ_FSEQ_PREFLUSH: case REQ_FSEQ_POSTFLUSH: /* queue for flush */ if (list_empty(pending)) fq->flush_pending_since = jiffies; list_add_tail(&rq->queuelist, pending); break; case REQ_FSEQ_DATA: fq->flush_data_in_flight++; spin_lock(&q->requeue_lock); list_move(&rq->queuelist, &q->requeue_list); spin_unlock(&q->requeue_lock); blk_mq_kick_requeue_list(q); break; case REQ_FSEQ_DONE: /* * @rq was previously adjusted by blk_insert_flush() for * flush sequencing and may already have gone through the * flush data request completion path. Restore @rq for * normal completion and end it. */ list_del_init(&rq->queuelist); blk_flush_restore_request(rq); blk_mq_end_request(rq, error); break; default: BUG(); } blk_kick_flush(q, fq, cmd_flags); } static enum rq_end_io_ret flush_end_io(struct request *flush_rq, blk_status_t error) { struct request_queue *q = flush_rq->q; struct list_head *running; struct request *rq, *n; unsigned long flags = 0; struct blk_flush_queue *fq = blk_get_flush_queue(q, flush_rq->mq_ctx); /* release the tag's ownership to the req cloned from */ spin_lock_irqsave(&fq->mq_flush_lock, flags); if (!req_ref_put_and_test(flush_rq)) { fq->rq_status = error; spin_unlock_irqrestore(&fq->mq_flush_lock, flags); return RQ_END_IO_NONE; } blk_account_io_flush(flush_rq); /* * Flush request has to be marked as IDLE when it is really ended * because its .end_io() is called from timeout code path too for * avoiding use-after-free. */ WRITE_ONCE(flush_rq->state, MQ_RQ_IDLE); if (fq->rq_status != BLK_STS_OK) { error = fq->rq_status; fq->rq_status = BLK_STS_OK; } if (!q->elevator) { flush_rq->tag = BLK_MQ_NO_TAG; } else { blk_mq_put_driver_tag(flush_rq); flush_rq->internal_tag = BLK_MQ_NO_TAG; } running = &fq->flush_queue[fq->flush_running_idx]; BUG_ON(fq->flush_pending_idx == fq->flush_running_idx); /* account completion of the flush request */ fq->flush_running_idx ^= 1; /* and push the waiting requests to the next stage */ list_for_each_entry_safe(rq, n, running, queuelist) { unsigned int seq = blk_flush_cur_seq(rq); BUG_ON(seq != REQ_FSEQ_PREFLUSH && seq != REQ_FSEQ_POSTFLUSH); list_del_init(&rq->queuelist); blk_flush_complete_seq(rq, fq, seq, error); } spin_unlock_irqrestore(&fq->mq_flush_lock, flags); return RQ_END_IO_NONE; } bool is_flush_rq(struct request *rq) { return rq->end_io == flush_end_io; } /** * blk_kick_flush - consider issuing flush request * @q: request_queue being kicked * @fq: flush queue * @flags: cmd_flags of the original request * * Flush related states of @q have changed, consider issuing flush request. * Please read the comment at the top of this file for more info. * * CONTEXT: * spin_lock_irq(fq->mq_flush_lock) * */ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, blk_opf_t flags) { struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx]; struct request *first_rq = list_first_entry(pending, struct request, queuelist); struct request *flush_rq = fq->flush_rq; /* C1 described at the top of this file */ if (fq->flush_pending_idx != fq->flush_running_idx || list_empty(pending)) return; /* C2 and C3 */ if (fq->flush_data_in_flight && time_before(jiffies, fq->flush_pending_since + FLUSH_PENDING_TIMEOUT)) return; /* * Issue flush and toggle pending_idx. This makes pending_idx * different from running_idx, which means flush is in flight. */ fq->flush_pending_idx ^= 1; blk_rq_init(q, flush_rq); /* * In case of none scheduler, borrow tag from the first request * since they can't be in flight at the same time. And acquire * the tag's ownership for flush req. * * In case of IO scheduler, flush rq need to borrow scheduler tag * just for cheating put/get driver tag. */ flush_rq->mq_ctx = first_rq->mq_ctx; flush_rq->mq_hctx = first_rq->mq_hctx; if (!q->elevator) flush_rq->tag = first_rq->tag; else flush_rq->internal_tag = first_rq->internal_tag; flush_rq->cmd_flags = REQ_OP_FLUSH | REQ_PREFLUSH; flush_rq->cmd_flags |= (flags & REQ_DRV) | (flags & REQ_FAILFAST_MASK); flush_rq->rq_flags |= RQF_FLUSH_SEQ; flush_rq->end_io = flush_end_io; /* * Order WRITE ->end_io and WRITE rq->ref, and its pair is the one * implied in refcount_inc_not_zero() called from * blk_mq_find_and_get_req(), which orders WRITE/READ flush_rq->ref * and READ flush_rq->end_io */ smp_wmb(); req_ref_set(flush_rq, 1); spin_lock(&q->requeue_lock); list_add_tail(&flush_rq->queuelist, &q->flush_list); spin_unlock(&q->requeue_lock); blk_mq_kick_requeue_list(q); } static enum rq_end_io_ret mq_flush_data_end_io(struct request *rq, blk_status_t error) { struct request_queue *q = rq->q; struct blk_mq_hw_ctx *hctx = rq->mq_hctx; struct blk_mq_ctx *ctx = rq->mq_ctx; unsigned long flags; struct blk_flush_queue *fq = blk_get_flush_queue(q, ctx); if (q->elevator) { WARN_ON(rq->tag < 0); blk_mq_put_driver_tag(rq); } /* * After populating an empty queue, kick it to avoid stall. Read * the comment in flush_end_io(). */ spin_lock_irqsave(&fq->mq_flush_lock, flags); fq->flush_data_in_flight--; /* * May have been corrupted by rq->rq_next reuse, we need to * re-initialize rq->queuelist before reusing it here. */ INIT_LIST_HEAD(&rq->queuelist); blk_flush_complete_seq(rq, fq, REQ_FSEQ_DATA, error); spin_unlock_irqrestore(&fq->mq_flush_lock, flags); blk_mq_sched_restart(hctx); return RQ_END_IO_NONE; } static void blk_rq_init_flush(struct request *rq) { rq->flush.seq = 0; rq->rq_flags |= RQF_FLUSH_SEQ; rq->flush.saved_end_io = rq->end_io; /* Usually NULL */ rq->end_io = mq_flush_data_end_io; } /* * Insert a PREFLUSH/FUA request into the flush state machine. * Returns true if the request has been consumed by the flush state machine, * or false if the caller should continue to process it. */ bool blk_insert_flush(struct request *rq) { struct request_queue *q = rq->q; struct blk_flush_queue *fq = blk_get_flush_queue(q, rq->mq_ctx); bool supports_fua = q->limits.features & BLK_FEAT_FUA; unsigned int policy = 0; /* FLUSH/FUA request must never be merged */ WARN_ON_ONCE(rq->bio != rq->biotail); if (blk_rq_sectors(rq)) policy |= REQ_FSEQ_DATA; /* * Check which flushes we need to sequence for this operation. */ if (blk_queue_write_cache(q)) { if (rq->cmd_flags & REQ_PREFLUSH) policy |= REQ_FSEQ_PREFLUSH; if ((rq->cmd_flags & REQ_FUA) && !supports_fua) policy |= REQ_FSEQ_POSTFLUSH; } /* * @policy now records what operations need to be done. Adjust * REQ_PREFLUSH and FUA for the driver. */ rq->cmd_flags &= ~REQ_PREFLUSH; if (!supports_fua) rq->cmd_flags &= ~REQ_FUA; /* * REQ_PREFLUSH|REQ_FUA implies REQ_SYNC, so if we clear any * of those flags, we have to set REQ_SYNC to avoid skewing * the request accounting. */ rq->cmd_flags |= REQ_SYNC; switch (policy) { case 0: /* * An empty flush handed down from a stacking driver may * translate into nothing if the underlying device does not * advertise a write-back cache. In this case, simply * complete the request. */ blk_mq_end_request(rq, 0); return true; case REQ_FSEQ_DATA: /* * If there's data, but no flush is necessary, the request can * be processed directly without going through flush machinery. * Queue for normal execution. */ return false; case REQ_FSEQ_DATA | REQ_FSEQ_POSTFLUSH: /* * Initialize the flush fields and completion handler to trigger * the post flush, and then just pass the command on. */ blk_rq_init_flush(rq); rq->flush.seq |= REQ_FSEQ_PREFLUSH; spin_lock_irq(&fq->mq_flush_lock); fq->flush_data_in_flight++; spin_unlock_irq(&fq->mq_flush_lock); return false; default: /* * Mark the request as part of a flush sequence and submit it * for further processing to the flush state machine. */ blk_rq_init_flush(rq); spin_lock_irq(&fq->mq_flush_lock); blk_flush_complete_seq(rq, fq, REQ_FSEQ_ACTIONS & ~policy, 0); spin_unlock_irq(&fq->mq_flush_lock); return true; } } /** * blkdev_issue_flush - queue a flush * @bdev: blockdev to issue flush for * * Description: * Issue a flush for the block device in question. */ int blkdev_issue_flush(struct block_device *bdev) { struct bio bio; bio_init(&bio, bdev, NULL, 0, REQ_OP_WRITE | REQ_PREFLUSH); return submit_bio_wait(&bio); } EXPORT_SYMBOL(blkdev_issue_flush); struct blk_flush_queue *blk_alloc_flush_queue(int node, int cmd_size, gfp_t flags) { struct blk_flush_queue *fq; int rq_sz = sizeof(struct request); fq = kzalloc_node(sizeof(*fq), flags, node); if (!fq) goto fail; spin_lock_init(&fq->mq_flush_lock); rq_sz = round_up(rq_sz + cmd_size, cache_line_size()); fq->flush_rq = kzalloc_node(rq_sz, flags, node); if (!fq->flush_rq) goto fail_rq; INIT_LIST_HEAD(&fq->flush_queue[0]); INIT_LIST_HEAD(&fq->flush_queue[1]); return fq; fail_rq: kfree(fq); fail: return NULL; } void blk_free_flush_queue(struct blk_flush_queue *fq) { /* bio based request queue hasn't flush queue */ if (!fq) return; kfree(fq->flush_rq); kfree(fq); } /* * Allow driver to set its own lock class to fq->mq_flush_lock for * avoiding lockdep complaint. * * flush_end_io() may be called recursively from some driver, such as * nvme-loop, so lockdep may complain 'possible recursive locking' because * all 'struct blk_flush_queue' instance share same mq_flush_lock lock class * key. We need to assign different lock class for these driver's * fq->mq_flush_lock for avoiding the lockdep warning. * * Use dynamically allocated lock class key for each 'blk_flush_queue' * instance is over-kill, and more worse it introduces horrible boot delay * issue because synchronize_rcu() is implied in lockdep_unregister_key which * is called for each hctx release. SCSI probing may synchronously create and * destroy lots of MQ request_queues for non-existent devices, and some robot * test kernel always enable lockdep option. It is observed that more than half * an hour is taken during SCSI MQ probe with per-fq lock class. */ void blk_mq_hctx_set_fq_lock_class(struct blk_mq_hw_ctx *hctx, struct lock_class_key *key) { lockdep_set_class(&hctx->fq->mq_flush_lock, key); } EXPORT_SYMBOL_GPL(blk_mq_hctx_set_fq_lock_class);
1 2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 // SPDX-License-Identifier: GPL-2.0-or-later /* * Force feedback support for SmartJoy PLUS PS2->USB adapter * * Copyright (c) 2009 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> * * Based of hid-pl.c and hid-gaff.c * Copyright (c) 2007, 2009 Anssi Hannula <anssi.hannula@gmail.com> * Copyright (c) 2008 Lukasz Lubojanski <lukasz@lubojanski.info> */ /* */ /* #define DEBUG */ #include <linux/input.h> #include <linux/slab.h> #include <linux/hid.h> #include <linux/module.h> #include "hid-ids.h" #ifdef CONFIG_SMARTJOYPLUS_FF struct sjoyff_device { struct hid_report *report; }; static int hid_sjoyff_play(struct input_dev *dev, void *data, struct ff_effect *effect) { struct hid_device *hid = input_get_drvdata(dev); struct sjoyff_device *sjoyff = data; u32 left, right; left = effect->u.rumble.strong_magnitude; right = effect->u.rumble.weak_magnitude; dev_dbg(&dev->dev, "called with 0x%08x 0x%08x\n", left, right); left = left * 0xff / 0xffff; right = (right != 0); /* on/off only */ sjoyff->report->field[0]->value[1] = right; sjoyff->report->field[0]->value[2] = left; dev_dbg(&dev->dev, "running with 0x%02x 0x%02x\n", left, right); hid_hw_request(hid, sjoyff->report, HID_REQ_SET_REPORT); return 0; } static int sjoyff_init(struct hid_device *hid) { struct sjoyff_device *sjoyff; struct hid_report *report; struct hid_input *hidinput; struct list_head *report_list = &hid->report_enum[HID_OUTPUT_REPORT].report_list; struct list_head *report_ptr = report_list; struct input_dev *dev; int error; if (list_empty(report_list)) { hid_err(hid, "no output reports found\n"); return -ENODEV; } list_for_each_entry(hidinput, &hid->inputs, list) { report_ptr = report_ptr->next; if (report_ptr == report_list) { hid_err(hid, "required output report is missing\n"); return -ENODEV; } report = list_entry(report_ptr, struct hid_report, list); if (report->maxfield < 1) { hid_err(hid, "no fields in the report\n"); return -ENODEV; } if (report->field[0]->report_count < 3) { hid_err(hid, "not enough values in the field\n"); return -ENODEV; } sjoyff = kzalloc(sizeof(struct sjoyff_device), GFP_KERNEL); if (!sjoyff) return -ENOMEM; dev = hidinput->input; set_bit(FF_RUMBLE, dev->ffbit); error = input_ff_create_memless(dev, sjoyff, hid_sjoyff_play); if (error) { kfree(sjoyff); return error; } sjoyff->report = report; sjoyff->report->field[0]->value[0] = 0x01; sjoyff->report->field[0]->value[1] = 0x00; sjoyff->report->field[0]->value[2] = 0x00; hid_hw_request(hid, sjoyff->report, HID_REQ_SET_REPORT); } hid_info(hid, "Force feedback for SmartJoy PLUS PS2/USB adapter\n"); return 0; } #else static inline int sjoyff_init(struct hid_device *hid) { return 0; } #endif static int sjoy_probe(struct hid_device *hdev, const struct hid_device_id *id) { int ret; hdev->quirks |= id->driver_data; ret = hid_parse(hdev); if (ret) { hid_err(hdev, "parse failed\n"); goto err; } ret = hid_hw_start(hdev, HID_CONNECT_DEFAULT & ~HID_CONNECT_FF); if (ret) { hid_err(hdev, "hw start failed\n"); goto err; } sjoyff_init(hdev); return 0; err: return ret; } static const struct hid_device_id sjoy_devices[] = { { HID_USB_DEVICE(USB_VENDOR_ID_WISEGROUP_LTD, USB_DEVICE_ID_SUPER_JOY_BOX_3_PRO), .driver_data = HID_QUIRK_NOGET }, { HID_USB_DEVICE(USB_VENDOR_ID_WISEGROUP_LTD, USB_DEVICE_ID_SUPER_DUAL_BOX_PRO), .driver_data = HID_QUIRK_MULTI_INPUT | HID_QUIRK_NOGET | HID_QUIRK_SKIP_OUTPUT_REPORTS }, { HID_USB_DEVICE(USB_VENDOR_ID_WISEGROUP_LTD, USB_DEVICE_ID_SUPER_JOY_BOX_5_PRO), .driver_data = HID_QUIRK_MULTI_INPUT | HID_QUIRK_NOGET | HID_QUIRK_SKIP_OUTPUT_REPORTS }, { HID_USB_DEVICE(USB_VENDOR_ID_WISEGROUP, USB_DEVICE_ID_SMARTJOY_PLUS) }, { HID_USB_DEVICE(USB_VENDOR_ID_WISEGROUP, USB_DEVICE_ID_SUPER_JOY_BOX_3) }, { HID_USB_DEVICE(USB_VENDOR_ID_WISEGROUP, USB_DEVICE_ID_DUAL_USB_JOYPAD), .driver_data = HID_QUIRK_MULTI_INPUT | HID_QUIRK_SKIP_OUTPUT_REPORTS }, { HID_USB_DEVICE(USB_VENDOR_ID_PLAYDOTCOM, USB_DEVICE_ID_PLAYDOTCOM_EMS_USBII), .driver_data = HID_QUIRK_MULTI_INPUT | HID_QUIRK_SKIP_OUTPUT_REPORTS }, { } }; MODULE_DEVICE_TABLE(hid, sjoy_devices); static struct hid_driver sjoy_driver = { .name = "smartjoyplus", .id_table = sjoy_devices, .probe = sjoy_probe, }; module_hid_driver(sjoy_driver); MODULE_DESCRIPTION("Force feedback support for SmartJoy PLUS PS2->USB adapter"); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jussi Kivilinna");
2 1 1 16 26 26 1 1 2 1 2 3 2 1 15 21 7 15 7 21 26 5 26 27 2 25 26 26 26 26 26 26 26 26 26 16 3 8 21 3 7 15 21 24 20 6 4 4 2 2 7 37 7 29 2 8 29 1 2 4 1 4 3 4 1 4 2 7 1 6 6 9 2 76 76 28 2 26 2 2 27 2 27 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 // SPDX-License-Identifier: GPL-2.0-or-later /* AF_RXRPC sendmsg() implementation. * * Copyright (C) 2007, 2016 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/net.h> #include <linux/gfp.h> #include <linux/skbuff.h> #include <linux/export.h> #include <linux/sched/signal.h> #include <net/sock.h> #include <net/af_rxrpc.h> #include "ar-internal.h" /* * Propose an abort to be made in the I/O thread. */ bool rxrpc_propose_abort(struct rxrpc_call *call, s32 abort_code, int error, enum rxrpc_abort_reason why) { _enter("{%d},%d,%d,%u", call->debug_id, abort_code, error, why); if (!call->send_abort && !rxrpc_call_is_complete(call)) { call->send_abort_why = why; call->send_abort_err = error; call->send_abort_seq = 0; trace_rxrpc_abort_call(call, abort_code); /* Request abort locklessly vs rxrpc_input_call_event(). */ smp_store_release(&call->send_abort, abort_code); rxrpc_poke_call(call, rxrpc_call_poke_abort); return true; } return false; } /* * Wait for a call to become connected. Interruption here doesn't cause the * call to be aborted. */ static int rxrpc_wait_to_be_connected(struct rxrpc_call *call, long *timeo) { DECLARE_WAITQUEUE(myself, current); int ret = 0; _enter("%d", call->debug_id); if (rxrpc_call_state(call) != RXRPC_CALL_CLIENT_AWAIT_CONN) goto no_wait; add_wait_queue_exclusive(&call->waitq, &myself); for (;;) { switch (call->interruptibility) { case RXRPC_INTERRUPTIBLE: case RXRPC_PREINTERRUPTIBLE: set_current_state(TASK_INTERRUPTIBLE); break; case RXRPC_UNINTERRUPTIBLE: default: set_current_state(TASK_UNINTERRUPTIBLE); break; } if (rxrpc_call_state(call) != RXRPC_CALL_CLIENT_AWAIT_CONN) break; if ((call->interruptibility == RXRPC_INTERRUPTIBLE || call->interruptibility == RXRPC_PREINTERRUPTIBLE) && signal_pending(current)) { ret = sock_intr_errno(*timeo); break; } *timeo = schedule_timeout(*timeo); } remove_wait_queue(&call->waitq, &myself); __set_current_state(TASK_RUNNING); no_wait: if (ret == 0 && rxrpc_call_is_complete(call)) ret = call->error; _leave(" = %d", ret); return ret; } /* * Return true if there's sufficient Tx queue space. */ static bool rxrpc_check_tx_space(struct rxrpc_call *call, rxrpc_seq_t *_tx_win) { rxrpc_seq_t tx_bottom = READ_ONCE(call->tx_bottom); if (_tx_win) *_tx_win = tx_bottom; return call->send_top - tx_bottom < 256; } /* * Wait for space to appear in the Tx queue or a signal to occur. */ static int rxrpc_wait_for_tx_window_intr(struct rxrpc_sock *rx, struct rxrpc_call *call, long *timeo) { for (;;) { set_current_state(TASK_INTERRUPTIBLE); if (rxrpc_check_tx_space(call, NULL)) return 0; if (rxrpc_call_is_complete(call)) return call->error; if (signal_pending(current)) return sock_intr_errno(*timeo); trace_rxrpc_txqueue(call, rxrpc_txqueue_wait); *timeo = schedule_timeout(*timeo); } } /* * Wait for space to appear in the Tx queue uninterruptibly, but with * a timeout of 2*RTT if no progress was made and a signal occurred. */ static int rxrpc_wait_for_tx_window_waitall(struct rxrpc_sock *rx, struct rxrpc_call *call) { rxrpc_seq_t tx_start, tx_win; signed long rtt, timeout; rtt = READ_ONCE(call->srtt_us) >> 3; rtt = usecs_to_jiffies(rtt) * 2; if (rtt < 2) rtt = 2; timeout = rtt; tx_start = READ_ONCE(call->tx_bottom); for (;;) { set_current_state(TASK_UNINTERRUPTIBLE); if (rxrpc_check_tx_space(call, &tx_win)) return 0; if (rxrpc_call_is_complete(call)) return call->error; if (timeout == 0 && tx_win == tx_start && signal_pending(current)) return -EINTR; if (tx_win != tx_start) { timeout = rtt; tx_start = tx_win; } trace_rxrpc_txqueue(call, rxrpc_txqueue_wait); timeout = schedule_timeout(timeout); } } /* * Wait for space to appear in the Tx queue uninterruptibly. */ static int rxrpc_wait_for_tx_window_nonintr(struct rxrpc_sock *rx, struct rxrpc_call *call, long *timeo) { for (;;) { set_current_state(TASK_UNINTERRUPTIBLE); if (rxrpc_check_tx_space(call, NULL)) return 0; if (rxrpc_call_is_complete(call)) return call->error; trace_rxrpc_txqueue(call, rxrpc_txqueue_wait); *timeo = schedule_timeout(*timeo); } } /* * wait for space to appear in the transmit/ACK window * - caller holds the socket locked */ static int rxrpc_wait_for_tx_window(struct rxrpc_sock *rx, struct rxrpc_call *call, long *timeo, bool waitall) { DECLARE_WAITQUEUE(myself, current); int ret; _enter(",{%u,%u,%u}", call->tx_bottom, call->tx_top, call->tx_winsize); add_wait_queue(&call->waitq, &myself); switch (call->interruptibility) { case RXRPC_INTERRUPTIBLE: if (waitall) ret = rxrpc_wait_for_tx_window_waitall(rx, call); else ret = rxrpc_wait_for_tx_window_intr(rx, call, timeo); break; case RXRPC_PREINTERRUPTIBLE: case RXRPC_UNINTERRUPTIBLE: default: ret = rxrpc_wait_for_tx_window_nonintr(rx, call, timeo); break; } remove_wait_queue(&call->waitq, &myself); set_current_state(TASK_RUNNING); _leave(" = %d", ret); return ret; } /* * Notify the owner of the call that the transmit phase is ended and the last * packet has been queued. */ static void rxrpc_notify_end_tx(struct rxrpc_sock *rx, struct rxrpc_call *call, rxrpc_notify_end_tx_t notify_end_tx) { if (notify_end_tx) notify_end_tx(&rx->sk, call, call->user_call_ID); } /* * Queue a DATA packet for transmission, set the resend timeout and send * the packet immediately. Returns the error from rxrpc_send_data_packet() * in case the caller wants to do something with it. */ static void rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call, struct rxrpc_txbuf *txb, rxrpc_notify_end_tx_t notify_end_tx) { struct rxrpc_txqueue *sq = call->send_queue; rxrpc_seq_t seq = txb->seq; bool poke, last = txb->flags & RXRPC_LAST_PACKET; int ix = seq & RXRPC_TXQ_MASK; rxrpc_inc_stat(call->rxnet, stat_tx_data); ASSERTCMP(txb->seq, ==, call->send_top + 1); if (last) trace_rxrpc_txqueue(call, rxrpc_txqueue_queue_last); else trace_rxrpc_txqueue(call, rxrpc_txqueue_queue); if (WARN_ON_ONCE(sq->bufs[ix])) trace_rxrpc_tq(call, sq, seq, rxrpc_tq_queue_dup); else trace_rxrpc_tq(call, sq, seq, rxrpc_tq_queue); /* Add the packet to the call's output buffer */ poke = (READ_ONCE(call->tx_bottom) == call->send_top); sq->bufs[ix] = txb; /* Order send_top after the queue->next pointer and txb content. */ smp_store_release(&call->send_top, seq); if (last) { set_bit(RXRPC_CALL_TX_NO_MORE, &call->flags); rxrpc_notify_end_tx(rx, call, notify_end_tx); call->send_queue = NULL; } if (poke) rxrpc_poke_call(call, rxrpc_call_poke_start); } /* * Allocate a new txqueue unit and add it to the transmission queue. */ static int rxrpc_alloc_txqueue(struct sock *sk, struct rxrpc_call *call) { struct rxrpc_txqueue *tq; tq = kzalloc(sizeof(*tq), sk->sk_allocation); if (!tq) return -ENOMEM; tq->xmit_ts_base = KTIME_MIN; for (int i = 0; i < RXRPC_NR_TXQUEUE; i++) tq->segment_xmit_ts[i] = UINT_MAX; if (call->send_queue) { tq->qbase = call->send_top + 1; call->send_queue->next = tq; call->send_queue = tq; } else if (WARN_ON(call->tx_queue)) { kfree(tq); return -ENOMEM; } else { /* We start at seq 1, so pretend seq 0 is hard-acked. */ tq->nr_reported_acks = 1; tq->segment_acked = 1UL; tq->qbase = 0; call->tx_qbase = 0; call->send_queue = tq; call->tx_qtail = tq; call->tx_queue = tq; } trace_rxrpc_tq(call, tq, call->send_top, rxrpc_tq_alloc); return 0; } /* * send data through a socket * - must be called in process context * - The caller holds the call user access mutex, but not the socket lock. */ static int rxrpc_send_data(struct rxrpc_sock *rx, struct rxrpc_call *call, struct msghdr *msg, size_t len, rxrpc_notify_end_tx_t notify_end_tx, bool *_dropped_lock) { struct rxrpc_txbuf *txb; struct sock *sk = &rx->sk; enum rxrpc_call_state state; long timeo; bool more = msg->msg_flags & MSG_MORE; int ret, copied = 0; if (test_bit(RXRPC_CALL_TX_NO_MORE, &call->flags)) { trace_rxrpc_abort(call->debug_id, rxrpc_sendmsg_late_send, call->cid, call->call_id, call->rx_consumed, 0, -EPROTO); return -EPROTO; } timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); ret = rxrpc_wait_to_be_connected(call, &timeo); if (ret < 0) return ret; if (call->conn->state == RXRPC_CONN_CLIENT_UNSECURED) { ret = rxrpc_init_client_conn_security(call->conn); if (ret < 0) return ret; } /* this should be in poll */ sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); reload: txb = call->tx_pending; call->tx_pending = NULL; if (txb) rxrpc_see_txbuf(txb, rxrpc_txbuf_see_send_more); ret = -EPIPE; if (sk->sk_shutdown & SEND_SHUTDOWN) goto maybe_error; state = rxrpc_call_state(call); ret = -ESHUTDOWN; if (state >= RXRPC_CALL_COMPLETE) goto maybe_error; ret = -EPROTO; if (state != RXRPC_CALL_CLIENT_SEND_REQUEST && state != RXRPC_CALL_SERVER_ACK_REQUEST && state != RXRPC_CALL_SERVER_SEND_REPLY) { /* Request phase complete for this client call */ trace_rxrpc_abort(call->debug_id, rxrpc_sendmsg_late_send, call->cid, call->call_id, call->rx_consumed, 0, -EPROTO); goto maybe_error; } ret = -EMSGSIZE; if (call->tx_total_len != -1) { if (len - copied > call->tx_total_len) goto maybe_error; if (!more && len - copied != call->tx_total_len) goto maybe_error; } do { if (!txb) { size_t remain; _debug("alloc"); if (!rxrpc_check_tx_space(call, NULL)) goto wait_for_space; /* See if we need to begin/extend the Tx queue. */ if (!call->send_queue || !((call->send_top + 1) & RXRPC_TXQ_MASK)) { ret = rxrpc_alloc_txqueue(sk, call); if (ret < 0) goto maybe_error; } /* Work out the maximum size of a packet. Assume that * the security header is going to be in the padded * region (enc blocksize), but the trailer is not. */ remain = more ? INT_MAX : msg_data_left(msg); txb = call->conn->security->alloc_txbuf(call, remain, sk->sk_allocation); if (!txb) { ret = -ENOMEM; goto maybe_error; } } _debug("append"); /* append next segment of data to the current buffer */ if (msg_data_left(msg) > 0) { size_t copy = umin(txb->space, msg_data_left(msg)); _debug("add %zu", copy); if (!copy_from_iter_full(txb->data + txb->offset, copy, &msg->msg_iter)) goto efault; _debug("added"); txb->space -= copy; txb->len += copy; txb->offset += copy; copied += copy; if (call->tx_total_len != -1) call->tx_total_len -= copy; } /* check for the far side aborting the call or a network error * occurring */ if (rxrpc_call_is_complete(call)) goto call_terminated; /* add the packet to the send queue if it's now full */ if (!txb->space || (msg_data_left(msg) == 0 && !more)) { if (msg_data_left(msg) == 0 && !more) txb->flags |= RXRPC_LAST_PACKET; ret = call->security->secure_packet(call, txb); if (ret < 0) goto out; rxrpc_queue_packet(rx, call, txb, notify_end_tx); txb = NULL; } } while (msg_data_left(msg) > 0); success: ret = copied; if (rxrpc_call_is_complete(call) && call->error < 0) ret = call->error; out: call->tx_pending = txb; _leave(" = %d", ret); return ret; call_terminated: rxrpc_put_txbuf(txb, rxrpc_txbuf_put_send_aborted); _leave(" = %d", call->error); return call->error; maybe_error: if (copied) goto success; goto out; efault: ret = -EFAULT; goto out; wait_for_space: ret = -EAGAIN; if (msg->msg_flags & MSG_DONTWAIT) goto maybe_error; mutex_unlock(&call->user_mutex); *_dropped_lock = true; ret = rxrpc_wait_for_tx_window(rx, call, &timeo, msg->msg_flags & MSG_WAITALL); if (ret < 0) goto maybe_error; if (call->interruptibility == RXRPC_INTERRUPTIBLE) { if (mutex_lock_interruptible(&call->user_mutex) < 0) { ret = sock_intr_errno(timeo); goto maybe_error; } } else { mutex_lock(&call->user_mutex); } *_dropped_lock = false; goto reload; } /* * extract control messages from the sendmsg() control buffer */ static int rxrpc_sendmsg_cmsg(struct msghdr *msg, struct rxrpc_send_params *p) { struct cmsghdr *cmsg; bool got_user_ID = false; int len; if (msg->msg_controllen == 0) return -EINVAL; for_each_cmsghdr(cmsg, msg) { if (!CMSG_OK(msg, cmsg)) return -EINVAL; len = cmsg->cmsg_len - sizeof(struct cmsghdr); _debug("CMSG %d, %d, %d", cmsg->cmsg_level, cmsg->cmsg_type, len); if (cmsg->cmsg_level != SOL_RXRPC) continue; switch (cmsg->cmsg_type) { case RXRPC_USER_CALL_ID: if (msg->msg_flags & MSG_CMSG_COMPAT) { if (len != sizeof(u32)) return -EINVAL; p->call.user_call_ID = *(u32 *)CMSG_DATA(cmsg); } else { if (len != sizeof(unsigned long)) return -EINVAL; p->call.user_call_ID = *(unsigned long *) CMSG_DATA(cmsg); } got_user_ID = true; break; case RXRPC_ABORT: if (p->command != RXRPC_CMD_SEND_DATA) return -EINVAL; p->command = RXRPC_CMD_SEND_ABORT; if (len != sizeof(p->abort_code)) return -EINVAL; p->abort_code = *(unsigned int *)CMSG_DATA(cmsg); if (p->abort_code == 0) return -EINVAL; break; case RXRPC_CHARGE_ACCEPT: if (p->command != RXRPC_CMD_SEND_DATA) return -EINVAL; p->command = RXRPC_CMD_CHARGE_ACCEPT; if (len != 0) return -EINVAL; break; case RXRPC_EXCLUSIVE_CALL: p->exclusive = true; if (len != 0) return -EINVAL; break; case RXRPC_UPGRADE_SERVICE: p->upgrade = true; if (len != 0) return -EINVAL; break; case RXRPC_TX_LENGTH: if (p->call.tx_total_len != -1 || len != sizeof(__s64)) return -EINVAL; p->call.tx_total_len = *(__s64 *)CMSG_DATA(cmsg); if (p->call.tx_total_len < 0) return -EINVAL; break; case RXRPC_SET_CALL_TIMEOUT: if (len & 3 || len < 4 || len > 12) return -EINVAL; memcpy(&p->call.timeouts, CMSG_DATA(cmsg), len); p->call.nr_timeouts = len / 4; if (p->call.timeouts.hard > INT_MAX / HZ) return -ERANGE; if (p->call.nr_timeouts >= 2 && p->call.timeouts.idle > 60 * 60 * 1000) return -ERANGE; if (p->call.nr_timeouts >= 3 && p->call.timeouts.normal > 60 * 60 * 1000) return -ERANGE; break; default: return -EINVAL; } } if (!got_user_ID) return -EINVAL; if (p->call.tx_total_len != -1 && p->command != RXRPC_CMD_SEND_DATA) return -EINVAL; _leave(" = 0"); return 0; } /* * Create a new client call for sendmsg(). * - Called with the socket lock held, which it must release. * - If it returns a call, the call's lock will need releasing by the caller. */ static struct rxrpc_call * rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, struct rxrpc_send_params *p) __releases(&rx->sk.sk_lock.slock) __acquires(&call->user_mutex) { struct rxrpc_conn_parameters cp; struct rxrpc_peer *peer; struct rxrpc_call *call; struct key *key; DECLARE_SOCKADDR(struct sockaddr_rxrpc *, srx, msg->msg_name); _enter(""); if (!msg->msg_name) { release_sock(&rx->sk); return ERR_PTR(-EDESTADDRREQ); } peer = rxrpc_lookup_peer(rx->local, srx, GFP_KERNEL); if (!peer) { release_sock(&rx->sk); return ERR_PTR(-ENOMEM); } key = rx->key; if (key && !rx->key->payload.data[0]) key = NULL; memset(&cp, 0, sizeof(cp)); cp.local = rx->local; cp.peer = peer; cp.key = rx->key; cp.security_level = rx->min_sec_level; cp.exclusive = rx->exclusive | p->exclusive; cp.upgrade = p->upgrade; cp.service_id = srx->srx_service; call = rxrpc_new_client_call(rx, &cp, &p->call, GFP_KERNEL, atomic_inc_return(&rxrpc_debug_id)); /* The socket is now unlocked */ rxrpc_put_peer(peer, rxrpc_peer_put_application); _leave(" = %p\n", call); return call; } /* * send a message forming part of a client call through an RxRPC socket * - caller holds the socket locked * - the socket may be either a client socket or a server socket */ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) __releases(&rx->sk.sk_lock.slock) { struct rxrpc_call *call; bool dropped_lock = false; int ret; struct rxrpc_send_params p = { .call.tx_total_len = -1, .call.user_call_ID = 0, .call.nr_timeouts = 0, .call.interruptibility = RXRPC_INTERRUPTIBLE, .abort_code = 0, .command = RXRPC_CMD_SEND_DATA, .exclusive = false, .upgrade = false, }; _enter(""); ret = rxrpc_sendmsg_cmsg(msg, &p); if (ret < 0) goto error_release_sock; if (p.command == RXRPC_CMD_CHARGE_ACCEPT) { ret = -EINVAL; if (rx->sk.sk_state != RXRPC_SERVER_LISTENING) goto error_release_sock; ret = rxrpc_user_charge_accept(rx, p.call.user_call_ID); goto error_release_sock; } call = rxrpc_find_call_by_user_ID(rx, p.call.user_call_ID); if (!call) { ret = -EBADSLT; if (p.command != RXRPC_CMD_SEND_DATA) goto error_release_sock; call = rxrpc_new_client_call_for_sendmsg(rx, msg, &p); /* The socket is now unlocked... */ if (IS_ERR(call)) return PTR_ERR(call); /* ... and we have the call lock. */ p.call.nr_timeouts = 0; ret = 0; if (rxrpc_call_is_complete(call)) goto out_put_unlock; } else { switch (rxrpc_call_state(call)) { case RXRPC_CALL_CLIENT_AWAIT_CONN: case RXRPC_CALL_SERVER_RECV_REQUEST: if (p.command == RXRPC_CMD_SEND_ABORT) break; fallthrough; case RXRPC_CALL_UNINITIALISED: case RXRPC_CALL_SERVER_PREALLOC: rxrpc_put_call(call, rxrpc_call_put_sendmsg); ret = -EBUSY; goto error_release_sock; default: break; } ret = mutex_lock_interruptible(&call->user_mutex); release_sock(&rx->sk); if (ret < 0) { ret = -ERESTARTSYS; goto error_put; } if (p.call.tx_total_len != -1) { ret = -EINVAL; if (call->tx_total_len != -1 || call->tx_pending || call->tx_top != 0) goto out_put_unlock; call->tx_total_len = p.call.tx_total_len; } } switch (p.call.nr_timeouts) { case 3: WRITE_ONCE(call->next_rx_timo, p.call.timeouts.normal); fallthrough; case 2: WRITE_ONCE(call->next_req_timo, p.call.timeouts.idle); fallthrough; case 1: if (p.call.timeouts.hard > 0) { ktime_t delay = ms_to_ktime(p.call.timeouts.hard * MSEC_PER_SEC); WRITE_ONCE(call->expect_term_by, ktime_add(p.call.timeouts.hard, ktime_get_real())); trace_rxrpc_timer_set(call, delay, rxrpc_timer_trace_hard); rxrpc_poke_call(call, rxrpc_call_poke_set_timeout); } break; } if (rxrpc_call_is_complete(call)) { /* it's too late for this call */ ret = -ESHUTDOWN; } else if (p.command == RXRPC_CMD_SEND_ABORT) { rxrpc_propose_abort(call, p.abort_code, -ECONNABORTED, rxrpc_abort_call_sendmsg); ret = 0; } else if (p.command != RXRPC_CMD_SEND_DATA) { ret = -EINVAL; } else { ret = rxrpc_send_data(rx, call, msg, len, NULL, &dropped_lock); } out_put_unlock: if (!dropped_lock) mutex_unlock(&call->user_mutex); error_put: rxrpc_put_call(call, rxrpc_call_put_sendmsg); _leave(" = %d", ret); return ret; error_release_sock: release_sock(&rx->sk); return ret; } /** * rxrpc_kernel_send_data - Allow a kernel service to send data on a call * @sock: The socket the call is on * @call: The call to send data through * @msg: The data to send * @len: The amount of data to send * @notify_end_tx: Notification that the last packet is queued. * * Allow a kernel service to send data on a call. The call must be in an state * appropriate to sending data. No control data should be supplied in @msg, * nor should an address be supplied. MSG_MORE should be flagged if there's * more data to come, otherwise this data will end the transmission phase. */ int rxrpc_kernel_send_data(struct socket *sock, struct rxrpc_call *call, struct msghdr *msg, size_t len, rxrpc_notify_end_tx_t notify_end_tx) { bool dropped_lock = false; int ret; _enter("{%d},", call->debug_id); ASSERTCMP(msg->msg_name, ==, NULL); ASSERTCMP(msg->msg_control, ==, NULL); mutex_lock(&call->user_mutex); ret = rxrpc_send_data(rxrpc_sk(sock->sk), call, msg, len, notify_end_tx, &dropped_lock); if (ret == -ESHUTDOWN) ret = call->error; if (!dropped_lock) mutex_unlock(&call->user_mutex); _leave(" = %d", ret); return ret; } EXPORT_SYMBOL(rxrpc_kernel_send_data); /** * rxrpc_kernel_abort_call - Allow a kernel service to abort a call * @sock: The socket the call is on * @call: The call to be aborted * @abort_code: The abort code to stick into the ABORT packet * @error: Local error value * @why: Indication as to why. * * Allow a kernel service to abort a call, if it's still in an abortable state * and return true if the call was aborted, false if it was already complete. */ bool rxrpc_kernel_abort_call(struct socket *sock, struct rxrpc_call *call, u32 abort_code, int error, enum rxrpc_abort_reason why) { bool aborted; _enter("{%d},%d,%d,%u", call->debug_id, abort_code, error, why); mutex_lock(&call->user_mutex); aborted = rxrpc_propose_abort(call, abort_code, error, why); mutex_unlock(&call->user_mutex); return aborted; } EXPORT_SYMBOL(rxrpc_kernel_abort_call); /** * rxrpc_kernel_set_tx_length - Set the total Tx length on a call * @sock: The socket the call is on * @call: The call to be informed * @tx_total_len: The amount of data to be transmitted for this call * * Allow a kernel service to set the total transmit length on a call. This * allows buffer-to-packet encrypt-and-copy to be performed. * * This function is primarily for use for setting the reply length since the * request length can be set when beginning the call. */ void rxrpc_kernel_set_tx_length(struct socket *sock, struct rxrpc_call *call, s64 tx_total_len) { WARN_ON(call->tx_total_len != -1); call->tx_total_len = tx_total_len; } EXPORT_SYMBOL(rxrpc_kernel_set_tx_length);
2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 // SPDX-License-Identifier: GPL-2.0-only /* * Driver for the Diolan DLN-2 USB adapter * * Copyright (c) 2014 Intel Corporation * * Derived from: * i2c-diolan-u2c.c * Copyright (c) 2010-2011 Ericsson AB */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/types.h> #include <linux/slab.h> #include <linux/usb.h> #include <linux/mutex.h> #include <linux/platform_device.h> #include <linux/mfd/core.h> #include <linux/mfd/dln2.h> #include <linux/rculist.h> struct dln2_header { __le16 size; __le16 id; __le16 echo; __le16 handle; }; struct dln2_response { struct dln2_header hdr; __le16 result; }; #define DLN2_GENERIC_MODULE_ID 0x00 #define DLN2_GENERIC_CMD(cmd) DLN2_CMD(cmd, DLN2_GENERIC_MODULE_ID) #define CMD_GET_DEVICE_VER DLN2_GENERIC_CMD(0x30) #define CMD_GET_DEVICE_SN DLN2_GENERIC_CMD(0x31) #define DLN2_HW_ID 0x200 #define DLN2_USB_TIMEOUT 200 /* in ms */ #define DLN2_MAX_RX_SLOTS 16 #define DLN2_MAX_URBS 16 #define DLN2_RX_BUF_SIZE 512 enum dln2_handle { DLN2_HANDLE_EVENT = 0, /* don't change, hardware defined */ DLN2_HANDLE_CTRL, DLN2_HANDLE_GPIO, DLN2_HANDLE_I2C, DLN2_HANDLE_SPI, DLN2_HANDLE_ADC, DLN2_HANDLES }; /* * Receive context used between the receive demultiplexer and the transfer * routine. While sending a request the transfer routine will look for a free * receive context and use it to wait for a response and to receive the URB and * thus the response data. */ struct dln2_rx_context { /* completion used to wait for a response */ struct completion done; /* if non-NULL the URB contains the response */ struct urb *urb; /* if true then this context is used to wait for a response */ bool in_use; }; /* * Receive contexts for a particular DLN2 module (i2c, gpio, etc.). We use the * handle header field to identify the module in dln2_dev.mod_rx_slots and then * the echo header field to index the slots field and find the receive context * for a particular request. */ struct dln2_mod_rx_slots { /* RX slots bitmap */ DECLARE_BITMAP(bmap, DLN2_MAX_RX_SLOTS); /* used to wait for a free RX slot */ wait_queue_head_t wq; /* used to wait for an RX operation to complete */ struct dln2_rx_context slots[DLN2_MAX_RX_SLOTS]; /* avoid races between alloc/free_rx_slot and dln2_rx_transfer */ spinlock_t lock; }; struct dln2_dev { struct usb_device *usb_dev; struct usb_interface *interface; u8 ep_in; u8 ep_out; struct urb *rx_urb[DLN2_MAX_URBS]; void *rx_buf[DLN2_MAX_URBS]; struct dln2_mod_rx_slots mod_rx_slots[DLN2_HANDLES]; struct list_head event_cb_list; spinlock_t event_cb_lock; bool disconnect; int active_transfers; wait_queue_head_t disconnect_wq; spinlock_t disconnect_lock; }; struct dln2_event_cb_entry { struct list_head list; u16 id; struct platform_device *pdev; dln2_event_cb_t callback; }; int dln2_register_event_cb(struct platform_device *pdev, u16 id, dln2_event_cb_t event_cb) { struct dln2_dev *dln2 = dev_get_drvdata(pdev->dev.parent); struct dln2_event_cb_entry *i, *entry; unsigned long flags; int ret = 0; entry = kzalloc(sizeof(*entry), GFP_KERNEL); if (!entry) return -ENOMEM; entry->id = id; entry->callback = event_cb; entry->pdev = pdev; spin_lock_irqsave(&dln2->event_cb_lock, flags); list_for_each_entry(i, &dln2->event_cb_list, list) { if (i->id == id) { ret = -EBUSY; break; } } if (!ret) list_add_rcu(&entry->list, &dln2->event_cb_list); spin_unlock_irqrestore(&dln2->event_cb_lock, flags); if (ret) kfree(entry); return ret; } EXPORT_SYMBOL(dln2_register_event_cb); void dln2_unregister_event_cb(struct platform_device *pdev, u16 id) { struct dln2_dev *dln2 = dev_get_drvdata(pdev->dev.parent); struct dln2_event_cb_entry *i; unsigned long flags; bool found = false; spin_lock_irqsave(&dln2->event_cb_lock, flags); list_for_each_entry(i, &dln2->event_cb_list, list) { if (i->id == id) { list_del_rcu(&i->list); found = true; break; } } spin_unlock_irqrestore(&dln2->event_cb_lock, flags); if (found) { synchronize_rcu(); kfree(i); } } EXPORT_SYMBOL(dln2_unregister_event_cb); /* * Returns true if a valid transfer slot is found. In this case the URB must not * be resubmitted immediately in dln2_rx as we need the data when dln2_transfer * is woke up. It will be resubmitted there. */ static bool dln2_transfer_complete(struct dln2_dev *dln2, struct urb *urb, u16 handle, u16 rx_slot) { struct device *dev = &dln2->interface->dev; struct dln2_mod_rx_slots *rxs = &dln2->mod_rx_slots[handle]; struct dln2_rx_context *rxc; unsigned long flags; bool valid_slot = false; if (rx_slot >= DLN2_MAX_RX_SLOTS) goto out; rxc = &rxs->slots[rx_slot]; spin_lock_irqsave(&rxs->lock, flags); if (rxc->in_use && !rxc->urb) { rxc->urb = urb; complete(&rxc->done); valid_slot = true; } spin_unlock_irqrestore(&rxs->lock, flags); out: if (!valid_slot) dev_warn(dev, "bad/late response %d/%d\n", handle, rx_slot); return valid_slot; } static void dln2_run_event_callbacks(struct dln2_dev *dln2, u16 id, u16 echo, void *data, int len) { struct dln2_event_cb_entry *i; rcu_read_lock(); list_for_each_entry_rcu(i, &dln2->event_cb_list, list) { if (i->id == id) { i->callback(i->pdev, echo, data, len); break; } } rcu_read_unlock(); } static void dln2_rx(struct urb *urb) { struct dln2_dev *dln2 = urb->context; struct dln2_header *hdr = urb->transfer_buffer; struct device *dev = &dln2->interface->dev; u16 id, echo, handle, size; u8 *data; int len; int err; switch (urb->status) { case 0: /* success */ break; case -ECONNRESET: case -ENOENT: case -ESHUTDOWN: case -EPIPE: /* this urb is terminated, clean up */ dev_dbg(dev, "urb shutting down with status %d\n", urb->status); return; default: dev_dbg(dev, "nonzero urb status received %d\n", urb->status); goto out; } if (urb->actual_length < sizeof(struct dln2_header)) { dev_err(dev, "short response: %d\n", urb->actual_length); goto out; } handle = le16_to_cpu(hdr->handle); id = le16_to_cpu(hdr->id); echo = le16_to_cpu(hdr->echo); size = le16_to_cpu(hdr->size); if (size != urb->actual_length) { dev_err(dev, "size mismatch: handle %x cmd %x echo %x size %d actual %d\n", handle, id, echo, size, urb->actual_length); goto out; } if (handle >= DLN2_HANDLES) { dev_warn(dev, "invalid handle %d\n", handle); goto out; } data = urb->transfer_buffer + sizeof(struct dln2_header); len = urb->actual_length - sizeof(struct dln2_header); if (handle == DLN2_HANDLE_EVENT) { unsigned long flags; spin_lock_irqsave(&dln2->event_cb_lock, flags); dln2_run_event_callbacks(dln2, id, echo, data, len); spin_unlock_irqrestore(&dln2->event_cb_lock, flags); } else { /* URB will be re-submitted in _dln2_transfer (free_rx_slot) */ if (dln2_transfer_complete(dln2, urb, handle, echo)) return; } out: err = usb_submit_urb(urb, GFP_ATOMIC); if (err < 0) dev_err(dev, "failed to resubmit RX URB: %d\n", err); } static void *dln2_prep_buf(u16 handle, u16 cmd, u16 echo, const void *obuf, int *obuf_len, gfp_t gfp) { int len; void *buf; struct dln2_header *hdr; len = *obuf_len + sizeof(*hdr); buf = kmalloc(len, gfp); if (!buf) return NULL; hdr = (struct dln2_header *)buf; hdr->id = cpu_to_le16(cmd); hdr->size = cpu_to_le16(len); hdr->echo = cpu_to_le16(echo); hdr->handle = cpu_to_le16(handle); memcpy(buf + sizeof(*hdr), obuf, *obuf_len); *obuf_len = len; return buf; } static int dln2_send_wait(struct dln2_dev *dln2, u16 handle, u16 cmd, u16 echo, const void *obuf, int obuf_len) { int ret = 0; int len = obuf_len; void *buf; int actual; buf = dln2_prep_buf(handle, cmd, echo, obuf, &len, GFP_KERNEL); if (!buf) return -ENOMEM; ret = usb_bulk_msg(dln2->usb_dev, usb_sndbulkpipe(dln2->usb_dev, dln2->ep_out), buf, len, &actual, DLN2_USB_TIMEOUT); kfree(buf); return ret; } static bool find_free_slot(struct dln2_dev *dln2, u16 handle, int *slot) { struct dln2_mod_rx_slots *rxs; unsigned long flags; if (dln2->disconnect) { *slot = -ENODEV; return true; } rxs = &dln2->mod_rx_slots[handle]; spin_lock_irqsave(&rxs->lock, flags); *slot = find_first_zero_bit(rxs->bmap, DLN2_MAX_RX_SLOTS); if (*slot < DLN2_MAX_RX_SLOTS) { struct dln2_rx_context *rxc = &rxs->slots[*slot]; set_bit(*slot, rxs->bmap); rxc->in_use = true; } spin_unlock_irqrestore(&rxs->lock, flags); return *slot < DLN2_MAX_RX_SLOTS; } static int alloc_rx_slot(struct dln2_dev *dln2, u16 handle) { int ret; int slot; /* * No need to timeout here, the wait is bounded by the timeout in * _dln2_transfer. */ ret = wait_event_interruptible(dln2->mod_rx_slots[handle].wq, find_free_slot(dln2, handle, &slot)); if (ret < 0) return ret; return slot; } static void free_rx_slot(struct dln2_dev *dln2, u16 handle, int slot) { struct dln2_mod_rx_slots *rxs; struct urb *urb = NULL; unsigned long flags; struct dln2_rx_context *rxc; rxs = &dln2->mod_rx_slots[handle]; spin_lock_irqsave(&rxs->lock, flags); clear_bit(slot, rxs->bmap); rxc = &rxs->slots[slot]; rxc->in_use = false; urb = rxc->urb; rxc->urb = NULL; reinit_completion(&rxc->done); spin_unlock_irqrestore(&rxs->lock, flags); if (urb) { int err; struct device *dev = &dln2->interface->dev; err = usb_submit_urb(urb, GFP_KERNEL); if (err < 0) dev_err(dev, "failed to resubmit RX URB: %d\n", err); } wake_up_interruptible(&rxs->wq); } static int _dln2_transfer(struct dln2_dev *dln2, u16 handle, u16 cmd, const void *obuf, unsigned obuf_len, void *ibuf, unsigned *ibuf_len) { int ret = 0; int rx_slot; struct dln2_response *rsp; struct dln2_rx_context *rxc; struct device *dev = &dln2->interface->dev; const unsigned long timeout = msecs_to_jiffies(DLN2_USB_TIMEOUT); struct dln2_mod_rx_slots *rxs = &dln2->mod_rx_slots[handle]; int size; spin_lock(&dln2->disconnect_lock); if (!dln2->disconnect) dln2->active_transfers++; else ret = -ENODEV; spin_unlock(&dln2->disconnect_lock); if (ret) return ret; rx_slot = alloc_rx_slot(dln2, handle); if (rx_slot < 0) { ret = rx_slot; goto out_decr; } ret = dln2_send_wait(dln2, handle, cmd, rx_slot, obuf, obuf_len); if (ret < 0) { dev_err(dev, "USB write failed: %d\n", ret); goto out_free_rx_slot; } rxc = &rxs->slots[rx_slot]; ret = wait_for_completion_interruptible_timeout(&rxc->done, timeout); if (ret <= 0) { if (!ret) ret = -ETIMEDOUT; goto out_free_rx_slot; } else { ret = 0; } if (dln2->disconnect) { ret = -ENODEV; goto out_free_rx_slot; } /* if we got here we know that the response header has been checked */ rsp = rxc->urb->transfer_buffer; size = le16_to_cpu(rsp->hdr.size); if (size < sizeof(*rsp)) { ret = -EPROTO; goto out_free_rx_slot; } if (le16_to_cpu(rsp->result) > 0x80) { dev_dbg(dev, "%d received response with error %d\n", handle, le16_to_cpu(rsp->result)); ret = -EREMOTEIO; goto out_free_rx_slot; } if (!ibuf) goto out_free_rx_slot; if (*ibuf_len > size - sizeof(*rsp)) *ibuf_len = size - sizeof(*rsp); memcpy(ibuf, rsp + 1, *ibuf_len); out_free_rx_slot: free_rx_slot(dln2, handle, rx_slot); out_decr: spin_lock(&dln2->disconnect_lock); dln2->active_transfers--; spin_unlock(&dln2->disconnect_lock); if (dln2->disconnect) wake_up(&dln2->disconnect_wq); return ret; } int dln2_transfer(struct platform_device *pdev, u16 cmd, const void *obuf, unsigned obuf_len, void *ibuf, unsigned *ibuf_len) { struct dln2_platform_data *dln2_pdata; struct dln2_dev *dln2; u16 handle; dln2 = dev_get_drvdata(pdev->dev.parent); dln2_pdata = dev_get_platdata(&pdev->dev); handle = dln2_pdata->handle; return _dln2_transfer(dln2, handle, cmd, obuf, obuf_len, ibuf, ibuf_len); } EXPORT_SYMBOL(dln2_transfer); static int dln2_check_hw(struct dln2_dev *dln2) { int ret; __le32 hw_type; int len = sizeof(hw_type); ret = _dln2_transfer(dln2, DLN2_HANDLE_CTRL, CMD_GET_DEVICE_VER, NULL, 0, &hw_type, &len); if (ret < 0) return ret; if (len < sizeof(hw_type)) return -EREMOTEIO; if (le32_to_cpu(hw_type) != DLN2_HW_ID) { dev_err(&dln2->interface->dev, "Device ID 0x%x not supported\n", le32_to_cpu(hw_type)); return -ENODEV; } return 0; } static int dln2_print_serialno(struct dln2_dev *dln2) { int ret; __le32 serial_no; int len = sizeof(serial_no); struct device *dev = &dln2->interface->dev; ret = _dln2_transfer(dln2, DLN2_HANDLE_CTRL, CMD_GET_DEVICE_SN, NULL, 0, &serial_no, &len); if (ret < 0) return ret; if (len < sizeof(serial_no)) return -EREMOTEIO; dev_info(dev, "Diolan DLN2 serial %u\n", le32_to_cpu(serial_no)); return 0; } static int dln2_hw_init(struct dln2_dev *dln2) { int ret; ret = dln2_check_hw(dln2); if (ret < 0) return ret; return dln2_print_serialno(dln2); } static void dln2_free_rx_urbs(struct dln2_dev *dln2) { int i; for (i = 0; i < DLN2_MAX_URBS; i++) { usb_free_urb(dln2->rx_urb[i]); kfree(dln2->rx_buf[i]); } } static void dln2_stop_rx_urbs(struct dln2_dev *dln2) { int i; for (i = 0; i < DLN2_MAX_URBS; i++) usb_kill_urb(dln2->rx_urb[i]); } static void dln2_free(struct dln2_dev *dln2) { dln2_free_rx_urbs(dln2); usb_put_dev(dln2->usb_dev); kfree(dln2); } static int dln2_setup_rx_urbs(struct dln2_dev *dln2, struct usb_host_interface *hostif) { int i; const int rx_max_size = DLN2_RX_BUF_SIZE; for (i = 0; i < DLN2_MAX_URBS; i++) { dln2->rx_buf[i] = kmalloc(rx_max_size, GFP_KERNEL); if (!dln2->rx_buf[i]) return -ENOMEM; dln2->rx_urb[i] = usb_alloc_urb(0, GFP_KERNEL); if (!dln2->rx_urb[i]) return -ENOMEM; usb_fill_bulk_urb(dln2->rx_urb[i], dln2->usb_dev, usb_rcvbulkpipe(dln2->usb_dev, dln2->ep_in), dln2->rx_buf[i], rx_max_size, dln2_rx, dln2); } return 0; } static int dln2_start_rx_urbs(struct dln2_dev *dln2, gfp_t gfp) { struct device *dev = &dln2->interface->dev; int ret; int i; for (i = 0; i < DLN2_MAX_URBS; i++) { ret = usb_submit_urb(dln2->rx_urb[i], gfp); if (ret < 0) { dev_err(dev, "failed to submit RX URB: %d\n", ret); return ret; } } return 0; } enum { DLN2_ACPI_MATCH_GPIO = 0, DLN2_ACPI_MATCH_I2C = 1, DLN2_ACPI_MATCH_SPI = 2, DLN2_ACPI_MATCH_ADC = 3, }; static struct dln2_platform_data dln2_pdata_gpio = { .handle = DLN2_HANDLE_GPIO, }; static struct mfd_cell_acpi_match dln2_acpi_match_gpio = { .adr = DLN2_ACPI_MATCH_GPIO, }; /* Only one I2C port seems to be supported on current hardware */ static struct dln2_platform_data dln2_pdata_i2c = { .handle = DLN2_HANDLE_I2C, .port = 0, }; static struct mfd_cell_acpi_match dln2_acpi_match_i2c = { .adr = DLN2_ACPI_MATCH_I2C, }; /* Only one SPI port supported */ static struct dln2_platform_data dln2_pdata_spi = { .handle = DLN2_HANDLE_SPI, .port = 0, }; static struct mfd_cell_acpi_match dln2_acpi_match_spi = { .adr = DLN2_ACPI_MATCH_SPI, }; /* Only one ADC port supported */ static struct dln2_platform_data dln2_pdata_adc = { .handle = DLN2_HANDLE_ADC, .port = 0, }; static struct mfd_cell_acpi_match dln2_acpi_match_adc = { .adr = DLN2_ACPI_MATCH_ADC, }; static const struct mfd_cell dln2_devs[] = { { .name = "dln2-gpio", .acpi_match = &dln2_acpi_match_gpio, .platform_data = &dln2_pdata_gpio, .pdata_size = sizeof(struct dln2_platform_data), }, { .name = "dln2-i2c", .acpi_match = &dln2_acpi_match_i2c, .platform_data = &dln2_pdata_i2c, .pdata_size = sizeof(struct dln2_platform_data), }, { .name = "dln2-spi", .acpi_match = &dln2_acpi_match_spi, .platform_data = &dln2_pdata_spi, .pdata_size = sizeof(struct dln2_platform_data), }, { .name = "dln2-adc", .acpi_match = &dln2_acpi_match_adc, .platform_data = &dln2_pdata_adc, .pdata_size = sizeof(struct dln2_platform_data), }, }; static void dln2_stop(struct dln2_dev *dln2) { int i, j; /* don't allow starting new transfers */ spin_lock(&dln2->disconnect_lock); dln2->disconnect = true; spin_unlock(&dln2->disconnect_lock); /* cancel in progress transfers */ for (i = 0; i < DLN2_HANDLES; i++) { struct dln2_mod_rx_slots *rxs = &dln2->mod_rx_slots[i]; unsigned long flags; spin_lock_irqsave(&rxs->lock, flags); /* cancel all response waiters */ for (j = 0; j < DLN2_MAX_RX_SLOTS; j++) { struct dln2_rx_context *rxc = &rxs->slots[j]; if (rxc->in_use) complete(&rxc->done); } spin_unlock_irqrestore(&rxs->lock, flags); } /* wait for transfers to end */ wait_event(dln2->disconnect_wq, !dln2->active_transfers); dln2_stop_rx_urbs(dln2); } static void dln2_disconnect(struct usb_interface *interface) { struct dln2_dev *dln2 = usb_get_intfdata(interface); dln2_stop(dln2); mfd_remove_devices(&interface->dev); dln2_free(dln2); } static int dln2_probe(struct usb_interface *interface, const struct usb_device_id *usb_id) { struct usb_host_interface *hostif = interface->cur_altsetting; struct usb_endpoint_descriptor *epin; struct usb_endpoint_descriptor *epout; struct device *dev = &interface->dev; struct dln2_dev *dln2; int ret; int i, j; if (hostif->desc.bInterfaceNumber != 0) return -ENODEV; ret = usb_find_common_endpoints(hostif, &epin, &epout, NULL, NULL); if (ret) return ret; dln2 = kzalloc(sizeof(*dln2), GFP_KERNEL); if (!dln2) return -ENOMEM; dln2->ep_out = epout->bEndpointAddress; dln2->ep_in = epin->bEndpointAddress; dln2->usb_dev = usb_get_dev(interface_to_usbdev(interface)); dln2->interface = interface; usb_set_intfdata(interface, dln2); init_waitqueue_head(&dln2->disconnect_wq); for (i = 0; i < DLN2_HANDLES; i++) { init_waitqueue_head(&dln2->mod_rx_slots[i].wq); spin_lock_init(&dln2->mod_rx_slots[i].lock); for (j = 0; j < DLN2_MAX_RX_SLOTS; j++) init_completion(&dln2->mod_rx_slots[i].slots[j].done); } spin_lock_init(&dln2->event_cb_lock); spin_lock_init(&dln2->disconnect_lock); INIT_LIST_HEAD(&dln2->event_cb_list); ret = dln2_setup_rx_urbs(dln2, hostif); if (ret) goto out_free; ret = dln2_start_rx_urbs(dln2, GFP_KERNEL); if (ret) goto out_stop_rx; ret = dln2_hw_init(dln2); if (ret < 0) { dev_err(dev, "failed to initialize hardware\n"); goto out_stop_rx; } ret = mfd_add_hotplug_devices(dev, dln2_devs, ARRAY_SIZE(dln2_devs)); if (ret != 0) { dev_err(dev, "failed to add mfd devices to core\n"); goto out_stop_rx; } return 0; out_stop_rx: dln2_stop_rx_urbs(dln2); out_free: dln2_free(dln2); return ret; } static int dln2_suspend(struct usb_interface *iface, pm_message_t message) { struct dln2_dev *dln2 = usb_get_intfdata(iface); dln2_stop(dln2); return 0; } static int dln2_resume(struct usb_interface *iface) { struct dln2_dev *dln2 = usb_get_intfdata(iface); dln2->disconnect = false; return dln2_start_rx_urbs(dln2, GFP_NOIO); } static const struct usb_device_id dln2_table[] = { { USB_DEVICE(0xa257, 0x2013) }, { } }; MODULE_DEVICE_TABLE(usb, dln2_table); static struct usb_driver dln2_driver = { .name = "dln2", .probe = dln2_probe, .disconnect = dln2_disconnect, .id_table = dln2_table, .suspend = dln2_suspend, .resume = dln2_resume, }; module_usb_driver(dln2_driver); MODULE_AUTHOR("Octavian Purdila <octavian.purdila@intel.com>"); MODULE_DESCRIPTION("Core driver for the Diolan DLN2 interface adapter"); MODULE_LICENSE("GPL v2");
311 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Supervisor Mode Access Prevention support * * Copyright (C) 2012 Intel Corporation * Author: H. Peter Anvin <hpa@linux.intel.com> */ #ifndef _ASM_X86_SMAP_H #define _ASM_X86_SMAP_H #include <asm/nops.h> #include <asm/cpufeatures.h> #include <asm/alternative.h> #ifdef __ASSEMBLY__ #define ASM_CLAC \ ALTERNATIVE "", "clac", X86_FEATURE_SMAP #define ASM_STAC \ ALTERNATIVE "", "stac", X86_FEATURE_SMAP #else /* __ASSEMBLY__ */ static __always_inline void clac(void) { /* Note: a barrier is implicit in alternative() */ alternative("", "clac", X86_FEATURE_SMAP); } static __always_inline void stac(void) { /* Note: a barrier is implicit in alternative() */ alternative("", "stac", X86_FEATURE_SMAP); } static __always_inline unsigned long smap_save(void) { unsigned long flags; asm volatile ("# smap_save\n\t" ALTERNATIVE("", "pushf; pop %0; " "clac" "\n\t", X86_FEATURE_SMAP) : "=rm" (flags) : : "memory", "cc"); return flags; } static __always_inline void smap_restore(unsigned long flags) { asm volatile ("# smap_restore\n\t" ALTERNATIVE("", "push %0; popf\n\t", X86_FEATURE_SMAP) : : "g" (flags) : "memory", "cc"); } /* These macros can be used in asm() statements */ #define ASM_CLAC \ ALTERNATIVE("", "clac", X86_FEATURE_SMAP) #define ASM_STAC \ ALTERNATIVE("", "stac", X86_FEATURE_SMAP) #endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_SMAP_H */
18 25 25 18 5 4 18 18 18 18 14 2 9 1 1 1 18 18 18 18 18 18 2 25 5 20 18 18 18 1 3 2 1 2 1 2 1 1 3 1 2 2 2 2 3 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 // SPDX-License-Identifier: GPL-2.0 /* * usb-serial driver for Quatech USB 2 devices * * Copyright (C) 2012 Bill Pemberton (wfp5p@virginia.edu) * * These devices all have only 1 bulk in and 1 bulk out that is shared * for all serial ports. * */ #include <linux/unaligned.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/tty.h> #include <linux/tty_driver.h> #include <linux/tty_flip.h> #include <linux/module.h> #include <linux/serial.h> #include <linux/usb.h> #include <linux/usb/serial.h> #include <linux/serial_reg.h> #include <linux/uaccess.h> /* default urb timeout for usb operations */ #define QT2_USB_TIMEOUT USB_CTRL_SET_TIMEOUT #define QT_OPEN_CLOSE_CHANNEL 0xca #define QT_SET_GET_DEVICE 0xc2 #define QT_SET_GET_REGISTER 0xc0 #define QT_GET_SET_PREBUF_TRIG_LVL 0xcc #define QT_SET_ATF 0xcd #define QT_TRANSFER_IN 0xc0 #define QT_HW_FLOW_CONTROL_MASK 0xc5 #define QT_SW_FLOW_CONTROL_MASK 0xc6 #define QT2_BREAK_CONTROL 0xc8 #define QT2_GET_SET_UART 0xc1 #define QT2_FLUSH_DEVICE 0xc4 #define QT2_GET_SET_QMCR 0xe1 #define QT2_QMCR_RS232 0x40 #define QT2_QMCR_RS422 0x10 #define SERIAL_CRTSCTS ((UART_MCR_RTS << 8) | UART_MSR_CTS) #define SERIAL_EVEN_PARITY (UART_LCR_PARITY | UART_LCR_EPAR) /* status bytes for the device */ #define QT2_CONTROL_BYTE 0x1b #define QT2_LINE_STATUS 0x00 /* following 1 byte is line status */ #define QT2_MODEM_STATUS 0x01 /* following 1 byte is modem status */ #define QT2_XMIT_HOLD 0x02 /* following 2 bytes are ?? */ #define QT2_CHANGE_PORT 0x03 /* following 1 byte is port to change to */ #define QT2_REC_FLUSH 0x04 /* no following info */ #define QT2_XMIT_FLUSH 0x05 /* no following info */ #define QT2_CONTROL_ESCAPE 0xff /* pass through previous 2 control bytes */ #define MAX_BAUD_RATE 921600 #define DEFAULT_BAUD_RATE 9600 #define QT2_READ_BUFFER_SIZE 512 /* size of read buffer */ #define QT2_WRITE_BUFFER_SIZE 512 /* size of write buffer */ #define QT2_WRITE_CONTROL_SIZE 5 /* control bytes used for a write */ #define DRIVER_DESC "Quatech 2nd gen USB to Serial Driver" #define USB_VENDOR_ID_QUATECH 0x061d #define QUATECH_SSU2_100 0xC120 /* RS232 single port */ #define QUATECH_DSU2_100 0xC140 /* RS232 dual port */ #define QUATECH_DSU2_400 0xC150 /* RS232/422/485 dual port */ #define QUATECH_QSU2_100 0xC160 /* RS232 four port */ #define QUATECH_QSU2_400 0xC170 /* RS232/422/485 four port */ #define QUATECH_ESU2_100 0xC1A0 /* RS232 eight port */ #define QUATECH_ESU2_400 0xC180 /* RS232/422/485 eight port */ struct qt2_device_detail { int product_id; int num_ports; }; #define QT_DETAILS(prod, ports) \ .product_id = (prod), \ .num_ports = (ports) static const struct qt2_device_detail qt2_device_details[] = { {QT_DETAILS(QUATECH_SSU2_100, 1)}, {QT_DETAILS(QUATECH_DSU2_400, 2)}, {QT_DETAILS(QUATECH_DSU2_100, 2)}, {QT_DETAILS(QUATECH_QSU2_400, 4)}, {QT_DETAILS(QUATECH_QSU2_100, 4)}, {QT_DETAILS(QUATECH_ESU2_400, 8)}, {QT_DETAILS(QUATECH_ESU2_100, 8)}, {QT_DETAILS(0, 0)} /* Terminating entry */ }; static const struct usb_device_id id_table[] = { {USB_DEVICE(USB_VENDOR_ID_QUATECH, QUATECH_SSU2_100)}, {USB_DEVICE(USB_VENDOR_ID_QUATECH, QUATECH_DSU2_100)}, {USB_DEVICE(USB_VENDOR_ID_QUATECH, QUATECH_DSU2_400)}, {USB_DEVICE(USB_VENDOR_ID_QUATECH, QUATECH_QSU2_100)}, {USB_DEVICE(USB_VENDOR_ID_QUATECH, QUATECH_QSU2_400)}, {USB_DEVICE(USB_VENDOR_ID_QUATECH, QUATECH_ESU2_100)}, {USB_DEVICE(USB_VENDOR_ID_QUATECH, QUATECH_ESU2_400)}, {} /* Terminating entry */ }; MODULE_DEVICE_TABLE(usb, id_table); struct qt2_serial_private { unsigned char current_port; /* current port for incoming data */ struct urb *read_urb; /* shared among all ports */ char *read_buffer; }; struct qt2_port_private { u8 device_port; spinlock_t urb_lock; bool urb_in_use; struct urb *write_urb; char *write_buffer; spinlock_t lock; u8 shadowLSR; u8 shadowMSR; struct usb_serial_port *port; }; static void qt2_update_lsr(struct usb_serial_port *port, unsigned char *ch); static void qt2_update_msr(struct usb_serial_port *port, unsigned char *ch); static void qt2_write_bulk_callback(struct urb *urb); static void qt2_read_bulk_callback(struct urb *urb); static void qt2_release(struct usb_serial *serial) { struct qt2_serial_private *serial_priv; serial_priv = usb_get_serial_data(serial); usb_kill_urb(serial_priv->read_urb); usb_free_urb(serial_priv->read_urb); kfree(serial_priv->read_buffer); kfree(serial_priv); } static inline int calc_baud_divisor(int baudrate) { int divisor, rem; divisor = MAX_BAUD_RATE / baudrate; rem = MAX_BAUD_RATE % baudrate; /* Round to nearest divisor */ if (((rem * 2) >= baudrate) && (baudrate != 110)) divisor++; return divisor; } static inline int qt2_set_port_config(struct usb_device *dev, unsigned char port_number, u16 baudrate, u16 lcr) { int divisor = calc_baud_divisor(baudrate); u16 index = ((u16) (lcr << 8) | (u16) (port_number)); return usb_control_msg(dev, usb_sndctrlpipe(dev, 0), QT2_GET_SET_UART, 0x40, divisor, index, NULL, 0, QT2_USB_TIMEOUT); } static inline int qt2_control_msg(struct usb_device *dev, u8 request, u16 data, u16 index) { return usb_control_msg(dev, usb_sndctrlpipe(dev, 0), request, 0x40, data, index, NULL, 0, QT2_USB_TIMEOUT); } static inline int qt2_getregister(struct usb_device *dev, u8 uart, u8 reg, u8 *data) { int ret; ret = usb_control_msg(dev, usb_rcvctrlpipe(dev, 0), QT_SET_GET_REGISTER, 0xc0, reg, uart, data, sizeof(*data), QT2_USB_TIMEOUT); if (ret < (int)sizeof(*data)) { if (ret >= 0) ret = -EIO; } return ret; } static inline int qt2_setregister(struct usb_device *dev, u8 uart, u8 reg, u16 data) { u16 value = (data << 8) | reg; return usb_control_msg(dev, usb_sndctrlpipe(dev, 0), QT_SET_GET_REGISTER, 0x40, value, uart, NULL, 0, QT2_USB_TIMEOUT); } static inline int update_mctrl(struct qt2_port_private *port_priv, unsigned int set, unsigned int clear) { struct usb_serial_port *port = port_priv->port; struct usb_device *dev = port->serial->dev; unsigned urb_value; int status; if (((set | clear) & (TIOCM_DTR | TIOCM_RTS)) == 0) { dev_dbg(&port->dev, "update_mctrl - DTR|RTS not being set|cleared\n"); return 0; /* no change */ } clear &= ~set; /* 'set' takes precedence over 'clear' */ urb_value = 0; if (set & TIOCM_DTR) urb_value |= UART_MCR_DTR; if (set & TIOCM_RTS) urb_value |= UART_MCR_RTS; status = qt2_setregister(dev, port_priv->device_port, UART_MCR, urb_value); if (status < 0) dev_err(&port->dev, "update_mctrl - Error from MODEM_CTRL urb: %i\n", status); return status; } static int qt2_calc_num_ports(struct usb_serial *serial, struct usb_serial_endpoints *epds) { struct qt2_device_detail d; int i; for (i = 0; d = qt2_device_details[i], d.product_id != 0; i++) { if (d.product_id == le16_to_cpu(serial->dev->descriptor.idProduct)) return d.num_ports; } /* we didn't recognize the device */ dev_err(&serial->dev->dev, "don't know the number of ports, assuming 1\n"); return 1; } static void qt2_set_termios(struct tty_struct *tty, struct usb_serial_port *port, const struct ktermios *old_termios) { struct usb_device *dev = port->serial->dev; struct qt2_port_private *port_priv; struct ktermios *termios = &tty->termios; u16 baud; unsigned int cflag = termios->c_cflag; u16 new_lcr = 0; int status; port_priv = usb_get_serial_port_data(port); if (cflag & PARENB) { if (cflag & PARODD) new_lcr |= UART_LCR_PARITY; else new_lcr |= SERIAL_EVEN_PARITY; } new_lcr |= UART_LCR_WLEN(tty_get_char_size(cflag)); baud = tty_get_baud_rate(tty); if (!baud) baud = 9600; status = qt2_set_port_config(dev, port_priv->device_port, baud, new_lcr); if (status < 0) dev_err(&port->dev, "%s - qt2_set_port_config failed: %i\n", __func__, status); if (cflag & CRTSCTS) status = qt2_control_msg(dev, QT_HW_FLOW_CONTROL_MASK, SERIAL_CRTSCTS, port_priv->device_port); else status = qt2_control_msg(dev, QT_HW_FLOW_CONTROL_MASK, 0, port_priv->device_port); if (status < 0) dev_err(&port->dev, "%s - set HW flow control failed: %i\n", __func__, status); if (I_IXOFF(tty) || I_IXON(tty)) { u16 x = ((u16) (START_CHAR(tty) << 8) | (u16) (STOP_CHAR(tty))); status = qt2_control_msg(dev, QT_SW_FLOW_CONTROL_MASK, x, port_priv->device_port); } else status = qt2_control_msg(dev, QT_SW_FLOW_CONTROL_MASK, 0, port_priv->device_port); if (status < 0) dev_err(&port->dev, "%s - set SW flow control failed: %i\n", __func__, status); } static int qt2_open(struct tty_struct *tty, struct usb_serial_port *port) { struct usb_serial *serial; struct qt2_port_private *port_priv; u8 *data; u16 device_port; int status; unsigned long flags; device_port = port->port_number; serial = port->serial; port_priv = usb_get_serial_port_data(port); /* set the port to RS232 mode */ status = qt2_control_msg(serial->dev, QT2_GET_SET_QMCR, QT2_QMCR_RS232, device_port); if (status < 0) { dev_err(&port->dev, "%s failed to set RS232 mode for port %i error %i\n", __func__, device_port, status); return status; } data = kzalloc(2, GFP_KERNEL); if (!data) return -ENOMEM; /* open the port */ status = usb_control_msg(serial->dev, usb_rcvctrlpipe(serial->dev, 0), QT_OPEN_CLOSE_CHANNEL, 0xc0, 0, device_port, data, 2, QT2_USB_TIMEOUT); if (status < 2) { dev_err(&port->dev, "%s - open port failed %i\n", __func__, status); if (status >= 0) status = -EIO; kfree(data); return status; } spin_lock_irqsave(&port_priv->lock, flags); port_priv->shadowLSR = data[0]; port_priv->shadowMSR = data[1]; spin_unlock_irqrestore(&port_priv->lock, flags); kfree(data); /* set to default speed and 8bit word size */ status = qt2_set_port_config(serial->dev, device_port, DEFAULT_BAUD_RATE, UART_LCR_WLEN8); if (status < 0) { dev_err(&port->dev, "%s - initial setup failed (%i)\n", __func__, device_port); return status; } port_priv->device_port = (u8) device_port; if (tty) qt2_set_termios(tty, port, &tty->termios); return 0; } static void qt2_close(struct usb_serial_port *port) { struct usb_serial *serial; struct qt2_port_private *port_priv; int i; serial = port->serial; port_priv = usb_get_serial_port_data(port); usb_kill_urb(port_priv->write_urb); /* flush the port transmit buffer */ i = usb_control_msg(serial->dev, usb_sndctrlpipe(serial->dev, 0), QT2_FLUSH_DEVICE, 0x40, 1, port_priv->device_port, NULL, 0, QT2_USB_TIMEOUT); if (i < 0) dev_err(&port->dev, "%s - transmit buffer flush failed: %i\n", __func__, i); /* flush the port receive buffer */ i = usb_control_msg(serial->dev, usb_sndctrlpipe(serial->dev, 0), QT2_FLUSH_DEVICE, 0x40, 0, port_priv->device_port, NULL, 0, QT2_USB_TIMEOUT); if (i < 0) dev_err(&port->dev, "%s - receive buffer flush failed: %i\n", __func__, i); /* close the port */ i = usb_control_msg(serial->dev, usb_sndctrlpipe(serial->dev, 0), QT_OPEN_CLOSE_CHANNEL, 0x40, 0, port_priv->device_port, NULL, 0, QT2_USB_TIMEOUT); if (i < 0) dev_err(&port->dev, "%s - close port failed %i\n", __func__, i); } static void qt2_disconnect(struct usb_serial *serial) { struct qt2_serial_private *serial_priv = usb_get_serial_data(serial); usb_kill_urb(serial_priv->read_urb); } static void qt2_process_status(struct usb_serial_port *port, unsigned char *ch) { switch (*ch) { case QT2_LINE_STATUS: qt2_update_lsr(port, ch + 1); break; case QT2_MODEM_STATUS: qt2_update_msr(port, ch + 1); break; } } static void qt2_process_read_urb(struct urb *urb) { struct usb_serial *serial; struct qt2_serial_private *serial_priv; struct usb_serial_port *port; bool escapeflag; unsigned char *ch; int i; unsigned char newport; int len = urb->actual_length; if (!len) return; ch = urb->transfer_buffer; serial = urb->context; serial_priv = usb_get_serial_data(serial); port = serial->port[serial_priv->current_port]; for (i = 0; i < urb->actual_length; i++) { ch = (unsigned char *)urb->transfer_buffer + i; if ((i <= (len - 3)) && (*ch == QT2_CONTROL_BYTE) && (*(ch + 1) == QT2_CONTROL_BYTE)) { escapeflag = false; switch (*(ch + 2)) { case QT2_LINE_STATUS: case QT2_MODEM_STATUS: if (i > (len - 4)) { dev_warn(&port->dev, "%s - status message too short\n", __func__); break; } qt2_process_status(port, ch + 2); i += 3; escapeflag = true; break; case QT2_XMIT_HOLD: if (i > (len - 5)) { dev_warn(&port->dev, "%s - xmit_empty message too short\n", __func__); break; } /* bytes_written = (ch[1] << 4) + ch[0]; */ i += 4; escapeflag = true; break; case QT2_CHANGE_PORT: if (i > (len - 4)) { dev_warn(&port->dev, "%s - change_port message too short\n", __func__); break; } tty_flip_buffer_push(&port->port); newport = *(ch + 3); if (newport >= serial->num_ports) { dev_err(&port->dev, "%s - port change to invalid port: %i\n", __func__, newport); break; } serial_priv->current_port = newport; port = serial->port[serial_priv->current_port]; i += 3; escapeflag = true; break; case QT2_REC_FLUSH: case QT2_XMIT_FLUSH: i += 2; escapeflag = true; break; case QT2_CONTROL_ESCAPE: tty_insert_flip_string(&port->port, ch, 2); i += 2; escapeflag = true; break; default: dev_warn(&port->dev, "%s - unsupported command %i\n", __func__, *(ch + 2)); break; } if (escapeflag) continue; } tty_insert_flip_char(&port->port, *ch, TTY_NORMAL); } tty_flip_buffer_push(&port->port); } static void qt2_write_bulk_callback(struct urb *urb) { struct usb_serial_port *port; struct qt2_port_private *port_priv; unsigned long flags; port = urb->context; port_priv = usb_get_serial_port_data(port); spin_lock_irqsave(&port_priv->urb_lock, flags); port_priv->urb_in_use = false; usb_serial_port_softint(port); spin_unlock_irqrestore(&port_priv->urb_lock, flags); } static void qt2_read_bulk_callback(struct urb *urb) { struct usb_serial *serial = urb->context; int status; if (urb->status) { dev_warn(&serial->dev->dev, "%s - non-zero urb status: %i\n", __func__, urb->status); return; } qt2_process_read_urb(urb); status = usb_submit_urb(urb, GFP_ATOMIC); if (status != 0) dev_err(&serial->dev->dev, "%s - resubmit read urb failed: %i\n", __func__, status); } static int qt2_setup_urbs(struct usb_serial *serial) { struct usb_serial_port *port0; struct qt2_serial_private *serial_priv; int status; port0 = serial->port[0]; serial_priv = usb_get_serial_data(serial); serial_priv->read_urb = usb_alloc_urb(0, GFP_KERNEL); if (!serial_priv->read_urb) return -ENOMEM; usb_fill_bulk_urb(serial_priv->read_urb, serial->dev, usb_rcvbulkpipe(serial->dev, port0->bulk_in_endpointAddress), serial_priv->read_buffer, QT2_READ_BUFFER_SIZE, qt2_read_bulk_callback, serial); status = usb_submit_urb(serial_priv->read_urb, GFP_KERNEL); if (status != 0) { dev_err(&serial->dev->dev, "%s - submit read urb failed %i\n", __func__, status); usb_free_urb(serial_priv->read_urb); return status; } return 0; } static int qt2_attach(struct usb_serial *serial) { struct qt2_serial_private *serial_priv; int status; /* power on unit */ status = usb_control_msg(serial->dev, usb_sndctrlpipe(serial->dev, 0), 0xc2, 0x40, 0x8000, 0, NULL, 0, QT2_USB_TIMEOUT); if (status < 0) { dev_err(&serial->dev->dev, "%s - failed to power on unit: %i\n", __func__, status); return status; } serial_priv = kzalloc(sizeof(*serial_priv), GFP_KERNEL); if (!serial_priv) return -ENOMEM; serial_priv->read_buffer = kmalloc(QT2_READ_BUFFER_SIZE, GFP_KERNEL); if (!serial_priv->read_buffer) { status = -ENOMEM; goto err_buf; } usb_set_serial_data(serial, serial_priv); status = qt2_setup_urbs(serial); if (status != 0) goto attach_failed; return 0; attach_failed: kfree(serial_priv->read_buffer); err_buf: kfree(serial_priv); return status; } static int qt2_port_probe(struct usb_serial_port *port) { struct usb_serial *serial = port->serial; struct qt2_port_private *port_priv; u8 bEndpointAddress; port_priv = kzalloc(sizeof(*port_priv), GFP_KERNEL); if (!port_priv) return -ENOMEM; spin_lock_init(&port_priv->lock); spin_lock_init(&port_priv->urb_lock); port_priv->port = port; port_priv->write_buffer = kmalloc(QT2_WRITE_BUFFER_SIZE, GFP_KERNEL); if (!port_priv->write_buffer) goto err_buf; port_priv->write_urb = usb_alloc_urb(0, GFP_KERNEL); if (!port_priv->write_urb) goto err_urb; bEndpointAddress = serial->port[0]->bulk_out_endpointAddress; usb_fill_bulk_urb(port_priv->write_urb, serial->dev, usb_sndbulkpipe(serial->dev, bEndpointAddress), port_priv->write_buffer, QT2_WRITE_BUFFER_SIZE, qt2_write_bulk_callback, port); usb_set_serial_port_data(port, port_priv); return 0; err_urb: kfree(port_priv->write_buffer); err_buf: kfree(port_priv); return -ENOMEM; } static void qt2_port_remove(struct usb_serial_port *port) { struct qt2_port_private *port_priv; port_priv = usb_get_serial_port_data(port); usb_free_urb(port_priv->write_urb); kfree(port_priv->write_buffer); kfree(port_priv); } static int qt2_tiocmget(struct tty_struct *tty) { struct usb_serial_port *port = tty->driver_data; struct usb_device *dev = port->serial->dev; struct qt2_port_private *port_priv = usb_get_serial_port_data(port); u8 *d; int r; d = kzalloc(2, GFP_KERNEL); if (!d) return -ENOMEM; r = qt2_getregister(dev, port_priv->device_port, UART_MCR, d); if (r < 0) goto mget_out; r = qt2_getregister(dev, port_priv->device_port, UART_MSR, d + 1); if (r < 0) goto mget_out; r = (d[0] & UART_MCR_DTR ? TIOCM_DTR : 0) | (d[0] & UART_MCR_RTS ? TIOCM_RTS : 0) | (d[1] & UART_MSR_CTS ? TIOCM_CTS : 0) | (d[1] & UART_MSR_DCD ? TIOCM_CAR : 0) | (d[1] & UART_MSR_RI ? TIOCM_RI : 0) | (d[1] & UART_MSR_DSR ? TIOCM_DSR : 0); mget_out: kfree(d); return r; } static int qt2_tiocmset(struct tty_struct *tty, unsigned int set, unsigned int clear) { struct qt2_port_private *port_priv; port_priv = usb_get_serial_port_data(tty->driver_data); return update_mctrl(port_priv, set, clear); } static int qt2_break_ctl(struct tty_struct *tty, int break_state) { struct usb_serial_port *port = tty->driver_data; struct qt2_port_private *port_priv; int status; u16 val; port_priv = usb_get_serial_port_data(port); val = (break_state == -1) ? 1 : 0; status = qt2_control_msg(port->serial->dev, QT2_BREAK_CONTROL, val, port_priv->device_port); if (status < 0) { dev_warn(&port->dev, "%s - failed to send control message: %i\n", __func__, status); return status; } return 0; } static void qt2_dtr_rts(struct usb_serial_port *port, int on) { struct usb_device *dev = port->serial->dev; struct qt2_port_private *port_priv = usb_get_serial_port_data(port); /* Disable flow control */ if (!on) { if (qt2_setregister(dev, port_priv->device_port, UART_MCR, 0) < 0) dev_warn(&port->dev, "error from flowcontrol urb\n"); } /* drop RTS and DTR */ if (on) update_mctrl(port_priv, TIOCM_DTR | TIOCM_RTS, 0); else update_mctrl(port_priv, 0, TIOCM_DTR | TIOCM_RTS); } static void qt2_update_msr(struct usb_serial_port *port, unsigned char *ch) { struct qt2_port_private *port_priv; u8 newMSR = (u8) *ch; unsigned long flags; /* May be called from qt2_process_read_urb() for an unbound port. */ port_priv = usb_get_serial_port_data(port); if (!port_priv) return; spin_lock_irqsave(&port_priv->lock, flags); port_priv->shadowMSR = newMSR; spin_unlock_irqrestore(&port_priv->lock, flags); if (newMSR & UART_MSR_ANY_DELTA) { /* update input line counters */ if (newMSR & UART_MSR_DCTS) port->icount.cts++; if (newMSR & UART_MSR_DDSR) port->icount.dsr++; if (newMSR & UART_MSR_DDCD) port->icount.dcd++; if (newMSR & UART_MSR_TERI) port->icount.rng++; wake_up_interruptible(&port->port.delta_msr_wait); } } static void qt2_update_lsr(struct usb_serial_port *port, unsigned char *ch) { struct qt2_port_private *port_priv; struct async_icount *icount; unsigned long flags; u8 newLSR = (u8) *ch; /* May be called from qt2_process_read_urb() for an unbound port. */ port_priv = usb_get_serial_port_data(port); if (!port_priv) return; if (newLSR & UART_LSR_BI) newLSR &= (u8) (UART_LSR_OE | UART_LSR_BI); spin_lock_irqsave(&port_priv->lock, flags); port_priv->shadowLSR = newLSR; spin_unlock_irqrestore(&port_priv->lock, flags); icount = &port->icount; if (newLSR & UART_LSR_BRK_ERROR_BITS) { if (newLSR & UART_LSR_BI) icount->brk++; if (newLSR & UART_LSR_OE) icount->overrun++; if (newLSR & UART_LSR_PE) icount->parity++; if (newLSR & UART_LSR_FE) icount->frame++; } } static unsigned int qt2_write_room(struct tty_struct *tty) { struct usb_serial_port *port = tty->driver_data; struct qt2_port_private *port_priv; unsigned long flags; unsigned int r; port_priv = usb_get_serial_port_data(port); spin_lock_irqsave(&port_priv->urb_lock, flags); if (port_priv->urb_in_use) r = 0; else r = QT2_WRITE_BUFFER_SIZE - QT2_WRITE_CONTROL_SIZE; spin_unlock_irqrestore(&port_priv->urb_lock, flags); return r; } static int qt2_write(struct tty_struct *tty, struct usb_serial_port *port, const unsigned char *buf, int count) { struct qt2_port_private *port_priv; struct urb *write_urb; unsigned char *data; unsigned long flags; int status; int bytes_out = 0; port_priv = usb_get_serial_port_data(port); if (port_priv->write_urb == NULL) { dev_err(&port->dev, "%s - no output urb\n", __func__); return 0; } write_urb = port_priv->write_urb; count = min(count, QT2_WRITE_BUFFER_SIZE - QT2_WRITE_CONTROL_SIZE); data = write_urb->transfer_buffer; spin_lock_irqsave(&port_priv->urb_lock, flags); if (port_priv->urb_in_use) { dev_err(&port->dev, "qt2_write - urb is in use\n"); goto write_out; } *data++ = QT2_CONTROL_BYTE; *data++ = QT2_CONTROL_BYTE; *data++ = port_priv->device_port; put_unaligned_le16(count, data); data += 2; memcpy(data, buf, count); write_urb->transfer_buffer_length = count + QT2_WRITE_CONTROL_SIZE; status = usb_submit_urb(write_urb, GFP_ATOMIC); if (status == 0) { port_priv->urb_in_use = true; bytes_out += count; } write_out: spin_unlock_irqrestore(&port_priv->urb_lock, flags); return bytes_out; } static struct usb_serial_driver qt2_device = { .driver = { .name = "quatech-serial", }, .description = DRIVER_DESC, .id_table = id_table, .open = qt2_open, .close = qt2_close, .write = qt2_write, .write_room = qt2_write_room, .calc_num_ports = qt2_calc_num_ports, .attach = qt2_attach, .release = qt2_release, .disconnect = qt2_disconnect, .port_probe = qt2_port_probe, .port_remove = qt2_port_remove, .dtr_rts = qt2_dtr_rts, .break_ctl = qt2_break_ctl, .tiocmget = qt2_tiocmget, .tiocmset = qt2_tiocmset, .tiocmiwait = usb_serial_generic_tiocmiwait, .get_icount = usb_serial_generic_get_icount, .set_termios = qt2_set_termios, }; static struct usb_serial_driver *const serial_drivers[] = { &qt2_device, NULL }; module_usb_serial_driver(serial_drivers, id_table); MODULE_DESCRIPTION(DRIVER_DESC); MODULE_LICENSE("GPL v2");
193 605 300 40 393 236 3 5 11 11 158 20 20 19 2 77 77 16 61 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Authors: Lotsa people, from code originally in tcp */ #ifndef _INET_HASHTABLES_H #define _INET_HASHTABLES_H #include <linux/interrupt.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/list.h> #include <linux/slab.h> #include <linux/socket.h> #include <linux/spinlock.h> #include <linux/types.h> #include <linux/wait.h> #include <net/inet_connection_sock.h> #include <net/inet_sock.h> #include <net/ip.h> #include <net/sock.h> #include <net/route.h> #include <net/tcp_states.h> #include <net/netns/hash.h> #include <linux/refcount.h> #include <asm/byteorder.h> /* This is for all connections with a full identity, no wildcards. * The 'e' prefix stands for Establish, but we really put all sockets * but LISTEN ones. */ struct inet_ehash_bucket { struct hlist_nulls_head chain; }; /* There are a few simple rules, which allow for local port reuse by * an application. In essence: * * 1) Sockets bound to different interfaces may share a local port. * Failing that, goto test 2. * 2) If all sockets have sk->sk_reuse set, and none of them are in * TCP_LISTEN state, the port may be shared. * Failing that, goto test 3. * 3) If all sockets are bound to a specific inet_sk(sk)->rcv_saddr local * address, and none of them are the same, the port may be * shared. * Failing this, the port cannot be shared. * * The interesting point, is test #2. This is what an FTP server does * all day. To optimize this case we use a specific flag bit defined * below. As we add sockets to a bind bucket list, we perform a * check of: (newsk->sk_reuse && (newsk->sk_state != TCP_LISTEN)) * As long as all sockets added to a bind bucket pass this test, * the flag bit will be set. * The resulting situation is that tcp_v[46]_verify_bind() can just check * for this flag bit, if it is set and the socket trying to bind has * sk->sk_reuse set, we don't even have to walk the owners list at all, * we return that it is ok to bind this socket to the requested local port. * * Sounds like a lot of work, but it is worth it. In a more naive * implementation (ie. current FreeBSD etc.) the entire list of ports * must be walked for each data port opened by an ftp server. Needless * to say, this does not scale at all. With a couple thousand FTP * users logged onto your box, isn't it nice to know that new data * ports are created in O(1) time? I thought so. ;-) -DaveM */ #define FASTREUSEPORT_ANY 1 #define FASTREUSEPORT_STRICT 2 struct inet_bind_bucket { possible_net_t ib_net; int l3mdev; unsigned short port; signed char fastreuse; signed char fastreuseport; kuid_t fastuid; #if IS_ENABLED(CONFIG_IPV6) struct in6_addr fast_v6_rcv_saddr; #endif __be32 fast_rcv_saddr; unsigned short fast_sk_family; bool fast_ipv6_only; struct hlist_node node; struct hlist_head bhash2; }; struct inet_bind2_bucket { possible_net_t ib_net; int l3mdev; unsigned short port; #if IS_ENABLED(CONFIG_IPV6) unsigned short addr_type; struct in6_addr v6_rcv_saddr; #define rcv_saddr v6_rcv_saddr.s6_addr32[3] #else __be32 rcv_saddr; #endif /* Node in the bhash2 inet_bind_hashbucket chain */ struct hlist_node node; struct hlist_node bhash_node; /* List of sockets hashed to this bucket */ struct hlist_head owners; }; static inline struct net *ib_net(const struct inet_bind_bucket *ib) { return read_pnet(&ib->ib_net); } static inline struct net *ib2_net(const struct inet_bind2_bucket *ib) { return read_pnet(&ib->ib_net); } #define inet_bind_bucket_for_each(tb, head) \ hlist_for_each_entry(tb, head, node) struct inet_bind_hashbucket { spinlock_t lock; struct hlist_head chain; }; /* Sockets can be hashed in established or listening table. * We must use different 'nulls' end-of-chain value for all hash buckets : * A socket might transition from ESTABLISH to LISTEN state without * RCU grace period. A lookup in ehash table needs to handle this case. */ #define LISTENING_NULLS_BASE (1U << 29) struct inet_listen_hashbucket { spinlock_t lock; struct hlist_nulls_head nulls_head; }; /* This is for listening sockets, thus all sockets which possess wildcards. */ #define INET_LHTABLE_SIZE 32 /* Yes, really, this is all you need. */ struct inet_hashinfo { /* This is for sockets with full identity only. Sockets here will * always be without wildcards and will have the following invariant: * * TCP_ESTABLISHED <= sk->sk_state < TCP_CLOSE * */ struct inet_ehash_bucket *ehash; spinlock_t *ehash_locks; unsigned int ehash_mask; unsigned int ehash_locks_mask; /* Ok, let's try this, I give up, we do need a local binding * TCP hash as well as the others for fast bind/connect. */ struct kmem_cache *bind_bucket_cachep; /* This bind table is hashed by local port */ struct inet_bind_hashbucket *bhash; struct kmem_cache *bind2_bucket_cachep; /* This bind table is hashed by local port and sk->sk_rcv_saddr (ipv4) * or sk->sk_v6_rcv_saddr (ipv6). This 2nd bind table is used * primarily for expediting bind conflict resolution. */ struct inet_bind_hashbucket *bhash2; unsigned int bhash_size; /* The 2nd listener table hashed by local port and address */ unsigned int lhash2_mask; struct inet_listen_hashbucket *lhash2; bool pernet; } ____cacheline_aligned_in_smp; static inline struct inet_hashinfo *tcp_or_dccp_get_hashinfo(const struct sock *sk) { #if IS_ENABLED(CONFIG_IP_DCCP) return sk->sk_prot->h.hashinfo ? : sock_net(sk)->ipv4.tcp_death_row.hashinfo; #else return sock_net(sk)->ipv4.tcp_death_row.hashinfo; #endif } static inline struct inet_listen_hashbucket * inet_lhash2_bucket(struct inet_hashinfo *h, u32 hash) { return &h->lhash2[hash & h->lhash2_mask]; } static inline struct inet_ehash_bucket *inet_ehash_bucket( struct inet_hashinfo *hashinfo, unsigned int hash) { return &hashinfo->ehash[hash & hashinfo->ehash_mask]; } static inline spinlock_t *inet_ehash_lockp( struct inet_hashinfo *hashinfo, unsigned int hash) { return &hashinfo->ehash_locks[hash & hashinfo->ehash_locks_mask]; } int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo); static inline void inet_hashinfo2_free_mod(struct inet_hashinfo *h) { kfree(h->lhash2); h->lhash2 = NULL; } static inline void inet_ehash_locks_free(struct inet_hashinfo *hashinfo) { kvfree(hashinfo->ehash_locks); hashinfo->ehash_locks = NULL; } struct inet_hashinfo *inet_pernet_hashinfo_alloc(struct inet_hashinfo *hashinfo, unsigned int ehash_entries); void inet_pernet_hashinfo_free(struct inet_hashinfo *hashinfo); struct inet_bind_bucket * inet_bind_bucket_create(struct kmem_cache *cachep, struct net *net, struct inet_bind_hashbucket *head, const unsigned short snum, int l3mdev); void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket *tb); bool inet_bind_bucket_match(const struct inet_bind_bucket *tb, const struct net *net, unsigned short port, int l3mdev); struct inet_bind2_bucket * inet_bind2_bucket_create(struct kmem_cache *cachep, struct net *net, struct inet_bind_hashbucket *head, struct inet_bind_bucket *tb, const struct sock *sk); void inet_bind2_bucket_destroy(struct kmem_cache *cachep, struct inet_bind2_bucket *tb); struct inet_bind2_bucket * inet_bind2_bucket_find(const struct inet_bind_hashbucket *head, const struct net *net, unsigned short port, int l3mdev, const struct sock *sk); bool inet_bind2_bucket_match_addr_any(const struct inet_bind2_bucket *tb, const struct net *net, unsigned short port, int l3mdev, const struct sock *sk); static inline u32 inet_bhashfn(const struct net *net, const __u16 lport, const u32 bhash_size) { return (lport + net_hash_mix(net)) & (bhash_size - 1); } static inline struct inet_bind_hashbucket * inet_bhashfn_portaddr(const struct inet_hashinfo *hinfo, const struct sock *sk, const struct net *net, unsigned short port) { u32 hash; #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6) hash = ipv6_portaddr_hash(net, &sk->sk_v6_rcv_saddr, port); else #endif hash = ipv4_portaddr_hash(net, sk->sk_rcv_saddr, port); return &hinfo->bhash2[hash & (hinfo->bhash_size - 1)]; } struct inet_bind_hashbucket * inet_bhash2_addr_any_hashbucket(const struct sock *sk, const struct net *net, int port); /* This should be called whenever a socket's sk_rcv_saddr (ipv4) or * sk_v6_rcv_saddr (ipv6) changes after it has been binded. The socket's * rcv_saddr field should already have been updated when this is called. */ int inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family); void inet_bhash2_reset_saddr(struct sock *sk); void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, struct inet_bind2_bucket *tb2, unsigned short port); /* Caller must disable local BH processing. */ int __inet_inherit_port(const struct sock *sk, struct sock *child); void inet_put_port(struct sock *sk); void inet_hashinfo2_init(struct inet_hashinfo *h, const char *name, unsigned long numentries, int scale, unsigned long low_limit, unsigned long high_limit); int inet_hashinfo2_init_mod(struct inet_hashinfo *h); bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk); bool inet_ehash_nolisten(struct sock *sk, struct sock *osk, bool *found_dup_sk); int __inet_hash(struct sock *sk, struct sock *osk); int inet_hash(struct sock *sk); void inet_unhash(struct sock *sk); struct sock *__inet_lookup_listener(const struct net *net, struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, const __be32 saddr, const __be16 sport, const __be32 daddr, const unsigned short hnum, const int dif, const int sdif); static inline struct sock *inet_lookup_listener(struct net *net, struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, __be32 saddr, __be16 sport, __be32 daddr, __be16 dport, int dif, int sdif) { return __inet_lookup_listener(net, hashinfo, skb, doff, saddr, sport, daddr, ntohs(dport), dif, sdif); } /* Socket demux engine toys. */ /* What happens here is ugly; there's a pair of adjacent fields in struct inet_sock; __be16 dport followed by __u16 num. We want to search by pair, so we combine the keys into a single 32bit value and compare with 32bit value read from &...->dport. Let's at least make sure that it's not mixed with anything else... On 64bit targets we combine comparisons with pair of adjacent __be32 fields in the same way. */ #ifdef __BIG_ENDIAN #define INET_COMBINED_PORTS(__sport, __dport) \ ((__force __portpair)(((__force __u32)(__be16)(__sport) << 16) | (__u32)(__dport))) #else /* __LITTLE_ENDIAN */ #define INET_COMBINED_PORTS(__sport, __dport) \ ((__force __portpair)(((__u32)(__dport) << 16) | (__force __u32)(__be16)(__sport))) #endif #ifdef __BIG_ENDIAN #define INET_ADDR_COOKIE(__name, __saddr, __daddr) \ const __addrpair __name = (__force __addrpair) ( \ (((__force __u64)(__be32)(__saddr)) << 32) | \ ((__force __u64)(__be32)(__daddr))) #else /* __LITTLE_ENDIAN */ #define INET_ADDR_COOKIE(__name, __saddr, __daddr) \ const __addrpair __name = (__force __addrpair) ( \ (((__force __u64)(__be32)(__daddr)) << 32) | \ ((__force __u64)(__be32)(__saddr))) #endif /* __BIG_ENDIAN */ static inline bool inet_match(const struct net *net, const struct sock *sk, const __addrpair cookie, const __portpair ports, int dif, int sdif) { if (!net_eq(sock_net(sk), net) || sk->sk_portpair != ports || sk->sk_addrpair != cookie) return false; /* READ_ONCE() paired with WRITE_ONCE() in sock_bindtoindex_locked() */ return inet_sk_bound_dev_eq(net, READ_ONCE(sk->sk_bound_dev_if), dif, sdif); } /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so we need * not check it for lookups anymore, thanks Alexey. -DaveM */ struct sock *__inet_lookup_established(const struct net *net, struct inet_hashinfo *hashinfo, const __be32 saddr, const __be16 sport, const __be32 daddr, const u16 hnum, const int dif, const int sdif); typedef u32 (inet_ehashfn_t)(const struct net *net, const __be32 laddr, const __u16 lport, const __be32 faddr, const __be16 fport); inet_ehashfn_t inet_ehashfn; INDIRECT_CALLABLE_DECLARE(inet_ehashfn_t udp_ehashfn); struct sock *inet_lookup_reuseport(const struct net *net, struct sock *sk, struct sk_buff *skb, int doff, __be32 saddr, __be16 sport, __be32 daddr, unsigned short hnum, inet_ehashfn_t *ehashfn); struct sock *inet_lookup_run_sk_lookup(const struct net *net, int protocol, struct sk_buff *skb, int doff, __be32 saddr, __be16 sport, __be32 daddr, u16 hnum, const int dif, inet_ehashfn_t *ehashfn); static inline struct sock * inet_lookup_established(struct net *net, struct inet_hashinfo *hashinfo, const __be32 saddr, const __be16 sport, const __be32 daddr, const __be16 dport, const int dif) { return __inet_lookup_established(net, hashinfo, saddr, sport, daddr, ntohs(dport), dif, 0); } static inline struct sock *__inet_lookup(struct net *net, struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, const __be32 saddr, const __be16 sport, const __be32 daddr, const __be16 dport, const int dif, const int sdif, bool *refcounted) { u16 hnum = ntohs(dport); struct sock *sk; sk = __inet_lookup_established(net, hashinfo, saddr, sport, daddr, hnum, dif, sdif); *refcounted = true; if (sk) return sk; *refcounted = false; return __inet_lookup_listener(net, hashinfo, skb, doff, saddr, sport, daddr, hnum, dif, sdif); } static inline struct sock *inet_lookup(struct net *net, struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, const __be32 saddr, const __be16 sport, const __be32 daddr, const __be16 dport, const int dif) { struct sock *sk; bool refcounted; sk = __inet_lookup(net, hashinfo, skb, doff, saddr, sport, daddr, dport, dif, 0, &refcounted); if (sk && !refcounted && !refcount_inc_not_zero(&sk->sk_refcnt)) sk = NULL; return sk; } static inline struct sock *inet_steal_sock(struct net *net, struct sk_buff *skb, int doff, const __be32 saddr, const __be16 sport, const __be32 daddr, const __be16 dport, bool *refcounted, inet_ehashfn_t *ehashfn) { struct sock *sk, *reuse_sk; bool prefetched; sk = skb_steal_sock(skb, refcounted, &prefetched); if (!sk) return NULL; if (!prefetched || !sk_fullsock(sk)) return sk; if (sk->sk_protocol == IPPROTO_TCP) { if (sk->sk_state != TCP_LISTEN) return sk; } else if (sk->sk_protocol == IPPROTO_UDP) { if (sk->sk_state != TCP_CLOSE) return sk; } else { return sk; } reuse_sk = inet_lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, ntohs(dport), ehashfn); if (!reuse_sk) return sk; /* We've chosen a new reuseport sock which is never refcounted. This * implies that sk also isn't refcounted. */ WARN_ON_ONCE(*refcounted); return reuse_sk; } static inline struct sock *__inet_lookup_skb(struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, const __be16 sport, const __be16 dport, const int sdif, bool *refcounted) { struct net *net = dev_net(skb_dst(skb)->dev); const struct iphdr *iph = ip_hdr(skb); struct sock *sk; sk = inet_steal_sock(net, skb, doff, iph->saddr, sport, iph->daddr, dport, refcounted, inet_ehashfn); if (IS_ERR(sk)) return NULL; if (sk) return sk; return __inet_lookup(net, hashinfo, skb, doff, iph->saddr, sport, iph->daddr, dport, inet_iif(skb), sdif, refcounted); } static inline void sk_daddr_set(struct sock *sk, __be32 addr) { sk->sk_daddr = addr; /* alias of inet_daddr */ #if IS_ENABLED(CONFIG_IPV6) ipv6_addr_set_v4mapped(addr, &sk->sk_v6_daddr); #endif } static inline void sk_rcv_saddr_set(struct sock *sk, __be32 addr) { sk->sk_rcv_saddr = addr; /* alias of inet_rcv_saddr */ #if IS_ENABLED(CONFIG_IPV6) ipv6_addr_set_v4mapped(addr, &sk->sk_v6_rcv_saddr); #endif } int __inet_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk, u64 port_offset, int (*check_established)(struct inet_timewait_death_row *, struct sock *, __u16, struct inet_timewait_sock **)); int inet_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk); #endif /* _INET_HASHTABLES_H */
2 2 2 4 2 2 2 3 1 2 1 2 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 // SPDX-License-Identifier: GPL-2.0-or-later /* * net/sched/em_meta.c Metadata ematch * * Authors: Thomas Graf <tgraf@suug.ch> * * ========================================================================== * * The metadata ematch compares two meta objects where each object * represents either a meta value stored in the kernel or a static * value provided by userspace. The objects are not provided by * userspace itself but rather a definition providing the information * to build them. Every object is of a certain type which must be * equal to the object it is being compared to. * * The definition of a objects conists of the type (meta type), a * identifier (meta id) and additional type specific information. * The meta id is either TCF_META_TYPE_VALUE for values provided by * userspace or a index to the meta operations table consisting of * function pointers to type specific meta data collectors returning * the value of the requested meta value. * * lvalue rvalue * +-----------+ +-----------+ * | type: INT | | type: INT | * def | id: DEV | | id: VALUE | * | data: | | data: 3 | * +-----------+ +-----------+ * | | * ---> meta_ops[INT][DEV](...) | * | | * ----------- | * V V * +-----------+ +-----------+ * | type: INT | | type: INT | * obj | id: DEV | | id: VALUE | * | data: 2 |<--data got filled out | data: 3 | * +-----------+ +-----------+ * | | * --------------> 2 equals 3 <-------------- * * This is a simplified schema, the complexity varies depending * on the meta type. Obviously, the length of the data must also * be provided for non-numeric types. * * Additionally, type dependent modifiers such as shift operators * or mask may be applied to extend the functionality. As of now, * the variable length type supports shifting the byte string to * the right, eating up any number of octets and thus supporting * wildcard interface name comparisons such as "ppp%" matching * ppp0..9. * * NOTE: Certain meta values depend on other subsystems and are * only available if that subsystem is enabled in the kernel. */ #include <linux/slab.h> #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/sched.h> #include <linux/sched/loadavg.h> #include <linux/string.h> #include <linux/skbuff.h> #include <linux/random.h> #include <linux/if_vlan.h> #include <linux/tc_ematch/tc_em_meta.h> #include <net/dst.h> #include <net/route.h> #include <net/pkt_cls.h> #include <net/sock.h> struct meta_obj { unsigned long value; unsigned int len; }; struct meta_value { struct tcf_meta_val hdr; unsigned long val; unsigned int len; }; struct meta_match { struct meta_value lvalue; struct meta_value rvalue; }; static inline int meta_id(struct meta_value *v) { return TCF_META_ID(v->hdr.kind); } static inline int meta_type(struct meta_value *v) { return TCF_META_TYPE(v->hdr.kind); } #define META_COLLECTOR(FUNC) static void meta_##FUNC(struct sk_buff *skb, \ struct tcf_pkt_info *info, struct meta_value *v, \ struct meta_obj *dst, int *err) /************************************************************************** * System status & misc **************************************************************************/ META_COLLECTOR(int_random) { get_random_bytes(&dst->value, sizeof(dst->value)); } static inline unsigned long fixed_loadavg(int load) { int rnd_load = load + (FIXED_1/200); int rnd_frac = ((rnd_load & (FIXED_1-1)) * 100) >> FSHIFT; return ((rnd_load >> FSHIFT) * 100) + rnd_frac; } META_COLLECTOR(int_loadavg_0) { dst->value = fixed_loadavg(avenrun[0]); } META_COLLECTOR(int_loadavg_1) { dst->value = fixed_loadavg(avenrun[1]); } META_COLLECTOR(int_loadavg_2) { dst->value = fixed_loadavg(avenrun[2]); } /************************************************************************** * Device names & indices **************************************************************************/ static inline int int_dev(struct net_device *dev, struct meta_obj *dst) { if (unlikely(dev == NULL)) return -1; dst->value = dev->ifindex; return 0; } static inline int var_dev(struct net_device *dev, struct meta_obj *dst) { if (unlikely(dev == NULL)) return -1; dst->value = (unsigned long) dev->name; dst->len = strlen(dev->name); return 0; } META_COLLECTOR(int_dev) { *err = int_dev(skb->dev, dst); } META_COLLECTOR(var_dev) { *err = var_dev(skb->dev, dst); } /************************************************************************** * vlan tag **************************************************************************/ META_COLLECTOR(int_vlan_tag) { unsigned short tag; if (skb_vlan_tag_present(skb)) dst->value = skb_vlan_tag_get(skb); else if (!__vlan_get_tag(skb, &tag)) dst->value = tag; else *err = -1; } /************************************************************************** * skb attributes **************************************************************************/ META_COLLECTOR(int_priority) { dst->value = skb->priority; } META_COLLECTOR(int_protocol) { /* Let userspace take care of the byte ordering */ dst->value = skb_protocol(skb, false); } META_COLLECTOR(int_pkttype) { dst->value = skb->pkt_type; } META_COLLECTOR(int_pktlen) { dst->value = skb->len; } META_COLLECTOR(int_datalen) { dst->value = skb->data_len; } META_COLLECTOR(int_maclen) { dst->value = skb->mac_len; } META_COLLECTOR(int_rxhash) { dst->value = skb_get_hash(skb); } /************************************************************************** * Netfilter **************************************************************************/ META_COLLECTOR(int_mark) { dst->value = skb->mark; } /************************************************************************** * Traffic Control **************************************************************************/ META_COLLECTOR(int_tcindex) { dst->value = skb->tc_index; } /************************************************************************** * Routing **************************************************************************/ META_COLLECTOR(int_rtclassid) { if (unlikely(skb_dst(skb) == NULL)) *err = -1; else #ifdef CONFIG_IP_ROUTE_CLASSID dst->value = skb_dst(skb)->tclassid; #else dst->value = 0; #endif } META_COLLECTOR(int_rtiif) { if (unlikely(skb_rtable(skb) == NULL)) *err = -1; else dst->value = inet_iif(skb); } /************************************************************************** * Socket Attributes **************************************************************************/ #define skip_nonlocal(skb) \ (unlikely(skb->sk == NULL)) META_COLLECTOR(int_sk_family) { if (skip_nonlocal(skb)) { *err = -1; return; } dst->value = skb->sk->sk_family; } META_COLLECTOR(int_sk_state) { if (skip_nonlocal(skb)) { *err = -1; return; } dst->value = skb->sk->sk_state; } META_COLLECTOR(int_sk_reuse) { if (skip_nonlocal(skb)) { *err = -1; return; } dst->value = skb->sk->sk_reuse; } META_COLLECTOR(int_sk_bound_if) { if (skip_nonlocal(skb)) { *err = -1; return; } /* No error if bound_dev_if is 0, legal userspace check */ dst->value = skb->sk->sk_bound_dev_if; } META_COLLECTOR(var_sk_bound_if) { int bound_dev_if; if (skip_nonlocal(skb)) { *err = -1; return; } bound_dev_if = READ_ONCE(skb->sk->sk_bound_dev_if); if (bound_dev_if == 0) { dst->value = (unsigned long) "any"; dst->len = 3; } else { struct net_device *dev; rcu_read_lock(); dev = dev_get_by_index_rcu(sock_net(skb->sk), bound_dev_if); *err = var_dev(dev, dst); rcu_read_unlock(); } } META_COLLECTOR(int_sk_refcnt) { if (skip_nonlocal(skb)) { *err = -1; return; } dst->value = refcount_read(&skb->sk->sk_refcnt); } META_COLLECTOR(int_sk_rcvbuf) { const struct sock *sk = skb_to_full_sk(skb); if (!sk) { *err = -1; return; } dst->value = sk->sk_rcvbuf; } META_COLLECTOR(int_sk_shutdown) { const struct sock *sk = skb_to_full_sk(skb); if (!sk) { *err = -1; return; } dst->value = sk->sk_shutdown; } META_COLLECTOR(int_sk_proto) { const struct sock *sk = skb_to_full_sk(skb); if (!sk) { *err = -1; return; } dst->value = sk->sk_protocol; } META_COLLECTOR(int_sk_type) { const struct sock *sk = skb_to_full_sk(skb); if (!sk) { *err = -1; return; } dst->value = sk->sk_type; } META_COLLECTOR(int_sk_rmem_alloc) { const struct sock *sk = skb_to_full_sk(skb); if (!sk) { *err = -1; return; } dst->value = sk_rmem_alloc_get(sk); } META_COLLECTOR(int_sk_wmem_alloc) { const struct sock *sk = skb_to_full_sk(skb); if (!sk) { *err = -1; return; } dst->value = sk_wmem_alloc_get(sk); } META_COLLECTOR(int_sk_omem_alloc) { const struct sock *sk = skb_to_full_sk(skb); if (!sk) { *err = -1; return; } dst->value = atomic_read(&sk->sk_omem_alloc); } META_COLLECTOR(int_sk_rcv_qlen) { const struct sock *sk = skb_to_full_sk(skb); if (!sk) { *err = -1; return; } dst->value = sk->sk_receive_queue.qlen; } META_COLLECTOR(int_sk_snd_qlen) { const struct sock *sk = skb_to_full_sk(skb); if (!sk) { *err = -1; return; } dst->value = sk->sk_write_queue.qlen; } META_COLLECTOR(int_sk_wmem_queued) { const struct sock *sk = skb_to_full_sk(skb); if (!sk) { *err = -1; return; } dst->value = READ_ONCE(sk->sk_wmem_queued); } META_COLLECTOR(int_sk_fwd_alloc) { const struct sock *sk = skb_to_full_sk(skb); if (!sk) { *err = -1; return; } dst->value = sk_forward_alloc_get(sk); } META_COLLECTOR(int_sk_sndbuf) { const struct sock *sk = skb_to_full_sk(skb); if (!sk) { *err = -1; return; } dst->value = sk->sk_sndbuf; } META_COLLECTOR(int_sk_alloc) { const struct sock *sk = skb_to_full_sk(skb); if (!sk) { *err = -1; return; } dst->value = (__force int) sk->sk_allocation; } META_COLLECTOR(int_sk_hash) { if (skip_nonlocal(skb)) { *err = -1; return; } dst->value = skb->sk->sk_hash; } META_COLLECTOR(int_sk_lingertime) { const struct sock *sk = skb_to_full_sk(skb); if (!sk) { *err = -1; return; } dst->value = READ_ONCE(sk->sk_lingertime) / HZ; } META_COLLECTOR(int_sk_err_qlen) { const struct sock *sk = skb_to_full_sk(skb); if (!sk) { *err = -1; return; } dst->value = sk->sk_error_queue.qlen; } META_COLLECTOR(int_sk_ack_bl) { const struct sock *sk = skb_to_full_sk(skb); if (!sk) { *err = -1; return; } dst->value = READ_ONCE(sk->sk_ack_backlog); } META_COLLECTOR(int_sk_max_ack_bl) { const struct sock *sk = skb_to_full_sk(skb); if (!sk) { *err = -1; return; } dst->value = READ_ONCE(sk->sk_max_ack_backlog); } META_COLLECTOR(int_sk_prio) { const struct sock *sk = skb_to_full_sk(skb); if (!sk) { *err = -1; return; } dst->value = READ_ONCE(sk->sk_priority); } META_COLLECTOR(int_sk_rcvlowat) { const struct sock *sk = skb_to_full_sk(skb); if (!sk) { *err = -1; return; } dst->value = READ_ONCE(sk->sk_rcvlowat); } META_COLLECTOR(int_sk_rcvtimeo) { const struct sock *sk = skb_to_full_sk(skb); if (!sk) { *err = -1; return; } dst->value = READ_ONCE(sk->sk_rcvtimeo) / HZ; } META_COLLECTOR(int_sk_sndtimeo) { const struct sock *sk = skb_to_full_sk(skb); if (!sk) { *err = -1; return; } dst->value = READ_ONCE(sk->sk_sndtimeo) / HZ; } META_COLLECTOR(int_sk_sendmsg_off) { const struct sock *sk = skb_to_full_sk(skb); if (!sk) { *err = -1; return; } dst->value = sk->sk_frag.offset; } META_COLLECTOR(int_sk_write_pend) { const struct sock *sk = skb_to_full_sk(skb); if (!sk) { *err = -1; return; } dst->value = sk->sk_write_pending; } /************************************************************************** * Meta value collectors assignment table **************************************************************************/ struct meta_ops { void (*get)(struct sk_buff *, struct tcf_pkt_info *, struct meta_value *, struct meta_obj *, int *); }; #define META_ID(name) TCF_META_ID_##name #define META_FUNC(name) { .get = meta_##name } /* Meta value operations table listing all meta value collectors and * assigns them to a type and meta id. */ static struct meta_ops __meta_ops[TCF_META_TYPE_MAX + 1][TCF_META_ID_MAX + 1] = { [TCF_META_TYPE_VAR] = { [META_ID(DEV)] = META_FUNC(var_dev), [META_ID(SK_BOUND_IF)] = META_FUNC(var_sk_bound_if), }, [TCF_META_TYPE_INT] = { [META_ID(RANDOM)] = META_FUNC(int_random), [META_ID(LOADAVG_0)] = META_FUNC(int_loadavg_0), [META_ID(LOADAVG_1)] = META_FUNC(int_loadavg_1), [META_ID(LOADAVG_2)] = META_FUNC(int_loadavg_2), [META_ID(DEV)] = META_FUNC(int_dev), [META_ID(PRIORITY)] = META_FUNC(int_priority), [META_ID(PROTOCOL)] = META_FUNC(int_protocol), [META_ID(PKTTYPE)] = META_FUNC(int_pkttype), [META_ID(PKTLEN)] = META_FUNC(int_pktlen), [META_ID(DATALEN)] = META_FUNC(int_datalen), [META_ID(MACLEN)] = META_FUNC(int_maclen), [META_ID(NFMARK)] = META_FUNC(int_mark), [META_ID(TCINDEX)] = META_FUNC(int_tcindex), [META_ID(RTCLASSID)] = META_FUNC(int_rtclassid), [META_ID(RTIIF)] = META_FUNC(int_rtiif), [META_ID(SK_FAMILY)] = META_FUNC(int_sk_family), [META_ID(SK_STATE)] = META_FUNC(int_sk_state), [META_ID(SK_REUSE)] = META_FUNC(int_sk_reuse), [META_ID(SK_BOUND_IF)] = META_FUNC(int_sk_bound_if), [META_ID(SK_REFCNT)] = META_FUNC(int_sk_refcnt), [META_ID(SK_RCVBUF)] = META_FUNC(int_sk_rcvbuf), [META_ID(SK_SNDBUF)] = META_FUNC(int_sk_sndbuf), [META_ID(SK_SHUTDOWN)] = META_FUNC(int_sk_shutdown), [META_ID(SK_PROTO)] = META_FUNC(int_sk_proto), [META_ID(SK_TYPE)] = META_FUNC(int_sk_type), [META_ID(SK_RMEM_ALLOC)] = META_FUNC(int_sk_rmem_alloc), [META_ID(SK_WMEM_ALLOC)] = META_FUNC(int_sk_wmem_alloc), [META_ID(SK_OMEM_ALLOC)] = META_FUNC(int_sk_omem_alloc), [META_ID(SK_WMEM_QUEUED)] = META_FUNC(int_sk_wmem_queued), [META_ID(SK_RCV_QLEN)] = META_FUNC(int_sk_rcv_qlen), [META_ID(SK_SND_QLEN)] = META_FUNC(int_sk_snd_qlen), [META_ID(SK_ERR_QLEN)] = META_FUNC(int_sk_err_qlen), [META_ID(SK_FORWARD_ALLOCS)] = META_FUNC(int_sk_fwd_alloc), [META_ID(SK_ALLOCS)] = META_FUNC(int_sk_alloc), [META_ID(SK_HASH)] = META_FUNC(int_sk_hash), [META_ID(SK_LINGERTIME)] = META_FUNC(int_sk_lingertime), [META_ID(SK_ACK_BACKLOG)] = META_FUNC(int_sk_ack_bl), [META_ID(SK_MAX_ACK_BACKLOG)] = META_FUNC(int_sk_max_ack_bl), [META_ID(SK_PRIO)] = META_FUNC(int_sk_prio), [META_ID(SK_RCVLOWAT)] = META_FUNC(int_sk_rcvlowat), [META_ID(SK_RCVTIMEO)] = META_FUNC(int_sk_rcvtimeo), [META_ID(SK_SNDTIMEO)] = META_FUNC(int_sk_sndtimeo), [META_ID(SK_SENDMSG_OFF)] = META_FUNC(int_sk_sendmsg_off), [META_ID(SK_WRITE_PENDING)] = META_FUNC(int_sk_write_pend), [META_ID(VLAN_TAG)] = META_FUNC(int_vlan_tag), [META_ID(RXHASH)] = META_FUNC(int_rxhash), } }; static inline struct meta_ops *meta_ops(struct meta_value *val) { return &__meta_ops[meta_type(val)][meta_id(val)]; } /************************************************************************** * Type specific operations for TCF_META_TYPE_VAR **************************************************************************/ static int meta_var_compare(struct meta_obj *a, struct meta_obj *b) { int r = a->len - b->len; if (r == 0) r = memcmp((void *) a->value, (void *) b->value, a->len); return r; } static int meta_var_change(struct meta_value *dst, struct nlattr *nla) { int len = nla_len(nla); dst->val = (unsigned long)kmemdup(nla_data(nla), len, GFP_KERNEL); if (dst->val == 0UL) return -ENOMEM; dst->len = len; return 0; } static void meta_var_destroy(struct meta_value *v) { kfree((void *) v->val); } static void meta_var_apply_extras(struct meta_value *v, struct meta_obj *dst) { int shift = v->hdr.shift; if (shift && shift < dst->len) dst->len -= shift; } static int meta_var_dump(struct sk_buff *skb, struct meta_value *v, int tlv) { if (v->val && v->len && nla_put(skb, tlv, v->len, (void *) v->val)) goto nla_put_failure; return 0; nla_put_failure: return -1; } /************************************************************************** * Type specific operations for TCF_META_TYPE_INT **************************************************************************/ static int meta_int_compare(struct meta_obj *a, struct meta_obj *b) { /* Let gcc optimize it, the unlikely is not really based on * some numbers but jump free code for mismatches seems * more logical. */ if (unlikely(a->value == b->value)) return 0; else if (a->value < b->value) return -1; else return 1; } static int meta_int_change(struct meta_value *dst, struct nlattr *nla) { if (nla_len(nla) >= sizeof(unsigned long)) { dst->val = *(unsigned long *) nla_data(nla); dst->len = sizeof(unsigned long); } else if (nla_len(nla) == sizeof(u32)) { dst->val = nla_get_u32(nla); dst->len = sizeof(u32); } else return -EINVAL; return 0; } static void meta_int_apply_extras(struct meta_value *v, struct meta_obj *dst) { if (v->hdr.shift) dst->value >>= v->hdr.shift; if (v->val) dst->value &= v->val; } static int meta_int_dump(struct sk_buff *skb, struct meta_value *v, int tlv) { if (v->len == sizeof(unsigned long)) { if (nla_put(skb, tlv, sizeof(unsigned long), &v->val)) goto nla_put_failure; } else if (v->len == sizeof(u32)) { if (nla_put_u32(skb, tlv, v->val)) goto nla_put_failure; } return 0; nla_put_failure: return -1; } /************************************************************************** * Type specific operations table **************************************************************************/ struct meta_type_ops { void (*destroy)(struct meta_value *); int (*compare)(struct meta_obj *, struct meta_obj *); int (*change)(struct meta_value *, struct nlattr *); void (*apply_extras)(struct meta_value *, struct meta_obj *); int (*dump)(struct sk_buff *, struct meta_value *, int); }; static const struct meta_type_ops __meta_type_ops[TCF_META_TYPE_MAX + 1] = { [TCF_META_TYPE_VAR] = { .destroy = meta_var_destroy, .compare = meta_var_compare, .change = meta_var_change, .apply_extras = meta_var_apply_extras, .dump = meta_var_dump }, [TCF_META_TYPE_INT] = { .compare = meta_int_compare, .change = meta_int_change, .apply_extras = meta_int_apply_extras, .dump = meta_int_dump } }; static inline const struct meta_type_ops *meta_type_ops(struct meta_value *v) { return &__meta_type_ops[meta_type(v)]; } /************************************************************************** * Core **************************************************************************/ static int meta_get(struct sk_buff *skb, struct tcf_pkt_info *info, struct meta_value *v, struct meta_obj *dst) { int err = 0; if (meta_id(v) == TCF_META_ID_VALUE) { dst->value = v->val; dst->len = v->len; return 0; } meta_ops(v)->get(skb, info, v, dst, &err); if (err < 0) return err; if (meta_type_ops(v)->apply_extras) meta_type_ops(v)->apply_extras(v, dst); return 0; } static int em_meta_match(struct sk_buff *skb, struct tcf_ematch *m, struct tcf_pkt_info *info) { int r; struct meta_match *meta = (struct meta_match *) m->data; struct meta_obj l_value, r_value; if (meta_get(skb, info, &meta->lvalue, &l_value) < 0 || meta_get(skb, info, &meta->rvalue, &r_value) < 0) return 0; r = meta_type_ops(&meta->lvalue)->compare(&l_value, &r_value); switch (meta->lvalue.hdr.op) { case TCF_EM_OPND_EQ: return !r; case TCF_EM_OPND_LT: return r < 0; case TCF_EM_OPND_GT: return r > 0; } return 0; } static void meta_delete(struct meta_match *meta) { if (meta) { const struct meta_type_ops *ops = meta_type_ops(&meta->lvalue); if (ops && ops->destroy) { ops->destroy(&meta->lvalue); ops->destroy(&meta->rvalue); } } kfree(meta); } static inline int meta_change_data(struct meta_value *dst, struct nlattr *nla) { if (nla) { if (nla_len(nla) == 0) return -EINVAL; return meta_type_ops(dst)->change(dst, nla); } return 0; } static inline int meta_is_supported(struct meta_value *val) { return !meta_id(val) || meta_ops(val)->get; } static const struct nla_policy meta_policy[TCA_EM_META_MAX + 1] = { [TCA_EM_META_HDR] = { .len = sizeof(struct tcf_meta_hdr) }, }; static int em_meta_change(struct net *net, void *data, int len, struct tcf_ematch *m) { int err; struct nlattr *tb[TCA_EM_META_MAX + 1]; struct tcf_meta_hdr *hdr; struct meta_match *meta = NULL; err = nla_parse_deprecated(tb, TCA_EM_META_MAX, data, len, meta_policy, NULL); if (err < 0) goto errout; err = -EINVAL; if (tb[TCA_EM_META_HDR] == NULL) goto errout; hdr = nla_data(tb[TCA_EM_META_HDR]); if (TCF_META_TYPE(hdr->left.kind) != TCF_META_TYPE(hdr->right.kind) || TCF_META_TYPE(hdr->left.kind) > TCF_META_TYPE_MAX || TCF_META_ID(hdr->left.kind) > TCF_META_ID_MAX || TCF_META_ID(hdr->right.kind) > TCF_META_ID_MAX) goto errout; meta = kzalloc(sizeof(*meta), GFP_KERNEL); if (meta == NULL) { err = -ENOMEM; goto errout; } memcpy(&meta->lvalue.hdr, &hdr->left, sizeof(hdr->left)); memcpy(&meta->rvalue.hdr, &hdr->right, sizeof(hdr->right)); if (!meta_is_supported(&meta->lvalue) || !meta_is_supported(&meta->rvalue)) { err = -EOPNOTSUPP; goto errout; } if (meta_change_data(&meta->lvalue, tb[TCA_EM_META_LVALUE]) < 0 || meta_change_data(&meta->rvalue, tb[TCA_EM_META_RVALUE]) < 0) goto errout; m->datalen = sizeof(*meta); m->data = (unsigned long) meta; err = 0; errout: if (err && meta) meta_delete(meta); return err; } static void em_meta_destroy(struct tcf_ematch *m) { if (m) meta_delete((struct meta_match *) m->data); } static int em_meta_dump(struct sk_buff *skb, struct tcf_ematch *em) { struct meta_match *meta = (struct meta_match *) em->data; struct tcf_meta_hdr hdr; const struct meta_type_ops *ops; memset(&hdr, 0, sizeof(hdr)); memcpy(&hdr.left, &meta->lvalue.hdr, sizeof(hdr.left)); memcpy(&hdr.right, &meta->rvalue.hdr, sizeof(hdr.right)); if (nla_put(skb, TCA_EM_META_HDR, sizeof(hdr), &hdr)) goto nla_put_failure; ops = meta_type_ops(&meta->lvalue); if (ops->dump(skb, &meta->lvalue, TCA_EM_META_LVALUE) < 0 || ops->dump(skb, &meta->rvalue, TCA_EM_META_RVALUE) < 0) goto nla_put_failure; return 0; nla_put_failure: return -1; } static struct tcf_ematch_ops em_meta_ops = { .kind = TCF_EM_META, .change = em_meta_change, .match = em_meta_match, .destroy = em_meta_destroy, .dump = em_meta_dump, .owner = THIS_MODULE, .link = LIST_HEAD_INIT(em_meta_ops.link) }; static int __init init_em_meta(void) { return tcf_em_register(&em_meta_ops); } static void __exit exit_em_meta(void) { tcf_em_unregister(&em_meta_ops); } MODULE_DESCRIPTION("ematch classifier for various internal kernel metadata, skb metadata and sk metadata"); MODULE_LICENSE("GPL"); module_init(init_em_meta); module_exit(exit_em_meta); MODULE_ALIAS_TCF_EMATCH(TCF_EM_META);
2 12 12 18 18 11 11 11 11 11 11 13 5 10 3 2 1 1 1 11 2 2 2 2 6 6 2 112 3 2 50 52 4 23 21 2 80 2 78 10 1 2 21 83 3 68 10 2 12 10 6 3 1 7 2 12 3 9 4 4 1 3 3 3 11 11 11 11 11 11 3 15 11 4 11 13 2 11 15 15 17 2 15 15 15 2 2 3 10 9 9 2 2 3 3 1 1 1 14 3 3 3 3 2 3 3 4 4 5 5 5 5 5 2 2 2 2 74 74 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2007-2014 Nicira, Inc. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/init.h> #include <linux/module.h> #include <linux/if_arp.h> #include <linux/if_vlan.h> #include <linux/in.h> #include <linux/ip.h> #include <linux/jhash.h> #include <linux/delay.h> #include <linux/time.h> #include <linux/etherdevice.h> #include <linux/kernel.h> #include <linux/kthread.h> #include <linux/mutex.h> #include <linux/percpu.h> #include <linux/rcupdate.h> #include <linux/tcp.h> #include <linux/udp.h> #include <linux/ethtool.h> #include <linux/wait.h> #include <asm/div64.h> #include <linux/highmem.h> #include <linux/netfilter_bridge.h> #include <linux/netfilter_ipv4.h> #include <linux/inetdevice.h> #include <linux/list.h> #include <linux/openvswitch.h> #include <linux/rculist.h> #include <linux/dmi.h> #include <net/genetlink.h> #include <net/gso.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/pkt_cls.h> #include "datapath.h" #include "drop.h" #include "flow.h" #include "flow_table.h" #include "flow_netlink.h" #include "meter.h" #include "openvswitch_trace.h" #include "vport-internal_dev.h" #include "vport-netdev.h" unsigned int ovs_net_id __read_mostly; static struct genl_family dp_packet_genl_family; static struct genl_family dp_flow_genl_family; static struct genl_family dp_datapath_genl_family; static const struct nla_policy flow_policy[]; static const struct genl_multicast_group ovs_dp_flow_multicast_group = { .name = OVS_FLOW_MCGROUP, }; static const struct genl_multicast_group ovs_dp_datapath_multicast_group = { .name = OVS_DATAPATH_MCGROUP, }; static const struct genl_multicast_group ovs_dp_vport_multicast_group = { .name = OVS_VPORT_MCGROUP, }; /* Check if need to build a reply message. * OVS userspace sets the NLM_F_ECHO flag if it needs the reply. */ static bool ovs_must_notify(struct genl_family *family, struct genl_info *info, unsigned int group) { return info->nlhdr->nlmsg_flags & NLM_F_ECHO || genl_has_listeners(family, genl_info_net(info), group); } static void ovs_notify(struct genl_family *family, struct sk_buff *skb, struct genl_info *info) { genl_notify(family, skb, info, 0, GFP_KERNEL); } /** * DOC: Locking: * * All writes e.g. Writes to device state (add/remove datapath, port, set * operations on vports, etc.), Writes to other state (flow table * modifications, set miscellaneous datapath parameters, etc.) are protected * by ovs_lock. * * Reads are protected by RCU. * * There are a few special cases (mostly stats) that have their own * synchronization but they nest under all of above and don't interact with * each other. * * The RTNL lock nests inside ovs_mutex. */ static DEFINE_MUTEX(ovs_mutex); void ovs_lock(void) { mutex_lock(&ovs_mutex); } void ovs_unlock(void) { mutex_unlock(&ovs_mutex); } #ifdef CONFIG_LOCKDEP int lockdep_ovsl_is_held(void) { if (debug_locks) return lockdep_is_held(&ovs_mutex); else return 1; } #endif static struct vport *new_vport(const struct vport_parms *); static int queue_gso_packets(struct datapath *dp, struct sk_buff *, const struct sw_flow_key *, const struct dp_upcall_info *, uint32_t cutlen); static int queue_userspace_packet(struct datapath *dp, struct sk_buff *, const struct sw_flow_key *, const struct dp_upcall_info *, uint32_t cutlen); static void ovs_dp_masks_rebalance(struct work_struct *work); static int ovs_dp_set_upcall_portids(struct datapath *, const struct nlattr *); /* Must be called with rcu_read_lock or ovs_mutex. */ const char *ovs_dp_name(const struct datapath *dp) { struct vport *vport = ovs_vport_ovsl_rcu(dp, OVSP_LOCAL); return ovs_vport_name(vport); } static int get_dpifindex(const struct datapath *dp) { struct vport *local; int ifindex; rcu_read_lock(); local = ovs_vport_rcu(dp, OVSP_LOCAL); if (local) ifindex = local->dev->ifindex; else ifindex = 0; rcu_read_unlock(); return ifindex; } static void destroy_dp_rcu(struct rcu_head *rcu) { struct datapath *dp = container_of(rcu, struct datapath, rcu); ovs_flow_tbl_destroy(&dp->table); free_percpu(dp->stats_percpu); kfree(dp->ports); ovs_meters_exit(dp); kfree(rcu_dereference_raw(dp->upcall_portids)); kfree(dp); } static struct hlist_head *vport_hash_bucket(const struct datapath *dp, u16 port_no) { return &dp->ports[port_no & (DP_VPORT_HASH_BUCKETS - 1)]; } /* Called with ovs_mutex or RCU read lock. */ struct vport *ovs_lookup_vport(const struct datapath *dp, u16 port_no) { struct vport *vport; struct hlist_head *head; head = vport_hash_bucket(dp, port_no); hlist_for_each_entry_rcu(vport, head, dp_hash_node, lockdep_ovsl_is_held()) { if (vport->port_no == port_no) return vport; } return NULL; } /* Called with ovs_mutex. */ static struct vport *new_vport(const struct vport_parms *parms) { struct vport *vport; vport = ovs_vport_add(parms); if (!IS_ERR(vport)) { struct datapath *dp = parms->dp; struct hlist_head *head = vport_hash_bucket(dp, vport->port_no); hlist_add_head_rcu(&vport->dp_hash_node, head); } return vport; } static void ovs_vport_update_upcall_stats(struct sk_buff *skb, const struct dp_upcall_info *upcall_info, bool upcall_result) { struct vport *p = OVS_CB(skb)->input_vport; struct vport_upcall_stats_percpu *stats; if (upcall_info->cmd != OVS_PACKET_CMD_MISS && upcall_info->cmd != OVS_PACKET_CMD_ACTION) return; stats = this_cpu_ptr(p->upcall_stats); u64_stats_update_begin(&stats->syncp); if (upcall_result) u64_stats_inc(&stats->n_success); else u64_stats_inc(&stats->n_fail); u64_stats_update_end(&stats->syncp); } void ovs_dp_detach_port(struct vport *p) { ASSERT_OVSL(); /* First drop references to device. */ hlist_del_rcu(&p->dp_hash_node); /* Then destroy it. */ ovs_vport_del(p); } /* Must be called with rcu_read_lock. */ void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key) { const struct vport *p = OVS_CB(skb)->input_vport; struct datapath *dp = p->dp; struct sw_flow *flow; struct sw_flow_actions *sf_acts; struct dp_stats_percpu *stats; u64 *stats_counter; u32 n_mask_hit; u32 n_cache_hit; int error; stats = this_cpu_ptr(dp->stats_percpu); /* Look up flow. */ flow = ovs_flow_tbl_lookup_stats(&dp->table, key, skb_get_hash(skb), &n_mask_hit, &n_cache_hit); if (unlikely(!flow)) { struct dp_upcall_info upcall; memset(&upcall, 0, sizeof(upcall)); upcall.cmd = OVS_PACKET_CMD_MISS; if (dp->user_features & OVS_DP_F_DISPATCH_UPCALL_PER_CPU) upcall.portid = ovs_dp_get_upcall_portid(dp, smp_processor_id()); else upcall.portid = ovs_vport_find_upcall_portid(p, skb); upcall.mru = OVS_CB(skb)->mru; error = ovs_dp_upcall(dp, skb, key, &upcall, 0); switch (error) { case 0: case -EAGAIN: case -ERESTARTSYS: case -EINTR: consume_skb(skb); break; default: kfree_skb(skb); break; } stats_counter = &stats->n_missed; goto out; } ovs_flow_stats_update(flow, key->tp.flags, skb); sf_acts = rcu_dereference(flow->sf_acts); error = ovs_execute_actions(dp, skb, sf_acts, key); if (unlikely(error)) net_dbg_ratelimited("ovs: action execution error on datapath %s: %d\n", ovs_dp_name(dp), error); stats_counter = &stats->n_hit; out: /* Update datapath statistics. */ u64_stats_update_begin(&stats->syncp); (*stats_counter)++; stats->n_mask_hit += n_mask_hit; stats->n_cache_hit += n_cache_hit; u64_stats_update_end(&stats->syncp); } int ovs_dp_upcall(struct datapath *dp, struct sk_buff *skb, const struct sw_flow_key *key, const struct dp_upcall_info *upcall_info, uint32_t cutlen) { struct dp_stats_percpu *stats; int err; if (trace_ovs_dp_upcall_enabled()) trace_ovs_dp_upcall(dp, skb, key, upcall_info); if (upcall_info->portid == 0) { err = -ENOTCONN; goto err; } if (!skb_is_gso(skb)) err = queue_userspace_packet(dp, skb, key, upcall_info, cutlen); else err = queue_gso_packets(dp, skb, key, upcall_info, cutlen); ovs_vport_update_upcall_stats(skb, upcall_info, !err); if (err) goto err; return 0; err: stats = this_cpu_ptr(dp->stats_percpu); u64_stats_update_begin(&stats->syncp); stats->n_lost++; u64_stats_update_end(&stats->syncp); return err; } static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb, const struct sw_flow_key *key, const struct dp_upcall_info *upcall_info, uint32_t cutlen) { unsigned int gso_type = skb_shinfo(skb)->gso_type; struct sw_flow_key later_key; struct sk_buff *segs, *nskb; int err; BUILD_BUG_ON(sizeof(*OVS_CB(skb)) > SKB_GSO_CB_OFFSET); segs = __skb_gso_segment(skb, NETIF_F_SG, false); if (IS_ERR(segs)) return PTR_ERR(segs); if (segs == NULL) return -EINVAL; if (gso_type & SKB_GSO_UDP) { /* The initial flow key extracted by ovs_flow_key_extract() * in this case is for a first fragment, so we need to * properly mark later fragments. */ later_key = *key; later_key.ip.frag = OVS_FRAG_TYPE_LATER; } /* Queue all of the segments. */ skb_list_walk_safe(segs, skb, nskb) { if (gso_type & SKB_GSO_UDP && skb != segs) key = &later_key; err = queue_userspace_packet(dp, skb, key, upcall_info, cutlen); if (err) break; } /* Free all of the segments. */ skb_list_walk_safe(segs, skb, nskb) { if (err) kfree_skb(skb); else consume_skb(skb); } return err; } static size_t upcall_msg_size(const struct dp_upcall_info *upcall_info, unsigned int hdrlen, int actions_attrlen) { size_t size = NLMSG_ALIGN(sizeof(struct ovs_header)) + nla_total_size(hdrlen) /* OVS_PACKET_ATTR_PACKET */ + nla_total_size(ovs_key_attr_size()) /* OVS_PACKET_ATTR_KEY */ + nla_total_size(sizeof(unsigned int)) /* OVS_PACKET_ATTR_LEN */ + nla_total_size(sizeof(u64)); /* OVS_PACKET_ATTR_HASH */ /* OVS_PACKET_ATTR_USERDATA */ if (upcall_info->userdata) size += NLA_ALIGN(upcall_info->userdata->nla_len); /* OVS_PACKET_ATTR_EGRESS_TUN_KEY */ if (upcall_info->egress_tun_info) size += nla_total_size(ovs_tun_key_attr_size()); /* OVS_PACKET_ATTR_ACTIONS */ if (upcall_info->actions_len) size += nla_total_size(actions_attrlen); /* OVS_PACKET_ATTR_MRU */ if (upcall_info->mru) size += nla_total_size(sizeof(upcall_info->mru)); return size; } static void pad_packet(struct datapath *dp, struct sk_buff *skb) { if (!(dp->user_features & OVS_DP_F_UNALIGNED)) { size_t plen = NLA_ALIGN(skb->len) - skb->len; if (plen > 0) skb_put_zero(skb, plen); } } static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb, const struct sw_flow_key *key, const struct dp_upcall_info *upcall_info, uint32_t cutlen) { struct ovs_header *upcall; struct sk_buff *nskb = NULL; struct sk_buff *user_skb = NULL; /* to be queued to userspace */ struct nlattr *nla; size_t len; unsigned int hlen; int err, dp_ifindex; u64 hash; dp_ifindex = get_dpifindex(dp); if (!dp_ifindex) return -ENODEV; if (skb_vlan_tag_present(skb)) { nskb = skb_clone(skb, GFP_ATOMIC); if (!nskb) return -ENOMEM; nskb = __vlan_hwaccel_push_inside(nskb); if (!nskb) return -ENOMEM; skb = nskb; } if (nla_attr_size(skb->len) > USHRT_MAX) { err = -EFBIG; goto out; } /* Complete checksum if needed */ if (skb->ip_summed == CHECKSUM_PARTIAL && (err = skb_csum_hwoffload_help(skb, 0))) goto out; /* Older versions of OVS user space enforce alignment of the last * Netlink attribute to NLA_ALIGNTO which would require extensive * padding logic. Only perform zerocopy if padding is not required. */ if (dp->user_features & OVS_DP_F_UNALIGNED) hlen = skb_zerocopy_headlen(skb); else hlen = skb->len; len = upcall_msg_size(upcall_info, hlen - cutlen, OVS_CB(skb)->acts_origlen); user_skb = genlmsg_new(len, GFP_ATOMIC); if (!user_skb) { err = -ENOMEM; goto out; } upcall = genlmsg_put(user_skb, 0, 0, &dp_packet_genl_family, 0, upcall_info->cmd); if (!upcall) { err = -EINVAL; goto out; } upcall->dp_ifindex = dp_ifindex; err = ovs_nla_put_key(key, key, OVS_PACKET_ATTR_KEY, false, user_skb); if (err) goto out; if (upcall_info->userdata) __nla_put(user_skb, OVS_PACKET_ATTR_USERDATA, nla_len(upcall_info->userdata), nla_data(upcall_info->userdata)); if (upcall_info->egress_tun_info) { nla = nla_nest_start_noflag(user_skb, OVS_PACKET_ATTR_EGRESS_TUN_KEY); if (!nla) { err = -EMSGSIZE; goto out; } err = ovs_nla_put_tunnel_info(user_skb, upcall_info->egress_tun_info); if (err) goto out; nla_nest_end(user_skb, nla); } if (upcall_info->actions_len) { nla = nla_nest_start_noflag(user_skb, OVS_PACKET_ATTR_ACTIONS); if (!nla) { err = -EMSGSIZE; goto out; } err = ovs_nla_put_actions(upcall_info->actions, upcall_info->actions_len, user_skb); if (!err) nla_nest_end(user_skb, nla); else nla_nest_cancel(user_skb, nla); } /* Add OVS_PACKET_ATTR_MRU */ if (upcall_info->mru && nla_put_u16(user_skb, OVS_PACKET_ATTR_MRU, upcall_info->mru)) { err = -ENOBUFS; goto out; } /* Add OVS_PACKET_ATTR_LEN when packet is truncated */ if (cutlen > 0 && nla_put_u32(user_skb, OVS_PACKET_ATTR_LEN, skb->len)) { err = -ENOBUFS; goto out; } /* Add OVS_PACKET_ATTR_HASH */ hash = skb_get_hash_raw(skb); if (skb->sw_hash) hash |= OVS_PACKET_HASH_SW_BIT; if (skb->l4_hash) hash |= OVS_PACKET_HASH_L4_BIT; if (nla_put(user_skb, OVS_PACKET_ATTR_HASH, sizeof (u64), &hash)) { err = -ENOBUFS; goto out; } /* Only reserve room for attribute header, packet data is added * in skb_zerocopy() */ if (!(nla = nla_reserve(user_skb, OVS_PACKET_ATTR_PACKET, 0))) { err = -ENOBUFS; goto out; } nla->nla_len = nla_attr_size(skb->len - cutlen); err = skb_zerocopy(user_skb, skb, skb->len - cutlen, hlen); if (err) goto out; /* Pad OVS_PACKET_ATTR_PACKET if linear copy was performed */ pad_packet(dp, user_skb); ((struct nlmsghdr *) user_skb->data)->nlmsg_len = user_skb->len; err = genlmsg_unicast(ovs_dp_get_net(dp), user_skb, upcall_info->portid); user_skb = NULL; out: if (err) skb_tx_error(skb); consume_skb(user_skb); consume_skb(nskb); return err; } static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) { struct ovs_header *ovs_header = genl_info_userhdr(info); struct net *net = sock_net(skb->sk); struct nlattr **a = info->attrs; struct sw_flow_actions *acts; struct sk_buff *packet; struct sw_flow *flow; struct sw_flow_actions *sf_acts; struct datapath *dp; struct vport *input_vport; u16 mru = 0; u64 hash; int len; int err; bool log = !a[OVS_PACKET_ATTR_PROBE]; err = -EINVAL; if (!a[OVS_PACKET_ATTR_PACKET] || !a[OVS_PACKET_ATTR_KEY] || !a[OVS_PACKET_ATTR_ACTIONS]) goto err; len = nla_len(a[OVS_PACKET_ATTR_PACKET]); packet = __dev_alloc_skb(NET_IP_ALIGN + len, GFP_KERNEL); err = -ENOMEM; if (!packet) goto err; skb_reserve(packet, NET_IP_ALIGN); nla_memcpy(__skb_put(packet, len), a[OVS_PACKET_ATTR_PACKET], len); /* Set packet's mru */ if (a[OVS_PACKET_ATTR_MRU]) { mru = nla_get_u16(a[OVS_PACKET_ATTR_MRU]); packet->ignore_df = 1; } OVS_CB(packet)->mru = mru; if (a[OVS_PACKET_ATTR_HASH]) { hash = nla_get_u64(a[OVS_PACKET_ATTR_HASH]); __skb_set_hash(packet, hash & 0xFFFFFFFFULL, !!(hash & OVS_PACKET_HASH_SW_BIT), !!(hash & OVS_PACKET_HASH_L4_BIT)); } /* Build an sw_flow for sending this packet. */ flow = ovs_flow_alloc(); err = PTR_ERR(flow); if (IS_ERR(flow)) goto err_kfree_skb; err = ovs_flow_key_extract_userspace(net, a[OVS_PACKET_ATTR_KEY], packet, &flow->key, log); if (err) goto err_flow_free; err = ovs_nla_copy_actions(net, a[OVS_PACKET_ATTR_ACTIONS], &flow->key, &acts, log); if (err) goto err_flow_free; rcu_assign_pointer(flow->sf_acts, acts); packet->priority = flow->key.phy.priority; packet->mark = flow->key.phy.skb_mark; rcu_read_lock(); dp = get_dp_rcu(net, ovs_header->dp_ifindex); err = -ENODEV; if (!dp) goto err_unlock; input_vport = ovs_vport_rcu(dp, flow->key.phy.in_port); if (!input_vport) input_vport = ovs_vport_rcu(dp, OVSP_LOCAL); if (!input_vport) goto err_unlock; packet->dev = input_vport->dev; OVS_CB(packet)->input_vport = input_vport; sf_acts = rcu_dereference(flow->sf_acts); local_bh_disable(); err = ovs_execute_actions(dp, packet, sf_acts, &flow->key); local_bh_enable(); rcu_read_unlock(); ovs_flow_free(flow, false); return err; err_unlock: rcu_read_unlock(); err_flow_free: ovs_flow_free(flow, false); err_kfree_skb: kfree_skb(packet); err: return err; } static const struct nla_policy packet_policy[OVS_PACKET_ATTR_MAX + 1] = { [OVS_PACKET_ATTR_PACKET] = { .len = ETH_HLEN }, [OVS_PACKET_ATTR_KEY] = { .type = NLA_NESTED }, [OVS_PACKET_ATTR_ACTIONS] = { .type = NLA_NESTED }, [OVS_PACKET_ATTR_PROBE] = { .type = NLA_FLAG }, [OVS_PACKET_ATTR_MRU] = { .type = NLA_U16 }, [OVS_PACKET_ATTR_HASH] = { .type = NLA_U64 }, }; static const struct genl_small_ops dp_packet_genl_ops[] = { { .cmd = OVS_PACKET_CMD_EXECUTE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .doit = ovs_packet_cmd_execute } }; static struct genl_family dp_packet_genl_family __ro_after_init = { .hdrsize = sizeof(struct ovs_header), .name = OVS_PACKET_FAMILY, .version = OVS_PACKET_VERSION, .maxattr = OVS_PACKET_ATTR_MAX, .policy = packet_policy, .netnsok = true, .parallel_ops = true, .small_ops = dp_packet_genl_ops, .n_small_ops = ARRAY_SIZE(dp_packet_genl_ops), .resv_start_op = OVS_PACKET_CMD_EXECUTE + 1, .module = THIS_MODULE, }; static void get_dp_stats(const struct datapath *dp, struct ovs_dp_stats *stats, struct ovs_dp_megaflow_stats *mega_stats) { int i; memset(mega_stats, 0, sizeof(*mega_stats)); stats->n_flows = ovs_flow_tbl_count(&dp->table); mega_stats->n_masks = ovs_flow_tbl_num_masks(&dp->table); stats->n_hit = stats->n_missed = stats->n_lost = 0; for_each_possible_cpu(i) { const struct dp_stats_percpu *percpu_stats; struct dp_stats_percpu local_stats; unsigned int start; percpu_stats = per_cpu_ptr(dp->stats_percpu, i); do { start = u64_stats_fetch_begin(&percpu_stats->syncp); local_stats = *percpu_stats; } while (u64_stats_fetch_retry(&percpu_stats->syncp, start)); stats->n_hit += local_stats.n_hit; stats->n_missed += local_stats.n_missed; stats->n_lost += local_stats.n_lost; mega_stats->n_mask_hit += local_stats.n_mask_hit; mega_stats->n_cache_hit += local_stats.n_cache_hit; } } static bool should_fill_key(const struct sw_flow_id *sfid, uint32_t ufid_flags) { return ovs_identifier_is_ufid(sfid) && !(ufid_flags & OVS_UFID_F_OMIT_KEY); } static bool should_fill_mask(uint32_t ufid_flags) { return !(ufid_flags & OVS_UFID_F_OMIT_MASK); } static bool should_fill_actions(uint32_t ufid_flags) { return !(ufid_flags & OVS_UFID_F_OMIT_ACTIONS); } static size_t ovs_flow_cmd_msg_size(const struct sw_flow_actions *acts, const struct sw_flow_id *sfid, uint32_t ufid_flags) { size_t len = NLMSG_ALIGN(sizeof(struct ovs_header)); /* OVS_FLOW_ATTR_UFID, or unmasked flow key as fallback * see ovs_nla_put_identifier() */ if (sfid && ovs_identifier_is_ufid(sfid)) len += nla_total_size(sfid->ufid_len); else len += nla_total_size(ovs_key_attr_size()); /* OVS_FLOW_ATTR_KEY */ if (!sfid || should_fill_key(sfid, ufid_flags)) len += nla_total_size(ovs_key_attr_size()); /* OVS_FLOW_ATTR_MASK */ if (should_fill_mask(ufid_flags)) len += nla_total_size(ovs_key_attr_size()); /* OVS_FLOW_ATTR_ACTIONS */ if (should_fill_actions(ufid_flags)) len += nla_total_size(acts->orig_len); return len + nla_total_size_64bit(sizeof(struct ovs_flow_stats)) /* OVS_FLOW_ATTR_STATS */ + nla_total_size(1) /* OVS_FLOW_ATTR_TCP_FLAGS */ + nla_total_size_64bit(8); /* OVS_FLOW_ATTR_USED */ } /* Called with ovs_mutex or RCU read lock. */ static int ovs_flow_cmd_fill_stats(const struct sw_flow *flow, struct sk_buff *skb) { struct ovs_flow_stats stats; __be16 tcp_flags; unsigned long used; ovs_flow_stats_get(flow, &stats, &used, &tcp_flags); if (used && nla_put_u64_64bit(skb, OVS_FLOW_ATTR_USED, ovs_flow_used_time(used), OVS_FLOW_ATTR_PAD)) return -EMSGSIZE; if (stats.n_packets && nla_put_64bit(skb, OVS_FLOW_ATTR_STATS, sizeof(struct ovs_flow_stats), &stats, OVS_FLOW_ATTR_PAD)) return -EMSGSIZE; if ((u8)ntohs(tcp_flags) && nla_put_u8(skb, OVS_FLOW_ATTR_TCP_FLAGS, (u8)ntohs(tcp_flags))) return -EMSGSIZE; return 0; } /* Called with ovs_mutex or RCU read lock. */ static int ovs_flow_cmd_fill_actions(const struct sw_flow *flow, struct sk_buff *skb, int skb_orig_len) { struct nlattr *start; int err; /* If OVS_FLOW_ATTR_ACTIONS doesn't fit, skip dumping the actions if * this is the first flow to be dumped into 'skb'. This is unusual for * Netlink but individual action lists can be longer than * NLMSG_GOODSIZE and thus entirely undumpable if we didn't do this. * The userspace caller can always fetch the actions separately if it * really wants them. (Most userspace callers in fact don't care.) * * This can only fail for dump operations because the skb is always * properly sized for single flows. */ start = nla_nest_start_noflag(skb, OVS_FLOW_ATTR_ACTIONS); if (start) { const struct sw_flow_actions *sf_acts; sf_acts = rcu_dereference_ovsl(flow->sf_acts); err = ovs_nla_put_actions(sf_acts->actions, sf_acts->actions_len, skb); if (!err) nla_nest_end(skb, start); else { if (skb_orig_len) return err; nla_nest_cancel(skb, start); } } else if (skb_orig_len) { return -EMSGSIZE; } return 0; } /* Called with ovs_mutex or RCU read lock. */ static int ovs_flow_cmd_fill_info(const struct sw_flow *flow, int dp_ifindex, struct sk_buff *skb, u32 portid, u32 seq, u32 flags, u8 cmd, u32 ufid_flags) { const int skb_orig_len = skb->len; struct ovs_header *ovs_header; int err; ovs_header = genlmsg_put(skb, portid, seq, &dp_flow_genl_family, flags, cmd); if (!ovs_header) return -EMSGSIZE; ovs_header->dp_ifindex = dp_ifindex; err = ovs_nla_put_identifier(flow, skb); if (err) goto error; if (should_fill_key(&flow->id, ufid_flags)) { err = ovs_nla_put_masked_key(flow, skb); if (err) goto error; } if (should_fill_mask(ufid_flags)) { err = ovs_nla_put_mask(flow, skb); if (err) goto error; } err = ovs_flow_cmd_fill_stats(flow, skb); if (err) goto error; if (should_fill_actions(ufid_flags)) { err = ovs_flow_cmd_fill_actions(flow, skb, skb_orig_len); if (err) goto error; } genlmsg_end(skb, ovs_header); return 0; error: genlmsg_cancel(skb, ovs_header); return err; } /* May not be called with RCU read lock. */ static struct sk_buff *ovs_flow_cmd_alloc_info(const struct sw_flow_actions *acts, const struct sw_flow_id *sfid, struct genl_info *info, bool always, uint32_t ufid_flags) { struct sk_buff *skb; size_t len; if (!always && !ovs_must_notify(&dp_flow_genl_family, info, 0)) return NULL; len = ovs_flow_cmd_msg_size(acts, sfid, ufid_flags); skb = genlmsg_new(len, GFP_KERNEL); if (!skb) return ERR_PTR(-ENOMEM); return skb; } /* Called with ovs_mutex. */ static struct sk_buff *ovs_flow_cmd_build_info(const struct sw_flow *flow, int dp_ifindex, struct genl_info *info, u8 cmd, bool always, u32 ufid_flags) { struct sk_buff *skb; int retval; skb = ovs_flow_cmd_alloc_info(ovsl_dereference(flow->sf_acts), &flow->id, info, always, ufid_flags); if (IS_ERR_OR_NULL(skb)) return skb; retval = ovs_flow_cmd_fill_info(flow, dp_ifindex, skb, info->snd_portid, info->snd_seq, 0, cmd, ufid_flags); if (WARN_ON_ONCE(retval < 0)) { kfree_skb(skb); skb = ERR_PTR(retval); } return skb; } static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) { struct net *net = sock_net(skb->sk); struct nlattr **a = info->attrs; struct ovs_header *ovs_header = genl_info_userhdr(info); struct sw_flow *flow = NULL, *new_flow; struct sw_flow_mask mask; struct sk_buff *reply; struct datapath *dp; struct sw_flow_key *key; struct sw_flow_actions *acts; struct sw_flow_match match; u32 ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]); int error; bool log = !a[OVS_FLOW_ATTR_PROBE]; /* Must have key and actions. */ error = -EINVAL; if (!a[OVS_FLOW_ATTR_KEY]) { OVS_NLERR(log, "Flow key attr not present in new flow."); goto error; } if (!a[OVS_FLOW_ATTR_ACTIONS]) { OVS_NLERR(log, "Flow actions attr not present in new flow."); goto error; } /* Most of the time we need to allocate a new flow, do it before * locking. */ new_flow = ovs_flow_alloc(); if (IS_ERR(new_flow)) { error = PTR_ERR(new_flow); goto error; } /* Extract key. */ key = kzalloc(sizeof(*key), GFP_KERNEL); if (!key) { error = -ENOMEM; goto err_kfree_flow; } ovs_match_init(&match, key, false, &mask); error = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY], a[OVS_FLOW_ATTR_MASK], log); if (error) goto err_kfree_key; ovs_flow_mask_key(&new_flow->key, key, true, &mask); /* Extract flow identifier. */ error = ovs_nla_get_identifier(&new_flow->id, a[OVS_FLOW_ATTR_UFID], key, log); if (error) goto err_kfree_key; /* Validate actions. */ error = ovs_nla_copy_actions(net, a[OVS_FLOW_ATTR_ACTIONS], &new_flow->key, &acts, log); if (error) { OVS_NLERR(log, "Flow actions may not be safe on all matching packets."); goto err_kfree_key; } reply = ovs_flow_cmd_alloc_info(acts, &new_flow->id, info, false, ufid_flags); if (IS_ERR(reply)) { error = PTR_ERR(reply); goto err_kfree_acts; } ovs_lock(); dp = get_dp(net, ovs_header->dp_ifindex); if (unlikely(!dp)) { error = -ENODEV; goto err_unlock_ovs; } /* Check if this is a duplicate flow */ if (ovs_identifier_is_ufid(&new_flow->id)) flow = ovs_flow_tbl_lookup_ufid(&dp->table, &new_flow->id); if (!flow) flow = ovs_flow_tbl_lookup(&dp->table, key); if (likely(!flow)) { rcu_assign_pointer(new_flow->sf_acts, acts); /* Put flow in bucket. */ error = ovs_flow_tbl_insert(&dp->table, new_flow, &mask); if (unlikely(error)) { acts = NULL; goto err_unlock_ovs; } if (unlikely(reply)) { error = ovs_flow_cmd_fill_info(new_flow, ovs_header->dp_ifindex, reply, info->snd_portid, info->snd_seq, 0, OVS_FLOW_CMD_NEW, ufid_flags); BUG_ON(error < 0); } ovs_unlock(); } else { struct sw_flow_actions *old_acts; /* Bail out if we're not allowed to modify an existing flow. * We accept NLM_F_CREATE in place of the intended NLM_F_EXCL * because Generic Netlink treats the latter as a dump * request. We also accept NLM_F_EXCL in case that bug ever * gets fixed. */ if (unlikely(info->nlhdr->nlmsg_flags & (NLM_F_CREATE | NLM_F_EXCL))) { error = -EEXIST; goto err_unlock_ovs; } /* The flow identifier has to be the same for flow updates. * Look for any overlapping flow. */ if (unlikely(!ovs_flow_cmp(flow, &match))) { if (ovs_identifier_is_key(&flow->id)) flow = ovs_flow_tbl_lookup_exact(&dp->table, &match); else /* UFID matches but key is different */ flow = NULL; if (!flow) { error = -ENOENT; goto err_unlock_ovs; } } /* Update actions. */ old_acts = ovsl_dereference(flow->sf_acts); rcu_assign_pointer(flow->sf_acts, acts); if (unlikely(reply)) { error = ovs_flow_cmd_fill_info(flow, ovs_header->dp_ifindex, reply, info->snd_portid, info->snd_seq, 0, OVS_FLOW_CMD_NEW, ufid_flags); BUG_ON(error < 0); } ovs_unlock(); ovs_nla_free_flow_actions_rcu(old_acts); ovs_flow_free(new_flow, false); } if (reply) ovs_notify(&dp_flow_genl_family, reply, info); kfree(key); return 0; err_unlock_ovs: ovs_unlock(); kfree_skb(reply); err_kfree_acts: ovs_nla_free_flow_actions(acts); err_kfree_key: kfree(key); err_kfree_flow: ovs_flow_free(new_flow, false); error: return error; } /* Factor out action copy to avoid "Wframe-larger-than=1024" warning. */ static noinline_for_stack struct sw_flow_actions *get_flow_actions(struct net *net, const struct nlattr *a, const struct sw_flow_key *key, const struct sw_flow_mask *mask, bool log) { struct sw_flow_actions *acts; struct sw_flow_key masked_key; int error; ovs_flow_mask_key(&masked_key, key, true, mask); error = ovs_nla_copy_actions(net, a, &masked_key, &acts, log); if (error) { OVS_NLERR(log, "Actions may not be safe on all matching packets"); return ERR_PTR(error); } return acts; } /* Factor out match-init and action-copy to avoid * "Wframe-larger-than=1024" warning. Because mask is only * used to get actions, we new a function to save some * stack space. * * If there are not key and action attrs, we return 0 * directly. In the case, the caller will also not use the * match as before. If there is action attr, we try to get * actions and save them to *acts. Before returning from * the function, we reset the match->mask pointer. Because * we should not to return match object with dangling reference * to mask. * */ static noinline_for_stack int ovs_nla_init_match_and_action(struct net *net, struct sw_flow_match *match, struct sw_flow_key *key, struct nlattr **a, struct sw_flow_actions **acts, bool log) { struct sw_flow_mask mask; int error = 0; if (a[OVS_FLOW_ATTR_KEY]) { ovs_match_init(match, key, true, &mask); error = ovs_nla_get_match(net, match, a[OVS_FLOW_ATTR_KEY], a[OVS_FLOW_ATTR_MASK], log); if (error) goto error; } if (a[OVS_FLOW_ATTR_ACTIONS]) { if (!a[OVS_FLOW_ATTR_KEY]) { OVS_NLERR(log, "Flow key attribute not present in set flow."); error = -EINVAL; goto error; } *acts = get_flow_actions(net, a[OVS_FLOW_ATTR_ACTIONS], key, &mask, log); if (IS_ERR(*acts)) { error = PTR_ERR(*acts); goto error; } } /* On success, error is 0. */ error: match->mask = NULL; return error; } static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info) { struct net *net = sock_net(skb->sk); struct nlattr **a = info->attrs; struct ovs_header *ovs_header = genl_info_userhdr(info); struct sw_flow_key key; struct sw_flow *flow; struct sk_buff *reply = NULL; struct datapath *dp; struct sw_flow_actions *old_acts = NULL, *acts = NULL; struct sw_flow_match match; struct sw_flow_id sfid; u32 ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]); int error = 0; bool log = !a[OVS_FLOW_ATTR_PROBE]; bool ufid_present; ufid_present = ovs_nla_get_ufid(&sfid, a[OVS_FLOW_ATTR_UFID], log); if (!a[OVS_FLOW_ATTR_KEY] && !ufid_present) { OVS_NLERR(log, "Flow set message rejected, Key attribute missing."); return -EINVAL; } error = ovs_nla_init_match_and_action(net, &match, &key, a, &acts, log); if (error) goto error; if (acts) { /* Can allocate before locking if have acts. */ reply = ovs_flow_cmd_alloc_info(acts, &sfid, info, false, ufid_flags); if (IS_ERR(reply)) { error = PTR_ERR(reply); goto err_kfree_acts; } } ovs_lock(); dp = get_dp(net, ovs_header->dp_ifindex); if (unlikely(!dp)) { error = -ENODEV; goto err_unlock_ovs; } /* Check that the flow exists. */ if (ufid_present) flow = ovs_flow_tbl_lookup_ufid(&dp->table, &sfid); else flow = ovs_flow_tbl_lookup_exact(&dp->table, &match); if (unlikely(!flow)) { error = -ENOENT; goto err_unlock_ovs; } /* Update actions, if present. */ if (likely(acts)) { old_acts = ovsl_dereference(flow->sf_acts); rcu_assign_pointer(flow->sf_acts, acts); if (unlikely(reply)) { error = ovs_flow_cmd_fill_info(flow, ovs_header->dp_ifindex, reply, info->snd_portid, info->snd_seq, 0, OVS_FLOW_CMD_SET, ufid_flags); BUG_ON(error < 0); } } else { /* Could not alloc without acts before locking. */ reply = ovs_flow_cmd_build_info(flow, ovs_header->dp_ifindex, info, OVS_FLOW_CMD_SET, false, ufid_flags); if (IS_ERR(reply)) { error = PTR_ERR(reply); goto err_unlock_ovs; } } /* Clear stats. */ if (a[OVS_FLOW_ATTR_CLEAR]) ovs_flow_stats_clear(flow); ovs_unlock(); if (reply) ovs_notify(&dp_flow_genl_family, reply, info); if (old_acts) ovs_nla_free_flow_actions_rcu(old_acts); return 0; err_unlock_ovs: ovs_unlock(); kfree_skb(reply); err_kfree_acts: ovs_nla_free_flow_actions(acts); error: return error; } static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info) { struct nlattr **a = info->attrs; struct ovs_header *ovs_header = genl_info_userhdr(info); struct net *net = sock_net(skb->sk); struct sw_flow_key key; struct sk_buff *reply; struct sw_flow *flow; struct datapath *dp; struct sw_flow_match match; struct sw_flow_id ufid; u32 ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]); int err = 0; bool log = !a[OVS_FLOW_ATTR_PROBE]; bool ufid_present; ufid_present = ovs_nla_get_ufid(&ufid, a[OVS_FLOW_ATTR_UFID], log); if (a[OVS_FLOW_ATTR_KEY]) { ovs_match_init(&match, &key, true, NULL); err = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY], NULL, log); } else if (!ufid_present) { OVS_NLERR(log, "Flow get message rejected, Key attribute missing."); err = -EINVAL; } if (err) return err; ovs_lock(); dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); if (!dp) { err = -ENODEV; goto unlock; } if (ufid_present) flow = ovs_flow_tbl_lookup_ufid(&dp->table, &ufid); else flow = ovs_flow_tbl_lookup_exact(&dp->table, &match); if (!flow) { err = -ENOENT; goto unlock; } reply = ovs_flow_cmd_build_info(flow, ovs_header->dp_ifindex, info, OVS_FLOW_CMD_GET, true, ufid_flags); if (IS_ERR(reply)) { err = PTR_ERR(reply); goto unlock; } ovs_unlock(); return genlmsg_reply(reply, info); unlock: ovs_unlock(); return err; } static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info) { struct nlattr **a = info->attrs; struct ovs_header *ovs_header = genl_info_userhdr(info); struct net *net = sock_net(skb->sk); struct sw_flow_key key; struct sk_buff *reply; struct sw_flow *flow = NULL; struct datapath *dp; struct sw_flow_match match; struct sw_flow_id ufid; u32 ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]); int err; bool log = !a[OVS_FLOW_ATTR_PROBE]; bool ufid_present; ufid_present = ovs_nla_get_ufid(&ufid, a[OVS_FLOW_ATTR_UFID], log); if (a[OVS_FLOW_ATTR_KEY]) { ovs_match_init(&match, &key, true, NULL); err = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY], NULL, log); if (unlikely(err)) return err; } ovs_lock(); dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); if (unlikely(!dp)) { err = -ENODEV; goto unlock; } if (unlikely(!a[OVS_FLOW_ATTR_KEY] && !ufid_present)) { err = ovs_flow_tbl_flush(&dp->table); goto unlock; } if (ufid_present) flow = ovs_flow_tbl_lookup_ufid(&dp->table, &ufid); else flow = ovs_flow_tbl_lookup_exact(&dp->table, &match); if (unlikely(!flow)) { err = -ENOENT; goto unlock; } ovs_flow_tbl_remove(&dp->table, flow); ovs_unlock(); reply = ovs_flow_cmd_alloc_info((const struct sw_flow_actions __force *) flow->sf_acts, &flow->id, info, false, ufid_flags); if (likely(reply)) { if (!IS_ERR(reply)) { rcu_read_lock(); /*To keep RCU checker happy. */ err = ovs_flow_cmd_fill_info(flow, ovs_header->dp_ifindex, reply, info->snd_portid, info->snd_seq, 0, OVS_FLOW_CMD_DEL, ufid_flags); rcu_read_unlock(); if (WARN_ON_ONCE(err < 0)) { kfree_skb(reply); goto out_free; } ovs_notify(&dp_flow_genl_family, reply, info); } else { netlink_set_err(sock_net(skb->sk)->genl_sock, 0, 0, PTR_ERR(reply)); } } out_free: ovs_flow_free(flow, true); return 0; unlock: ovs_unlock(); return err; } static int ovs_flow_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct nlattr *a[__OVS_FLOW_ATTR_MAX]; struct ovs_header *ovs_header = genlmsg_data(nlmsg_data(cb->nlh)); struct table_instance *ti; struct datapath *dp; u32 ufid_flags; int err; err = genlmsg_parse_deprecated(cb->nlh, &dp_flow_genl_family, a, OVS_FLOW_ATTR_MAX, flow_policy, NULL); if (err) return err; ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]); rcu_read_lock(); dp = get_dp_rcu(sock_net(skb->sk), ovs_header->dp_ifindex); if (!dp) { rcu_read_unlock(); return -ENODEV; } ti = rcu_dereference(dp->table.ti); for (;;) { struct sw_flow *flow; u32 bucket, obj; bucket = cb->args[0]; obj = cb->args[1]; flow = ovs_flow_tbl_dump_next(ti, &bucket, &obj); if (!flow) break; if (ovs_flow_cmd_fill_info(flow, ovs_header->dp_ifindex, skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, OVS_FLOW_CMD_GET, ufid_flags) < 0) break; cb->args[0] = bucket; cb->args[1] = obj; } rcu_read_unlock(); return skb->len; } static const struct nla_policy flow_policy[OVS_FLOW_ATTR_MAX + 1] = { [OVS_FLOW_ATTR_KEY] = { .type = NLA_NESTED }, [OVS_FLOW_ATTR_MASK] = { .type = NLA_NESTED }, [OVS_FLOW_ATTR_ACTIONS] = { .type = NLA_NESTED }, [OVS_FLOW_ATTR_CLEAR] = { .type = NLA_FLAG }, [OVS_FLOW_ATTR_PROBE] = { .type = NLA_FLAG }, [OVS_FLOW_ATTR_UFID] = { .type = NLA_UNSPEC, .len = 1 }, [OVS_FLOW_ATTR_UFID_FLAGS] = { .type = NLA_U32 }, }; static const struct genl_small_ops dp_flow_genl_ops[] = { { .cmd = OVS_FLOW_CMD_NEW, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .doit = ovs_flow_cmd_new }, { .cmd = OVS_FLOW_CMD_DEL, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .doit = ovs_flow_cmd_del }, { .cmd = OVS_FLOW_CMD_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = 0, /* OK for unprivileged users. */ .doit = ovs_flow_cmd_get, .dumpit = ovs_flow_cmd_dump }, { .cmd = OVS_FLOW_CMD_SET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .doit = ovs_flow_cmd_set, }, }; static struct genl_family dp_flow_genl_family __ro_after_init = { .hdrsize = sizeof(struct ovs_header), .name = OVS_FLOW_FAMILY, .version = OVS_FLOW_VERSION, .maxattr = OVS_FLOW_ATTR_MAX, .policy = flow_policy, .netnsok = true, .parallel_ops = true, .small_ops = dp_flow_genl_ops, .n_small_ops = ARRAY_SIZE(dp_flow_genl_ops), .resv_start_op = OVS_FLOW_CMD_SET + 1, .mcgrps = &ovs_dp_flow_multicast_group, .n_mcgrps = 1, .module = THIS_MODULE, }; static size_t ovs_dp_cmd_msg_size(void) { size_t msgsize = NLMSG_ALIGN(sizeof(struct ovs_header)); msgsize += nla_total_size(IFNAMSIZ); msgsize += nla_total_size_64bit(sizeof(struct ovs_dp_stats)); msgsize += nla_total_size_64bit(sizeof(struct ovs_dp_megaflow_stats)); msgsize += nla_total_size(sizeof(u32)); /* OVS_DP_ATTR_USER_FEATURES */ msgsize += nla_total_size(sizeof(u32)); /* OVS_DP_ATTR_MASKS_CACHE_SIZE */ msgsize += nla_total_size(sizeof(u32) * nr_cpu_ids); /* OVS_DP_ATTR_PER_CPU_PIDS */ return msgsize; } /* Called with ovs_mutex. */ static int ovs_dp_cmd_fill_info(struct datapath *dp, struct sk_buff *skb, u32 portid, u32 seq, u32 flags, u8 cmd) { struct ovs_header *ovs_header; struct ovs_dp_stats dp_stats; struct ovs_dp_megaflow_stats dp_megaflow_stats; struct dp_nlsk_pids *pids = ovsl_dereference(dp->upcall_portids); int err, pids_len; ovs_header = genlmsg_put(skb, portid, seq, &dp_datapath_genl_family, flags, cmd); if (!ovs_header) goto error; ovs_header->dp_ifindex = get_dpifindex(dp); err = nla_put_string(skb, OVS_DP_ATTR_NAME, ovs_dp_name(dp)); if (err) goto nla_put_failure; get_dp_stats(dp, &dp_stats, &dp_megaflow_stats); if (nla_put_64bit(skb, OVS_DP_ATTR_STATS, sizeof(struct ovs_dp_stats), &dp_stats, OVS_DP_ATTR_PAD)) goto nla_put_failure; if (nla_put_64bit(skb, OVS_DP_ATTR_MEGAFLOW_STATS, sizeof(struct ovs_dp_megaflow_stats), &dp_megaflow_stats, OVS_DP_ATTR_PAD)) goto nla_put_failure; if (nla_put_u32(skb, OVS_DP_ATTR_USER_FEATURES, dp->user_features)) goto nla_put_failure; if (nla_put_u32(skb, OVS_DP_ATTR_MASKS_CACHE_SIZE, ovs_flow_tbl_masks_cache_size(&dp->table))) goto nla_put_failure; if (dp->user_features & OVS_DP_F_DISPATCH_UPCALL_PER_CPU && pids) { pids_len = min(pids->n_pids, nr_cpu_ids) * sizeof(u32); if (nla_put(skb, OVS_DP_ATTR_PER_CPU_PIDS, pids_len, &pids->pids)) goto nla_put_failure; } genlmsg_end(skb, ovs_header); return 0; nla_put_failure: genlmsg_cancel(skb, ovs_header); error: return -EMSGSIZE; } static struct sk_buff *ovs_dp_cmd_alloc_info(void) { return genlmsg_new(ovs_dp_cmd_msg_size(), GFP_KERNEL); } /* Called with rcu_read_lock or ovs_mutex. */ static struct datapath *lookup_datapath(struct net *net, const struct ovs_header *ovs_header, struct nlattr *a[OVS_DP_ATTR_MAX + 1]) { struct datapath *dp; if (!a[OVS_DP_ATTR_NAME]) dp = get_dp(net, ovs_header->dp_ifindex); else { struct vport *vport; vport = ovs_vport_locate(net, nla_data(a[OVS_DP_ATTR_NAME])); dp = vport && vport->port_no == OVSP_LOCAL ? vport->dp : NULL; } return dp ? dp : ERR_PTR(-ENODEV); } static void ovs_dp_reset_user_features(struct sk_buff *skb, struct genl_info *info) { struct datapath *dp; dp = lookup_datapath(sock_net(skb->sk), genl_info_userhdr(info), info->attrs); if (IS_ERR(dp)) return; pr_warn("%s: Dropping previously announced user features\n", ovs_dp_name(dp)); dp->user_features = 0; } static int ovs_dp_set_upcall_portids(struct datapath *dp, const struct nlattr *ids) { struct dp_nlsk_pids *old, *dp_nlsk_pids; if (!nla_len(ids) || nla_len(ids) % sizeof(u32)) return -EINVAL; old = ovsl_dereference(dp->upcall_portids); dp_nlsk_pids = kmalloc(sizeof(*dp_nlsk_pids) + nla_len(ids), GFP_KERNEL); if (!dp_nlsk_pids) return -ENOMEM; dp_nlsk_pids->n_pids = nla_len(ids) / sizeof(u32); nla_memcpy(dp_nlsk_pids->pids, ids, nla_len(ids)); rcu_assign_pointer(dp->upcall_portids, dp_nlsk_pids); kfree_rcu(old, rcu); return 0; } u32 ovs_dp_get_upcall_portid(const struct datapath *dp, uint32_t cpu_id) { struct dp_nlsk_pids *dp_nlsk_pids; dp_nlsk_pids = rcu_dereference(dp->upcall_portids); if (dp_nlsk_pids) { if (cpu_id < dp_nlsk_pids->n_pids) { return dp_nlsk_pids->pids[cpu_id]; } else if (dp_nlsk_pids->n_pids > 0 && cpu_id >= dp_nlsk_pids->n_pids) { /* If the number of netlink PIDs is mismatched with * the number of CPUs as seen by the kernel, log this * and send the upcall to an arbitrary socket (0) in * order to not drop packets */ pr_info_ratelimited("cpu_id mismatch with handler threads"); return dp_nlsk_pids->pids[cpu_id % dp_nlsk_pids->n_pids]; } else { return 0; } } else { return 0; } } static int ovs_dp_change(struct datapath *dp, struct nlattr *a[]) { u32 user_features = 0, old_features = dp->user_features; int err; if (a[OVS_DP_ATTR_USER_FEATURES]) { user_features = nla_get_u32(a[OVS_DP_ATTR_USER_FEATURES]); if (user_features & ~(OVS_DP_F_VPORT_PIDS | OVS_DP_F_UNALIGNED | OVS_DP_F_TC_RECIRC_SHARING | OVS_DP_F_DISPATCH_UPCALL_PER_CPU)) return -EOPNOTSUPP; #if !IS_ENABLED(CONFIG_NET_TC_SKB_EXT) if (user_features & OVS_DP_F_TC_RECIRC_SHARING) return -EOPNOTSUPP; #endif } if (a[OVS_DP_ATTR_MASKS_CACHE_SIZE]) { int err; u32 cache_size; cache_size = nla_get_u32(a[OVS_DP_ATTR_MASKS_CACHE_SIZE]); err = ovs_flow_tbl_masks_cache_resize(&dp->table, cache_size); if (err) return err; } dp->user_features = user_features; if (dp->user_features & OVS_DP_F_DISPATCH_UPCALL_PER_CPU && a[OVS_DP_ATTR_PER_CPU_PIDS]) { /* Upcall Netlink Port IDs have been updated */ err = ovs_dp_set_upcall_portids(dp, a[OVS_DP_ATTR_PER_CPU_PIDS]); if (err) return err; } if ((dp->user_features & OVS_DP_F_TC_RECIRC_SHARING) && !(old_features & OVS_DP_F_TC_RECIRC_SHARING)) tc_skb_ext_tc_enable(); else if (!(dp->user_features & OVS_DP_F_TC_RECIRC_SHARING) && (old_features & OVS_DP_F_TC_RECIRC_SHARING)) tc_skb_ext_tc_disable(); return 0; } static int ovs_dp_stats_init(struct datapath *dp) { dp->stats_percpu = netdev_alloc_pcpu_stats(struct dp_stats_percpu); if (!dp->stats_percpu) return -ENOMEM; return 0; } static int ovs_dp_vport_init(struct datapath *dp) { int i; dp->ports = kmalloc_array(DP_VPORT_HASH_BUCKETS, sizeof(struct hlist_head), GFP_KERNEL); if (!dp->ports) return -ENOMEM; for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) INIT_HLIST_HEAD(&dp->ports[i]); return 0; } static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) { struct nlattr **a = info->attrs; struct vport_parms parms; struct sk_buff *reply; struct datapath *dp; struct vport *vport; struct ovs_net *ovs_net; int err; err = -EINVAL; if (!a[OVS_DP_ATTR_NAME] || !a[OVS_DP_ATTR_UPCALL_PID]) goto err; reply = ovs_dp_cmd_alloc_info(); if (!reply) return -ENOMEM; err = -ENOMEM; dp = kzalloc(sizeof(*dp), GFP_KERNEL); if (dp == NULL) goto err_destroy_reply; ovs_dp_set_net(dp, sock_net(skb->sk)); /* Allocate table. */ err = ovs_flow_tbl_init(&dp->table); if (err) goto err_destroy_dp; err = ovs_dp_stats_init(dp); if (err) goto err_destroy_table; err = ovs_dp_vport_init(dp); if (err) goto err_destroy_stats; err = ovs_meters_init(dp); if (err) goto err_destroy_ports; /* Set up our datapath device. */ parms.name = nla_data(a[OVS_DP_ATTR_NAME]); parms.type = OVS_VPORT_TYPE_INTERNAL; parms.options = NULL; parms.dp = dp; parms.port_no = OVSP_LOCAL; parms.upcall_portids = a[OVS_DP_ATTR_UPCALL_PID]; parms.desired_ifindex = nla_get_s32_default(a[OVS_DP_ATTR_IFINDEX], 0); /* So far only local changes have been made, now need the lock. */ ovs_lock(); err = ovs_dp_change(dp, a); if (err) goto err_unlock_and_destroy_meters; vport = new_vport(&parms); if (IS_ERR(vport)) { err = PTR_ERR(vport); if (err == -EBUSY) err = -EEXIST; if (err == -EEXIST) { /* An outdated user space instance that does not understand * the concept of user_features has attempted to create a new * datapath and is likely to reuse it. Drop all user features. */ if (info->genlhdr->version < OVS_DP_VER_FEATURES) ovs_dp_reset_user_features(skb, info); } goto err_destroy_portids; } err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid, info->snd_seq, 0, OVS_DP_CMD_NEW); BUG_ON(err < 0); ovs_net = net_generic(ovs_dp_get_net(dp), ovs_net_id); list_add_tail_rcu(&dp->list_node, &ovs_net->dps); ovs_unlock(); ovs_notify(&dp_datapath_genl_family, reply, info); return 0; err_destroy_portids: kfree(rcu_dereference_raw(dp->upcall_portids)); err_unlock_and_destroy_meters: ovs_unlock(); ovs_meters_exit(dp); err_destroy_ports: kfree(dp->ports); err_destroy_stats: free_percpu(dp->stats_percpu); err_destroy_table: ovs_flow_tbl_destroy(&dp->table); err_destroy_dp: kfree(dp); err_destroy_reply: kfree_skb(reply); err: return err; } /* Called with ovs_mutex. */ static void __dp_destroy(struct datapath *dp) { struct flow_table *table = &dp->table; int i; if (dp->user_features & OVS_DP_F_TC_RECIRC_SHARING) tc_skb_ext_tc_disable(); for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) { struct vport *vport; struct hlist_node *n; hlist_for_each_entry_safe(vport, n, &dp->ports[i], dp_hash_node) if (vport->port_no != OVSP_LOCAL) ovs_dp_detach_port(vport); } list_del_rcu(&dp->list_node); /* OVSP_LOCAL is datapath internal port. We need to make sure that * all ports in datapath are destroyed first before freeing datapath. */ ovs_dp_detach_port(ovs_vport_ovsl(dp, OVSP_LOCAL)); /* Flush sw_flow in the tables. RCU cb only releases resource * such as dp, ports and tables. That may avoid some issues * such as RCU usage warning. */ table_instance_flow_flush(table, ovsl_dereference(table->ti), ovsl_dereference(table->ufid_ti)); /* RCU destroy the ports, meters and flow tables. */ call_rcu(&dp->rcu, destroy_dp_rcu); } static int ovs_dp_cmd_del(struct sk_buff *skb, struct genl_info *info) { struct sk_buff *reply; struct datapath *dp; int err; reply = ovs_dp_cmd_alloc_info(); if (!reply) return -ENOMEM; ovs_lock(); dp = lookup_datapath(sock_net(skb->sk), genl_info_userhdr(info), info->attrs); err = PTR_ERR(dp); if (IS_ERR(dp)) goto err_unlock_free; err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid, info->snd_seq, 0, OVS_DP_CMD_DEL); BUG_ON(err < 0); __dp_destroy(dp); ovs_unlock(); ovs_notify(&dp_datapath_genl_family, reply, info); return 0; err_unlock_free: ovs_unlock(); kfree_skb(reply); return err; } static int ovs_dp_cmd_set(struct sk_buff *skb, struct genl_info *info) { struct sk_buff *reply; struct datapath *dp; int err; reply = ovs_dp_cmd_alloc_info(); if (!reply) return -ENOMEM; ovs_lock(); dp = lookup_datapath(sock_net(skb->sk), genl_info_userhdr(info), info->attrs); err = PTR_ERR(dp); if (IS_ERR(dp)) goto err_unlock_free; err = ovs_dp_change(dp, info->attrs); if (err) goto err_unlock_free; err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid, info->snd_seq, 0, OVS_DP_CMD_SET); BUG_ON(err < 0); ovs_unlock(); ovs_notify(&dp_datapath_genl_family, reply, info); return 0; err_unlock_free: ovs_unlock(); kfree_skb(reply); return err; } static int ovs_dp_cmd_get(struct sk_buff *skb, struct genl_info *info) { struct sk_buff *reply; struct datapath *dp; int err; reply = ovs_dp_cmd_alloc_info(); if (!reply) return -ENOMEM; ovs_lock(); dp = lookup_datapath(sock_net(skb->sk), genl_info_userhdr(info), info->attrs); if (IS_ERR(dp)) { err = PTR_ERR(dp); goto err_unlock_free; } err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid, info->snd_seq, 0, OVS_DP_CMD_GET); BUG_ON(err < 0); ovs_unlock(); return genlmsg_reply(reply, info); err_unlock_free: ovs_unlock(); kfree_skb(reply); return err; } static int ovs_dp_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct ovs_net *ovs_net = net_generic(sock_net(skb->sk), ovs_net_id); struct datapath *dp; int skip = cb->args[0]; int i = 0; ovs_lock(); list_for_each_entry(dp, &ovs_net->dps, list_node) { if (i >= skip && ovs_dp_cmd_fill_info(dp, skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, OVS_DP_CMD_GET) < 0) break; i++; } ovs_unlock(); cb->args[0] = i; return skb->len; } static const struct nla_policy datapath_policy[OVS_DP_ATTR_MAX + 1] = { [OVS_DP_ATTR_NAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 }, [OVS_DP_ATTR_UPCALL_PID] = { .type = NLA_U32 }, [OVS_DP_ATTR_USER_FEATURES] = { .type = NLA_U32 }, [OVS_DP_ATTR_MASKS_CACHE_SIZE] = NLA_POLICY_RANGE(NLA_U32, 0, PCPU_MIN_UNIT_SIZE / sizeof(struct mask_cache_entry)), [OVS_DP_ATTR_IFINDEX] = NLA_POLICY_MIN(NLA_S32, 0), }; static const struct genl_small_ops dp_datapath_genl_ops[] = { { .cmd = OVS_DP_CMD_NEW, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .doit = ovs_dp_cmd_new }, { .cmd = OVS_DP_CMD_DEL, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .doit = ovs_dp_cmd_del }, { .cmd = OVS_DP_CMD_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = 0, /* OK for unprivileged users. */ .doit = ovs_dp_cmd_get, .dumpit = ovs_dp_cmd_dump }, { .cmd = OVS_DP_CMD_SET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .doit = ovs_dp_cmd_set, }, }; static struct genl_family dp_datapath_genl_family __ro_after_init = { .hdrsize = sizeof(struct ovs_header), .name = OVS_DATAPATH_FAMILY, .version = OVS_DATAPATH_VERSION, .maxattr = OVS_DP_ATTR_MAX, .policy = datapath_policy, .netnsok = true, .parallel_ops = true, .small_ops = dp_datapath_genl_ops, .n_small_ops = ARRAY_SIZE(dp_datapath_genl_ops), .resv_start_op = OVS_DP_CMD_SET + 1, .mcgrps = &ovs_dp_datapath_multicast_group, .n_mcgrps = 1, .module = THIS_MODULE, }; /* Called with ovs_mutex or RCU read lock. */ static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb, struct net *net, u32 portid, u32 seq, u32 flags, u8 cmd, gfp_t gfp) { struct ovs_header *ovs_header; struct ovs_vport_stats vport_stats; struct net *net_vport; int err; ovs_header = genlmsg_put(skb, portid, seq, &dp_vport_genl_family, flags, cmd); if (!ovs_header) return -EMSGSIZE; ovs_header->dp_ifindex = get_dpifindex(vport->dp); if (nla_put_u32(skb, OVS_VPORT_ATTR_PORT_NO, vport->port_no) || nla_put_u32(skb, OVS_VPORT_ATTR_TYPE, vport->ops->type) || nla_put_string(skb, OVS_VPORT_ATTR_NAME, ovs_vport_name(vport)) || nla_put_u32(skb, OVS_VPORT_ATTR_IFINDEX, vport->dev->ifindex)) goto nla_put_failure; rcu_read_lock(); net_vport = dev_net_rcu(vport->dev); if (!net_eq(net, net_vport)) { int id = peernet2id_alloc(net, net_vport, GFP_ATOMIC); if (nla_put_s32(skb, OVS_VPORT_ATTR_NETNSID, id)) goto nla_put_failure_unlock; } rcu_read_unlock(); ovs_vport_get_stats(vport, &vport_stats); if (nla_put_64bit(skb, OVS_VPORT_ATTR_STATS, sizeof(struct ovs_vport_stats), &vport_stats, OVS_VPORT_ATTR_PAD)) goto nla_put_failure; if (ovs_vport_get_upcall_stats(vport, skb)) goto nla_put_failure; if (ovs_vport_get_upcall_portids(vport, skb)) goto nla_put_failure; err = ovs_vport_get_options(vport, skb); if (err == -EMSGSIZE) goto error; genlmsg_end(skb, ovs_header); return 0; nla_put_failure_unlock: rcu_read_unlock(); nla_put_failure: err = -EMSGSIZE; error: genlmsg_cancel(skb, ovs_header); return err; } static struct sk_buff *ovs_vport_cmd_alloc_info(void) { return nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); } /* Called with ovs_mutex, only via ovs_dp_notify_wq(). */ struct sk_buff *ovs_vport_cmd_build_info(struct vport *vport, struct net *net, u32 portid, u32 seq, u8 cmd) { struct sk_buff *skb; int retval; skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!skb) return ERR_PTR(-ENOMEM); retval = ovs_vport_cmd_fill_info(vport, skb, net, portid, seq, 0, cmd, GFP_KERNEL); BUG_ON(retval < 0); return skb; } /* Called with ovs_mutex or RCU read lock. */ static struct vport *lookup_vport(struct net *net, const struct ovs_header *ovs_header, struct nlattr *a[OVS_VPORT_ATTR_MAX + 1]) { struct datapath *dp; struct vport *vport; if (a[OVS_VPORT_ATTR_IFINDEX]) return ERR_PTR(-EOPNOTSUPP); if (a[OVS_VPORT_ATTR_NAME]) { vport = ovs_vport_locate(net, nla_data(a[OVS_VPORT_ATTR_NAME])); if (!vport) return ERR_PTR(-ENODEV); if (ovs_header->dp_ifindex && ovs_header->dp_ifindex != get_dpifindex(vport->dp)) return ERR_PTR(-ENODEV); return vport; } else if (a[OVS_VPORT_ATTR_PORT_NO]) { u32 port_no = nla_get_u32(a[OVS_VPORT_ATTR_PORT_NO]); if (port_no >= DP_MAX_PORTS) return ERR_PTR(-EFBIG); dp = get_dp(net, ovs_header->dp_ifindex); if (!dp) return ERR_PTR(-ENODEV); vport = ovs_vport_ovsl_rcu(dp, port_no); if (!vport) return ERR_PTR(-ENODEV); return vport; } else return ERR_PTR(-EINVAL); } static unsigned int ovs_get_max_headroom(struct datapath *dp) { unsigned int dev_headroom, max_headroom = 0; struct net_device *dev; struct vport *vport; int i; for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) { hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node, lockdep_ovsl_is_held()) { dev = vport->dev; dev_headroom = netdev_get_fwd_headroom(dev); if (dev_headroom > max_headroom) max_headroom = dev_headroom; } } return max_headroom; } /* Called with ovs_mutex */ static void ovs_update_headroom(struct datapath *dp, unsigned int new_headroom) { struct vport *vport; int i; dp->max_headroom = new_headroom; for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) { hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node, lockdep_ovsl_is_held()) netdev_set_rx_headroom(vport->dev, new_headroom); } } static int ovs_vport_cmd_new(struct sk_buff *skb, struct genl_info *info) { struct nlattr **a = info->attrs; struct ovs_header *ovs_header = genl_info_userhdr(info); struct vport_parms parms; struct sk_buff *reply; struct vport *vport; struct datapath *dp; unsigned int new_headroom; u32 port_no; int err; if (!a[OVS_VPORT_ATTR_NAME] || !a[OVS_VPORT_ATTR_TYPE] || !a[OVS_VPORT_ATTR_UPCALL_PID]) return -EINVAL; parms.type = nla_get_u32(a[OVS_VPORT_ATTR_TYPE]); if (a[OVS_VPORT_ATTR_IFINDEX] && parms.type != OVS_VPORT_TYPE_INTERNAL) return -EOPNOTSUPP; port_no = nla_get_u32_default(a[OVS_VPORT_ATTR_PORT_NO], 0); if (port_no >= DP_MAX_PORTS) return -EFBIG; reply = ovs_vport_cmd_alloc_info(); if (!reply) return -ENOMEM; ovs_lock(); restart: dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); err = -ENODEV; if (!dp) goto exit_unlock_free; if (port_no) { vport = ovs_vport_ovsl(dp, port_no); err = -EBUSY; if (vport) goto exit_unlock_free; } else { for (port_no = 1; ; port_no++) { if (port_no >= DP_MAX_PORTS) { err = -EFBIG; goto exit_unlock_free; } vport = ovs_vport_ovsl(dp, port_no); if (!vport) break; } } parms.name = nla_data(a[OVS_VPORT_ATTR_NAME]); parms.options = a[OVS_VPORT_ATTR_OPTIONS]; parms.dp = dp; parms.port_no = port_no; parms.upcall_portids = a[OVS_VPORT_ATTR_UPCALL_PID]; parms.desired_ifindex = nla_get_s32_default(a[OVS_VPORT_ATTR_IFINDEX], 0); vport = new_vport(&parms); err = PTR_ERR(vport); if (IS_ERR(vport)) { if (err == -EAGAIN) goto restart; goto exit_unlock_free; } err = ovs_vport_cmd_fill_info(vport, reply, genl_info_net(info), info->snd_portid, info->snd_seq, 0, OVS_VPORT_CMD_NEW, GFP_KERNEL); new_headroom = netdev_get_fwd_headroom(vport->dev); if (new_headroom > dp->max_headroom) ovs_update_headroom(dp, new_headroom); else netdev_set_rx_headroom(vport->dev, dp->max_headroom); BUG_ON(err < 0); ovs_unlock(); ovs_notify(&dp_vport_genl_family, reply, info); return 0; exit_unlock_free: ovs_unlock(); kfree_skb(reply); return err; } static int ovs_vport_cmd_set(struct sk_buff *skb, struct genl_info *info) { struct nlattr **a = info->attrs; struct sk_buff *reply; struct vport *vport; int err; reply = ovs_vport_cmd_alloc_info(); if (!reply) return -ENOMEM; ovs_lock(); vport = lookup_vport(sock_net(skb->sk), genl_info_userhdr(info), a); err = PTR_ERR(vport); if (IS_ERR(vport)) goto exit_unlock_free; if (a[OVS_VPORT_ATTR_TYPE] && nla_get_u32(a[OVS_VPORT_ATTR_TYPE]) != vport->ops->type) { err = -EINVAL; goto exit_unlock_free; } if (a[OVS_VPORT_ATTR_OPTIONS]) { err = ovs_vport_set_options(vport, a[OVS_VPORT_ATTR_OPTIONS]); if (err) goto exit_unlock_free; } if (a[OVS_VPORT_ATTR_UPCALL_PID]) { struct nlattr *ids = a[OVS_VPORT_ATTR_UPCALL_PID]; err = ovs_vport_set_upcall_portids(vport, ids); if (err) goto exit_unlock_free; } err = ovs_vport_cmd_fill_info(vport, reply, genl_info_net(info), info->snd_portid, info->snd_seq, 0, OVS_VPORT_CMD_SET, GFP_KERNEL); BUG_ON(err < 0); ovs_unlock(); ovs_notify(&dp_vport_genl_family, reply, info); return 0; exit_unlock_free: ovs_unlock(); kfree_skb(reply); return err; } static int ovs_vport_cmd_del(struct sk_buff *skb, struct genl_info *info) { bool update_headroom = false; struct nlattr **a = info->attrs; struct sk_buff *reply; struct datapath *dp; struct vport *vport; unsigned int new_headroom; int err; reply = ovs_vport_cmd_alloc_info(); if (!reply) return -ENOMEM; ovs_lock(); vport = lookup_vport(sock_net(skb->sk), genl_info_userhdr(info), a); err = PTR_ERR(vport); if (IS_ERR(vport)) goto exit_unlock_free; if (vport->port_no == OVSP_LOCAL) { err = -EINVAL; goto exit_unlock_free; } err = ovs_vport_cmd_fill_info(vport, reply, genl_info_net(info), info->snd_portid, info->snd_seq, 0, OVS_VPORT_CMD_DEL, GFP_KERNEL); BUG_ON(err < 0); /* the vport deletion may trigger dp headroom update */ dp = vport->dp; if (netdev_get_fwd_headroom(vport->dev) == dp->max_headroom) update_headroom = true; netdev_reset_rx_headroom(vport->dev); ovs_dp_detach_port(vport); if (update_headroom) { new_headroom = ovs_get_max_headroom(dp); if (new_headroom < dp->max_headroom) ovs_update_headroom(dp, new_headroom); } ovs_unlock(); ovs_notify(&dp_vport_genl_family, reply, info); return 0; exit_unlock_free: ovs_unlock(); kfree_skb(reply); return err; } static int ovs_vport_cmd_get(struct sk_buff *skb, struct genl_info *info) { struct nlattr **a = info->attrs; struct ovs_header *ovs_header = genl_info_userhdr(info); struct sk_buff *reply; struct vport *vport; int err; reply = ovs_vport_cmd_alloc_info(); if (!reply) return -ENOMEM; rcu_read_lock(); vport = lookup_vport(sock_net(skb->sk), ovs_header, a); err = PTR_ERR(vport); if (IS_ERR(vport)) goto exit_unlock_free; err = ovs_vport_cmd_fill_info(vport, reply, genl_info_net(info), info->snd_portid, info->snd_seq, 0, OVS_VPORT_CMD_GET, GFP_ATOMIC); BUG_ON(err < 0); rcu_read_unlock(); return genlmsg_reply(reply, info); exit_unlock_free: rcu_read_unlock(); kfree_skb(reply); return err; } static int ovs_vport_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct ovs_header *ovs_header = genlmsg_data(nlmsg_data(cb->nlh)); struct datapath *dp; int bucket = cb->args[0], skip = cb->args[1]; int i, j = 0; rcu_read_lock(); dp = get_dp_rcu(sock_net(skb->sk), ovs_header->dp_ifindex); if (!dp) { rcu_read_unlock(); return -ENODEV; } for (i = bucket; i < DP_VPORT_HASH_BUCKETS; i++) { struct vport *vport; j = 0; hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node) { if (j >= skip && ovs_vport_cmd_fill_info(vport, skb, sock_net(skb->sk), NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, OVS_VPORT_CMD_GET, GFP_ATOMIC) < 0) goto out; j++; } skip = 0; } out: rcu_read_unlock(); cb->args[0] = i; cb->args[1] = j; return skb->len; } static void ovs_dp_masks_rebalance(struct work_struct *work) { struct ovs_net *ovs_net = container_of(work, struct ovs_net, masks_rebalance.work); struct datapath *dp; ovs_lock(); list_for_each_entry(dp, &ovs_net->dps, list_node) ovs_flow_masks_rebalance(&dp->table); ovs_unlock(); schedule_delayed_work(&ovs_net->masks_rebalance, msecs_to_jiffies(DP_MASKS_REBALANCE_INTERVAL)); } static const struct nla_policy vport_policy[OVS_VPORT_ATTR_MAX + 1] = { [OVS_VPORT_ATTR_NAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 }, [OVS_VPORT_ATTR_STATS] = { .len = sizeof(struct ovs_vport_stats) }, [OVS_VPORT_ATTR_PORT_NO] = { .type = NLA_U32 }, [OVS_VPORT_ATTR_TYPE] = { .type = NLA_U32 }, [OVS_VPORT_ATTR_UPCALL_PID] = { .type = NLA_UNSPEC }, [OVS_VPORT_ATTR_OPTIONS] = { .type = NLA_NESTED }, [OVS_VPORT_ATTR_IFINDEX] = NLA_POLICY_MIN(NLA_S32, 0), [OVS_VPORT_ATTR_NETNSID] = { .type = NLA_S32 }, [OVS_VPORT_ATTR_UPCALL_STATS] = { .type = NLA_NESTED }, }; static const struct genl_small_ops dp_vport_genl_ops[] = { { .cmd = OVS_VPORT_CMD_NEW, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .doit = ovs_vport_cmd_new }, { .cmd = OVS_VPORT_CMD_DEL, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .doit = ovs_vport_cmd_del }, { .cmd = OVS_VPORT_CMD_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = 0, /* OK for unprivileged users. */ .doit = ovs_vport_cmd_get, .dumpit = ovs_vport_cmd_dump }, { .cmd = OVS_VPORT_CMD_SET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .doit = ovs_vport_cmd_set, }, }; struct genl_family dp_vport_genl_family __ro_after_init = { .hdrsize = sizeof(struct ovs_header), .name = OVS_VPORT_FAMILY, .version = OVS_VPORT_VERSION, .maxattr = OVS_VPORT_ATTR_MAX, .policy = vport_policy, .netnsok = true, .parallel_ops = true, .small_ops = dp_vport_genl_ops, .n_small_ops = ARRAY_SIZE(dp_vport_genl_ops), .resv_start_op = OVS_VPORT_CMD_SET + 1, .mcgrps = &ovs_dp_vport_multicast_group, .n_mcgrps = 1, .module = THIS_MODULE, }; static struct genl_family * const dp_genl_families[] = { &dp_datapath_genl_family, &dp_vport_genl_family, &dp_flow_genl_family, &dp_packet_genl_family, &dp_meter_genl_family, #if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT) &dp_ct_limit_genl_family, #endif }; static void dp_unregister_genl(int n_families) { int i; for (i = 0; i < n_families; i++) genl_unregister_family(dp_genl_families[i]); } static int __init dp_register_genl(void) { int err; int i; for (i = 0; i < ARRAY_SIZE(dp_genl_families); i++) { err = genl_register_family(dp_genl_families[i]); if (err) goto error; } return 0; error: dp_unregister_genl(i); return err; } static int __net_init ovs_init_net(struct net *net) { struct ovs_net *ovs_net = net_generic(net, ovs_net_id); int err; INIT_LIST_HEAD(&ovs_net->dps); INIT_WORK(&ovs_net->dp_notify_work, ovs_dp_notify_wq); INIT_DELAYED_WORK(&ovs_net->masks_rebalance, ovs_dp_masks_rebalance); err = ovs_ct_init(net); if (err) return err; schedule_delayed_work(&ovs_net->masks_rebalance, msecs_to_jiffies(DP_MASKS_REBALANCE_INTERVAL)); return 0; } static void __net_exit list_vports_from_net(struct net *net, struct net *dnet, struct list_head *head) { struct ovs_net *ovs_net = net_generic(net, ovs_net_id); struct datapath *dp; list_for_each_entry(dp, &ovs_net->dps, list_node) { int i; for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) { struct vport *vport; hlist_for_each_entry(vport, &dp->ports[i], dp_hash_node) { if (vport->ops->type != OVS_VPORT_TYPE_INTERNAL) continue; if (dev_net(vport->dev) == dnet) list_add(&vport->detach_list, head); } } } } static void __net_exit ovs_exit_net(struct net *dnet) { struct datapath *dp, *dp_next; struct ovs_net *ovs_net = net_generic(dnet, ovs_net_id); struct vport *vport, *vport_next; struct net *net; LIST_HEAD(head); ovs_lock(); ovs_ct_exit(dnet); list_for_each_entry_safe(dp, dp_next, &ovs_net->dps, list_node) __dp_destroy(dp); down_read(&net_rwsem); for_each_net(net) list_vports_from_net(net, dnet, &head); up_read(&net_rwsem); /* Detach all vports from given namespace. */ list_for_each_entry_safe(vport, vport_next, &head, detach_list) { list_del(&vport->detach_list); ovs_dp_detach_port(vport); } ovs_unlock(); cancel_delayed_work_sync(&ovs_net->masks_rebalance); cancel_work_sync(&ovs_net->dp_notify_work); } static struct pernet_operations ovs_net_ops = { .init = ovs_init_net, .exit = ovs_exit_net, .id = &ovs_net_id, .size = sizeof(struct ovs_net), }; static const char * const ovs_drop_reasons[] = { #define S(x) [(x) & ~SKB_DROP_REASON_SUBSYS_MASK] = (#x), OVS_DROP_REASONS(S) #undef S }; static struct drop_reason_list drop_reason_list_ovs = { .reasons = ovs_drop_reasons, .n_reasons = ARRAY_SIZE(ovs_drop_reasons), }; static int __init dp_init(void) { int err; BUILD_BUG_ON(sizeof(struct ovs_skb_cb) > sizeof_field(struct sk_buff, cb)); pr_info("Open vSwitch switching datapath\n"); err = action_fifos_init(); if (err) goto error; err = ovs_internal_dev_rtnl_link_register(); if (err) goto error_action_fifos_exit; err = ovs_flow_init(); if (err) goto error_unreg_rtnl_link; err = ovs_vport_init(); if (err) goto error_flow_exit; err = register_pernet_device(&ovs_net_ops); if (err) goto error_vport_exit; err = register_netdevice_notifier(&ovs_dp_device_notifier); if (err) goto error_netns_exit; err = ovs_netdev_init(); if (err) goto error_unreg_notifier; err = dp_register_genl(); if (err < 0) goto error_unreg_netdev; drop_reasons_register_subsys(SKB_DROP_REASON_SUBSYS_OPENVSWITCH, &drop_reason_list_ovs); return 0; error_unreg_netdev: ovs_netdev_exit(); error_unreg_notifier: unregister_netdevice_notifier(&ovs_dp_device_notifier); error_netns_exit: unregister_pernet_device(&ovs_net_ops); error_vport_exit: ovs_vport_exit(); error_flow_exit: ovs_flow_exit(); error_unreg_rtnl_link: ovs_internal_dev_rtnl_link_unregister(); error_action_fifos_exit: action_fifos_exit(); error: return err; } static void dp_cleanup(void) { dp_unregister_genl(ARRAY_SIZE(dp_genl_families)); ovs_netdev_exit(); unregister_netdevice_notifier(&ovs_dp_device_notifier); unregister_pernet_device(&ovs_net_ops); drop_reasons_unregister_subsys(SKB_DROP_REASON_SUBSYS_OPENVSWITCH); rcu_barrier(); ovs_vport_exit(); ovs_flow_exit(); ovs_internal_dev_rtnl_link_unregister(); action_fifos_exit(); } module_init(dp_init); module_exit(dp_cleanup); MODULE_DESCRIPTION("Open vSwitch switching datapath"); MODULE_LICENSE("GPL"); MODULE_ALIAS_GENL_FAMILY(OVS_DATAPATH_FAMILY); MODULE_ALIAS_GENL_FAMILY(OVS_VPORT_FAMILY); MODULE_ALIAS_GENL_FAMILY(OVS_FLOW_FAMILY); MODULE_ALIAS_GENL_FAMILY(OVS_PACKET_FAMILY); MODULE_ALIAS_GENL_FAMILY(OVS_METER_FAMILY); MODULE_ALIAS_GENL_FAMILY(OVS_CT_LIMIT_FAMILY);
1509 1520 1311 508 120 110 16 26 382 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 // SPDX-License-Identifier: (GPL-2.0-only OR BSD-3-Clause) /* Copyright (C) 2016-2022 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. * * SipHash: a fast short-input PRF * https://131002.net/siphash/ * * This implementation is specifically for SipHash2-4 for a secure PRF * and HalfSipHash1-3/SipHash1-3 for an insecure PRF only suitable for * hashtables. */ #include <linux/siphash.h> #include <linux/unaligned.h> #if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64 #include <linux/dcache.h> #include <asm/word-at-a-time.h> #endif #define SIPROUND SIPHASH_PERMUTATION(v0, v1, v2, v3) #define PREAMBLE(len) \ u64 v0 = SIPHASH_CONST_0; \ u64 v1 = SIPHASH_CONST_1; \ u64 v2 = SIPHASH_CONST_2; \ u64 v3 = SIPHASH_CONST_3; \ u64 b = ((u64)(len)) << 56; \ v3 ^= key->key[1]; \ v2 ^= key->key[0]; \ v1 ^= key->key[1]; \ v0 ^= key->key[0]; #define POSTAMBLE \ v3 ^= b; \ SIPROUND; \ SIPROUND; \ v0 ^= b; \ v2 ^= 0xff; \ SIPROUND; \ SIPROUND; \ SIPROUND; \ SIPROUND; \ return (v0 ^ v1) ^ (v2 ^ v3); #ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS u64 __siphash_aligned(const void *data, size_t len, const siphash_key_t *key) { const u8 *end = data + len - (len % sizeof(u64)); const u8 left = len & (sizeof(u64) - 1); u64 m; PREAMBLE(len) for (; data != end; data += sizeof(u64)) { m = le64_to_cpup(data); v3 ^= m; SIPROUND; SIPROUND; v0 ^= m; } #if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64 if (left) b |= le64_to_cpu((__force __le64)(load_unaligned_zeropad(data) & bytemask_from_count(left))); #else switch (left) { case 7: b |= ((u64)end[6]) << 48; fallthrough; case 6: b |= ((u64)end[5]) << 40; fallthrough; case 5: b |= ((u64)end[4]) << 32; fallthrough; case 4: b |= le32_to_cpup(data); break; case 3: b |= ((u64)end[2]) << 16; fallthrough; case 2: b |= le16_to_cpup(data); break; case 1: b |= end[0]; } #endif POSTAMBLE } EXPORT_SYMBOL(__siphash_aligned); #endif u64 __siphash_unaligned(const void *data, size_t len, const siphash_key_t *key) { const u8 *end = data + len - (len % sizeof(u64)); const u8 left = len & (sizeof(u64) - 1); u64 m; PREAMBLE(len) for (; data != end; data += sizeof(u64)) { m = get_unaligned_le64(data); v3 ^= m; SIPROUND; SIPROUND; v0 ^= m; } #if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64 if (left) b |= le64_to_cpu((__force __le64)(load_unaligned_zeropad(data) & bytemask_from_count(left))); #else switch (left) { case 7: b |= ((u64)end[6]) << 48; fallthrough; case 6: b |= ((u64)end[5]) << 40; fallthrough; case 5: b |= ((u64)end[4]) << 32; fallthrough; case 4: b |= get_unaligned_le32(end); break; case 3: b |= ((u64)end[2]) << 16; fallthrough; case 2: b |= get_unaligned_le16(end); break; case 1: b |= end[0]; } #endif POSTAMBLE } EXPORT_SYMBOL(__siphash_unaligned); /** * siphash_1u64 - compute 64-bit siphash PRF value of a u64 * @first: first u64 * @key: the siphash key */ u64 siphash_1u64(const u64 first, const siphash_key_t *key) { PREAMBLE(8) v3 ^= first; SIPROUND; SIPROUND; v0 ^= first; POSTAMBLE } EXPORT_SYMBOL(siphash_1u64); /** * siphash_2u64 - compute 64-bit siphash PRF value of 2 u64 * @first: first u64 * @second: second u64 * @key: the siphash key */ u64 siphash_2u64(const u64 first, const u64 second, const siphash_key_t *key) { PREAMBLE(16) v3 ^= first; SIPROUND; SIPROUND; v0 ^= first; v3 ^= second; SIPROUND; SIPROUND; v0 ^= second; POSTAMBLE } EXPORT_SYMBOL(siphash_2u64); /** * siphash_3u64 - compute 64-bit siphash PRF value of 3 u64 * @first: first u64 * @second: second u64 * @third: third u64 * @key: the siphash key */ u64 siphash_3u64(const u64 first, const u64 second, const u64 third, const siphash_key_t *key) { PREAMBLE(24) v3 ^= first; SIPROUND; SIPROUND; v0 ^= first; v3 ^= second; SIPROUND; SIPROUND; v0 ^= second; v3 ^= third; SIPROUND; SIPROUND; v0 ^= third; POSTAMBLE } EXPORT_SYMBOL(siphash_3u64); /** * siphash_4u64 - compute 64-bit siphash PRF value of 4 u64 * @first: first u64 * @second: second u64 * @third: third u64 * @forth: forth u64 * @key: the siphash key */ u64 siphash_4u64(const u64 first, const u64 second, const u64 third, const u64 forth, const siphash_key_t *key) { PREAMBLE(32) v3 ^= first; SIPROUND; SIPROUND; v0 ^= first; v3 ^= second; SIPROUND; SIPROUND; v0 ^= second; v3 ^= third; SIPROUND; SIPROUND; v0 ^= third; v3 ^= forth; SIPROUND; SIPROUND; v0 ^= forth; POSTAMBLE } EXPORT_SYMBOL(siphash_4u64); u64 siphash_1u32(const u32 first, const siphash_key_t *key) { PREAMBLE(4) b |= first; POSTAMBLE } EXPORT_SYMBOL(siphash_1u32); u64 siphash_3u32(const u32 first, const u32 second, const u32 third, const siphash_key_t *key) { u64 combined = (u64)second << 32 | first; PREAMBLE(12) v3 ^= combined; SIPROUND; SIPROUND; v0 ^= combined; b |= third; POSTAMBLE } EXPORT_SYMBOL(siphash_3u32); #if BITS_PER_LONG == 64 /* Note that on 64-bit, we make HalfSipHash1-3 actually be SipHash1-3, for * performance reasons. On 32-bit, below, we actually implement HalfSipHash1-3. */ #define HSIPROUND SIPROUND #define HPREAMBLE(len) PREAMBLE(len) #define HPOSTAMBLE \ v3 ^= b; \ HSIPROUND; \ v0 ^= b; \ v2 ^= 0xff; \ HSIPROUND; \ HSIPROUND; \ HSIPROUND; \ return (v0 ^ v1) ^ (v2 ^ v3); #ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS u32 __hsiphash_aligned(const void *data, size_t len, const hsiphash_key_t *key) { const u8 *end = data + len - (len % sizeof(u64)); const u8 left = len & (sizeof(u64) - 1); u64 m; HPREAMBLE(len) for (; data != end; data += sizeof(u64)) { m = le64_to_cpup(data); v3 ^= m; HSIPROUND; v0 ^= m; } #if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64 if (left) b |= le64_to_cpu((__force __le64)(load_unaligned_zeropad(data) & bytemask_from_count(left))); #else switch (left) { case 7: b |= ((u64)end[6]) << 48; fallthrough; case 6: b |= ((u64)end[5]) << 40; fallthrough; case 5: b |= ((u64)end[4]) << 32; fallthrough; case 4: b |= le32_to_cpup(data); break; case 3: b |= ((u64)end[2]) << 16; fallthrough; case 2: b |= le16_to_cpup(data); break; case 1: b |= end[0]; } #endif HPOSTAMBLE } EXPORT_SYMBOL(__hsiphash_aligned); #endif u32 __hsiphash_unaligned(const void *data, size_t len, const hsiphash_key_t *key) { const u8 *end = data + len - (len % sizeof(u64)); const u8 left = len & (sizeof(u64) - 1); u64 m; HPREAMBLE(len) for (; data != end; data += sizeof(u64)) { m = get_unaligned_le64(data); v3 ^= m; HSIPROUND; v0 ^= m; } #if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64 if (left) b |= le64_to_cpu((__force __le64)(load_unaligned_zeropad(data) & bytemask_from_count(left))); #else switch (left) { case 7: b |= ((u64)end[6]) << 48; fallthrough; case 6: b |= ((u64)end[5]) << 40; fallthrough; case 5: b |= ((u64)end[4]) << 32; fallthrough; case 4: b |= get_unaligned_le32(end); break; case 3: b |= ((u64)end[2]) << 16; fallthrough; case 2: b |= get_unaligned_le16(end); break; case 1: b |= end[0]; } #endif HPOSTAMBLE } EXPORT_SYMBOL(__hsiphash_unaligned); /** * hsiphash_1u32 - compute 64-bit hsiphash PRF value of a u32 * @first: first u32 * @key: the hsiphash key */ u32 hsiphash_1u32(const u32 first, const hsiphash_key_t *key) { HPREAMBLE(4) b |= first; HPOSTAMBLE } EXPORT_SYMBOL(hsiphash_1u32); /** * hsiphash_2u32 - compute 32-bit hsiphash PRF value of 2 u32 * @first: first u32 * @second: second u32 * @key: the hsiphash key */ u32 hsiphash_2u32(const u32 first, const u32 second, const hsiphash_key_t *key) { u64 combined = (u64)second << 32 | first; HPREAMBLE(8) v3 ^= combined; HSIPROUND; v0 ^= combined; HPOSTAMBLE } EXPORT_SYMBOL(hsiphash_2u32); /** * hsiphash_3u32 - compute 32-bit hsiphash PRF value of 3 u32 * @first: first u32 * @second: second u32 * @third: third u32 * @key: the hsiphash key */ u32 hsiphash_3u32(const u32 first, const u32 second, const u32 third, const hsiphash_key_t *key) { u64 combined = (u64)second << 32 | first; HPREAMBLE(12) v3 ^= combined; HSIPROUND; v0 ^= combined; b |= third; HPOSTAMBLE } EXPORT_SYMBOL(hsiphash_3u32); /** * hsiphash_4u32 - compute 32-bit hsiphash PRF value of 4 u32 * @first: first u32 * @second: second u32 * @third: third u32 * @forth: forth u32 * @key: the hsiphash key */ u32 hsiphash_4u32(const u32 first, const u32 second, const u32 third, const u32 forth, const hsiphash_key_t *key) { u64 combined = (u64)second << 32 | first; HPREAMBLE(16) v3 ^= combined; HSIPROUND; v0 ^= combined; combined = (u64)forth << 32 | third; v3 ^= combined; HSIPROUND; v0 ^= combined; HPOSTAMBLE } EXPORT_SYMBOL(hsiphash_4u32); #else #define HSIPROUND HSIPHASH_PERMUTATION(v0, v1, v2, v3) #define HPREAMBLE(len) \ u32 v0 = HSIPHASH_CONST_0; \ u32 v1 = HSIPHASH_CONST_1; \ u32 v2 = HSIPHASH_CONST_2; \ u32 v3 = HSIPHASH_CONST_3; \ u32 b = ((u32)(len)) << 24; \ v3 ^= key->key[1]; \ v2 ^= key->key[0]; \ v1 ^= key->key[1]; \ v0 ^= key->key[0]; #define HPOSTAMBLE \ v3 ^= b; \ HSIPROUND; \ v0 ^= b; \ v2 ^= 0xff; \ HSIPROUND; \ HSIPROUND; \ HSIPROUND; \ return v1 ^ v3; #ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS u32 __hsiphash_aligned(const void *data, size_t len, const hsiphash_key_t *key) { const u8 *end = data + len - (len % sizeof(u32)); const u8 left = len & (sizeof(u32) - 1); u32 m; HPREAMBLE(len) for (; data != end; data += sizeof(u32)) { m = le32_to_cpup(data); v3 ^= m; HSIPROUND; v0 ^= m; } switch (left) { case 3: b |= ((u32)end[2]) << 16; fallthrough; case 2: b |= le16_to_cpup(data); break; case 1: b |= end[0]; } HPOSTAMBLE } EXPORT_SYMBOL(__hsiphash_aligned); #endif u32 __hsiphash_unaligned(const void *data, size_t len, const hsiphash_key_t *key) { const u8 *end = data + len - (len % sizeof(u32)); const u8 left = len & (sizeof(u32) - 1); u32 m; HPREAMBLE(len) for (; data != end; data += sizeof(u32)) { m = get_unaligned_le32(data); v3 ^= m; HSIPROUND; v0 ^= m; } switch (left) { case 3: b |= ((u32)end[2]) << 16; fallthrough; case 2: b |= get_unaligned_le16(end); break; case 1: b |= end[0]; } HPOSTAMBLE } EXPORT_SYMBOL(__hsiphash_unaligned); /** * hsiphash_1u32 - compute 32-bit hsiphash PRF value of a u32 * @first: first u32 * @key: the hsiphash key */ u32 hsiphash_1u32(const u32 first, const hsiphash_key_t *key) { HPREAMBLE(4) v3 ^= first; HSIPROUND; v0 ^= first; HPOSTAMBLE } EXPORT_SYMBOL(hsiphash_1u32); /** * hsiphash_2u32 - compute 32-bit hsiphash PRF value of 2 u32 * @first: first u32 * @second: second u32 * @key: the hsiphash key */ u32 hsiphash_2u32(const u32 first, const u32 second, const hsiphash_key_t *key) { HPREAMBLE(8) v3 ^= first; HSIPROUND; v0 ^= first; v3 ^= second; HSIPROUND; v0 ^= second; HPOSTAMBLE } EXPORT_SYMBOL(hsiphash_2u32); /** * hsiphash_3u32 - compute 32-bit hsiphash PRF value of 3 u32 * @first: first u32 * @second: second u32 * @third: third u32 * @key: the hsiphash key */ u32 hsiphash_3u32(const u32 first, const u32 second, const u32 third, const hsiphash_key_t *key) { HPREAMBLE(12) v3 ^= first; HSIPROUND; v0 ^= first; v3 ^= second; HSIPROUND; v0 ^= second; v3 ^= third; HSIPROUND; v0 ^= third; HPOSTAMBLE } EXPORT_SYMBOL(hsiphash_3u32); /** * hsiphash_4u32 - compute 32-bit hsiphash PRF value of 4 u32 * @first: first u32 * @second: second u32 * @third: third u32 * @forth: forth u32 * @key: the hsiphash key */ u32 hsiphash_4u32(const u32 first, const u32 second, const u32 third, const u32 forth, const hsiphash_key_t *key) { HPREAMBLE(16) v3 ^= first; HSIPROUND; v0 ^= first; v3 ^= second; HSIPROUND; v0 ^= second; v3 ^= third; HSIPROUND; v0 ^= third; v3 ^= forth; HSIPROUND; v0 ^= forth; HPOSTAMBLE } EXPORT_SYMBOL(hsiphash_4u32); #endif
44 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __KVM_X86_VMX_CAPS_H #define __KVM_X86_VMX_CAPS_H #include <asm/vmx.h> #include "../lapic.h" #include "../x86.h" #include "../pmu.h" #include "../cpuid.h" extern bool __read_mostly enable_vpid; extern bool __read_mostly flexpriority_enabled; extern bool __read_mostly enable_ept; extern bool __read_mostly enable_unrestricted_guest; extern bool __read_mostly enable_ept_ad_bits; extern bool __read_mostly enable_pml; extern bool __read_mostly enable_ipiv; extern int __read_mostly pt_mode; #define PT_MODE_SYSTEM 0 #define PT_MODE_HOST_GUEST 1 #define PMU_CAP_FW_WRITES (1ULL << 13) #define PMU_CAP_LBR_FMT 0x3f struct nested_vmx_msrs { /* * We only store the "true" versions of the VMX capability MSRs. We * generate the "non-true" versions by setting the must-be-1 bits * according to the SDM. */ u32 procbased_ctls_low; u32 procbased_ctls_high; u32 secondary_ctls_low; u32 secondary_ctls_high; u32 pinbased_ctls_low; u32 pinbased_ctls_high; u32 exit_ctls_low; u32 exit_ctls_high; u32 entry_ctls_low; u32 entry_ctls_high; u32 misc_low; u32 misc_high; u32 ept_caps; u32 vpid_caps; u64 basic; u64 cr0_fixed0; u64 cr0_fixed1; u64 cr4_fixed0; u64 cr4_fixed1; u64 vmcs_enum; u64 vmfunc_controls; }; struct vmcs_config { u64 basic; u32 pin_based_exec_ctrl; u32 cpu_based_exec_ctrl; u32 cpu_based_2nd_exec_ctrl; u64 cpu_based_3rd_exec_ctrl; u32 vmexit_ctrl; u32 vmentry_ctrl; u64 misc; struct nested_vmx_msrs nested; }; extern struct vmcs_config vmcs_config __ro_after_init; struct vmx_capability { u32 ept; u32 vpid; }; extern struct vmx_capability vmx_capability __ro_after_init; static inline bool cpu_has_vmx_basic_inout(void) { return vmcs_config.basic & VMX_BASIC_INOUT; } static inline bool cpu_has_virtual_nmis(void) { return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS && vmcs_config.cpu_based_exec_ctrl & CPU_BASED_NMI_WINDOW_EXITING; } static inline bool cpu_has_vmx_preemption_timer(void) { return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VMX_PREEMPTION_TIMER; } static inline bool cpu_has_vmx_posted_intr(void) { return vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR; } static inline bool cpu_has_load_ia32_efer(void) { return vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_EFER; } static inline bool cpu_has_load_perf_global_ctrl(void) { return vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; } static inline bool cpu_has_vmx_mpx(void) { return vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_BNDCFGS; } static inline bool cpu_has_vmx_tpr_shadow(void) { return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW; } static inline bool cpu_need_tpr_shadow(struct kvm_vcpu *vcpu) { return cpu_has_vmx_tpr_shadow() && lapic_in_kernel(vcpu); } static inline bool cpu_has_vmx_msr_bitmap(void) { return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS; } static inline bool cpu_has_secondary_exec_ctrls(void) { return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; } static inline bool cpu_has_tertiary_exec_ctrls(void) { return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_ACTIVATE_TERTIARY_CONTROLS; } static inline bool cpu_has_vmx_virtualize_apic_accesses(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; } static inline bool cpu_has_vmx_ept(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_EPT; } static inline bool vmx_umip_emulated(void) { return !boot_cpu_has(X86_FEATURE_UMIP) && (vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_DESC); } static inline bool cpu_has_vmx_rdtscp(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_RDTSCP; } static inline bool cpu_has_vmx_virtualize_x2apic_mode(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; } static inline bool cpu_has_vmx_vpid(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_VPID; } static inline bool cpu_has_vmx_wbinvd_exit(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_WBINVD_EXITING; } static inline bool cpu_has_vmx_unrestricted_guest(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_UNRESTRICTED_GUEST; } static inline bool cpu_has_vmx_apic_register_virt(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_APIC_REGISTER_VIRT; } static inline bool cpu_has_vmx_virtual_intr_delivery(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY; } static inline bool cpu_has_vmx_ple(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_PAUSE_LOOP_EXITING; } static inline bool cpu_has_vmx_rdrand(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_RDRAND_EXITING; } static inline bool cpu_has_vmx_invpcid(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_INVPCID; } static inline bool cpu_has_vmx_vmfunc(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_VMFUNC; } static inline bool cpu_has_vmx_shadow_vmcs(void) { /* check if the cpu supports writing r/o exit information fields */ if (!(vmcs_config.misc & VMX_MISC_VMWRITE_SHADOW_RO_FIELDS)) return false; return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_SHADOW_VMCS; } static inline bool cpu_has_vmx_encls_vmexit(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENCLS_EXITING; } static inline bool cpu_has_vmx_rdseed(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_RDSEED_EXITING; } static inline bool cpu_has_vmx_pml(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_PML; } static inline bool cpu_has_vmx_xsaves(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_XSAVES; } static inline bool cpu_has_vmx_waitpkg(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE; } static inline bool cpu_has_vmx_tsc_scaling(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_TSC_SCALING; } static inline bool cpu_has_vmx_bus_lock_detection(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_BUS_LOCK_DETECTION; } static inline bool cpu_has_vmx_apicv(void) { return cpu_has_vmx_apic_register_virt() && cpu_has_vmx_virtual_intr_delivery() && cpu_has_vmx_posted_intr(); } static inline bool cpu_has_vmx_ipiv(void) { return vmcs_config.cpu_based_3rd_exec_ctrl & TERTIARY_EXEC_IPI_VIRT; } static inline bool cpu_has_vmx_flexpriority(void) { return cpu_has_vmx_tpr_shadow() && cpu_has_vmx_virtualize_apic_accesses(); } static inline bool cpu_has_vmx_ept_execute_only(void) { return vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT; } static inline bool cpu_has_vmx_ept_4levels(void) { return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT; } static inline bool cpu_has_vmx_ept_5levels(void) { return vmx_capability.ept & VMX_EPT_PAGE_WALK_5_BIT; } static inline bool cpu_has_vmx_ept_mt_wb(void) { return vmx_capability.ept & VMX_EPTP_WB_BIT; } static inline bool cpu_has_vmx_ept_2m_page(void) { return vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT; } static inline bool cpu_has_vmx_ept_1g_page(void) { return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT; } static inline int ept_caps_to_lpage_level(u32 ept_caps) { if (ept_caps & VMX_EPT_1GB_PAGE_BIT) return PG_LEVEL_1G; if (ept_caps & VMX_EPT_2MB_PAGE_BIT) return PG_LEVEL_2M; return PG_LEVEL_4K; } static inline bool cpu_has_vmx_ept_ad_bits(void) { return vmx_capability.ept & VMX_EPT_AD_BIT; } static inline bool cpu_has_vmx_invept_context(void) { return vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT; } static inline bool cpu_has_vmx_invept_global(void) { return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT; } static inline bool cpu_has_vmx_invvpid(void) { return vmx_capability.vpid & VMX_VPID_INVVPID_BIT; } static inline bool cpu_has_vmx_invvpid_individual_addr(void) { return vmx_capability.vpid & VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT; } static inline bool cpu_has_vmx_invvpid_single(void) { return vmx_capability.vpid & VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT; } static inline bool cpu_has_vmx_invvpid_global(void) { return vmx_capability.vpid & VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT; } static inline bool cpu_has_vmx_intel_pt(void) { return (vmcs_config.misc & VMX_MISC_INTEL_PT) && (vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_PT_USE_GPA) && (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_RTIT_CTL); } /* * Processor Trace can operate in one of three modes: * a. system-wide: trace both host/guest and output to host buffer * b. host-only: only trace host and output to host buffer * c. host-guest: trace host and guest simultaneously and output to their * respective buffer * * KVM currently only supports (a) and (c). */ static inline bool vmx_pt_mode_is_system(void) { return pt_mode == PT_MODE_SYSTEM; } static inline bool vmx_pt_mode_is_host_guest(void) { return pt_mode == PT_MODE_HOST_GUEST; } static inline bool vmx_pebs_supported(void) { return boot_cpu_has(X86_FEATURE_PEBS) && kvm_pmu_cap.pebs_ept; } static inline bool cpu_has_notify_vmexit(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_NOTIFY_VM_EXITING; } #endif /* __KVM_X86_VMX_CAPS_H */
141 141 141 141 141 141 140 141 141 140 141 141 141 141 141 139 141 82 141 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 // SPDX-License-Identifier: GPL-2.0-only /* * Add configfs and memory store: Kyungchan Koh <kkc6196@fb.com> and * Shaohua Li <shli@fb.com> */ #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/sched.h> #include <linux/fs.h> #include <linux/init.h> #include "null_blk.h" #undef pr_fmt #define pr_fmt(fmt) "null_blk: " fmt #define FREE_BATCH 16 #define TICKS_PER_SEC 50ULL #define TIMER_INTERVAL (NSEC_PER_SEC / TICKS_PER_SEC) #ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION static DECLARE_FAULT_ATTR(null_timeout_attr); static DECLARE_FAULT_ATTR(null_requeue_attr); static DECLARE_FAULT_ATTR(null_init_hctx_attr); #endif static inline u64 mb_per_tick(int mbps) { return (1 << 20) / TICKS_PER_SEC * ((u64) mbps); } /* * Status flags for nullb_device. * * CONFIGURED: Device has been configured and turned on. Cannot reconfigure. * UP: Device is currently on and visible in userspace. * THROTTLED: Device is being throttled. * CACHE: Device is using a write-back cache. */ enum nullb_device_flags { NULLB_DEV_FL_CONFIGURED = 0, NULLB_DEV_FL_UP = 1, NULLB_DEV_FL_THROTTLED = 2, NULLB_DEV_FL_CACHE = 3, }; #define MAP_SZ ((PAGE_SIZE >> SECTOR_SHIFT) + 2) /* * nullb_page is a page in memory for nullb devices. * * @page: The page holding the data. * @bitmap: The bitmap represents which sector in the page has data. * Each bit represents one block size. For example, sector 8 * will use the 7th bit * The highest 2 bits of bitmap are for special purpose. LOCK means the cache * page is being flushing to storage. FREE means the cache page is freed and * should be skipped from flushing to storage. Please see * null_make_cache_space */ struct nullb_page { struct page *page; DECLARE_BITMAP(bitmap, MAP_SZ); }; #define NULLB_PAGE_LOCK (MAP_SZ - 1) #define NULLB_PAGE_FREE (MAP_SZ - 2) static LIST_HEAD(nullb_list); static struct mutex lock; static int null_major; static DEFINE_IDA(nullb_indexes); static struct blk_mq_tag_set tag_set; enum { NULL_IRQ_NONE = 0, NULL_IRQ_SOFTIRQ = 1, NULL_IRQ_TIMER = 2, }; static bool g_virt_boundary; module_param_named(virt_boundary, g_virt_boundary, bool, 0444); MODULE_PARM_DESC(virt_boundary, "Require a virtual boundary for the device. Default: False"); static int g_no_sched; module_param_named(no_sched, g_no_sched, int, 0444); MODULE_PARM_DESC(no_sched, "No io scheduler"); static int g_submit_queues = 1; module_param_named(submit_queues, g_submit_queues, int, 0444); MODULE_PARM_DESC(submit_queues, "Number of submission queues"); static int g_poll_queues = 1; module_param_named(poll_queues, g_poll_queues, int, 0444); MODULE_PARM_DESC(poll_queues, "Number of IOPOLL submission queues"); static int g_home_node = NUMA_NO_NODE; module_param_named(home_node, g_home_node, int, 0444); MODULE_PARM_DESC(home_node, "Home node for the device"); #ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION /* * For more details about fault injection, please refer to * Documentation/fault-injection/fault-injection.rst. */ static char g_timeout_str[80]; module_param_string(timeout, g_timeout_str, sizeof(g_timeout_str), 0444); MODULE_PARM_DESC(timeout, "Fault injection. timeout=<interval>,<probability>,<space>,<times>"); static char g_requeue_str[80]; module_param_string(requeue, g_requeue_str, sizeof(g_requeue_str), 0444); MODULE_PARM_DESC(requeue, "Fault injection. requeue=<interval>,<probability>,<space>,<times>"); static char g_init_hctx_str[80]; module_param_string(init_hctx, g_init_hctx_str, sizeof(g_init_hctx_str), 0444); MODULE_PARM_DESC(init_hctx, "Fault injection to fail hctx init. init_hctx=<interval>,<probability>,<space>,<times>"); #endif /* * Historic queue modes. * * These days nothing but NULL_Q_MQ is actually supported, but we keep it the * enum for error reporting. */ enum { NULL_Q_BIO = 0, NULL_Q_RQ = 1, NULL_Q_MQ = 2, }; static int g_queue_mode = NULL_Q_MQ; static int null_param_store_val(const char *str, int *val, int min, int max) { int ret, new_val; ret = kstrtoint(str, 10, &new_val); if (ret) return -EINVAL; if (new_val < min || new_val > max) return -EINVAL; *val = new_val; return 0; } static int null_set_queue_mode(const char *str, const struct kernel_param *kp) { return null_param_store_val(str, &g_queue_mode, NULL_Q_BIO, NULL_Q_MQ); } static const struct kernel_param_ops null_queue_mode_param_ops = { .set = null_set_queue_mode, .get = param_get_int, }; device_param_cb(queue_mode, &null_queue_mode_param_ops, &g_queue_mode, 0444); MODULE_PARM_DESC(queue_mode, "Block interface to use (0=bio,1=rq,2=multiqueue)"); static int g_gb = 250; module_param_named(gb, g_gb, int, 0444); MODULE_PARM_DESC(gb, "Size in GB"); static int g_bs = 512; module_param_named(bs, g_bs, int, 0444); MODULE_PARM_DESC(bs, "Block size (in bytes)"); static int g_max_sectors; module_param_named(max_sectors, g_max_sectors, int, 0444); MODULE_PARM_DESC(max_sectors, "Maximum size of a command (in 512B sectors)"); static unsigned int nr_devices = 1; module_param(nr_devices, uint, 0444); MODULE_PARM_DESC(nr_devices, "Number of devices to register"); static bool g_blocking; module_param_named(blocking, g_blocking, bool, 0444); MODULE_PARM_DESC(blocking, "Register as a blocking blk-mq driver device"); static bool g_shared_tags; module_param_named(shared_tags, g_shared_tags, bool, 0444); MODULE_PARM_DESC(shared_tags, "Share tag set between devices for blk-mq"); static bool g_shared_tag_bitmap; module_param_named(shared_tag_bitmap, g_shared_tag_bitmap, bool, 0444); MODULE_PARM_DESC(shared_tag_bitmap, "Use shared tag bitmap for all submission queues for blk-mq"); static int g_irqmode = NULL_IRQ_SOFTIRQ; static int null_set_irqmode(const char *str, const struct kernel_param *kp) { return null_param_store_val(str, &g_irqmode, NULL_IRQ_NONE, NULL_IRQ_TIMER); } static const struct kernel_param_ops null_irqmode_param_ops = { .set = null_set_irqmode, .get = param_get_int, }; device_param_cb(irqmode, &null_irqmode_param_ops, &g_irqmode, 0444); MODULE_PARM_DESC(irqmode, "IRQ completion handler. 0-none, 1-softirq, 2-timer"); static unsigned long g_completion_nsec = 10000; module_param_named(completion_nsec, g_completion_nsec, ulong, 0444); MODULE_PARM_DESC(completion_nsec, "Time in ns to complete a request in hardware. Default: 10,000ns"); static int g_hw_queue_depth = 64; module_param_named(hw_queue_depth, g_hw_queue_depth, int, 0444); MODULE_PARM_DESC(hw_queue_depth, "Queue depth for each hardware queue. Default: 64"); static bool g_use_per_node_hctx; module_param_named(use_per_node_hctx, g_use_per_node_hctx, bool, 0444); MODULE_PARM_DESC(use_per_node_hctx, "Use per-node allocation for hardware context queues. Default: false"); static bool g_memory_backed; module_param_named(memory_backed, g_memory_backed, bool, 0444); MODULE_PARM_DESC(memory_backed, "Create a memory-backed block device. Default: false"); static bool g_discard; module_param_named(discard, g_discard, bool, 0444); MODULE_PARM_DESC(discard, "Support discard operations (requires memory-backed null_blk device). Default: false"); static unsigned long g_cache_size; module_param_named(cache_size, g_cache_size, ulong, 0444); MODULE_PARM_DESC(mbps, "Cache size in MiB for memory-backed device. Default: 0 (none)"); static bool g_fua = true; module_param_named(fua, g_fua, bool, 0444); MODULE_PARM_DESC(fua, "Enable/disable FUA support when cache_size is used. Default: true"); static unsigned int g_mbps; module_param_named(mbps, g_mbps, uint, 0444); MODULE_PARM_DESC(mbps, "Limit maximum bandwidth (in MiB/s). Default: 0 (no limit)"); static bool g_zoned; module_param_named(zoned, g_zoned, bool, S_IRUGO); MODULE_PARM_DESC(zoned, "Make device as a host-managed zoned block device. Default: false"); static unsigned long g_zone_size = 256; module_param_named(zone_size, g_zone_size, ulong, S_IRUGO); MODULE_PARM_DESC(zone_size, "Zone size in MB when block device is zoned. Must be power-of-two: Default: 256"); static unsigned long g_zone_capacity; module_param_named(zone_capacity, g_zone_capacity, ulong, 0444); MODULE_PARM_DESC(zone_capacity, "Zone capacity in MB when block device is zoned. Can be less than or equal to zone size. Default: Zone size"); static unsigned int g_zone_nr_conv; module_param_named(zone_nr_conv, g_zone_nr_conv, uint, 0444); MODULE_PARM_DESC(zone_nr_conv, "Number of conventional zones when block device is zoned. Default: 0"); static unsigned int g_zone_max_open; module_param_named(zone_max_open, g_zone_max_open, uint, 0444); MODULE_PARM_DESC(zone_max_open, "Maximum number of open zones when block device is zoned. Default: 0 (no limit)"); static unsigned int g_zone_max_active; module_param_named(zone_max_active, g_zone_max_active, uint, 0444); MODULE_PARM_DESC(zone_max_active, "Maximum number of active zones when block device is zoned. Default: 0 (no limit)"); static int g_zone_append_max_sectors = INT_MAX; module_param_named(zone_append_max_sectors, g_zone_append_max_sectors, int, 0444); MODULE_PARM_DESC(zone_append_max_sectors, "Maximum size of a zone append command (in 512B sectors). Specify 0 for zone append emulation"); static bool g_zone_full; module_param_named(zone_full, g_zone_full, bool, S_IRUGO); MODULE_PARM_DESC(zone_full, "Initialize the sequential write required zones of a zoned device to be full. Default: false"); static bool g_rotational; module_param_named(rotational, g_rotational, bool, S_IRUGO); MODULE_PARM_DESC(rotational, "Set the rotational feature for the device. Default: false"); static struct nullb_device *null_alloc_dev(void); static void null_free_dev(struct nullb_device *dev); static void null_del_dev(struct nullb *nullb); static int null_add_dev(struct nullb_device *dev); static struct nullb *null_find_dev_by_name(const char *name); static void null_free_device_storage(struct nullb_device *dev, bool is_cache); static inline struct nullb_device *to_nullb_device(struct config_item *item) { return item ? container_of(to_config_group(item), struct nullb_device, group) : NULL; } static inline ssize_t nullb_device_uint_attr_show(unsigned int val, char *page) { return snprintf(page, PAGE_SIZE, "%u\n", val); } static inline ssize_t nullb_device_ulong_attr_show(unsigned long val, char *page) { return snprintf(page, PAGE_SIZE, "%lu\n", val); } static inline ssize_t nullb_device_bool_attr_show(bool val, char *page) { return snprintf(page, PAGE_SIZE, "%u\n", val); } static ssize_t nullb_device_uint_attr_store(unsigned int *val, const char *page, size_t count) { unsigned int tmp; int result; result = kstrtouint(page, 0, &tmp); if (result < 0) return result; *val = tmp; return count; } static ssize_t nullb_device_ulong_attr_store(unsigned long *val, const char *page, size_t count) { int result; unsigned long tmp; result = kstrtoul(page, 0, &tmp); if (result < 0) return result; *val = tmp; return count; } static ssize_t nullb_device_bool_attr_store(bool *val, const char *page, size_t count) { bool tmp; int result; result = kstrtobool(page, &tmp); if (result < 0) return result; *val = tmp; return count; } /* The following macro should only be used with TYPE = {uint, ulong, bool}. */ #define NULLB_DEVICE_ATTR(NAME, TYPE, APPLY) \ static ssize_t \ nullb_device_##NAME##_show(struct config_item *item, char *page) \ { \ return nullb_device_##TYPE##_attr_show( \ to_nullb_device(item)->NAME, page); \ } \ static ssize_t \ nullb_device_##NAME##_store(struct config_item *item, const char *page, \ size_t count) \ { \ int (*apply_fn)(struct nullb_device *dev, TYPE new_value) = APPLY;\ struct nullb_device *dev = to_nullb_device(item); \ TYPE new_value = 0; \ int ret; \ \ ret = nullb_device_##TYPE##_attr_store(&new_value, page, count);\ if (ret < 0) \ return ret; \ if (apply_fn) \ ret = apply_fn(dev, new_value); \ else if (test_bit(NULLB_DEV_FL_CONFIGURED, &dev->flags)) \ ret = -EBUSY; \ if (ret < 0) \ return ret; \ dev->NAME = new_value; \ return count; \ } \ CONFIGFS_ATTR(nullb_device_, NAME); static int nullb_update_nr_hw_queues(struct nullb_device *dev, unsigned int submit_queues, unsigned int poll_queues) { struct blk_mq_tag_set *set; int ret, nr_hw_queues; if (!dev->nullb) return 0; /* * Make sure at least one submit queue exists. */ if (!submit_queues) return -EINVAL; /* * Make sure that null_init_hctx() does not access nullb->queues[] past * the end of that array. */ if (submit_queues > nr_cpu_ids || poll_queues > g_poll_queues) return -EINVAL; /* * Keep previous and new queue numbers in nullb_device for reference in * the call back function null_map_queues(). */ dev->prev_submit_queues = dev->submit_queues; dev->prev_poll_queues = dev->poll_queues; dev->submit_queues = submit_queues; dev->poll_queues = poll_queues; set = dev->nullb->tag_set; nr_hw_queues = submit_queues + poll_queues; blk_mq_update_nr_hw_queues(set, nr_hw_queues); ret = set->nr_hw_queues == nr_hw_queues ? 0 : -ENOMEM; if (ret) { /* on error, revert the queue numbers */ dev->submit_queues = dev->prev_submit_queues; dev->poll_queues = dev->prev_poll_queues; } return ret; } static int nullb_apply_submit_queues(struct nullb_device *dev, unsigned int submit_queues) { int ret; mutex_lock(&lock); ret = nullb_update_nr_hw_queues(dev, submit_queues, dev->poll_queues); mutex_unlock(&lock); return ret; } static int nullb_apply_poll_queues(struct nullb_device *dev, unsigned int poll_queues) { int ret; mutex_lock(&lock); ret = nullb_update_nr_hw_queues(dev, dev->submit_queues, poll_queues); mutex_unlock(&lock); return ret; } NULLB_DEVICE_ATTR(size, ulong, NULL); NULLB_DEVICE_ATTR(completion_nsec, ulong, NULL); NULLB_DEVICE_ATTR(submit_queues, uint, nullb_apply_submit_queues); NULLB_DEVICE_ATTR(poll_queues, uint, nullb_apply_poll_queues); NULLB_DEVICE_ATTR(home_node, uint, NULL); NULLB_DEVICE_ATTR(queue_mode, uint, NULL); NULLB_DEVICE_ATTR(blocksize, uint, NULL); NULLB_DEVICE_ATTR(max_sectors, uint, NULL); NULLB_DEVICE_ATTR(irqmode, uint, NULL); NULLB_DEVICE_ATTR(hw_queue_depth, uint, NULL); NULLB_DEVICE_ATTR(index, uint, NULL); NULLB_DEVICE_ATTR(blocking, bool, NULL); NULLB_DEVICE_ATTR(use_per_node_hctx, bool, NULL); NULLB_DEVICE_ATTR(memory_backed, bool, NULL); NULLB_DEVICE_ATTR(discard, bool, NULL); NULLB_DEVICE_ATTR(mbps, uint, NULL); NULLB_DEVICE_ATTR(cache_size, ulong, NULL); NULLB_DEVICE_ATTR(zoned, bool, NULL); NULLB_DEVICE_ATTR(zone_size, ulong, NULL); NULLB_DEVICE_ATTR(zone_capacity, ulong, NULL); NULLB_DEVICE_ATTR(zone_nr_conv, uint, NULL); NULLB_DEVICE_ATTR(zone_max_open, uint, NULL); NULLB_DEVICE_ATTR(zone_max_active, uint, NULL); NULLB_DEVICE_ATTR(zone_append_max_sectors, uint, NULL); NULLB_DEVICE_ATTR(zone_full, bool, NULL); NULLB_DEVICE_ATTR(virt_boundary, bool, NULL); NULLB_DEVICE_ATTR(no_sched, bool, NULL); NULLB_DEVICE_ATTR(shared_tags, bool, NULL); NULLB_DEVICE_ATTR(shared_tag_bitmap, bool, NULL); NULLB_DEVICE_ATTR(fua, bool, NULL); NULLB_DEVICE_ATTR(rotational, bool, NULL); static ssize_t nullb_device_power_show(struct config_item *item, char *page) { return nullb_device_bool_attr_show(to_nullb_device(item)->power, page); } static ssize_t nullb_device_power_store(struct config_item *item, const char *page, size_t count) { struct nullb_device *dev = to_nullb_device(item); bool newp = false; ssize_t ret; ret = nullb_device_bool_attr_store(&newp, page, count); if (ret < 0) return ret; ret = count; mutex_lock(&lock); if (!dev->power && newp) { if (test_and_set_bit(NULLB_DEV_FL_UP, &dev->flags)) goto out; ret = null_add_dev(dev); if (ret) { clear_bit(NULLB_DEV_FL_UP, &dev->flags); goto out; } set_bit(NULLB_DEV_FL_CONFIGURED, &dev->flags); dev->power = newp; ret = count; } else if (dev->power && !newp) { if (test_and_clear_bit(NULLB_DEV_FL_UP, &dev->flags)) { dev->power = newp; null_del_dev(dev->nullb); } clear_bit(NULLB_DEV_FL_CONFIGURED, &dev->flags); } out: mutex_unlock(&lock); return ret; } CONFIGFS_ATTR(nullb_device_, power); static ssize_t nullb_device_badblocks_show(struct config_item *item, char *page) { struct nullb_device *t_dev = to_nullb_device(item); return badblocks_show(&t_dev->badblocks, page, 0); } static ssize_t nullb_device_badblocks_store(struct config_item *item, const char *page, size_t count) { struct nullb_device *t_dev = to_nullb_device(item); char *orig, *buf, *tmp; u64 start, end; int ret; orig = kstrndup(page, count, GFP_KERNEL); if (!orig) return -ENOMEM; buf = strstrip(orig); ret = -EINVAL; if (buf[0] != '+' && buf[0] != '-') goto out; tmp = strchr(&buf[1], '-'); if (!tmp) goto out; *tmp = '\0'; ret = kstrtoull(buf + 1, 0, &start); if (ret) goto out; ret = kstrtoull(tmp + 1, 0, &end); if (ret) goto out; ret = -EINVAL; if (start > end) goto out; /* enable badblocks */ cmpxchg(&t_dev->badblocks.shift, -1, 0); if