Total coverage: 354831 (18%)of 1975186
1751 872 873 1603 867 873 795 868 1651 123 102 25 25 870 869 869 862 3 3 3 2 868 3 863 863 867 2 857 864 101 101 764 764 764 762 3 852 625 626 625 627 627 371 312 801 322 323 806 1 811 811 811 8 8 8 804 1 801 800 1 801 28 802 799 799 799 801 217 799 799 148 362 797 419 793 795 798 797 794 2 795 796 797 12 614 621 5 5 612 590 798 801 21 10 22 800 181 172 4 167 2 677 1 1 802 802 811 812 811 810 2 1 806 1 807 811 1666 1670 1667 1671 1 1660 814 1674 7 7 7 6 7 7 7 7 7 7 7 6 8 7 6 6 5 6 104 104 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 /* Netfilter messages via netlink socket. Allows for user space * protocol helpers and general trouble making from userspace. * * (C) 2001 by Jay Schulist <jschlst@samba.org>, * (C) 2002-2005 by Harald Welte <laforge@gnumonks.org> * (C) 2005-2017 by Pablo Neira Ayuso <pablo@netfilter.org> * * Initial netfilter messages via netlink development funded and * generally made possible by Network Robots, Inc. (www.networkrobots.com) * * Further development of this code funded by Astaro AG (http://www.astaro.com) * * This software may be used and distributed according to the terms * of the GNU General Public License, incorporated herein by reference. */ #include <linux/module.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/skbuff.h> #include <linux/uaccess.h> #include <net/sock.h> #include <linux/init.h> #include <linux/sched/signal.h> #include <net/netlink.h> #include <net/netns/generic.h> #include <linux/netfilter.h> #include <linux/netfilter/nfnetlink.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_NETFILTER); MODULE_DESCRIPTION("Netfilter messages via netlink socket"); #define nfnl_dereference_protected(id) \ rcu_dereference_protected(table[(id)].subsys, \ lockdep_nfnl_is_held((id))) #define NFNL_MAX_ATTR_COUNT 32 static unsigned int nfnetlink_pernet_id __read_mostly; #ifdef CONFIG_NF_CONNTRACK_EVENTS static DEFINE_SPINLOCK(nfnl_grp_active_lock); #endif struct nfnl_net { struct sock *nfnl; }; static struct { struct mutex mutex; const struct nfnetlink_subsystem __rcu *subsys; } table[NFNL_SUBSYS_COUNT]; static struct lock_class_key nfnl_lockdep_keys[NFNL_SUBSYS_COUNT]; static const char *const nfnl_lockdep_names[NFNL_SUBSYS_COUNT] = { [NFNL_SUBSYS_NONE] = "nfnl_subsys_none", [NFNL_SUBSYS_CTNETLINK] = "nfnl_subsys_ctnetlink", [NFNL_SUBSYS_CTNETLINK_EXP] = "nfnl_subsys_ctnetlink_exp", [NFNL_SUBSYS_QUEUE] = "nfnl_subsys_queue", [NFNL_SUBSYS_ULOG] = "nfnl_subsys_ulog", [NFNL_SUBSYS_OSF] = "nfnl_subsys_osf", [NFNL_SUBSYS_IPSET] = "nfnl_subsys_ipset", [NFNL_SUBSYS_ACCT] = "nfnl_subsys_acct", [NFNL_SUBSYS_CTNETLINK_TIMEOUT] = "nfnl_subsys_cttimeout", [NFNL_SUBSYS_CTHELPER] = "nfnl_subsys_cthelper", [NFNL_SUBSYS_NFTABLES] = "nfnl_subsys_nftables", [NFNL_SUBSYS_NFT_COMPAT] = "nfnl_subsys_nftcompat", [NFNL_SUBSYS_HOOK] = "nfnl_subsys_hook", }; static const int nfnl_group2type[NFNLGRP_MAX+1] = { [NFNLGRP_CONNTRACK_NEW] = NFNL_SUBSYS_CTNETLINK, [NFNLGRP_CONNTRACK_UPDATE] = NFNL_SUBSYS_CTNETLINK, [NFNLGRP_CONNTRACK_DESTROY] = NFNL_SUBSYS_CTNETLINK, [NFNLGRP_CONNTRACK_EXP_NEW] = NFNL_SUBSYS_CTNETLINK_EXP, [NFNLGRP_CONNTRACK_EXP_UPDATE] = NFNL_SUBSYS_CTNETLINK_EXP, [NFNLGRP_CONNTRACK_EXP_DESTROY] = NFNL_SUBSYS_CTNETLINK_EXP, [NFNLGRP_NFTABLES] = NFNL_SUBSYS_NFTABLES, [NFNLGRP_ACCT_QUOTA] = NFNL_SUBSYS_ACCT, [NFNLGRP_NFTRACE] = NFNL_SUBSYS_NFTABLES, }; static struct nfnl_net *nfnl_pernet(struct net *net) { return net_generic(net, nfnetlink_pernet_id); } void nfnl_lock(__u8 subsys_id) { mutex_lock(&table[subsys_id].mutex); } EXPORT_SYMBOL_GPL(nfnl_lock); void nfnl_unlock(__u8 subsys_id) { mutex_unlock(&table[subsys_id].mutex); } EXPORT_SYMBOL_GPL(nfnl_unlock); #ifdef CONFIG_PROVE_LOCKING bool lockdep_nfnl_is_held(u8 subsys_id) { return lockdep_is_held(&table[subsys_id].mutex); } EXPORT_SYMBOL_GPL(lockdep_nfnl_is_held); #endif int nfnetlink_subsys_register(const struct nfnetlink_subsystem *n) { u8 cb_id; /* Sanity-check attr_count size to avoid stack buffer overflow. */ for (cb_id = 0; cb_id < n->cb_count; cb_id++) if (WARN_ON(n->cb[cb_id].attr_count > NFNL_MAX_ATTR_COUNT)) return -EINVAL; nfnl_lock(n->subsys_id); if (table[n->subsys_id].subsys) { nfnl_unlock(n->subsys_id); return -EBUSY; } rcu_assign_pointer(table[n->subsys_id].subsys, n); nfnl_unlock(n->subsys_id); return 0; } EXPORT_SYMBOL_GPL(nfnetlink_subsys_register); int nfnetlink_subsys_unregister(const struct nfnetlink_subsystem *n) { nfnl_lock(n->subsys_id); table[n->subsys_id].subsys = NULL; nfnl_unlock(n->subsys_id); synchronize_rcu(); return 0; } EXPORT_SYMBOL_GPL(nfnetlink_subsys_unregister); static inline const struct nfnetlink_subsystem *nfnetlink_get_subsys(u16 type) { u8 subsys_id = NFNL_SUBSYS_ID(type); if (subsys_id >= NFNL_SUBSYS_COUNT) return NULL; return rcu_dereference(table[subsys_id].subsys); } static inline const struct nfnl_callback * nfnetlink_find_client(u16 type, const struct nfnetlink_subsystem *ss) { u8 cb_id = NFNL_MSG_TYPE(type); if (cb_id >= ss->cb_count) return NULL; return &ss->cb[cb_id]; } int nfnetlink_has_listeners(struct net *net, unsigned int group) { struct nfnl_net *nfnlnet = nfnl_pernet(net); return netlink_has_listeners(nfnlnet->nfnl, group); } EXPORT_SYMBOL_GPL(nfnetlink_has_listeners); int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 portid, unsigned int group, int echo, gfp_t flags) { struct nfnl_net *nfnlnet = nfnl_pernet(net); return nlmsg_notify(nfnlnet->nfnl, skb, portid, group, echo, flags); } EXPORT_SYMBOL_GPL(nfnetlink_send); int nfnetlink_set_err(struct net *net, u32 portid, u32 group, int error) { struct nfnl_net *nfnlnet = nfnl_pernet(net); return netlink_set_err(nfnlnet->nfnl, portid, group, error); } EXPORT_SYMBOL_GPL(nfnetlink_set_err); int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u32 portid) { struct nfnl_net *nfnlnet = nfnl_pernet(net); int err; err = nlmsg_unicast(nfnlnet->nfnl, skb, portid); if (err == -EAGAIN) err = -ENOBUFS; return err; } EXPORT_SYMBOL_GPL(nfnetlink_unicast); void nfnetlink_broadcast(struct net *net, struct sk_buff *skb, __u32 portid, __u32 group, gfp_t allocation) { struct nfnl_net *nfnlnet = nfnl_pernet(net); netlink_broadcast(nfnlnet->nfnl, skb, portid, group, allocation); } EXPORT_SYMBOL_GPL(nfnetlink_broadcast); /* Process one complete nfnetlink message. */ static int nfnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); const struct nfnl_callback *nc; const struct nfnetlink_subsystem *ss; int type, err; /* All the messages must at least contain nfgenmsg */ if (nlmsg_len(nlh) < sizeof(struct nfgenmsg)) return 0; type = nlh->nlmsg_type; replay: rcu_read_lock(); ss = nfnetlink_get_subsys(type); if (!ss) { #ifdef CONFIG_MODULES rcu_read_unlock(); request_module("nfnetlink-subsys-%d", NFNL_SUBSYS_ID(type)); rcu_read_lock(); ss = nfnetlink_get_subsys(type); if (!ss) #endif { rcu_read_unlock(); return -EINVAL; } } nc = nfnetlink_find_client(type, ss); if (!nc) { rcu_read_unlock(); return -EINVAL; } { int min_len = nlmsg_total_size(sizeof(struct nfgenmsg)); struct nfnl_net *nfnlnet = nfnl_pernet(net); u8 cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type); struct nlattr *cda[NFNL_MAX_ATTR_COUNT + 1]; struct nlattr *attr = (void *)nlh + min_len; int attrlen = nlh->nlmsg_len - min_len; __u8 subsys_id = NFNL_SUBSYS_ID(type); struct nfnl_info info = { .net = net, .sk = nfnlnet->nfnl, .nlh = nlh, .nfmsg = nlmsg_data(nlh), .extack = extack, }; /* Sanity-check NFNL_MAX_ATTR_COUNT */ if (ss->cb[cb_id].attr_count > NFNL_MAX_ATTR_COUNT) { rcu_read_unlock(); return -ENOMEM; } err = nla_parse_deprecated(cda, ss->cb[cb_id].attr_count, attr, attrlen, ss->cb[cb_id].policy, extack); if (err < 0) { rcu_read_unlock(); return err; } if (!nc->call) { rcu_read_unlock(); return -EINVAL; } switch (nc->type) { case NFNL_CB_RCU: err = nc->call(skb, &info, (const struct nlattr **)cda); rcu_read_unlock(); break; case NFNL_CB_MUTEX: rcu_read_unlock(); nfnl_lock(subsys_id); if (nfnl_dereference_protected(subsys_id) != ss || nfnetlink_find_client(type, ss) != nc) { nfnl_unlock(subsys_id); err = -EAGAIN; break; } err = nc->call(skb, &info, (const struct nlattr **)cda); nfnl_unlock(subsys_id); break; default: rcu_read_unlock(); err = -EINVAL; break; } if (err == -EAGAIN) goto replay; return err; } } struct nfnl_err { struct list_head head; struct nlmsghdr *nlh; int err; struct netlink_ext_ack extack; }; static int nfnl_err_add(struct list_head *list, struct nlmsghdr *nlh, int err, const struct netlink_ext_ack *extack) { struct nfnl_err *nfnl_err; nfnl_err = kmalloc(sizeof(struct nfnl_err), GFP_KERNEL); if (nfnl_err == NULL) return -ENOMEM; nfnl_err->nlh = nlh; nfnl_err->err = err; nfnl_err->extack = *extack; list_add_tail(&nfnl_err->head, list); return 0; } static void nfnl_err_del(struct nfnl_err *nfnl_err) { list_del(&nfnl_err->head); kfree(nfnl_err); } static void nfnl_err_reset(struct list_head *err_list) { struct nfnl_err *nfnl_err, *next; list_for_each_entry_safe(nfnl_err, next, err_list, head) nfnl_err_del(nfnl_err); } static void nfnl_err_deliver(struct list_head *err_list, struct sk_buff *skb) { struct nfnl_err *nfnl_err, *next; list_for_each_entry_safe(nfnl_err, next, err_list, head) { netlink_ack(skb, nfnl_err->nlh, nfnl_err->err, &nfnl_err->extack); nfnl_err_del(nfnl_err); } } enum { NFNL_BATCH_FAILURE = (1 << 0), NFNL_BATCH_DONE = (1 << 1), NFNL_BATCH_REPLAY = (1 << 2), }; static void nfnetlink_rcv_batch(struct sk_buff *skb, struct nlmsghdr *nlh, u16 subsys_id, u32 genid) { struct sk_buff *oskb = skb; struct net *net = sock_net(skb->sk); const struct nfnetlink_subsystem *ss; const struct nfnl_callback *nc; struct netlink_ext_ack extack; LIST_HEAD(err_list); u32 status; int err; if (subsys_id >= NFNL_SUBSYS_COUNT) return netlink_ack(skb, nlh, -EINVAL, NULL); replay: status = 0; replay_abort: skb = netlink_skb_clone(oskb, GFP_KERNEL); if (!skb) return netlink_ack(oskb, nlh, -ENOMEM, NULL); nfnl_lock(subsys_id); ss = nfnl_dereference_protected(subsys_id); if (!ss) { #ifdef CONFIG_MODULES nfnl_unlock(subsys_id); request_module("nfnetlink-subsys-%d", subsys_id); nfnl_lock(subsys_id); ss = nfnl_dereference_protected(subsys_id); if (!ss) #endif { nfnl_unlock(subsys_id); netlink_ack(oskb, nlh, -EOPNOTSUPP, NULL); return consume_skb(skb); } } if (!ss->valid_genid || !ss->commit || !ss->abort) { nfnl_unlock(subsys_id); netlink_ack(oskb, nlh, -EOPNOTSUPP, NULL); return consume_skb(skb); } if (!try_module_get(ss->owner)) { nfnl_unlock(subsys_id); netlink_ack(oskb, nlh, -EOPNOTSUPP, NULL); return consume_skb(skb); } if (!ss->valid_genid(net, genid)) { module_put(ss->owner); nfnl_unlock(subsys_id); netlink_ack(oskb, nlh, -ERESTART, NULL); return consume_skb(skb); } nfnl_unlock(subsys_id); if (nlh->nlmsg_flags & NLM_F_ACK) { memset(&extack, 0, sizeof(extack)); nfnl_err_add(&err_list, nlh, 0, &extack); } while (skb->len >= nlmsg_total_size(0)) { int msglen, type; if (fatal_signal_pending(current)) { nfnl_err_reset(&err_list); err = -EINTR; status = NFNL_BATCH_FAILURE; goto done; } memset(&extack, 0, sizeof(extack)); nlh = nlmsg_hdr(skb); err = 0; if (nlh->nlmsg_len < NLMSG_HDRLEN || skb->len < nlh->nlmsg_len || nlmsg_len(nlh) < sizeof(struct nfgenmsg)) { nfnl_err_reset(&err_list); status |= NFNL_BATCH_FAILURE; goto done; } /* Only requests are handled by the kernel */ if (!(nlh->nlmsg_flags & NLM_F_REQUEST)) { err = -EINVAL; goto ack; } type = nlh->nlmsg_type; if (type == NFNL_MSG_BATCH_BEGIN) { /* Malformed: Batch begin twice */ nfnl_err_reset(&err_list); status |= NFNL_BATCH_FAILURE; goto done; } else if (type == NFNL_MSG_BATCH_END) { status |= NFNL_BATCH_DONE; goto done; } else if (type < NLMSG_MIN_TYPE) { err = -EINVAL; goto ack; } /* We only accept a batch with messages for the same * subsystem. */ if (NFNL_SUBSYS_ID(type) != subsys_id) { err = -EINVAL; goto ack; } nc = nfnetlink_find_client(type, ss); if (!nc) { err = -EINVAL; goto ack; } if (nc->type != NFNL_CB_BATCH) { err = -EINVAL; goto ack; } { int min_len = nlmsg_total_size(sizeof(struct nfgenmsg)); struct nfnl_net *nfnlnet = nfnl_pernet(net); struct nlattr *cda[NFNL_MAX_ATTR_COUNT + 1]; struct nlattr *attr = (void *)nlh + min_len; u8 cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type); int attrlen = nlh->nlmsg_len - min_len; struct nfnl_info info = { .net = net, .sk = nfnlnet->nfnl, .nlh = nlh, .nfmsg = nlmsg_data(nlh), .extack = &extack, }; /* Sanity-check NFTA_MAX_ATTR */ if (ss->cb[cb_id].attr_count > NFNL_MAX_ATTR_COUNT) { err = -ENOMEM; goto ack; } err = nla_parse_deprecated(cda, ss->cb[cb_id].attr_count, attr, attrlen, ss->cb[cb_id].policy, &extack); if (err < 0) goto ack; err = nc->call(skb, &info, (const struct nlattr **)cda); /* The lock was released to autoload some module, we * have to abort and start from scratch using the * original skb. */ if (err == -EAGAIN) { status |= NFNL_BATCH_REPLAY; goto done; } } ack: if (nlh->nlmsg_flags & NLM_F_ACK || err) { /* Errors are delivered once the full batch has been * processed, this avoids that the same error is * reported several times when replaying the batch. */ if (err == -ENOMEM || nfnl_err_add(&err_list, nlh, err, &extack) < 0) { /* We failed to enqueue an error, reset the * list of errors and send OOM to userspace * pointing to the batch header. */ nfnl_err_reset(&err_list); netlink_ack(oskb, nlmsg_hdr(oskb), -ENOMEM, NULL); status |= NFNL_BATCH_FAILURE; goto done; } /* We don't stop processing the batch on errors, thus, * userspace gets all the errors that the batch * triggers. */ if (err) status |= NFNL_BATCH_FAILURE; } msglen = NLMSG_ALIGN(nlh->nlmsg_len); if (msglen > skb->len) msglen = skb->len; skb_pull(skb, msglen); } done: if (status & NFNL_BATCH_REPLAY) { ss->abort(net, oskb, NFNL_ABORT_AUTOLOAD); nfnl_err_reset(&err_list); consume_skb(skb); module_put(ss->owner); goto replay; } else if (status == NFNL_BATCH_DONE) { err = ss->commit(net, oskb); if (err == -EAGAIN) { status |= NFNL_BATCH_REPLAY; goto done; } else if (err) { ss->abort(net, oskb, NFNL_ABORT_NONE); netlink_ack(oskb, nlmsg_hdr(oskb), err, NULL); } else if (nlh->nlmsg_flags & NLM_F_ACK) { memset(&extack, 0, sizeof(extack)); nfnl_err_add(&err_list, nlh, 0, &extack); } } else { enum nfnl_abort_action abort_action; if (status & NFNL_BATCH_FAILURE) abort_action = NFNL_ABORT_NONE; else abort_action = NFNL_ABORT_VALIDATE; err = ss->abort(net, oskb, abort_action); if (err == -EAGAIN) { nfnl_err_reset(&err_list); consume_skb(skb); module_put(ss->owner); status |= NFNL_BATCH_FAILURE; goto replay_abort; } } nfnl_err_deliver(&err_list, oskb); consume_skb(skb); module_put(ss->owner); } static const struct nla_policy nfnl_batch_policy[NFNL_BATCH_MAX + 1] = { [NFNL_BATCH_GENID] = { .type = NLA_U32 }, }; static void nfnetlink_rcv_skb_batch(struct sk_buff *skb, struct nlmsghdr *nlh) { int min_len = nlmsg_total_size(sizeof(struct nfgenmsg)); struct nlattr *attr = (void *)nlh + min_len; struct nlattr *cda[NFNL_BATCH_MAX + 1]; int attrlen = nlh->nlmsg_len - min_len; struct nfgenmsg *nfgenmsg; int msglen, err; u32 gen_id = 0; u16 res_id; msglen = NLMSG_ALIGN(nlh->nlmsg_len); if (msglen > skb->len) msglen = skb->len; if (skb->len < NLMSG_HDRLEN + sizeof(struct nfgenmsg)) return; err = nla_parse_deprecated(cda, NFNL_BATCH_MAX, attr, attrlen, nfnl_batch_policy, NULL); if (err < 0) { netlink_ack(skb, nlh, err, NULL); return; } if (cda[NFNL_BATCH_GENID]) gen_id = ntohl(nla_get_be32(cda[NFNL_BATCH_GENID])); nfgenmsg = nlmsg_data(nlh); skb_pull(skb, msglen); /* Work around old nft using host byte order */ if (nfgenmsg->res_id == (__force __be16)NFNL_SUBSYS_NFTABLES) res_id = NFNL_SUBSYS_NFTABLES; else res_id = ntohs(nfgenmsg->res_id); nfnetlink_rcv_batch(skb, nlh, res_id, gen_id); } static void nfnetlink_rcv(struct sk_buff *skb) { struct nlmsghdr *nlh = nlmsg_hdr(skb); if (skb->len < NLMSG_HDRLEN || nlh->nlmsg_len < NLMSG_HDRLEN || skb->len < nlh->nlmsg_len) return; if (!netlink_net_capable(skb, CAP_NET_ADMIN)) { netlink_ack(skb, nlh, -EPERM, NULL); return; } if (nlh->nlmsg_type == NFNL_MSG_BATCH_BEGIN) nfnetlink_rcv_skb_batch(skb, nlh); else netlink_rcv_skb(skb, nfnetlink_rcv_msg); } static void nfnetlink_bind_event(struct net *net, unsigned int group) { #ifdef CONFIG_NF_CONNTRACK_EVENTS int type, group_bit; u8 v; /* All NFNLGRP_CONNTRACK_* group bits fit into u8. * The other groups are not relevant and can be ignored. */ if (group >= 8) return; type = nfnl_group2type[group]; switch (type) { case NFNL_SUBSYS_CTNETLINK: break; case NFNL_SUBSYS_CTNETLINK_EXP: break; default: return; } group_bit = (1 << group); spin_lock(&nfnl_grp_active_lock); v = READ_ONCE(nf_ctnetlink_has_listener); if ((v & group_bit) == 0) { v |= group_bit; /* read concurrently without nfnl_grp_active_lock held. */ WRITE_ONCE(nf_ctnetlink_has_listener, v); } spin_unlock(&nfnl_grp_active_lock); #endif } static int nfnetlink_bind(struct net *net, int group) { const struct nfnetlink_subsystem *ss; int type; if (group <= NFNLGRP_NONE || group > NFNLGRP_MAX) return 0; type = nfnl_group2type[group]; rcu_read_lock(); ss = nfnetlink_get_subsys(type << 8); rcu_read_unlock(); if (!ss) request_module_nowait("nfnetlink-subsys-%d", type); nfnetlink_bind_event(net, group); return 0; } static void nfnetlink_unbind(struct net *net, int group) { #ifdef CONFIG_NF_CONNTRACK_EVENTS int type, group_bit; if (group <= NFNLGRP_NONE || group > NFNLGRP_MAX) return; type = nfnl_group2type[group]; switch (type) { case NFNL_SUBSYS_CTNETLINK: break; case NFNL_SUBSYS_CTNETLINK_EXP: break; default: return; } /* ctnetlink_has_listener is u8 */ if (group >= 8) return; group_bit = (1 << group); spin_lock(&nfnl_grp_active_lock); if (!nfnetlink_has_listeners(net, group)) { u8 v = READ_ONCE(nf_ctnetlink_has_listener); v &= ~group_bit; /* read concurrently without nfnl_grp_active_lock held. */ WRITE_ONCE(nf_ctnetlink_has_listener, v); } spin_unlock(&nfnl_grp_active_lock); #endif } static int __net_init nfnetlink_net_init(struct net *net) { struct nfnl_net *nfnlnet = nfnl_pernet(net); struct netlink_kernel_cfg cfg = { .groups = NFNLGRP_MAX, .input = nfnetlink_rcv, .bind = nfnetlink_bind, .unbind = nfnetlink_unbind, }; nfnlnet->nfnl = netlink_kernel_create(net, NETLINK_NETFILTER, &cfg); if (!nfnlnet->nfnl) return -ENOMEM; return 0; } static void __net_exit nfnetlink_net_exit_batch(struct list_head *net_exit_list) { struct nfnl_net *nfnlnet; struct net *net; list_for_each_entry(net, net_exit_list, exit_list) { nfnlnet = nfnl_pernet(net); netlink_kernel_release(nfnlnet->nfnl); } } static struct pernet_operations nfnetlink_net_ops = { .init = nfnetlink_net_init, .exit_batch = nfnetlink_net_exit_batch, .id = &nfnetlink_pernet_id, .size = sizeof(struct nfnl_net), }; static int __init nfnetlink_init(void) { int i; for (i = NFNLGRP_NONE + 1; i <= NFNLGRP_MAX; i++) BUG_ON(nfnl_group2type[i] == NFNL_SUBSYS_NONE); for (i=0; i<NFNL_SUBSYS_COUNT; i++) __mutex_init(&table[i].mutex, nfnl_lockdep_names[i], &nfnl_lockdep_keys[i]); return register_pernet_subsys(&nfnetlink_net_ops); } static void __exit nfnetlink_exit(void) { unregister_pernet_subsys(&nfnetlink_net_ops); } module_init(nfnetlink_init); module_exit(nfnetlink_exit);
1 229 1 229 4 229 9 229 2 229 210 228 6 3 229 4 229 229 245 243 245 245 241 244 8 8 8 229 227 224 229 221 9 9 5 1 9 9 9 8 6 10 9 9 9 10 9 9 5 9 9 9 5 9 7 7 6 7 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 // SPDX-License-Identifier: GPL-2.0 #include <linux/syscalls.h> #include <linux/export.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/mount.h> #include <linux/namei.h> #include <linux/statfs.h> #include <linux/security.h> #include <linux/uaccess.h> #include <linux/compat.h> #include "internal.h" static int flags_by_mnt(int mnt_flags) { int flags = 0; if (mnt_flags & MNT_READONLY) flags |= ST_RDONLY; if (mnt_flags & MNT_NOSUID) flags |= ST_NOSUID; if (mnt_flags & MNT_NODEV) flags |= ST_NODEV; if (mnt_flags & MNT_NOEXEC) flags |= ST_NOEXEC; if (mnt_flags & MNT_NOATIME) flags |= ST_NOATIME; if (mnt_flags & MNT_NODIRATIME) flags |= ST_NODIRATIME; if (mnt_flags & MNT_RELATIME) flags |= ST_RELATIME; if (mnt_flags & MNT_NOSYMFOLLOW) flags |= ST_NOSYMFOLLOW; return flags; } static int flags_by_sb(int s_flags) { int flags = 0; if (s_flags & SB_SYNCHRONOUS) flags |= ST_SYNCHRONOUS; if (s_flags & SB_MANDLOCK) flags |= ST_MANDLOCK; if (s_flags & SB_RDONLY) flags |= ST_RDONLY; return flags; } static int calculate_f_flags(struct vfsmount *mnt) { return ST_VALID | flags_by_mnt(mnt->mnt_flags) | flags_by_sb(mnt->mnt_sb->s_flags); } static int statfs_by_dentry(struct dentry *dentry, struct kstatfs *buf) { int retval; if (!dentry->d_sb->s_op->statfs) return -ENOSYS; memset(buf, 0, sizeof(*buf)); retval = security_sb_statfs(dentry); if (retval) return retval; retval = dentry->d_sb->s_op->statfs(dentry, buf); if (retval == 0 && buf->f_frsize == 0) buf->f_frsize = buf->f_bsize; return retval; } int vfs_get_fsid(struct dentry *dentry, __kernel_fsid_t *fsid) { struct kstatfs st; int error; error = statfs_by_dentry(dentry, &st); if (error) return error; *fsid = st.f_fsid; return 0; } EXPORT_SYMBOL(vfs_get_fsid); int vfs_statfs(const struct path *path, struct kstatfs *buf) { int error; error = statfs_by_dentry(path->dentry, buf); if (!error) buf->f_flags = calculate_f_flags(path->mnt); return error; } EXPORT_SYMBOL(vfs_statfs); int user_statfs(const char __user *pathname, struct kstatfs *st) { struct path path; int error; unsigned int lookup_flags = LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT; retry: error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path); if (!error) { error = vfs_statfs(&path, st); path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } } return error; } int fd_statfs(int fd, struct kstatfs *st) { CLASS(fd_raw, f)(fd); if (fd_empty(f)) return -EBADF; return vfs_statfs(&fd_file(f)->f_path, st); } static int do_statfs_native(struct kstatfs *st, struct statfs __user *p) { struct statfs buf; if (sizeof(buf) == sizeof(*st)) memcpy(&buf, st, sizeof(*st)); else { memset(&buf, 0, sizeof(buf)); if (sizeof buf.f_blocks == 4) { if ((st->f_blocks | st->f_bfree | st->f_bavail | st->f_bsize | st->f_frsize) & 0xffffffff00000000ULL) return -EOVERFLOW; /* * f_files and f_ffree may be -1; it's okay to stuff * that into 32 bits */ if (st->f_files != -1 && (st->f_files & 0xffffffff00000000ULL)) return -EOVERFLOW; if (st->f_ffree != -1 && (st->f_ffree & 0xffffffff00000000ULL)) return -EOVERFLOW; } buf.f_type = st->f_type; buf.f_bsize = st->f_bsize; buf.f_blocks = st->f_blocks; buf.f_bfree = st->f_bfree; buf.f_bavail = st->f_bavail; buf.f_files = st->f_files; buf.f_ffree = st->f_ffree; buf.f_fsid = st->f_fsid; buf.f_namelen = st->f_namelen; buf.f_frsize = st->f_frsize; buf.f_flags = st->f_flags; } if (copy_to_user(p, &buf, sizeof(buf))) return -EFAULT; return 0; } static int do_statfs64(struct kstatfs *st, struct statfs64 __user *p) { struct statfs64 buf; if (sizeof(buf) == sizeof(*st)) memcpy(&buf, st, sizeof(*st)); else { memset(&buf, 0, sizeof(buf)); buf.f_type = st->f_type; buf.f_bsize = st->f_bsize; buf.f_blocks = st->f_blocks; buf.f_bfree = st->f_bfree; buf.f_bavail = st->f_bavail; buf.f_files = st->f_files; buf.f_ffree = st->f_ffree; buf.f_fsid = st->f_fsid; buf.f_namelen = st->f_namelen; buf.f_frsize = st->f_frsize; buf.f_flags = st->f_flags; } if (copy_to_user(p, &buf, sizeof(buf))) return -EFAULT; return 0; } SYSCALL_DEFINE2(statfs, const char __user *, pathname, struct statfs __user *, buf) { struct kstatfs st; int error = user_statfs(pathname, &st); if (!error) error = do_statfs_native(&st, buf); return error; } SYSCALL_DEFINE3(statfs64, const char __user *, pathname, size_t, sz, struct statfs64 __user *, buf) { struct kstatfs st; int error; if (sz != sizeof(*buf)) return -EINVAL; error = user_statfs(pathname, &st); if (!error) error = do_statfs64(&st, buf); return error; } SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct statfs __user *, buf) { struct kstatfs st; int error = fd_statfs(fd, &st); if (!error) error = do_statfs_native(&st, buf); return error; } SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, size_t, sz, struct statfs64 __user *, buf) { struct kstatfs st; int error; if (sz != sizeof(*buf)) return -EINVAL; error = fd_statfs(fd, &st); if (!error) error = do_statfs64(&st, buf); return error; } static int vfs_ustat(dev_t dev, struct kstatfs *sbuf) { struct super_block *s = user_get_super(dev, false); int err; if (!s) return -EINVAL; err = statfs_by_dentry(s->s_root, sbuf); drop_super(s); return err; } SYSCALL_DEFINE2(ustat, unsigned, dev, struct ustat __user *, ubuf) { struct ustat tmp; struct kstatfs sbuf; int err = vfs_ustat(new_decode_dev(dev), &sbuf); if (err) return err; memset(&tmp,0,sizeof(struct ustat)); tmp.f_tfree = sbuf.f_bfree; if (IS_ENABLED(CONFIG_ARCH_32BIT_USTAT_F_TINODE)) tmp.f_tinode = min_t(u64, sbuf.f_ffree, UINT_MAX); else tmp.f_tinode = sbuf.f_ffree; return copy_to_user(ubuf, &tmp, sizeof(struct ustat)) ? -EFAULT : 0; } #ifdef CONFIG_COMPAT static int put_compat_statfs(struct compat_statfs __user *ubuf, struct kstatfs *kbuf) { struct compat_statfs buf; if (sizeof ubuf->f_blocks == 4) { if ((kbuf->f_blocks | kbuf->f_bfree | kbuf->f_bavail | kbuf->f_bsize | kbuf->f_frsize) & 0xffffffff00000000ULL) return -EOVERFLOW; /* f_files and f_ffree may be -1; it's okay * to stuff that into 32 bits */ if (kbuf->f_files != 0xffffffffffffffffULL && (kbuf->f_files & 0xffffffff00000000ULL)) return -EOVERFLOW; if (kbuf->f_ffree != 0xffffffffffffffffULL && (kbuf->f_ffree & 0xffffffff00000000ULL)) return -EOVERFLOW; } memset(&buf, 0, sizeof(struct compat_statfs)); buf.f_type = kbuf->f_type; buf.f_bsize = kbuf->f_bsize; buf.f_blocks = kbuf->f_blocks; buf.f_bfree = kbuf->f_bfree; buf.f_bavail = kbuf->f_bavail; buf.f_files = kbuf->f_files; buf.f_ffree = kbuf->f_ffree; buf.f_namelen = kbuf->f_namelen; buf.f_fsid.val[0] = kbuf->f_fsid.val[0]; buf.f_fsid.val[1] = kbuf->f_fsid.val[1]; buf.f_frsize = kbuf->f_frsize; buf.f_flags = kbuf->f_flags; if (copy_to_user(ubuf, &buf, sizeof(struct compat_statfs))) return -EFAULT; return 0; } /* * The following statfs calls are copies of code from fs/statfs.c and * should be checked against those from time to time */ COMPAT_SYSCALL_DEFINE2(statfs, const char __user *, pathname, struct compat_statfs __user *, buf) { struct kstatfs tmp; int error = user_statfs(pathname, &tmp); if (!error) error = put_compat_statfs(buf, &tmp); return error; } COMPAT_SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct compat_statfs __user *, buf) { struct kstatfs tmp; int error = fd_statfs(fd, &tmp); if (!error) error = put_compat_statfs(buf, &tmp); return error; } static int put_compat_statfs64(struct compat_statfs64 __user *ubuf, struct kstatfs *kbuf) { struct compat_statfs64 buf; if ((kbuf->f_bsize | kbuf->f_frsize) & 0xffffffff00000000ULL) return -EOVERFLOW; memset(&buf, 0, sizeof(struct compat_statfs64)); buf.f_type = kbuf->f_type; buf.f_bsize = kbuf->f_bsize; buf.f_blocks = kbuf->f_blocks; buf.f_bfree = kbuf->f_bfree; buf.f_bavail = kbuf->f_bavail; buf.f_files = kbuf->f_files; buf.f_ffree = kbuf->f_ffree; buf.f_namelen = kbuf->f_namelen; buf.f_fsid.val[0] = kbuf->f_fsid.val[0]; buf.f_fsid.val[1] = kbuf->f_fsid.val[1]; buf.f_frsize = kbuf->f_frsize; buf.f_flags = kbuf->f_flags; if (copy_to_user(ubuf, &buf, sizeof(struct compat_statfs64))) return -EFAULT; return 0; } int kcompat_sys_statfs64(const char __user * pathname, compat_size_t sz, struct compat_statfs64 __user * buf) { struct kstatfs tmp; int error; if (sz != sizeof(*buf)) return -EINVAL; error = user_statfs(pathname, &tmp); if (!error) error = put_compat_statfs64(buf, &tmp); return error; } COMPAT_SYSCALL_DEFINE3(statfs64, const char __user *, pathname, compat_size_t, sz, struct compat_statfs64 __user *, buf) { return kcompat_sys_statfs64(pathname, sz, buf); } int kcompat_sys_fstatfs64(unsigned int fd, compat_size_t sz, struct compat_statfs64 __user * buf) { struct kstatfs tmp; int error; if (sz != sizeof(*buf)) return -EINVAL; error = fd_statfs(fd, &tmp); if (!error) error = put_compat_statfs64(buf, &tmp); return error; } COMPAT_SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, compat_size_t, sz, struct compat_statfs64 __user *, buf) { return kcompat_sys_fstatfs64(fd, sz, buf); } /* * This is a copy of sys_ustat, just dealing with a structure layout. * Given how simple this syscall is that apporach is more maintainable * than the various conversion hacks. */ COMPAT_SYSCALL_DEFINE2(ustat, unsigned, dev, struct compat_ustat __user *, u) { struct compat_ustat tmp; struct kstatfs sbuf; int err = vfs_ustat(new_decode_dev(dev), &sbuf); if (err) return err; memset(&tmp, 0, sizeof(struct compat_ustat)); tmp.f_tfree = sbuf.f_bfree; tmp.f_tinode = sbuf.f_ffree; if (copy_to_user(u, &tmp, sizeof(struct compat_ustat))) return -EFAULT; return 0; } #endif
86 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 // SPDX-License-Identifier: GPL-2.0-only #include <linux/spinlock.h> #include <linux/slab.h> #include <linux/list.h> #include <linux/list_bl.h> #include <linux/module.h> #include <linux/sched.h> #include <linux/workqueue.h> #include <linux/mbcache.h> /* * Mbcache is a simple key-value store. Keys need not be unique, however * key-value pairs are expected to be unique (we use this fact in * mb_cache_entry_delete_or_get()). * * Ext2 and ext4 use this cache for deduplication of extended attribute blocks. * Ext4 also uses it for deduplication of xattr values stored in inodes. * They use hash of data as a key and provide a value that may represent a * block or inode number. That's why keys need not be unique (hash of different * data may be the same). However user provided value always uniquely * identifies a cache entry. * * We provide functions for creation and removal of entries, search by key, * and a special "delete entry with given key-value pair" operation. Fixed * size hash table is used for fast key lookups. */ struct mb_cache { /* Hash table of entries */ struct hlist_bl_head *c_hash; /* log2 of hash table size */ int c_bucket_bits; /* Maximum entries in cache to avoid degrading hash too much */ unsigned long c_max_entries; /* Protects c_list, c_entry_count */ spinlock_t c_list_lock; struct list_head c_list; /* Number of entries in cache */ unsigned long c_entry_count; struct shrinker *c_shrink; /* Work for shrinking when the cache has too many entries */ struct work_struct c_shrink_work; }; static struct kmem_cache *mb_entry_cache; static unsigned long mb_cache_shrink(struct mb_cache *cache, unsigned long nr_to_scan); static inline struct hlist_bl_head *mb_cache_entry_head(struct mb_cache *cache, u32 key) { return &cache->c_hash[hash_32(key, cache->c_bucket_bits)]; } /* * Number of entries to reclaim synchronously when there are too many entries * in cache */ #define SYNC_SHRINK_BATCH 64 /* * mb_cache_entry_create - create entry in cache * @cache - cache where the entry should be created * @mask - gfp mask with which the entry should be allocated * @key - key of the entry * @value - value of the entry * @reusable - is the entry reusable by others? * * Creates entry in @cache with key @key and value @value. The function returns * -EBUSY if entry with the same key and value already exists in cache. * Otherwise 0 is returned. */ int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key, u64 value, bool reusable) { struct mb_cache_entry *entry, *dup; struct hlist_bl_node *dup_node; struct hlist_bl_head *head; /* Schedule background reclaim if there are too many entries */ if (cache->c_entry_count >= cache->c_max_entries) schedule_work(&cache->c_shrink_work); /* Do some sync reclaim if background reclaim cannot keep up */ if (cache->c_entry_count >= 2*cache->c_max_entries) mb_cache_shrink(cache, SYNC_SHRINK_BATCH); entry = kmem_cache_alloc(mb_entry_cache, mask); if (!entry) return -ENOMEM; INIT_LIST_HEAD(&entry->e_list); /* * We create entry with two references. One reference is kept by the * hash table, the other reference is used to protect us from * mb_cache_entry_delete_or_get() until the entry is fully setup. This * avoids nesting of cache->c_list_lock into hash table bit locks which * is problematic for RT. */ atomic_set(&entry->e_refcnt, 2); entry->e_key = key; entry->e_value = value; entry->e_flags = 0; if (reusable) set_bit(MBE_REUSABLE_B, &entry->e_flags); head = mb_cache_entry_head(cache, key); hlist_bl_lock(head); hlist_bl_for_each_entry(dup, dup_node, head, e_hash_list) { if (dup->e_key == key && dup->e_value == value) { hlist_bl_unlock(head); kmem_cache_free(mb_entry_cache, entry); return -EBUSY; } } hlist_bl_add_head(&entry->e_hash_list, head); hlist_bl_unlock(head); spin_lock(&cache->c_list_lock); list_add_tail(&entry->e_list, &cache->c_list); cache->c_entry_count++; spin_unlock(&cache->c_list_lock); mb_cache_entry_put(cache, entry); return 0; } EXPORT_SYMBOL(mb_cache_entry_create); void __mb_cache_entry_free(struct mb_cache *cache, struct mb_cache_entry *entry) { struct hlist_bl_head *head; head = mb_cache_entry_head(cache, entry->e_key); hlist_bl_lock(head); hlist_bl_del(&entry->e_hash_list); hlist_bl_unlock(head); kmem_cache_free(mb_entry_cache, entry); } EXPORT_SYMBOL(__mb_cache_entry_free); /* * mb_cache_entry_wait_unused - wait to be the last user of the entry * * @entry - entry to work on * * Wait to be the last user of the entry. */ void mb_cache_entry_wait_unused(struct mb_cache_entry *entry) { wait_var_event(&entry->e_refcnt, atomic_read(&entry->e_refcnt) <= 2); } EXPORT_SYMBOL(mb_cache_entry_wait_unused); static struct mb_cache_entry *__entry_find(struct mb_cache *cache, struct mb_cache_entry *entry, u32 key) { struct mb_cache_entry *old_entry = entry; struct hlist_bl_node *node; struct hlist_bl_head *head; head = mb_cache_entry_head(cache, key); hlist_bl_lock(head); if (entry && !hlist_bl_unhashed(&entry->e_hash_list)) node = entry->e_hash_list.next; else node = hlist_bl_first(head); while (node) { entry = hlist_bl_entry(node, struct mb_cache_entry, e_hash_list); if (entry->e_key == key && test_bit(MBE_REUSABLE_B, &entry->e_flags) && atomic_inc_not_zero(&entry->e_refcnt)) goto out; node = node->next; } entry = NULL; out: hlist_bl_unlock(head); if (old_entry) mb_cache_entry_put(cache, old_entry); return entry; } /* * mb_cache_entry_find_first - find the first reusable entry with the given key * @cache: cache where we should search * @key: key to look for * * Search in @cache for a reusable entry with key @key. Grabs reference to the * first reusable entry found and returns the entry. */ struct mb_cache_entry *mb_cache_entry_find_first(struct mb_cache *cache, u32 key) { return __entry_find(cache, NULL, key); } EXPORT_SYMBOL(mb_cache_entry_find_first); /* * mb_cache_entry_find_next - find next reusable entry with the same key * @cache: cache where we should search * @entry: entry to start search from * * Finds next reusable entry in the hash chain which has the same key as @entry. * If @entry is unhashed (which can happen when deletion of entry races with the * search), finds the first reusable entry in the hash chain. The function drops * reference to @entry and returns with a reference to the found entry. */ struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache *cache, struct mb_cache_entry *entry) { return __entry_find(cache, entry, entry->e_key); } EXPORT_SYMBOL(mb_cache_entry_find_next); /* * mb_cache_entry_get - get a cache entry by value (and key) * @cache - cache we work with * @key - key * @value - value */ struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key, u64 value) { struct hlist_bl_node *node; struct hlist_bl_head *head; struct mb_cache_entry *entry; head = mb_cache_entry_head(cache, key); hlist_bl_lock(head); hlist_bl_for_each_entry(entry, node, head, e_hash_list) { if (entry->e_key == key && entry->e_value == value && atomic_inc_not_zero(&entry->e_refcnt)) goto out; } entry = NULL; out: hlist_bl_unlock(head); return entry; } EXPORT_SYMBOL(mb_cache_entry_get); /* mb_cache_entry_delete_or_get - remove a cache entry if it has no users * @cache - cache we work with * @key - key * @value - value * * Remove entry from cache @cache with key @key and value @value. The removal * happens only if the entry is unused. The function returns NULL in case the * entry was successfully removed or there's no entry in cache. Otherwise the * function grabs reference of the entry that we failed to delete because it * still has users and return it. */ struct mb_cache_entry *mb_cache_entry_delete_or_get(struct mb_cache *cache, u32 key, u64 value) { struct mb_cache_entry *entry; entry = mb_cache_entry_get(cache, key, value); if (!entry) return NULL; /* * Drop the ref we got from mb_cache_entry_get() and the initial hash * ref if we are the last user */ if (atomic_cmpxchg(&entry->e_refcnt, 2, 0) != 2) return entry; spin_lock(&cache->c_list_lock); if (!list_empty(&entry->e_list)) list_del_init(&entry->e_list); cache->c_entry_count--; spin_unlock(&cache->c_list_lock); __mb_cache_entry_free(cache, entry); return NULL; } EXPORT_SYMBOL(mb_cache_entry_delete_or_get); /* mb_cache_entry_touch - cache entry got used * @cache - cache the entry belongs to * @entry - entry that got used * * Marks entry as used to give hit higher chances of surviving in cache. */ void mb_cache_entry_touch(struct mb_cache *cache, struct mb_cache_entry *entry) { set_bit(MBE_REFERENCED_B, &entry->e_flags); } EXPORT_SYMBOL(mb_cache_entry_touch); static unsigned long mb_cache_count(struct shrinker *shrink, struct shrink_control *sc) { struct mb_cache *cache = shrink->private_data; return cache->c_entry_count; } /* Shrink number of entries in cache */ static unsigned long mb_cache_shrink(struct mb_cache *cache, unsigned long nr_to_scan) { struct mb_cache_entry *entry; unsigned long shrunk = 0; spin_lock(&cache->c_list_lock); while (nr_to_scan-- && !list_empty(&cache->c_list)) { entry = list_first_entry(&cache->c_list, struct mb_cache_entry, e_list); /* Drop initial hash reference if there is no user */ if (test_bit(MBE_REFERENCED_B, &entry->e_flags) || atomic_cmpxchg(&entry->e_refcnt, 1, 0) != 1) { clear_bit(MBE_REFERENCED_B, &entry->e_flags); list_move_tail(&entry->e_list, &cache->c_list); continue; } list_del_init(&entry->e_list); cache->c_entry_count--; spin_unlock(&cache->c_list_lock); __mb_cache_entry_free(cache, entry); shrunk++; cond_resched(); spin_lock(&cache->c_list_lock); } spin_unlock(&cache->c_list_lock); return shrunk; } static unsigned long mb_cache_scan(struct shrinker *shrink, struct shrink_control *sc) { struct mb_cache *cache = shrink->private_data; return mb_cache_shrink(cache, sc->nr_to_scan); } /* We shrink 1/X of the cache when we have too many entries in it */ #define SHRINK_DIVISOR 16 static void mb_cache_shrink_worker(struct work_struct *work) { struct mb_cache *cache = container_of(work, struct mb_cache, c_shrink_work); mb_cache_shrink(cache, cache->c_max_entries / SHRINK_DIVISOR); } /* * mb_cache_create - create cache * @bucket_bits: log2 of the hash table size * * Create cache for keys with 2^bucket_bits hash entries. */ struct mb_cache *mb_cache_create(int bucket_bits) { struct mb_cache *cache; unsigned long bucket_count = 1UL << bucket_bits; unsigned long i; cache = kzalloc(sizeof(struct mb_cache), GFP_KERNEL); if (!cache) goto err_out; cache->c_bucket_bits = bucket_bits; cache->c_max_entries = bucket_count << 4; INIT_LIST_HEAD(&cache->c_list); spin_lock_init(&cache->c_list_lock); cache->c_hash = kmalloc_array(bucket_count, sizeof(struct hlist_bl_head), GFP_KERNEL); if (!cache->c_hash) { kfree(cache); goto err_out; } for (i = 0; i < bucket_count; i++) INIT_HLIST_BL_HEAD(&cache->c_hash[i]); cache->c_shrink = shrinker_alloc(0, "mbcache-shrinker"); if (!cache->c_shrink) { kfree(cache->c_hash); kfree(cache); goto err_out; } cache->c_shrink->count_objects = mb_cache_count; cache->c_shrink->scan_objects = mb_cache_scan; cache->c_shrink->private_data = cache; shrinker_register(cache->c_shrink); INIT_WORK(&cache->c_shrink_work, mb_cache_shrink_worker); return cache; err_out: return NULL; } EXPORT_SYMBOL(mb_cache_create); /* * mb_cache_destroy - destroy cache * @cache: the cache to destroy * * Free all entries in cache and cache itself. Caller must make sure nobody * (except shrinker) can reach @cache when calling this. */ void mb_cache_destroy(struct mb_cache *cache) { struct mb_cache_entry *entry, *next; shrinker_free(cache->c_shrink); /* * We don't bother with any locking. Cache must not be used at this * point. */ list_for_each_entry_safe(entry, next, &cache->c_list, e_list) { list_del(&entry->e_list); WARN_ON(atomic_read(&entry->e_refcnt) != 1); mb_cache_entry_put(cache, entry); } kfree(cache->c_hash); kfree(cache); } EXPORT_SYMBOL(mb_cache_destroy); static int __init mbcache_init(void) { mb_entry_cache = KMEM_CACHE(mb_cache_entry, SLAB_RECLAIM_ACCOUNT); if (!mb_entry_cache) return -ENOMEM; return 0; } static void __exit mbcache_exit(void) { kmem_cache_destroy(mb_entry_cache); } module_init(mbcache_init) module_exit(mbcache_exit) MODULE_AUTHOR("Jan Kara <jack@suse.cz>"); MODULE_DESCRIPTION("Meta block cache (for extended attributes)"); MODULE_LICENSE("GPL");
332 332 332 638 638 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 // SPDX-License-Identifier: GPL-2.0-only #define pr_fmt(fmt) "callthunks: " fmt #include <linux/debugfs.h> #include <linux/kallsyms.h> #include <linux/memory.h> #include <linux/moduleloader.h> #include <linux/static_call.h> #include <asm/alternative.h> #include <asm/asm-offsets.h> #include <asm/cpu.h> #include <asm/ftrace.h> #include <asm/insn.h> #include <asm/kexec.h> #include <asm/nospec-branch.h> #include <asm/paravirt.h> #include <asm/sections.h> #include <asm/switch_to.h> #include <asm/sync_core.h> #include <asm/text-patching.h> #include <asm/xen/hypercall.h> static int __initdata_or_module debug_callthunks; #define MAX_PATCH_LEN (255-1) #define prdbg(fmt, args...) \ do { \ if (debug_callthunks) \ printk(KERN_DEBUG pr_fmt(fmt), ##args); \ } while(0) static int __init debug_thunks(char *str) { debug_callthunks = 1; return 1; } __setup("debug-callthunks", debug_thunks); #ifdef CONFIG_CALL_THUNKS_DEBUG DEFINE_PER_CPU(u64, __x86_call_count); DEFINE_PER_CPU(u64, __x86_ret_count); DEFINE_PER_CPU(u64, __x86_stuffs_count); DEFINE_PER_CPU(u64, __x86_ctxsw_count); EXPORT_PER_CPU_SYMBOL_GPL(__x86_ctxsw_count); EXPORT_PER_CPU_SYMBOL_GPL(__x86_call_count); #endif extern s32 __call_sites[], __call_sites_end[]; struct core_text { unsigned long base; unsigned long end; const char *name; }; static bool thunks_initialized __ro_after_init; static const struct core_text builtin_coretext = { .base = (unsigned long)_text, .end = (unsigned long)_etext, .name = "builtin", }; asm ( ".pushsection .rodata \n" ".global skl_call_thunk_template \n" "skl_call_thunk_template: \n" __stringify(INCREMENT_CALL_DEPTH)" \n" ".global skl_call_thunk_tail \n" "skl_call_thunk_tail: \n" ".popsection \n" ); extern u8 skl_call_thunk_template[]; extern u8 skl_call_thunk_tail[]; #define SKL_TMPL_SIZE \ ((unsigned int)(skl_call_thunk_tail - skl_call_thunk_template)) extern void error_entry(void); extern void xen_error_entry(void); extern void paranoid_entry(void); static inline bool within_coretext(const struct core_text *ct, void *addr) { unsigned long p = (unsigned long)addr; return ct->base <= p && p < ct->end; } static inline bool within_module_coretext(void *addr) { bool ret = false; #ifdef CONFIG_MODULES struct module *mod; preempt_disable(); mod = __module_address((unsigned long)addr); if (mod && within_module_core((unsigned long)addr, mod)) ret = true; preempt_enable(); #endif return ret; } static bool is_coretext(const struct core_text *ct, void *addr) { if (ct && within_coretext(ct, addr)) return true; if (within_coretext(&builtin_coretext, addr)) return true; return within_module_coretext(addr); } static bool skip_addr(void *dest) { if (dest == error_entry) return true; if (dest == paranoid_entry) return true; if (dest == xen_error_entry) return true; /* Does FILL_RSB... */ if (dest == __switch_to_asm) return true; /* Accounts directly */ if (dest == ret_from_fork) return true; #if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_AMD_MEM_ENCRYPT) if (dest == soft_restart_cpu) return true; #endif #ifdef CONFIG_FUNCTION_TRACER if (dest == __fentry__) return true; #endif #ifdef CONFIG_KEXEC_CORE if (dest >= (void *)relocate_kernel && dest < (void*)relocate_kernel + KEXEC_CONTROL_CODE_MAX_SIZE) return true; #endif return false; } static __init_or_module void *call_get_dest(void *addr) { struct insn insn; void *dest; int ret; ret = insn_decode_kernel(&insn, addr); if (ret) return ERR_PTR(ret); /* Patched out call? */ if (insn.opcode.bytes[0] != CALL_INSN_OPCODE) return NULL; dest = addr + insn.length + insn.immediate.value; if (skip_addr(dest)) return NULL; return dest; } static const u8 nops[] = { 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, }; static void *patch_dest(void *dest, bool direct) { unsigned int tsize = SKL_TMPL_SIZE; u8 insn_buff[MAX_PATCH_LEN]; u8 *pad = dest - tsize; memcpy(insn_buff, skl_call_thunk_template, tsize); apply_relocation(insn_buff, pad, tsize, skl_call_thunk_template, tsize); /* Already patched? */ if (!bcmp(pad, insn_buff, tsize)) return pad; /* Ensure there are nops */ if (bcmp(pad, nops, tsize)) { pr_warn_once("Invalid padding area for %pS\n", dest); return NULL; } if (direct) memcpy(pad, insn_buff, tsize); else text_poke_copy_locked(pad, insn_buff, tsize, true); return pad; } static __init_or_module void patch_call(void *addr, const struct core_text *ct) { void *pad, *dest; u8 bytes[8]; if (!within_coretext(ct, addr)) return; dest = call_get_dest(addr); if (!dest || WARN_ON_ONCE(IS_ERR(dest))) return; if (!is_coretext(ct, dest)) return; pad = patch_dest(dest, within_coretext(ct, dest)); if (!pad) return; prdbg("Patch call at: %pS %px to %pS %px -> %px \n", addr, addr, dest, dest, pad); __text_gen_insn(bytes, CALL_INSN_OPCODE, addr, pad, CALL_INSN_SIZE); text_poke_early(addr, bytes, CALL_INSN_SIZE); } static __init_or_module void patch_call_sites(s32 *start, s32 *end, const struct core_text *ct) { s32 *s; for (s = start; s < end; s++) patch_call((void *)s + *s, ct); } static __init_or_module void patch_alt_call_sites(struct alt_instr *start, struct alt_instr *end, const struct core_text *ct) { struct alt_instr *a; for (a = start; a < end; a++) patch_call((void *)&a->instr_offset + a->instr_offset, ct); } static __init_or_module void callthunks_setup(struct callthunk_sites *cs, const struct core_text *ct) { prdbg("Patching call sites %s\n", ct->name); patch_call_sites(cs->call_start, cs->call_end, ct); patch_alt_call_sites(cs->alt_start, cs->alt_end, ct); prdbg("Patching call sites done%s\n", ct->name); } void __init callthunks_patch_builtin_calls(void) { struct callthunk_sites cs = { .call_start = __call_sites, .call_end = __call_sites_end, .alt_start = __alt_instructions, .alt_end = __alt_instructions_end }; if (!cpu_feature_enabled(X86_FEATURE_CALL_DEPTH)) return; pr_info("Setting up call depth tracking\n"); mutex_lock(&text_mutex); callthunks_setup(&cs, &builtin_coretext); thunks_initialized = true; mutex_unlock(&text_mutex); } void *callthunks_translate_call_dest(void *dest) { void *target; lockdep_assert_held(&text_mutex); if (!thunks_initialized || skip_addr(dest)) return dest; if (!is_coretext(NULL, dest)) return dest; target = patch_dest(dest, false); return target ? : dest; } #ifdef CONFIG_BPF_JIT static bool is_callthunk(void *addr) { unsigned int tmpl_size = SKL_TMPL_SIZE; u8 insn_buff[MAX_PATCH_LEN]; unsigned long dest; u8 *pad; dest = roundup((unsigned long)addr, CONFIG_FUNCTION_ALIGNMENT); if (!thunks_initialized || skip_addr((void *)dest)) return false; pad = (void *)(dest - tmpl_size); memcpy(insn_buff, skl_call_thunk_template, tmpl_size); apply_relocation(insn_buff, pad, tmpl_size, skl_call_thunk_template, tmpl_size); return !bcmp(pad, insn_buff, tmpl_size); } int x86_call_depth_emit_accounting(u8 **pprog, void *func, void *ip) { unsigned int tmpl_size = SKL_TMPL_SIZE; u8 insn_buff[MAX_PATCH_LEN]; if (!thunks_initialized) return 0; /* Is function call target a thunk? */ if (func && is_callthunk(func)) return 0; memcpy(insn_buff, skl_call_thunk_template, tmpl_size); apply_relocation(insn_buff, ip, tmpl_size, skl_call_thunk_template, tmpl_size); memcpy(*pprog, insn_buff, tmpl_size); *pprog += tmpl_size; return tmpl_size; } #endif #ifdef CONFIG_MODULES void noinline callthunks_patch_module_calls(struct callthunk_sites *cs, struct module *mod) { struct core_text ct = { .base = (unsigned long)mod->mem[MOD_TEXT].base, .end = (unsigned long)mod->mem[MOD_TEXT].base + mod->mem[MOD_TEXT].size, .name = mod->name, }; if (!thunks_initialized) return; mutex_lock(&text_mutex); callthunks_setup(cs, &ct); mutex_unlock(&text_mutex); } #endif /* CONFIG_MODULES */ #if defined(CONFIG_CALL_THUNKS_DEBUG) && defined(CONFIG_DEBUG_FS) static int callthunks_debug_show(struct seq_file *m, void *p) { unsigned long cpu = (unsigned long)m->private; seq_printf(m, "C: %16llu R: %16llu S: %16llu X: %16llu\n,", per_cpu(__x86_call_count, cpu), per_cpu(__x86_ret_count, cpu), per_cpu(__x86_stuffs_count, cpu), per_cpu(__x86_ctxsw_count, cpu)); return 0; } static int callthunks_debug_open(struct inode *inode, struct file *file) { return single_open(file, callthunks_debug_show, inode->i_private); } static const struct file_operations dfs_ops = { .open = callthunks_debug_open, .read = seq_read, .llseek = seq_lseek, .release = single_release, }; static int __init callthunks_debugfs_init(void) { struct dentry *dir; unsigned long cpu; dir = debugfs_create_dir("callthunks", NULL); for_each_possible_cpu(cpu) { void *arg = (void *)cpu; char name [10]; sprintf(name, "cpu%lu", cpu); debugfs_create_file(name, 0644, dir, arg, &dfs_ops); } return 0; } __initcall(callthunks_debugfs_init); #endif
19 19 18 19 19 14 14 19 19 18 19 19 19 19 19 10 5 13 14 13 14 14 14 14 13 14 14 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 // SPDX-License-Identifier: GPL-2.0-only /* * AppArmor security module * * This file contains AppArmor auditing functions * * Copyright (C) 1998-2008 Novell/SUSE * Copyright 2009-2010 Canonical Ltd. */ #include <linux/audit.h> #include <linux/socket.h> #include "include/apparmor.h" #include "include/audit.h" #include "include/policy.h" #include "include/policy_ns.h" #include "include/secid.h" const char *const audit_mode_names[] = { "normal", "quiet_denied", "quiet", "noquiet", "all" }; static const char *const aa_audit_type[] = { "AUDIT", "ALLOWED", "DENIED", "HINT", "STATUS", "ERROR", "KILLED", "AUTO" }; static const char *const aa_class_names[] = { "none", "unknown", "file", "cap", "net", "rlimits", "domain", "mount", "unknown", "ptrace", "signal", "xmatch", "unknown", "unknown", "net", "unknown", "label", "posix_mqueue", "io_uring", "module", "lsm", "namespace", "io_uring", "unknown", "unknown", "unknown", "unknown", "unknown", "unknown", "unknown", "unknown", "X", "dbus", }; /* * Currently AppArmor auditing is fed straight into the audit framework. * * TODO: * netlink interface for complain mode * user auditing, - send user auditing to netlink interface * system control of whether user audit messages go to system log */ /** * audit_pre() - core AppArmor function. * @ab: audit buffer to fill (NOT NULL) * @va: audit structure containing data to audit (NOT NULL) * * Record common AppArmor audit data from @va */ static void audit_pre(struct audit_buffer *ab, void *va) { struct apparmor_audit_data *ad = aad_of_va(va); if (aa_g_audit_header) { audit_log_format(ab, "apparmor=\"%s\"", aa_audit_type[ad->type]); } if (ad->op) audit_log_format(ab, " operation=\"%s\"", ad->op); if (ad->class) audit_log_format(ab, " class=\"%s\"", ad->class <= AA_CLASS_LAST ? aa_class_names[ad->class] : "unknown"); if (ad->info) { audit_log_format(ab, " info=\"%s\"", ad->info); if (ad->error) audit_log_format(ab, " error=%d", ad->error); } if (ad->subj_label) { struct aa_label *label = ad->subj_label; if (label_isprofile(label)) { struct aa_profile *profile = labels_profile(label); if (profile->ns != root_ns) { audit_log_format(ab, " namespace="); audit_log_untrustedstring(ab, profile->ns->base.hname); } audit_log_format(ab, " profile="); audit_log_untrustedstring(ab, profile->base.hname); } else { audit_log_format(ab, " label="); aa_label_xaudit(ab, root_ns, label, FLAG_VIEW_SUBNS, GFP_ATOMIC); } } if (ad->name) { audit_log_format(ab, " name="); audit_log_untrustedstring(ab, ad->name); } } /** * aa_audit_msg - Log a message to the audit subsystem * @type: audit type for the message * @ad: audit event structure (NOT NULL) * @cb: optional callback fn for type specific fields (MAYBE NULL) */ void aa_audit_msg(int type, struct apparmor_audit_data *ad, void (*cb) (struct audit_buffer *, void *)) { ad->type = type; common_lsm_audit(&ad->common, audit_pre, cb); } /** * aa_audit - Log a profile based audit event to the audit subsystem * @type: audit type for the message * @profile: profile to check against (NOT NULL) * @ad: audit event (NOT NULL) * @cb: optional callback fn for type specific fields (MAYBE NULL) * * Handle default message switching based off of audit mode flags * * Returns: error on failure */ int aa_audit(int type, struct aa_profile *profile, struct apparmor_audit_data *ad, void (*cb) (struct audit_buffer *, void *)) { AA_BUG(!profile); if (type == AUDIT_APPARMOR_AUTO) { if (likely(!ad->error)) { if (AUDIT_MODE(profile) != AUDIT_ALL) return 0; type = AUDIT_APPARMOR_AUDIT; } else if (COMPLAIN_MODE(profile)) type = AUDIT_APPARMOR_ALLOWED; else type = AUDIT_APPARMOR_DENIED; } if (AUDIT_MODE(profile) == AUDIT_QUIET || (type == AUDIT_APPARMOR_DENIED && AUDIT_MODE(profile) == AUDIT_QUIET_DENIED)) return ad->error; if (KILL_MODE(profile) && type == AUDIT_APPARMOR_DENIED) type = AUDIT_APPARMOR_KILL; ad->subj_label = &profile->label; aa_audit_msg(type, ad, cb); if (ad->type == AUDIT_APPARMOR_KILL) (void)send_sig_info(SIGKILL, NULL, ad->common.type == LSM_AUDIT_DATA_TASK && ad->common.u.tsk ? ad->common.u.tsk : current); if (ad->type == AUDIT_APPARMOR_ALLOWED) return complain_error(ad->error); return ad->error; } struct aa_audit_rule { struct aa_label *label; }; void aa_audit_rule_free(void *vrule) { struct aa_audit_rule *rule = vrule; if (rule) { if (!IS_ERR(rule->label)) aa_put_label(rule->label); kfree(rule); } } int aa_audit_rule_init(u32 field, u32 op, char *rulestr, void **vrule, gfp_t gfp) { struct aa_audit_rule *rule; switch (field) { case AUDIT_SUBJ_ROLE: if (op != Audit_equal && op != Audit_not_equal) return -EINVAL; break; default: return -EINVAL; } rule = kzalloc(sizeof(struct aa_audit_rule), gfp); if (!rule) return -ENOMEM; /* Currently rules are treated as coming from the root ns */ rule->label = aa_label_parse(&root_ns->unconfined->label, rulestr, gfp, true, false); if (IS_ERR(rule->label)) { int err = PTR_ERR(rule->label); aa_audit_rule_free(rule); return err; } *vrule = rule; return 0; } int aa_audit_rule_known(struct audit_krule *rule) { int i; for (i = 0; i < rule->field_count; i++) { struct audit_field *f = &rule->fields[i]; switch (f->type) { case AUDIT_SUBJ_ROLE: return 1; } } return 0; } int aa_audit_rule_match(struct lsm_prop *prop, u32 field, u32 op, void *vrule) { struct aa_audit_rule *rule = vrule; struct aa_label *label; int found = 0; label = prop->apparmor.label; if (!label) return -ENOENT; if (aa_label_is_subset(label, rule->label)) found = 1; switch (field) { case AUDIT_SUBJ_ROLE: switch (op) { case Audit_equal: return found; case Audit_not_equal: return !found; } } return 0; }
2055 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_SMP_H #define _ASM_X86_SMP_H #ifndef __ASSEMBLY__ #include <linux/cpumask.h> #include <asm/cpumask.h> #include <asm/current.h> #include <asm/thread_info.h> DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map); DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_map); DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_die_map); /* cpus sharing the last level cache: */ DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map); DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_l2c_shared_map); DECLARE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_apicid); DECLARE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_acpiid); struct task_struct; struct smp_ops { void (*smp_prepare_boot_cpu)(void); void (*smp_prepare_cpus)(unsigned max_cpus); void (*smp_cpus_done)(unsigned max_cpus); void (*stop_other_cpus)(int wait); void (*crash_stop_other_cpus)(void); void (*smp_send_reschedule)(int cpu); void (*cleanup_dead_cpu)(unsigned cpu); void (*poll_sync_state)(void); int (*kick_ap_alive)(unsigned cpu, struct task_struct *tidle); int (*cpu_disable)(void); void (*cpu_die)(unsigned int cpu); void (*play_dead)(void); void (*stop_this_cpu)(void); void (*send_call_func_ipi)(const struct cpumask *mask); void (*send_call_func_single_ipi)(int cpu); }; /* Globals due to paravirt */ extern void set_cpu_sibling_map(int cpu); #ifdef CONFIG_SMP extern struct smp_ops smp_ops; static inline void smp_send_stop(void) { smp_ops.stop_other_cpus(0); } static inline void stop_other_cpus(void) { smp_ops.stop_other_cpus(1); } static inline void smp_prepare_cpus(unsigned int max_cpus) { smp_ops.smp_prepare_cpus(max_cpus); } static inline void smp_cpus_done(unsigned int max_cpus) { smp_ops.smp_cpus_done(max_cpus); } static inline int __cpu_disable(void) { return smp_ops.cpu_disable(); } static inline void __cpu_die(unsigned int cpu) { if (smp_ops.cpu_die) smp_ops.cpu_die(cpu); } static inline void __noreturn play_dead(void) { smp_ops.play_dead(); BUG(); } static inline void arch_smp_send_reschedule(int cpu) { smp_ops.smp_send_reschedule(cpu); } static inline void arch_send_call_function_single_ipi(int cpu) { smp_ops.send_call_func_single_ipi(cpu); } static inline void arch_send_call_function_ipi_mask(const struct cpumask *mask) { smp_ops.send_call_func_ipi(mask); } void cpu_disable_common(void); void native_smp_prepare_boot_cpu(void); void smp_prepare_cpus_common(void); void native_smp_prepare_cpus(unsigned int max_cpus); void native_smp_cpus_done(unsigned int max_cpus); int common_cpu_up(unsigned int cpunum, struct task_struct *tidle); int native_kick_ap(unsigned int cpu, struct task_struct *tidle); int native_cpu_disable(void); void __noreturn hlt_play_dead(void); void native_play_dead(void); void play_dead_common(void); void wbinvd_on_cpu(int cpu); int wbinvd_on_all_cpus(void); void smp_kick_mwait_play_dead(void); void native_smp_send_reschedule(int cpu); void native_send_call_func_ipi(const struct cpumask *mask); void native_send_call_func_single_ipi(int cpu); void smp_store_cpu_info(int id); asmlinkage __visible void smp_reboot_interrupt(void); __visible void smp_reschedule_interrupt(struct pt_regs *regs); __visible void smp_call_function_interrupt(struct pt_regs *regs); __visible void smp_call_function_single_interrupt(struct pt_regs *r); #define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu) #define cpu_acpi_id(cpu) per_cpu(x86_cpu_to_acpiid, cpu) /* * This function is needed by all SMP systems. It must _always_ be valid * from the initial startup. */ #define raw_smp_processor_id() this_cpu_read(pcpu_hot.cpu_number) #define __smp_processor_id() __this_cpu_read(pcpu_hot.cpu_number) #ifdef CONFIG_X86_32 extern int safe_smp_processor_id(void); #else # define safe_smp_processor_id() smp_processor_id() #endif static inline struct cpumask *cpu_llc_shared_mask(int cpu) { return per_cpu(cpu_llc_shared_map, cpu); } static inline struct cpumask *cpu_l2c_shared_mask(int cpu) { return per_cpu(cpu_l2c_shared_map, cpu); } #else /* !CONFIG_SMP */ #define wbinvd_on_cpu(cpu) wbinvd() static inline int wbinvd_on_all_cpus(void) { wbinvd(); return 0; } static inline struct cpumask *cpu_llc_shared_mask(int cpu) { return (struct cpumask *)cpumask_of(0); } #endif /* CONFIG_SMP */ #ifdef CONFIG_DEBUG_NMI_SELFTEST extern void nmi_selftest(void); #else #define nmi_selftest() do { } while (0) #endif extern unsigned int smpboot_control; extern unsigned long apic_mmio_base; #endif /* !__ASSEMBLY__ */ /* Control bits for startup_64 */ #define STARTUP_READ_APICID 0x80000000 /* Top 8 bits are reserved for control */ #define STARTUP_PARALLEL_MASK 0xFF000000 #endif /* _ASM_X86_SMP_H */
12 12 12 13 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. * All Rights Reserved. */ #ifndef __XFS_MOUNT_H__ #define __XFS_MOUNT_H__ struct xlog; struct xfs_inode; struct xfs_mru_cache; struct xfs_ail; struct xfs_quotainfo; struct xfs_da_geometry; struct xfs_perag; /* dynamic preallocation free space thresholds, 5% down to 1% */ enum { XFS_LOWSP_1_PCNT = 0, XFS_LOWSP_2_PCNT, XFS_LOWSP_3_PCNT, XFS_LOWSP_4_PCNT, XFS_LOWSP_5_PCNT, XFS_LOWSP_MAX, }; /* * Error Configuration * * Error classes define the subsystem the configuration belongs to. * Error numbers define the errors that are configurable. */ enum { XFS_ERR_METADATA, XFS_ERR_CLASS_MAX, }; enum { XFS_ERR_DEFAULT, XFS_ERR_EIO, XFS_ERR_ENOSPC, XFS_ERR_ENODEV, XFS_ERR_ERRNO_MAX, }; #define XFS_ERR_RETRY_FOREVER -1 /* * Although retry_timeout is in jiffies which is normally an unsigned long, * we limit the retry timeout to 86400 seconds, or one day. So even a * signed 32-bit long is sufficient for a HZ value up to 24855. Making it * signed lets us store the special "-1" value, meaning retry forever. */ struct xfs_error_cfg { struct xfs_kobj kobj; int max_retries; long retry_timeout; /* in jiffies, -1 = infinite */ }; /* * Per-cpu deferred inode inactivation GC lists. */ struct xfs_inodegc { struct xfs_mount *mp; struct llist_head list; struct delayed_work work; int error; /* approximate count of inodes in the list */ unsigned int items; unsigned int shrinker_hits; unsigned int cpu; }; /* * Container for each type of groups, used to look up individual groups and * describes the geometry. */ struct xfs_groups { struct xarray xa; /* * Maximum capacity of the group in FSBs. * * Each group is laid out densely in the daddr space. For the * degenerate case of a pre-rtgroups filesystem, the incore rtgroup * pretends to have a zero-block and zero-blklog rtgroup. */ uint32_t blocks; /* * Log(2) of the logical size of each group. * * Compared to the blocks field above this is rounded up to the next * power of two, and thus lays out the xfs_fsblock_t/xfs_rtblock_t * space sparsely with a hole from blocks to (1 << blklog) at the end * of each group. */ uint8_t blklog; /* * Mask to extract the group-relative block number from a FSB. * For a pre-rtgroups filesystem we pretend to have one very large * rtgroup, so this mask must be 64-bit. */ uint64_t blkmask; }; /* * The struct xfsmount layout is optimised to separate read-mostly variables * from variables that are frequently modified. We put the read-mostly variables * first, then place all the other variables at the end. * * Typically, read-mostly variables are those that are set at mount time and * never changed again, or only change rarely as a result of things like sysfs * knobs being tweaked. */ typedef struct xfs_mount { struct xfs_sb m_sb; /* copy of fs superblock */ struct super_block *m_super; struct xfs_ail *m_ail; /* fs active log item list */ struct xfs_buf *m_sb_bp; /* buffer for superblock */ struct xfs_buf *m_rtsb_bp; /* realtime superblock */ char *m_rtname; /* realtime device name */ char *m_logname; /* external log device name */ struct xfs_da_geometry *m_dir_geo; /* directory block geometry */ struct xfs_da_geometry *m_attr_geo; /* attribute block geometry */ struct xlog *m_log; /* log specific stuff */ struct xfs_inode *m_rootip; /* pointer to root directory */ struct xfs_inode *m_metadirip; /* ptr to metadata directory */ struct xfs_inode *m_rtdirip; /* ptr to realtime metadir */ struct xfs_quotainfo *m_quotainfo; /* disk quota information */ struct xfs_buftarg *m_ddev_targp; /* data device */ struct xfs_buftarg *m_logdev_targp;/* log device */ struct xfs_buftarg *m_rtdev_targp; /* rt device */ void __percpu *m_inodegc; /* percpu inodegc structures */ struct xfs_mru_cache *m_filestream; /* per-mount filestream data */ struct workqueue_struct *m_buf_workqueue; struct workqueue_struct *m_unwritten_workqueue; struct workqueue_struct *m_reclaim_workqueue; struct workqueue_struct *m_sync_workqueue; struct workqueue_struct *m_blockgc_wq; struct workqueue_struct *m_inodegc_wq; int m_bsize; /* fs logical block size */ uint8_t m_blkbit_log; /* blocklog + NBBY */ uint8_t m_blkbb_log; /* blocklog - BBSHIFT */ uint8_t m_agno_log; /* log #ag's */ uint8_t m_sectbb_log; /* sectlog - BBSHIFT */ int8_t m_rtxblklog; /* log2 of rextsize, if possible */ uint m_blockmask; /* sb_blocksize-1 */ uint m_blockwsize; /* sb_blocksize in words */ /* number of rt extents per rt bitmap block if rtgroups enabled */ unsigned int m_rtx_per_rbmblock; uint m_alloc_mxr[2]; /* max alloc btree records */ uint m_alloc_mnr[2]; /* min alloc btree records */ uint m_bmap_dmxr[2]; /* max bmap btree records */ uint m_bmap_dmnr[2]; /* min bmap btree records */ uint m_rmap_mxr[2]; /* max rmap btree records */ uint m_rmap_mnr[2]; /* min rmap btree records */ uint m_refc_mxr[2]; /* max refc btree records */ uint m_refc_mnr[2]; /* min refc btree records */ uint m_alloc_maxlevels; /* max alloc btree levels */ uint m_bm_maxlevels[2]; /* max bmap btree levels */ uint m_rmap_maxlevels; /* max rmap btree levels */ uint m_refc_maxlevels; /* max refcount btree level */ unsigned int m_agbtree_maxlevels; /* max level of all AG btrees */ xfs_extlen_t m_ag_prealloc_blocks; /* reserved ag blocks */ uint m_alloc_set_aside; /* space we can't use */ uint m_ag_max_usable; /* max space per AG */ int m_dalign; /* stripe unit */ int m_swidth; /* stripe width */ xfs_agnumber_t m_maxagi; /* highest inode alloc group */ uint m_allocsize_log;/* min write size log bytes */ uint m_allocsize_blocks; /* min write size blocks */ int m_logbufs; /* number of log buffers */ int m_logbsize; /* size of each log buffer */ unsigned int m_rsumlevels; /* rt summary levels */ xfs_filblks_t m_rsumblocks; /* size of rt summary, FSBs */ int m_fixedfsid[2]; /* unchanged for life of FS */ uint m_qflags; /* quota status flags */ uint64_t m_features; /* active filesystem features */ uint64_t m_low_space[XFS_LOWSP_MAX]; uint64_t m_low_rtexts[XFS_LOWSP_MAX]; uint64_t m_rtxblkmask; /* rt extent block mask */ struct xfs_ino_geometry m_ino_geo; /* inode geometry */ struct xfs_trans_resv m_resv; /* precomputed res values */ /* low free space thresholds */ unsigned long m_opstate; /* dynamic state flags */ bool m_always_cow; bool m_fail_unmount; bool m_finobt_nores; /* no per-AG finobt resv. */ bool m_update_sb; /* sb needs update in mount */ /* * Bitsets of per-fs metadata that have been checked and/or are sick. * Callers must hold m_sb_lock to access these two fields. */ uint8_t m_fs_checked; uint8_t m_fs_sick; /* * Bitsets of rt metadata that have been checked and/or are sick. * Callers must hold m_sb_lock to access this field. */ uint8_t m_rt_checked; uint8_t m_rt_sick; /* * End of read-mostly variables. Frequently written variables and locks * should be placed below this comment from now on. The first variable * here is marked as cacheline aligned so they it is separated from * the read-mostly variables. */ spinlock_t ____cacheline_aligned m_sb_lock; /* sb counter lock */ struct percpu_counter m_icount; /* allocated inodes counter */ struct percpu_counter m_ifree; /* free inodes counter */ struct percpu_counter m_fdblocks; /* free block counter */ struct percpu_counter m_frextents; /* free rt extent counter */ /* * Count of data device blocks reserved for delayed allocations, * including indlen blocks. Does not include allocated CoW staging * extents or anything related to the rt device. */ struct percpu_counter m_delalloc_blks; /* * RT version of the above. */ struct percpu_counter m_delalloc_rtextents; /* * Global count of allocation btree blocks in use across all AGs. Only * used when perag reservation is enabled. Helps prevent block * reservation from attempting to reserve allocation btree blocks. */ atomic64_t m_allocbt_blks; struct xfs_groups m_groups[XG_TYPE_MAX]; uint64_t m_resblks; /* total reserved blocks */ uint64_t m_resblks_avail;/* available reserved blocks */ uint64_t m_resblks_save; /* reserved blks @ remount,ro */ struct delayed_work m_reclaim_work; /* background inode reclaim */ struct dentry *m_debugfs; /* debugfs parent */ struct xfs_kobj m_kobj; struct xfs_kobj m_error_kobj; struct xfs_kobj m_error_meta_kobj; struct xfs_error_cfg m_error_cfg[XFS_ERR_CLASS_MAX][XFS_ERR_ERRNO_MAX]; struct xstats m_stats; /* per-fs stats */ #ifdef CONFIG_XFS_ONLINE_SCRUB_STATS struct xchk_stats *m_scrub_stats; #endif xfs_agnumber_t m_agfrotor; /* last ag where space found */ atomic_t m_agirotor; /* last ag dir inode alloced */ atomic_t m_rtgrotor; /* last rtgroup rtpicked */ /* Memory shrinker to throttle and reprioritize inodegc */ struct shrinker *m_inodegc_shrinker; /* * Workqueue item so that we can coalesce multiple inode flush attempts * into a single flush. */ struct work_struct m_flush_inodes_work; /* * Generation of the filesysyem layout. This is incremented by each * growfs, and used by the pNFS server to ensure the client updates * its view of the block device once it gets a layout that might * reference the newly added blocks. Does not need to be persistent * as long as we only allow file system size increments, but if we * ever support shrinks it would have to be persisted in addition * to various other kinds of pain inflicted on the pNFS server. */ uint32_t m_generation; struct mutex m_growlock; /* growfs mutex */ #ifdef DEBUG /* * Frequency with which errors are injected. Replaces xfs_etest; the * value stored in here is the inverse of the frequency with which the * error triggers. 1 = always, 2 = half the time, etc. */ unsigned int *m_errortag; struct xfs_kobj m_errortag_kobj; #endif /* cpus that have inodes queued for inactivation */ struct cpumask m_inodegc_cpumask; /* Hook to feed dirent updates to an active online repair. */ struct xfs_hooks m_dir_update_hooks; } xfs_mount_t; #define M_IGEO(mp) (&(mp)->m_ino_geo) /* * Flags for m_features. * * These are all the active features in the filesystem, regardless of how * they are configured. */ #define XFS_FEAT_ATTR (1ULL << 0) /* xattrs present in fs */ #define XFS_FEAT_NLINK (1ULL << 1) /* 32 bit link counts */ #define XFS_FEAT_QUOTA (1ULL << 2) /* quota active */ #define XFS_FEAT_ALIGN (1ULL << 3) /* inode alignment */ #define XFS_FEAT_DALIGN (1ULL << 4) /* data alignment */ #define XFS_FEAT_LOGV2 (1ULL << 5) /* version 2 logs */ #define XFS_FEAT_SECTOR (1ULL << 6) /* sector size > 512 bytes */ #define XFS_FEAT_EXTFLG (1ULL << 7) /* unwritten extents */ #define XFS_FEAT_ASCIICI (1ULL << 8) /* ASCII only case-insens. */ #define XFS_FEAT_LAZYSBCOUNT (1ULL << 9) /* Superblk counters */ #define XFS_FEAT_ATTR2 (1ULL << 10) /* dynamic attr fork */ #define XFS_FEAT_PARENT (1ULL << 11) /* parent pointers */ #define XFS_FEAT_PROJID32 (1ULL << 12) /* 32 bit project id */ #define XFS_FEAT_CRC (1ULL << 13) /* metadata CRCs */ #define XFS_FEAT_V3INODES (1ULL << 14) /* Version 3 inodes */ #define XFS_FEAT_PQUOTINO (1ULL << 15) /* non-shared proj/grp quotas */ #define XFS_FEAT_FTYPE (1ULL << 16) /* inode type in dir */ #define XFS_FEAT_FINOBT (1ULL << 17) /* free inode btree */ #define XFS_FEAT_RMAPBT (1ULL << 18) /* reverse map btree */ #define XFS_FEAT_REFLINK (1ULL << 19) /* reflinked files */ #define XFS_FEAT_SPINODES (1ULL << 20) /* sparse inode chunks */ #define XFS_FEAT_META_UUID (1ULL << 21) /* metadata UUID */ #define XFS_FEAT_REALTIME (1ULL << 22) /* realtime device present */ #define XFS_FEAT_INOBTCNT (1ULL << 23) /* inobt block counts */ #define XFS_FEAT_BIGTIME (1ULL << 24) /* large timestamps */ #define XFS_FEAT_NEEDSREPAIR (1ULL << 25) /* needs xfs_repair */ #define XFS_FEAT_NREXT64 (1ULL << 26) /* large extent counters */ #define XFS_FEAT_EXCHANGE_RANGE (1ULL << 27) /* exchange range */ #define XFS_FEAT_METADIR (1ULL << 28) /* metadata directory tree */ /* Mount features */ #define XFS_FEAT_NOATTR2 (1ULL << 48) /* disable attr2 creation */ #define XFS_FEAT_NOALIGN (1ULL << 49) /* ignore alignment */ #define XFS_FEAT_ALLOCSIZE (1ULL << 50) /* user specified allocation size */ #define XFS_FEAT_LARGE_IOSIZE (1ULL << 51) /* report large preferred * I/O size in stat() */ #define XFS_FEAT_WSYNC (1ULL << 52) /* synchronous metadata ops */ #define XFS_FEAT_DIRSYNC (1ULL << 53) /* synchronous directory ops */ #define XFS_FEAT_DISCARD (1ULL << 54) /* discard unused blocks */ #define XFS_FEAT_GRPID (1ULL << 55) /* group-ID assigned from directory */ #define XFS_FEAT_SMALL_INUMS (1ULL << 56) /* user wants 32bit inodes */ #define XFS_FEAT_IKEEP (1ULL << 57) /* keep empty inode clusters*/ #define XFS_FEAT_SWALLOC (1ULL << 58) /* stripe width allocation */ #define XFS_FEAT_FILESTREAMS (1ULL << 59) /* use filestreams allocator */ #define XFS_FEAT_DAX_ALWAYS (1ULL << 60) /* DAX always enabled */ #define XFS_FEAT_DAX_NEVER (1ULL << 61) /* DAX never enabled */ #define XFS_FEAT_NORECOVERY (1ULL << 62) /* no recovery - dirty fs */ #define XFS_FEAT_NOUUID (1ULL << 63) /* ignore uuid during mount */ #define __XFS_HAS_FEAT(name, NAME) \ static inline bool xfs_has_ ## name (struct xfs_mount *mp) \ { \ return mp->m_features & XFS_FEAT_ ## NAME; \ } /* Some features can be added dynamically so they need a set wrapper, too. */ #define __XFS_ADD_FEAT(name, NAME) \ __XFS_HAS_FEAT(name, NAME); \ static inline void xfs_add_ ## name (struct xfs_mount *mp) \ { \ mp->m_features |= XFS_FEAT_ ## NAME; \ xfs_sb_version_add ## name(&mp->m_sb); \ } /* Superblock features */ __XFS_ADD_FEAT(attr, ATTR) __XFS_HAS_FEAT(nlink, NLINK) __XFS_ADD_FEAT(quota, QUOTA) __XFS_HAS_FEAT(dalign, DALIGN) __XFS_HAS_FEAT(sector, SECTOR) __XFS_HAS_FEAT(asciici, ASCIICI) __XFS_HAS_FEAT(parent, PARENT) __XFS_HAS_FEAT(ftype, FTYPE) __XFS_HAS_FEAT(finobt, FINOBT) __XFS_HAS_FEAT(rmapbt, RMAPBT) __XFS_HAS_FEAT(reflink, REFLINK) __XFS_HAS_FEAT(sparseinodes, SPINODES) __XFS_HAS_FEAT(metauuid, META_UUID) __XFS_HAS_FEAT(realtime, REALTIME) __XFS_HAS_FEAT(inobtcounts, INOBTCNT) __XFS_HAS_FEAT(bigtime, BIGTIME) __XFS_HAS_FEAT(needsrepair, NEEDSREPAIR) __XFS_HAS_FEAT(large_extent_counts, NREXT64) __XFS_HAS_FEAT(exchange_range, EXCHANGE_RANGE) __XFS_HAS_FEAT(metadir, METADIR) static inline bool xfs_has_rtgroups(struct xfs_mount *mp) { /* all metadir file systems also allow rtgroups */ return xfs_has_metadir(mp); } static inline bool xfs_has_rtsb(struct xfs_mount *mp) { /* all rtgroups filesystems with an rt section have an rtsb */ return xfs_has_rtgroups(mp) && xfs_has_realtime(mp); } /* * Some features are always on for v5 file systems, allow the compiler to * eliminiate dead code when building without v4 support. */ #define __XFS_HAS_V4_FEAT(name, NAME) \ static inline bool xfs_has_ ## name (struct xfs_mount *mp) \ { \ return !IS_ENABLED(CONFIG_XFS_SUPPORT_V4) || \ (mp->m_features & XFS_FEAT_ ## NAME); \ } #define __XFS_ADD_V4_FEAT(name, NAME) \ __XFS_HAS_V4_FEAT(name, NAME); \ static inline void xfs_add_ ## name (struct xfs_mount *mp) \ { \ if (IS_ENABLED(CONFIG_XFS_SUPPORT_V4)) { \ mp->m_features |= XFS_FEAT_ ## NAME; \ xfs_sb_version_add ## name(&mp->m_sb); \ } \ } __XFS_HAS_V4_FEAT(align, ALIGN) __XFS_HAS_V4_FEAT(logv2, LOGV2) __XFS_HAS_V4_FEAT(extflg, EXTFLG) __XFS_HAS_V4_FEAT(lazysbcount, LAZYSBCOUNT) __XFS_ADD_V4_FEAT(attr2, ATTR2) __XFS_ADD_V4_FEAT(projid32, PROJID32) __XFS_HAS_V4_FEAT(v3inodes, V3INODES) __XFS_HAS_V4_FEAT(crc, CRC) __XFS_HAS_V4_FEAT(pquotino, PQUOTINO) /* * Mount features * * These do not change dynamically - features that can come and go, such as 32 * bit inodes and read-only state, are kept as operational state rather than * features. */ __XFS_HAS_FEAT(noattr2, NOATTR2) __XFS_HAS_FEAT(noalign, NOALIGN) __XFS_HAS_FEAT(allocsize, ALLOCSIZE) __XFS_HAS_FEAT(large_iosize, LARGE_IOSIZE) __XFS_HAS_FEAT(wsync, WSYNC) __XFS_HAS_FEAT(dirsync, DIRSYNC) __XFS_HAS_FEAT(discard, DISCARD) __XFS_HAS_FEAT(grpid, GRPID) __XFS_HAS_FEAT(small_inums, SMALL_INUMS) __XFS_HAS_FEAT(ikeep, IKEEP) __XFS_HAS_FEAT(swalloc, SWALLOC) __XFS_HAS_FEAT(filestreams, FILESTREAMS) __XFS_HAS_FEAT(dax_always, DAX_ALWAYS) __XFS_HAS_FEAT(dax_never, DAX_NEVER) __XFS_HAS_FEAT(norecovery, NORECOVERY) __XFS_HAS_FEAT(nouuid, NOUUID) /* * Operational mount state flags * * Use these with atomic bit ops only! */ #define XFS_OPSTATE_UNMOUNTING 0 /* filesystem is unmounting */ #define XFS_OPSTATE_CLEAN 1 /* mount was clean */ #define XFS_OPSTATE_SHUTDOWN 2 /* stop all fs operations */ #define XFS_OPSTATE_INODE32 3 /* inode32 allocator active */ #define XFS_OPSTATE_READONLY 4 /* read-only fs */ /* * If set, inactivation worker threads will be scheduled to process queued * inodegc work. If not, queued inodes remain in memory waiting to be * processed. */ #define XFS_OPSTATE_INODEGC_ENABLED 5 /* * If set, background speculative prealloc gc worker threads will be scheduled * to process queued blockgc work. If not, inodes retain their preallocations * until explicitly deleted. */ #define XFS_OPSTATE_BLOCKGC_ENABLED 6 /* Kernel has logged a warning about pNFS being used on this fs. */ #define XFS_OPSTATE_WARNED_PNFS 7 /* Kernel has logged a warning about online fsck being used on this fs. */ #define XFS_OPSTATE_WARNED_SCRUB 8 /* Kernel has logged a warning about shrink being used on this fs. */ #define XFS_OPSTATE_WARNED_SHRINK 9 /* Kernel has logged a warning about logged xattr updates being used. */ #define XFS_OPSTATE_WARNED_LARP 10 /* Mount time quotacheck is running */ #define XFS_OPSTATE_QUOTACHECK_RUNNING 11 /* Do we want to clear log incompat flags? */ #define XFS_OPSTATE_UNSET_LOG_INCOMPAT 12 /* Filesystem can use logged extended attributes */ #define XFS_OPSTATE_USE_LARP 13 /* Kernel has logged a warning about blocksize > pagesize on this fs. */ #define XFS_OPSTATE_WARNED_LBS 14 /* Kernel has logged a warning about exchange-range being used on this fs. */ #define XFS_OPSTATE_WARNED_EXCHRANGE 15 /* Kernel has logged a warning about parent pointers being used on this fs. */ #define XFS_OPSTATE_WARNED_PPTR 16 /* Kernel has logged a warning about metadata dirs being used on this fs. */ #define XFS_OPSTATE_WARNED_METADIR 17 /* Filesystem should use qflags to determine quotaon status */ #define XFS_OPSTATE_RESUMING_QUOTAON 18 #define __XFS_IS_OPSTATE(name, NAME) \ static inline bool xfs_is_ ## name (struct xfs_mount *mp) \ { \ return test_bit(XFS_OPSTATE_ ## NAME, &mp->m_opstate); \ } \ static inline bool xfs_clear_ ## name (struct xfs_mount *mp) \ { \ return test_and_clear_bit(XFS_OPSTATE_ ## NAME, &mp->m_opstate); \ } \ static inline bool xfs_set_ ## name (struct xfs_mount *mp) \ { \ return test_and_set_bit(XFS_OPSTATE_ ## NAME, &mp->m_opstate); \ } __XFS_IS_OPSTATE(unmounting, UNMOUNTING) __XFS_IS_OPSTATE(clean, CLEAN) __XFS_IS_OPSTATE(shutdown, SHUTDOWN) __XFS_IS_OPSTATE(inode32, INODE32) __XFS_IS_OPSTATE(readonly, READONLY) __XFS_IS_OPSTATE(inodegc_enabled, INODEGC_ENABLED) __XFS_IS_OPSTATE(blockgc_enabled, BLOCKGC_ENABLED) #ifdef CONFIG_XFS_QUOTA __XFS_IS_OPSTATE(quotacheck_running, QUOTACHECK_RUNNING) __XFS_IS_OPSTATE(resuming_quotaon, RESUMING_QUOTAON) #else static inline bool xfs_is_quotacheck_running(struct xfs_mount *mp) { return false; } static inline bool xfs_is_resuming_quotaon(struct xfs_mount *mp) { return false; } static inline void xfs_set_resuming_quotaon(struct xfs_mount *m) { } static inline bool xfs_clear_resuming_quotaon(struct xfs_mount *mp) { return false; } #endif /* CONFIG_XFS_QUOTA */ __XFS_IS_OPSTATE(done_with_log_incompat, UNSET_LOG_INCOMPAT) __XFS_IS_OPSTATE(using_logged_xattrs, USE_LARP) static inline bool xfs_should_warn(struct xfs_mount *mp, long nr) { return !test_and_set_bit(nr, &mp->m_opstate); } #define XFS_OPSTATE_STRINGS \ { (1UL << XFS_OPSTATE_UNMOUNTING), "unmounting" }, \ { (1UL << XFS_OPSTATE_CLEAN), "clean" }, \ { (1UL << XFS_OPSTATE_SHUTDOWN), "shutdown" }, \ { (1UL << XFS_OPSTATE_INODE32), "inode32" }, \ { (1UL << XFS_OPSTATE_READONLY), "read_only" }, \ { (1UL << XFS_OPSTATE_INODEGC_ENABLED), "inodegc" }, \ { (1UL << XFS_OPSTATE_BLOCKGC_ENABLED), "blockgc" }, \ { (1UL << XFS_OPSTATE_WARNED_SCRUB), "wscrub" }, \ { (1UL << XFS_OPSTATE_WARNED_SHRINK), "wshrink" }, \ { (1UL << XFS_OPSTATE_WARNED_LARP), "wlarp" }, \ { (1UL << XFS_OPSTATE_QUOTACHECK_RUNNING), "quotacheck" }, \ { (1UL << XFS_OPSTATE_UNSET_LOG_INCOMPAT), "unset_log_incompat" }, \ { (1UL << XFS_OPSTATE_USE_LARP), "logged_xattrs" } /* * Max and min values for mount-option defined I/O * preallocation sizes. */ #define XFS_MAX_IO_LOG 30 /* 1G */ #define XFS_MIN_IO_LOG PAGE_SHIFT void xfs_do_force_shutdown(struct xfs_mount *mp, uint32_t flags, char *fname, int lnnum); #define xfs_force_shutdown(m,f) \ xfs_do_force_shutdown(m, f, __FILE__, __LINE__) #define SHUTDOWN_META_IO_ERROR (1u << 0) /* write attempt to metadata failed */ #define SHUTDOWN_LOG_IO_ERROR (1u << 1) /* write attempt to the log failed */ #define SHUTDOWN_FORCE_UMOUNT (1u << 2) /* shutdown from a forced unmount */ #define SHUTDOWN_CORRUPT_INCORE (1u << 3) /* corrupt in-memory structures */ #define SHUTDOWN_CORRUPT_ONDISK (1u << 4) /* corrupt metadata on device */ #define SHUTDOWN_DEVICE_REMOVED (1u << 5) /* device removed underneath us */ #define XFS_SHUTDOWN_STRINGS \ { SHUTDOWN_META_IO_ERROR, "metadata_io" }, \ { SHUTDOWN_LOG_IO_ERROR, "log_io" }, \ { SHUTDOWN_FORCE_UMOUNT, "force_umount" }, \ { SHUTDOWN_CORRUPT_INCORE, "corruption" }, \ { SHUTDOWN_DEVICE_REMOVED, "device_removed" } /* * Flags for xfs_mountfs */ #define XFS_MFSI_QUIET 0x40 /* Be silent if mount errors found */ static inline xfs_agnumber_t xfs_daddr_to_agno(struct xfs_mount *mp, xfs_daddr_t d) { xfs_rfsblock_t ld = XFS_BB_TO_FSBT(mp, d); do_div(ld, mp->m_sb.sb_agblocks); return (xfs_agnumber_t) ld; } static inline xfs_agblock_t xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d) { xfs_rfsblock_t ld = XFS_BB_TO_FSBT(mp, d); return (xfs_agblock_t) do_div(ld, mp->m_sb.sb_agblocks); } extern void xfs_uuid_table_free(void); extern uint64_t xfs_default_resblks(xfs_mount_t *mp); extern int xfs_mountfs(xfs_mount_t *mp); extern void xfs_unmountfs(xfs_mount_t *); /* * Deltas for the block count can vary from 1 to very large, but lock contention * only occurs on frequent small block count updates such as in the delayed * allocation path for buffered writes (page a time updates). Hence we set * a large batch count (1024) to minimise global counter updates except when * we get near to ENOSPC and we have to be very accurate with our updates. */ #define XFS_FDBLOCKS_BATCH 1024 /* * Estimate the amount of free space that is not available to userspace and is * not explicitly reserved from the incore fdblocks. This includes: * * - The minimum number of blocks needed to support splitting a bmap btree * - The blocks currently in use by the freespace btrees because they record * the actual blocks that will fill per-AG metadata space reservations */ static inline uint64_t xfs_fdblocks_unavailable( struct xfs_mount *mp) { return mp->m_alloc_set_aside + atomic64_read(&mp->m_allocbt_blks); } int xfs_dec_freecounter(struct xfs_mount *mp, struct percpu_counter *counter, uint64_t delta, bool rsvd); void xfs_add_freecounter(struct xfs_mount *mp, struct percpu_counter *counter, uint64_t delta); static inline int xfs_dec_fdblocks(struct xfs_mount *mp, uint64_t delta, bool reserved) { return xfs_dec_freecounter(mp, &mp->m_fdblocks, delta, reserved); } static inline void xfs_add_fdblocks(struct xfs_mount *mp, uint64_t delta) { xfs_add_freecounter(mp, &mp->m_fdblocks, delta); } static inline int xfs_dec_frextents(struct xfs_mount *mp, uint64_t delta) { return xfs_dec_freecounter(mp, &mp->m_frextents, delta, false); } static inline void xfs_add_frextents(struct xfs_mount *mp, uint64_t delta) { xfs_add_freecounter(mp, &mp->m_frextents, delta); } extern int xfs_readsb(xfs_mount_t *, int); extern void xfs_freesb(xfs_mount_t *); extern bool xfs_fs_writable(struct xfs_mount *mp, int level); extern int xfs_sb_validate_fsb_count(struct xfs_sb *, uint64_t); extern int xfs_dev_is_read_only(struct xfs_mount *, char *); extern void xfs_set_low_space_thresholds(struct xfs_mount *); int xfs_zero_extent(struct xfs_inode *ip, xfs_fsblock_t start_fsb, xfs_off_t count_fsb); struct xfs_error_cfg * xfs_error_get_cfg(struct xfs_mount *mp, int error_class, int error); void xfs_force_summary_recalc(struct xfs_mount *mp); int xfs_add_incompat_log_feature(struct xfs_mount *mp, uint32_t feature); bool xfs_clear_incompat_log_features(struct xfs_mount *mp); void xfs_mod_delalloc(struct xfs_inode *ip, int64_t data_delta, int64_t ind_delta); #endif /* __XFS_MOUNT_H__ */
5 2 16 16 16 15 29 29 16 11 16 13 29 29 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 // SPDX-License-Identifier: GPL-2.0-only /* * linux/kernel/power/swap.c * * This file provides functions for reading the suspend image from * and writing it to a swap partition. * * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz> * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> * Copyright (C) 2010-2012 Bojan Smojver <bojan@rexursive.com> */ #define pr_fmt(fmt) "PM: " fmt #include <linux/module.h> #include <linux/file.h> #include <linux/delay.h> #include <linux/bitops.h> #include <linux/device.h> #include <linux/bio.h> #include <linux/blkdev.h> #include <linux/swap.h> #include <linux/swapops.h> #include <linux/pm.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/cpumask.h> #include <linux/atomic.h> #include <linux/kthread.h> #include <linux/crc32.h> #include <linux/ktime.h> #include "power.h" #define HIBERNATE_SIG "S1SUSPEND" u32 swsusp_hardware_signature; /* * When reading an {un,}compressed image, we may restore pages in place, * in which case some architectures need these pages cleaning before they * can be executed. We don't know which pages these may be, so clean the lot. */ static bool clean_pages_on_read; static bool clean_pages_on_decompress; /* * The swap map is a data structure used for keeping track of each page * written to a swap partition. It consists of many swap_map_page * structures that contain each an array of MAP_PAGE_ENTRIES swap entries. * These structures are stored on the swap and linked together with the * help of the .next_swap member. * * The swap map is created during suspend. The swap map pages are * allocated and populated one at a time, so we only need one memory * page to set up the entire structure. * * During resume we pick up all swap_map_page structures into a list. */ #define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(sector_t) - 1) /* * Number of free pages that are not high. */ static inline unsigned long low_free_pages(void) { return nr_free_pages() - nr_free_highpages(); } /* * Number of pages required to be kept free while writing the image. Always * half of all available low pages before the writing starts. */ static inline unsigned long reqd_free_pages(void) { return low_free_pages() / 2; } struct swap_map_page { sector_t entries[MAP_PAGE_ENTRIES]; sector_t next_swap; }; struct swap_map_page_list { struct swap_map_page *map; struct swap_map_page_list *next; }; /* * The swap_map_handle structure is used for handling swap in * a file-alike way */ struct swap_map_handle { struct swap_map_page *cur; struct swap_map_page_list *maps; sector_t cur_swap; sector_t first_sector; unsigned int k; unsigned long reqd_free_pages; u32 crc32; }; struct swsusp_header { char reserved[PAGE_SIZE - 20 - sizeof(sector_t) - sizeof(int) - sizeof(u32) - sizeof(u32)]; u32 hw_sig; u32 crc32; sector_t image; unsigned int flags; /* Flags to pass to the "boot" kernel */ char orig_sig[10]; char sig[10]; } __packed; static struct swsusp_header *swsusp_header; /* * The following functions are used for tracing the allocated * swap pages, so that they can be freed in case of an error. */ struct swsusp_extent { struct rb_node node; unsigned long start; unsigned long end; }; static struct rb_root swsusp_extents = RB_ROOT; static int swsusp_extents_insert(unsigned long swap_offset) { struct rb_node **new = &(swsusp_extents.rb_node); struct rb_node *parent = NULL; struct swsusp_extent *ext; /* Figure out where to put the new node */ while (*new) { ext = rb_entry(*new, struct swsusp_extent, node); parent = *new; if (swap_offset < ext->start) { /* Try to merge */ if (swap_offset == ext->start - 1) { ext->start--; return 0; } new = &((*new)->rb_left); } else if (swap_offset > ext->end) { /* Try to merge */ if (swap_offset == ext->end + 1) { ext->end++; return 0; } new = &((*new)->rb_right); } else { /* It already is in the tree */ return -EINVAL; } } /* Add the new node and rebalance the tree. */ ext = kzalloc(sizeof(struct swsusp_extent), GFP_KERNEL); if (!ext) return -ENOMEM; ext->start = swap_offset; ext->end = swap_offset; rb_link_node(&ext->node, parent, new); rb_insert_color(&ext->node, &swsusp_extents); return 0; } /* * alloc_swapdev_block - allocate a swap page and register that it has * been allocated, so that it can be freed in case of an error. */ sector_t alloc_swapdev_block(int swap) { unsigned long offset; offset = swp_offset(get_swap_page_of_type(swap)); if (offset) { if (swsusp_extents_insert(offset)) swap_free(swp_entry(swap, offset)); else return swapdev_block(swap, offset); } return 0; } /* * free_all_swap_pages - free swap pages allocated for saving image data. * It also frees the extents used to register which swap entries had been * allocated. */ void free_all_swap_pages(int swap) { struct rb_node *node; while ((node = swsusp_extents.rb_node)) { struct swsusp_extent *ext; ext = rb_entry(node, struct swsusp_extent, node); rb_erase(node, &swsusp_extents); swap_free_nr(swp_entry(swap, ext->start), ext->end - ext->start + 1); kfree(ext); } } int swsusp_swap_in_use(void) { return (swsusp_extents.rb_node != NULL); } /* * General things */ static unsigned short root_swap = 0xffff; static struct file *hib_resume_bdev_file; struct hib_bio_batch { atomic_t count; wait_queue_head_t wait; blk_status_t error; struct blk_plug plug; }; static void hib_init_batch(struct hib_bio_batch *hb) { atomic_set(&hb->count, 0); init_waitqueue_head(&hb->wait); hb->error = BLK_STS_OK; blk_start_plug(&hb->plug); } static void hib_finish_batch(struct hib_bio_batch *hb) { blk_finish_plug(&hb->plug); } static void hib_end_io(struct bio *bio) { struct hib_bio_batch *hb = bio->bi_private; struct page *page = bio_first_page_all(bio); if (bio->bi_status) { pr_alert("Read-error on swap-device (%u:%u:%Lu)\n", MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)), (unsigned long long)bio->bi_iter.bi_sector); } if (bio_data_dir(bio) == WRITE) put_page(page); else if (clean_pages_on_read) flush_icache_range((unsigned long)page_address(page), (unsigned long)page_address(page) + PAGE_SIZE); if (bio->bi_status && !hb->error) hb->error = bio->bi_status; if (atomic_dec_and_test(&hb->count)) wake_up(&hb->wait); bio_put(bio); } static int hib_submit_io(blk_opf_t opf, pgoff_t page_off, void *addr, struct hib_bio_batch *hb) { struct page *page = virt_to_page(addr); struct bio *bio; int error = 0; bio = bio_alloc(file_bdev(hib_resume_bdev_file), 1, opf, GFP_NOIO | __GFP_HIGH); bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9); if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { pr_err("Adding page to bio failed at %llu\n", (unsigned long long)bio->bi_iter.bi_sector); bio_put(bio); return -EFAULT; } if (hb) { bio->bi_end_io = hib_end_io; bio->bi_private = hb; atomic_inc(&hb->count); submit_bio(bio); } else { error = submit_bio_wait(bio); bio_put(bio); } return error; } static int hib_wait_io(struct hib_bio_batch *hb) { /* * We are relying on the behavior of blk_plug that a thread with * a plug will flush the plug list before sleeping. */ wait_event(hb->wait, atomic_read(&hb->count) == 0); return blk_status_to_errno(hb->error); } /* * Saving part */ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags) { int error; hib_submit_io(REQ_OP_READ, swsusp_resume_block, swsusp_header, NULL); if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) || !memcmp("SWAPSPACE2",swsusp_header->sig, 10)) { memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10); memcpy(swsusp_header->sig, HIBERNATE_SIG, 10); swsusp_header->image = handle->first_sector; if (swsusp_hardware_signature) { swsusp_header->hw_sig = swsusp_hardware_signature; flags |= SF_HW_SIG; } swsusp_header->flags = flags; if (flags & SF_CRC32_MODE) swsusp_header->crc32 = handle->crc32; error = hib_submit_io(REQ_OP_WRITE | REQ_SYNC, swsusp_resume_block, swsusp_header, NULL); } else { pr_err("Swap header not found!\n"); error = -ENODEV; } return error; } /* * Hold the swsusp_header flag. This is used in software_resume() in * 'kernel/power/hibernate' to check if the image is compressed and query * for the compression algorithm support(if so). */ unsigned int swsusp_header_flags; /** * swsusp_swap_check - check if the resume device is a swap device * and get its index (if so) * * This is called before saving image */ static int swsusp_swap_check(void) { int res; if (swsusp_resume_device) res = swap_type_of(swsusp_resume_device, swsusp_resume_block); else res = find_first_swap(&swsusp_resume_device); if (res < 0) return res; root_swap = res; hib_resume_bdev_file = bdev_file_open_by_dev(swsusp_resume_device, BLK_OPEN_WRITE, NULL, NULL); if (IS_ERR(hib_resume_bdev_file)) return PTR_ERR(hib_resume_bdev_file); return 0; } /** * write_page - Write one page to given swap location. * @buf: Address we're writing. * @offset: Offset of the swap page we're writing to. * @hb: bio completion batch */ static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb) { void *src; int ret; if (!offset) return -ENOSPC; if (hb) { src = (void *)__get_free_page(GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY); if (src) { copy_page(src, buf); } else { ret = hib_wait_io(hb); /* Free pages */ if (ret) return ret; src = (void *)__get_free_page(GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY); if (src) { copy_page(src, buf); } else { WARN_ON_ONCE(1); hb = NULL; /* Go synchronous */ src = buf; } } } else { src = buf; } return hib_submit_io(REQ_OP_WRITE | REQ_SYNC, offset, src, hb); } static void release_swap_writer(struct swap_map_handle *handle) { if (handle->cur) free_page((unsigned long)handle->cur); handle->cur = NULL; } static int get_swap_writer(struct swap_map_handle *handle) { int ret; ret = swsusp_swap_check(); if (ret) { if (ret != -ENOSPC) pr_err("Cannot find swap device, try swapon -a\n"); return ret; } handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL); if (!handle->cur) { ret = -ENOMEM; goto err_close; } handle->cur_swap = alloc_swapdev_block(root_swap); if (!handle->cur_swap) { ret = -ENOSPC; goto err_rel; } handle->k = 0; handle->reqd_free_pages = reqd_free_pages(); handle->first_sector = handle->cur_swap; return 0; err_rel: release_swap_writer(handle); err_close: swsusp_close(); return ret; } static int swap_write_page(struct swap_map_handle *handle, void *buf, struct hib_bio_batch *hb) { int error; sector_t offset; if (!handle->cur) return -EINVAL; offset = alloc_swapdev_block(root_swap); error = write_page(buf, offset, hb); if (error) return error; handle->cur->entries[handle->k++] = offset; if (handle->k >= MAP_PAGE_ENTRIES) { offset = alloc_swapdev_block(root_swap); if (!offset) return -ENOSPC; handle->cur->next_swap = offset; error = write_page(handle->cur, handle->cur_swap, hb); if (error) goto out; clear_page(handle->cur); handle->cur_swap = offset; handle->k = 0; if (hb && low_free_pages() <= handle->reqd_free_pages) { error = hib_wait_io(hb); if (error) goto out; /* * Recalculate the number of required free pages, to * make sure we never take more than half. */ handle->reqd_free_pages = reqd_free_pages(); } } out: return error; } static int flush_swap_writer(struct swap_map_handle *handle) { if (handle->cur && handle->cur_swap) return write_page(handle->cur, handle->cur_swap, NULL); else return -EINVAL; } static int swap_writer_finish(struct swap_map_handle *handle, unsigned int flags, int error) { if (!error) { pr_info("S"); error = mark_swapfiles(handle, flags); pr_cont("|\n"); flush_swap_writer(handle); } if (error) free_all_swap_pages(root_swap); release_swap_writer(handle); swsusp_close(); return error; } /* * Bytes we need for compressed data in worst case. We assume(limitation) * this is the worst of all the compression algorithms. */ #define bytes_worst_compress(x) ((x) + ((x) / 16) + 64 + 3 + 2) /* We need to remember how much compressed data we need to read. */ #define CMP_HEADER sizeof(size_t) /* Number of pages/bytes we'll compress at one time. */ #define UNC_PAGES 32 #define UNC_SIZE (UNC_PAGES * PAGE_SIZE) /* Number of pages we need for compressed data (worst case). */ #define CMP_PAGES DIV_ROUND_UP(bytes_worst_compress(UNC_SIZE) + \ CMP_HEADER, PAGE_SIZE) #define CMP_SIZE (CMP_PAGES * PAGE_SIZE) /* Maximum number of threads for compression/decompression. */ #define CMP_THREADS 3 /* Minimum/maximum number of pages for read buffering. */ #define CMP_MIN_RD_PAGES 1024 #define CMP_MAX_RD_PAGES 8192 /** * save_image - save the suspend image data */ static int save_image(struct swap_map_handle *handle, struct snapshot_handle *snapshot, unsigned int nr_to_write) { unsigned int m; int ret; int nr_pages; int err2; struct hib_bio_batch hb; ktime_t start; ktime_t stop; hib_init_batch(&hb); pr_info("Saving image data pages (%u pages)...\n", nr_to_write); m = nr_to_write / 10; if (!m) m = 1; nr_pages = 0; start = ktime_get(); while (1) { ret = snapshot_read_next(snapshot); if (ret <= 0) break; ret = swap_write_page(handle, data_of(*snapshot), &hb); if (ret) break; if (!(nr_pages % m)) pr_info("Image saving progress: %3d%%\n", nr_pages / m * 10); nr_pages++; } err2 = hib_wait_io(&hb); hib_finish_batch(&hb); stop = ktime_get(); if (!ret) ret = err2; if (!ret) pr_info("Image saving done\n"); swsusp_show_speed(start, stop, nr_to_write, "Wrote"); return ret; } /* * Structure used for CRC32. */ struct crc_data { struct task_struct *thr; /* thread */ atomic_t ready; /* ready to start flag */ atomic_t stop; /* ready to stop flag */ unsigned run_threads; /* nr current threads */ wait_queue_head_t go; /* start crc update */ wait_queue_head_t done; /* crc update done */ u32 *crc32; /* points to handle's crc32 */ size_t *unc_len[CMP_THREADS]; /* uncompressed lengths */ unsigned char *unc[CMP_THREADS]; /* uncompressed data */ }; /* * CRC32 update function that runs in its own thread. */ static int crc32_threadfn(void *data) { struct crc_data *d = data; unsigned i; while (1) { wait_event(d->go, atomic_read_acquire(&d->ready) || kthread_should_stop()); if (kthread_should_stop()) { d->thr = NULL; atomic_set_release(&d->stop, 1); wake_up(&d->done); break; } atomic_set(&d->ready, 0); for (i = 0; i < d->run_threads; i++) *d->crc32 = crc32_le(*d->crc32, d->unc[i], *d->unc_len[i]); atomic_set_release(&d->stop, 1); wake_up(&d->done); } return 0; } /* * Structure used for data compression. */ struct cmp_data { struct task_struct *thr; /* thread */ struct crypto_comp *cc; /* crypto compressor stream */ atomic_t ready; /* ready to start flag */ atomic_t stop; /* ready to stop flag */ int ret; /* return code */ wait_queue_head_t go; /* start compression */ wait_queue_head_t done; /* compression done */ size_t unc_len; /* uncompressed length */ size_t cmp_len; /* compressed length */ unsigned char unc[UNC_SIZE]; /* uncompressed buffer */ unsigned char cmp[CMP_SIZE]; /* compressed buffer */ }; /* Indicates the image size after compression */ static atomic_t compressed_size = ATOMIC_INIT(0); /* * Compression function that runs in its own thread. */ static int compress_threadfn(void *data) { struct cmp_data *d = data; unsigned int cmp_len = 0; while (1) { wait_event(d->go, atomic_read_acquire(&d->ready) || kthread_should_stop()); if (kthread_should_stop()) { d->thr = NULL; d->ret = -1; atomic_set_release(&d->stop, 1); wake_up(&d->done); break; } atomic_set(&d->ready, 0); cmp_len = CMP_SIZE - CMP_HEADER; d->ret = crypto_comp_compress(d->cc, d->unc, d->unc_len, d->cmp + CMP_HEADER, &cmp_len); d->cmp_len = cmp_len; atomic_set(&compressed_size, atomic_read(&compressed_size) + d->cmp_len); atomic_set_release(&d->stop, 1); wake_up(&d->done); } return 0; } /** * save_compressed_image - Save the suspend image data after compression. * @handle: Swap map handle to use for saving the image. * @snapshot: Image to read data from. * @nr_to_write: Number of pages to save. */ static int save_compressed_image(struct swap_map_handle *handle, struct snapshot_handle *snapshot, unsigned int nr_to_write) { unsigned int m; int ret = 0; int nr_pages; int err2; struct hib_bio_batch hb; ktime_t start; ktime_t stop; size_t off; unsigned thr, run_threads, nr_threads; unsigned char *page = NULL; struct cmp_data *data = NULL; struct crc_data *crc = NULL; hib_init_batch(&hb); atomic_set(&compressed_size, 0); /* * We'll limit the number of threads for compression to limit memory * footprint. */ nr_threads = num_online_cpus() - 1; nr_threads = clamp_val(nr_threads, 1, CMP_THREADS); page = (void *)__get_free_page(GFP_NOIO | __GFP_HIGH); if (!page) { pr_err("Failed to allocate %s page\n", hib_comp_algo); ret = -ENOMEM; goto out_clean; } data = vzalloc(array_size(nr_threads, sizeof(*data))); if (!data) { pr_err("Failed to allocate %s data\n", hib_comp_algo); ret = -ENOMEM; goto out_clean; } crc = kzalloc(sizeof(*crc), GFP_KERNEL); if (!crc) { pr_err("Failed to allocate crc\n"); ret = -ENOMEM; goto out_clean; } /* * Start the compression threads. */ for (thr = 0; thr < nr_threads; thr++) { init_waitqueue_head(&data[thr].go); init_waitqueue_head(&data[thr].done); data[thr].cc = crypto_alloc_comp(hib_comp_algo, 0, 0); if (IS_ERR_OR_NULL(data[thr].cc)) { pr_err("Could not allocate comp stream %ld\n", PTR_ERR(data[thr].cc)); ret = -EFAULT; goto out_clean; } data[thr].thr = kthread_run(compress_threadfn, &data[thr], "image_compress/%u", thr); if (IS_ERR(data[thr].thr)) { data[thr].thr = NULL; pr_err("Cannot start compression threads\n"); ret = -ENOMEM; goto out_clean; } } /* * Start the CRC32 thread. */ init_waitqueue_head(&crc->go); init_waitqueue_head(&crc->done); handle->crc32 = 0; crc->crc32 = &handle->crc32; for (thr = 0; thr < nr_threads; thr++) { crc->unc[thr] = data[thr].unc; crc->unc_len[thr] = &data[thr].unc_len; } crc->thr = kthread_run(crc32_threadfn, crc, "image_crc32"); if (IS_ERR(crc->thr)) { crc->thr = NULL; pr_err("Cannot start CRC32 thread\n"); ret = -ENOMEM; goto out_clean; } /* * Adjust the number of required free pages after all allocations have * been done. We don't want to run out of pages when writing. */ handle->reqd_free_pages = reqd_free_pages(); pr_info("Using %u thread(s) for %s compression\n", nr_threads, hib_comp_algo); pr_info("Compressing and saving image data (%u pages)...\n", nr_to_write); m = nr_to_write / 10; if (!m) m = 1; nr_pages = 0; start = ktime_get(); for (;;) { for (thr = 0; thr < nr_threads; thr++) { for (off = 0; off < UNC_SIZE; off += PAGE_SIZE) { ret = snapshot_read_next(snapshot); if (ret < 0) goto out_finish; if (!ret) break; memcpy(data[thr].unc + off, data_of(*snapshot), PAGE_SIZE); if (!(nr_pages % m)) pr_info("Image saving progress: %3d%%\n", nr_pages / m * 10); nr_pages++; } if (!off) break; data[thr].unc_len = off; atomic_set_release(&data[thr].ready, 1); wake_up(&data[thr].go); } if (!thr) break; crc->run_threads = thr; atomic_set_release(&crc->ready, 1); wake_up(&crc->go); for (run_threads = thr, thr = 0; thr < run_threads; thr++) { wait_event(data[thr].done, atomic_read_acquire(&data[thr].stop)); atomic_set(&data[thr].stop, 0); ret = data[thr].ret; if (ret < 0) { pr_err("%s compression failed\n", hib_comp_algo); goto out_finish; } if (unlikely(!data[thr].cmp_len || data[thr].cmp_len > bytes_worst_compress(data[thr].unc_len))) { pr_err("Invalid %s compressed length\n", hib_comp_algo); ret = -1; goto out_finish; } *(size_t *)data[thr].cmp = data[thr].cmp_len; /* * Given we are writing one page at a time to disk, we * copy that much from the buffer, although the last * bit will likely be smaller than full page. This is * OK - we saved the length of the compressed data, so * any garbage at the end will be discarded when we * read it. */ for (off = 0; off < CMP_HEADER + data[thr].cmp_len; off += PAGE_SIZE) { memcpy(page, data[thr].cmp + off, PAGE_SIZE); ret = swap_write_page(handle, page, &hb); if (ret) goto out_finish; } } wait_event(crc->done, atomic_read_acquire(&crc->stop)); atomic_set(&crc->stop, 0); } out_finish: err2 = hib_wait_io(&hb); stop = ktime_get(); if (!ret) ret = err2; if (!ret) pr_info("Image saving done\n"); swsusp_show_speed(start, stop, nr_to_write, "Wrote"); pr_info("Image size after compression: %d kbytes\n", (atomic_read(&compressed_size) / 1024)); out_clean: hib_finish_batch(&hb); if (crc) { if (crc->thr) kthread_stop(crc->thr); kfree(crc); } if (data) { for (thr = 0; thr < nr_threads; thr++) { if (data[thr].thr) kthread_stop(data[thr].thr); if (data[thr].cc) crypto_free_comp(data[thr].cc); } vfree(data); } if (page) free_page((unsigned long)page); return ret; } /** * enough_swap - Make sure we have enough swap to save the image. * * Returns TRUE or FALSE after checking the total amount of swap * space available from the resume partition. */ static int enough_swap(unsigned int nr_pages) { unsigned int free_swap = count_swap_pages(root_swap, 1); unsigned int required; pr_debug("Free swap pages: %u\n", free_swap); required = PAGES_FOR_IO + nr_pages; return free_swap > required; } /** * swsusp_write - Write entire image and metadata. * @flags: flags to pass to the "boot" kernel in the image header * * It is important _NOT_ to umount filesystems at this point. We want * them synced (in case something goes wrong) but we DO not want to mark * filesystem clean: it is not. (And it does not matter, if we resume * correctly, we'll mark system clean, anyway.) */ int swsusp_write(unsigned int flags) { struct swap_map_handle handle; struct snapshot_handle snapshot; struct swsusp_info *header; unsigned long pages; int error; pages = snapshot_get_image_size(); error = get_swap_writer(&handle); if (error) { pr_err("Cannot get swap writer\n"); return error; } if (flags & SF_NOCOMPRESS_MODE) { if (!enough_swap(pages)) { pr_err("Not enough free swap\n"); error = -ENOSPC; goto out_finish; } } memset(&snapshot, 0, sizeof(struct snapshot_handle)); error = snapshot_read_next(&snapshot); if (error < (int)PAGE_SIZE) { if (error >= 0) error = -EFAULT; goto out_finish; } header = (struct swsusp_info *)data_of(snapshot); error = swap_write_page(&handle, header, NULL); if (!error) { error = (flags & SF_NOCOMPRESS_MODE) ? save_image(&handle, &snapshot, pages - 1) : save_compressed_image(&handle, &snapshot, pages - 1); } out_finish: error = swap_writer_finish(&handle, flags, error); return error; } /* * The following functions allow us to read data using a swap map * in a file-like way. */ static void release_swap_reader(struct swap_map_handle *handle) { struct swap_map_page_list *tmp; while (handle->maps) { if (handle->maps->map) free_page((unsigned long)handle->maps->map); tmp = handle->maps; handle->maps = handle->maps->next; kfree(tmp); } handle->cur = NULL; } static int get_swap_reader(struct swap_map_handle *handle, unsigned int *flags_p) { int error; struct swap_map_page_list *tmp, *last; sector_t offset; *flags_p = swsusp_header->flags; if (!swsusp_header->image) /* how can this happen? */ return -EINVAL; handle->cur = NULL; last = handle->maps = NULL; offset = swsusp_header->image; while (offset) { tmp = kzalloc(sizeof(*handle->maps), GFP_KERNEL); if (!tmp) { release_swap_reader(handle); return -ENOMEM; } if (!handle->maps) handle->maps = tmp; if (last) last->next = tmp; last = tmp; tmp->map = (struct swap_map_page *) __get_free_page(GFP_NOIO | __GFP_HIGH); if (!tmp->map) { release_swap_reader(handle); return -ENOMEM; } error = hib_submit_io(REQ_OP_READ, offset, tmp->map, NULL); if (error) { release_swap_reader(handle); return error; } offset = tmp->map->next_swap; } handle->k = 0; handle->cur = handle->maps->map; return 0; } static int swap_read_page(struct swap_map_handle *handle, void *buf, struct hib_bio_batch *hb) { sector_t offset; int error; struct swap_map_page_list *tmp; if (!handle->cur) return -EINVAL; offset = handle->cur->entries[handle->k]; if (!offset) return -EFAULT; error = hib_submit_io(REQ_OP_READ, offset, buf, hb); if (error) return error; if (++handle->k >= MAP_PAGE_ENTRIES) { handle->k = 0; free_page((unsigned long)handle->maps->map); tmp = handle->maps; handle->maps = handle->maps->next; kfree(tmp); if (!handle->maps) release_swap_reader(handle); else handle->cur = handle->maps->map; } return error; } static int swap_reader_finish(struct swap_map_handle *handle) { release_swap_reader(handle); return 0; } /** * load_image - load the image using the swap map handle * @handle and the snapshot handle @snapshot * (assume there are @nr_pages pages to load) */ static int load_image(struct swap_map_handle *handle, struct snapshot_handle *snapshot, unsigned int nr_to_read) { unsigned int m; int ret = 0; ktime_t start; ktime_t stop; struct hib_bio_batch hb; int err2; unsigned nr_pages; hib_init_batch(&hb); clean_pages_on_read = true; pr_info("Loading image data pages (%u pages)...\n", nr_to_read); m = nr_to_read / 10; if (!m) m = 1; nr_pages = 0; start = ktime_get(); for ( ; ; ) { ret = snapshot_write_next(snapshot); if (ret <= 0) break; ret = swap_read_page(handle, data_of(*snapshot), &hb); if (ret) break; if (snapshot->sync_read) ret = hib_wait_io(&hb); if (ret) break; if (!(nr_pages % m)) pr_info("Image loading progress: %3d%%\n", nr_pages / m * 10); nr_pages++; } err2 = hib_wait_io(&hb); hib_finish_batch(&hb); stop = ktime_get(); if (!ret) ret = err2; if (!ret) { pr_info("Image loading done\n"); ret = snapshot_write_finalize(snapshot); if (!ret && !snapshot_image_loaded(snapshot)) ret = -ENODATA; } swsusp_show_speed(start, stop, nr_to_read, "Read"); return ret; } /* * Structure used for data decompression. */ struct dec_data { struct task_struct *thr; /* thread */ struct crypto_comp *cc; /* crypto compressor stream */ atomic_t ready; /* ready to start flag */ atomic_t stop; /* ready to stop flag */ int ret; /* return code */ wait_queue_head_t go; /* start decompression */ wait_queue_head_t done; /* decompression done */ size_t unc_len; /* uncompressed length */ size_t cmp_len; /* compressed length */ unsigned char unc[UNC_SIZE]; /* uncompressed buffer */ unsigned char cmp[CMP_SIZE]; /* compressed buffer */ }; /* * Decompression function that runs in its own thread. */ static int decompress_threadfn(void *data) { struct dec_data *d = data; unsigned int unc_len = 0; while (1) { wait_event(d->go, atomic_read_acquire(&d->ready) || kthread_should_stop()); if (kthread_should_stop()) { d->thr = NULL; d->ret = -1; atomic_set_release(&d->stop, 1); wake_up(&d->done); break; } atomic_set(&d->ready, 0); unc_len = UNC_SIZE; d->ret = crypto_comp_decompress(d->cc, d->cmp + CMP_HEADER, d->cmp_len, d->unc, &unc_len); d->unc_len = unc_len; if (clean_pages_on_decompress) flush_icache_range((unsigned long)d->unc, (unsigned long)d->unc + d->unc_len); atomic_set_release(&d->stop, 1); wake_up(&d->done); } return 0; } /** * load_compressed_image - Load compressed image data and decompress it. * @handle: Swap map handle to use for loading data. * @snapshot: Image to copy uncompressed data into. * @nr_to_read: Number of pages to load. */ static int load_compressed_image(struct swap_map_handle *handle, struct snapshot_handle *snapshot, unsigned int nr_to_read) { unsigned int m; int ret = 0; int eof = 0; struct hib_bio_batch hb; ktime_t start; ktime_t stop; unsigned nr_pages; size_t off; unsigned i, thr, run_threads, nr_threads; unsigned ring = 0, pg = 0, ring_size = 0, have = 0, want, need, asked = 0; unsigned long read_pages = 0; unsigned char **page = NULL; struct dec_data *data = NULL; struct crc_data *crc = NULL; hib_init_batch(&hb); /* * We'll limit the number of threads for decompression to limit memory * footprint. */ nr_threads = num_online_cpus() - 1; nr_threads = clamp_val(nr_threads, 1, CMP_THREADS); page = vmalloc(array_size(CMP_MAX_RD_PAGES, sizeof(*page))); if (!page) { pr_err("Failed to allocate %s page\n", hib_comp_algo); ret = -ENOMEM; goto out_clean; } data = vzalloc(array_size(nr_threads, sizeof(*data))); if (!data) { pr_err("Failed to allocate %s data\n", hib_comp_algo); ret = -ENOMEM; goto out_clean; } crc = kzalloc(sizeof(*crc), GFP_KERNEL); if (!crc) { pr_err("Failed to allocate crc\n"); ret = -ENOMEM; goto out_clean; } clean_pages_on_decompress = true; /* * Start the decompression threads. */ for (thr = 0; thr < nr_threads; thr++) { init_waitqueue_head(&data[thr].go); init_waitqueue_head(&data[thr].done); data[thr].cc = crypto_alloc_comp(hib_comp_algo, 0, 0); if (IS_ERR_OR_NULL(data[thr].cc)) { pr_err("Could not allocate comp stream %ld\n", PTR_ERR(data[thr].cc)); ret = -EFAULT; goto out_clean; } data[thr].thr = kthread_run(decompress_threadfn, &data[thr], "image_decompress/%u", thr); if (IS_ERR(data[thr].thr)) { data[thr].thr = NULL; pr_err("Cannot start decompression threads\n"); ret = -ENOMEM; goto out_clean; } } /* * Start the CRC32 thread. */ init_waitqueue_head(&crc->go); init_waitqueue_head(&crc->done); handle->crc32 = 0; crc->crc32 = &handle->crc32; for (thr = 0; thr < nr_threads; thr++) { crc->unc[thr] = data[thr].unc; crc->unc_len[thr] = &data[thr].unc_len; } crc->thr = kthread_run(crc32_threadfn, crc, "image_crc32"); if (IS_ERR(crc->thr)) { crc->thr = NULL; pr_err("Cannot start CRC32 thread\n"); ret = -ENOMEM; goto out_clean; } /* * Set the number of pages for read buffering. * This is complete guesswork, because we'll only know the real * picture once prepare_image() is called, which is much later on * during the image load phase. We'll assume the worst case and * say that none of the image pages are from high memory. */ if (low_free_pages() > snapshot_get_image_size()) read_pages = (low_free_pages() - snapshot_get_image_size()) / 2; read_pages = clamp_val(read_pages, CMP_MIN_RD_PAGES, CMP_MAX_RD_PAGES); for (i = 0; i < read_pages; i++) { page[i] = (void *)__get_free_page(i < CMP_PAGES ? GFP_NOIO | __GFP_HIGH : GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY); if (!page[i]) { if (i < CMP_PAGES) { ring_size = i; pr_err("Failed to allocate %s pages\n", hib_comp_algo); ret = -ENOMEM; goto out_clean; } else { break; } } } want = ring_size = i; pr_info("Using %u thread(s) for %s decompression\n", nr_threads, hib_comp_algo); pr_info("Loading and decompressing image data (%u pages)...\n", nr_to_read); m = nr_to_read / 10; if (!m) m = 1; nr_pages = 0; start = ktime_get(); ret = snapshot_write_next(snapshot); if (ret <= 0) goto out_finish; for(;;) { for (i = 0; !eof && i < want; i++) { ret = swap_read_page(handle, page[ring], &hb); if (ret) { /* * On real read error, finish. On end of data, * set EOF flag and just exit the read loop. */ if (handle->cur && handle->cur->entries[handle->k]) { goto out_finish; } else { eof = 1; break; } } if (++ring >= ring_size) ring = 0; } asked += i; want -= i; /* * We are out of data, wait for some more. */ if (!have) { if (!asked) break; ret = hib_wait_io(&hb); if (ret) goto out_finish; have += asked; asked = 0; if (eof) eof = 2; } if (crc->run_threads) { wait_event(crc->done, atomic_read_acquire(&crc->stop)); atomic_set(&crc->stop, 0); crc->run_threads = 0; } for (thr = 0; have && thr < nr_threads; thr++) { data[thr].cmp_len = *(size_t *)page[pg]; if (unlikely(!data[thr].cmp_len || data[thr].cmp_len > bytes_worst_compress(UNC_SIZE))) { pr_err("Invalid %s compressed length\n", hib_comp_algo); ret = -1; goto out_finish; } need = DIV_ROUND_UP(data[thr].cmp_len + CMP_HEADER, PAGE_SIZE); if (need > have) { if (eof > 1) { ret = -1; goto out_finish; } break; } for (off = 0; off < CMP_HEADER + data[thr].cmp_len; off += PAGE_SIZE) { memcpy(data[thr].cmp + off, page[pg], PAGE_SIZE); have--; want++; if (++pg >= ring_size) pg = 0; } atomic_set_release(&data[thr].ready, 1); wake_up(&data[thr].go); } /* * Wait for more data while we are decompressing. */ if (have < CMP_PAGES && asked) { ret = hib_wait_io(&hb); if (ret) goto out_finish; have += asked; asked = 0; if (eof) eof = 2; } for (run_threads = thr, thr = 0; thr < run_threads; thr++) { wait_event(data[thr].done, atomic_read_acquire(&data[thr].stop)); atomic_set(&data[thr].stop, 0); ret = data[thr].ret; if (ret < 0) { pr_err("%s decompression failed\n", hib_comp_algo); goto out_finish; } if (unlikely(!data[thr].unc_len || data[thr].unc_len > UNC_SIZE || data[thr].unc_len & (PAGE_SIZE - 1))) { pr_err("Invalid %s uncompressed length\n", hib_comp_algo); ret = -1; goto out_finish; } for (off = 0; off < data[thr].unc_len; off += PAGE_SIZE) { memcpy(data_of(*snapshot), data[thr].unc + off, PAGE_SIZE); if (!(nr_pages % m)) pr_info("Image loading progress: %3d%%\n", nr_pages / m * 10); nr_pages++; ret = snapshot_write_next(snapshot); if (ret <= 0) { crc->run_threads = thr + 1; atomic_set_release(&crc->ready, 1); wake_up(&crc->go); goto out_finish; } } } crc->run_threads = thr; atomic_set_release(&crc->ready, 1); wake_up(&crc->go); } out_finish: if (crc->run_threads) { wait_event(crc->done, atomic_read_acquire(&crc->stop)); atomic_set(&crc->stop, 0); } stop = ktime_get(); if (!ret) { pr_info("Image loading done\n"); ret = snapshot_write_finalize(snapshot); if (!ret && !snapshot_image_loaded(snapshot)) ret = -ENODATA; if (!ret) { if (swsusp_header->flags & SF_CRC32_MODE) { if(handle->crc32 != swsusp_header->crc32) { pr_err("Invalid image CRC32!\n"); ret = -ENODATA; } } } } swsusp_show_speed(start, stop, nr_to_read, "Read"); out_clean: hib_finish_batch(&hb); for (i = 0; i < ring_size; i++) free_page((unsigned long)page[i]); if (crc) { if (crc->thr) kthread_stop(crc->thr); kfree(crc); } if (data) { for (thr = 0; thr < nr_threads; thr++) { if (data[thr].thr) kthread_stop(data[thr].thr); if (data[thr].cc) crypto_free_comp(data[thr].cc); } vfree(data); } vfree(page); return ret; } /** * swsusp_read - read the hibernation image. * @flags_p: flags passed by the "frozen" kernel in the image header should * be written into this memory location */ int swsusp_read(unsigned int *flags_p) { int error; struct swap_map_handle handle; struct snapshot_handle snapshot; struct swsusp_info *header; memset(&snapshot, 0, sizeof(struct snapshot_handle)); error = snapshot_write_next(&snapshot); if (error < (int)PAGE_SIZE) return error < 0 ? error : -EFAULT; header = (struct swsusp_info *)data_of(snapshot); error = get_swap_reader(&handle, flags_p); if (error) goto end; if (!error) error = swap_read_page(&handle, header, NULL); if (!error) { error = (*flags_p & SF_NOCOMPRESS_MODE) ? load_image(&handle, &snapshot, header->pages - 1) : load_compressed_image(&handle, &snapshot, header->pages - 1); } swap_reader_finish(&handle); end: if (!error) pr_debug("Image successfully loaded\n"); else pr_debug("Error %d resuming\n", error); return error; } static void *swsusp_holder; /** * swsusp_check - Open the resume device and check for the swsusp signature. * @exclusive: Open the resume device exclusively. */ int swsusp_check(bool exclusive) { void *holder = exclusive ? &swsusp_holder : NULL; int error; hib_resume_bdev_file = bdev_file_open_by_dev(swsusp_resume_device, BLK_OPEN_READ, holder, NULL); if (!IS_ERR(hib_resume_bdev_file)) { clear_page(swsusp_header); error = hib_submit_io(REQ_OP_READ, swsusp_resume_block, swsusp_header, NULL); if (error) goto put; if (!memcmp(HIBERNATE_SIG, swsusp_header->sig, 10)) { memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10); swsusp_header_flags = swsusp_header->flags; /* Reset swap signature now */ error = hib_submit_io(REQ_OP_WRITE | REQ_SYNC, swsusp_resume_block, swsusp_header, NULL); } else { error = -EINVAL; } if (!error && swsusp_header->flags & SF_HW_SIG && swsusp_header->hw_sig != swsusp_hardware_signature) { pr_info("Suspend image hardware signature mismatch (%08x now %08x); aborting resume.\n", swsusp_header->hw_sig, swsusp_hardware_signature); error = -EINVAL; } put: if (error) bdev_fput(hib_resume_bdev_file); else pr_debug("Image signature found, resuming\n"); } else { error = PTR_ERR(hib_resume_bdev_file); } if (error) pr_debug("Image not found (code %d)\n", error); return error; } /** * swsusp_close - close resume device. */ void swsusp_close(void) { if (IS_ERR(hib_resume_bdev_file)) { pr_debug("Image device not initialised\n"); return; } fput(hib_resume_bdev_file); } /** * swsusp_unmark - Unmark swsusp signature in the resume device */ #ifdef CONFIG_SUSPEND int swsusp_unmark(void) { int error; hib_submit_io(REQ_OP_READ, swsusp_resume_block, swsusp_header, NULL); if (!memcmp(HIBERNATE_SIG,swsusp_header->sig, 10)) { memcpy(swsusp_header->sig,swsusp_header->orig_sig, 10); error = hib_submit_io(REQ_OP_WRITE | REQ_SYNC, swsusp_resume_block, swsusp_header, NULL); } else { pr_err("Cannot find swsusp signature!\n"); error = -ENODEV; } /* * We just returned from suspend, we don't need the image any more. */ free_all_swap_pages(root_swap); return error; } #endif static int __init swsusp_header_init(void) { swsusp_header = (struct swsusp_header*) __get_free_page(GFP_KERNEL); if (!swsusp_header) panic("Could not allocate memory for swsusp_header\n"); return 0; } core_initcall(swsusp_header_init);
35 8 8 7 7 8 9 9 9 9 9 9 9 9 9 9 2 2 17 17 16 16 16 16 17 10 10 10 9 10 9 9 7 9 9 10 4 4 4 4 3 24 24 24 24 2 23 23 23 23 23 34 34 15 34 34 34 34 34 2 32 27 27 36 34 34 32 2 1 46 46 46 36 46 4 2 1 1 2 1 2 2 1 3 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 // SPDX-License-Identifier: GPL-2.0-or-later /* * * Bluetooth HCI UART driver * * Copyright (C) 2000-2001 Qualcomm Incorporated * Copyright (C) 2002-2003 Maxim Krasnyansky <maxk@qualcomm.com> * Copyright (C) 2004-2005 Marcel Holtmann <marcel@holtmann.org> */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/init.h> #include <linux/types.h> #include <linux/fcntl.h> #include <linux/interrupt.h> #include <linux/ptrace.h> #include <linux/poll.h> #include <linux/slab.h> #include <linux/tty.h> #include <linux/errno.h> #include <linux/string.h> #include <linux/signal.h> #include <linux/ioctl.h> #include <linux/skbuff.h> #include <linux/firmware.h> #include <linux/serdev.h> #include <net/bluetooth/bluetooth.h> #include <net/bluetooth/hci_core.h> #include "btintel.h" #include "btbcm.h" #include "hci_uart.h" #define VERSION "2.3" static const struct hci_uart_proto *hup[HCI_UART_MAX_PROTO]; int hci_uart_register_proto(const struct hci_uart_proto *p) { if (p->id >= HCI_UART_MAX_PROTO) return -EINVAL; if (hup[p->id]) return -EEXIST; hup[p->id] = p; BT_INFO("HCI UART protocol %s registered", p->name); return 0; } int hci_uart_unregister_proto(const struct hci_uart_proto *p) { if (p->id >= HCI_UART_MAX_PROTO) return -EINVAL; if (!hup[p->id]) return -EINVAL; hup[p->id] = NULL; return 0; } static const struct hci_uart_proto *hci_uart_get_proto(unsigned int id) { if (id >= HCI_UART_MAX_PROTO) return NULL; return hup[id]; } static inline void hci_uart_tx_complete(struct hci_uart *hu, int pkt_type) { struct hci_dev *hdev = hu->hdev; /* Update HCI stat counters */ switch (pkt_type) { case HCI_COMMAND_PKT: hdev->stat.cmd_tx++; break; case HCI_ACLDATA_PKT: hdev->stat.acl_tx++; break; case HCI_SCODATA_PKT: hdev->stat.sco_tx++; break; } } static inline struct sk_buff *hci_uart_dequeue(struct hci_uart *hu) { struct sk_buff *skb = hu->tx_skb; if (!skb) { percpu_down_read(&hu->proto_lock); if (test_bit(HCI_UART_PROTO_READY, &hu->flags)) skb = hu->proto->dequeue(hu); percpu_up_read(&hu->proto_lock); } else { hu->tx_skb = NULL; } return skb; } int hci_uart_tx_wakeup(struct hci_uart *hu) { /* This may be called in an IRQ context, so we can't sleep. Therefore * we try to acquire the lock only, and if that fails we assume the * tty is being closed because that is the only time the write lock is * acquired. If, however, at some point in the future the write lock * is also acquired in other situations, then this must be revisited. */ if (!percpu_down_read_trylock(&hu->proto_lock)) return 0; if (!test_bit(HCI_UART_PROTO_READY, &hu->flags)) goto no_schedule; set_bit(HCI_UART_TX_WAKEUP, &hu->tx_state); if (test_and_set_bit(HCI_UART_SENDING, &hu->tx_state)) goto no_schedule; BT_DBG(""); schedule_work(&hu->write_work); no_schedule: percpu_up_read(&hu->proto_lock); return 0; } EXPORT_SYMBOL_GPL(hci_uart_tx_wakeup); static void hci_uart_write_work(struct work_struct *work) { struct hci_uart *hu = container_of(work, struct hci_uart, write_work); struct tty_struct *tty = hu->tty; struct hci_dev *hdev = hu->hdev; struct sk_buff *skb; /* REVISIT: should we cope with bad skbs or ->write() returning * and error value ? */ restart: clear_bit(HCI_UART_TX_WAKEUP, &hu->tx_state); while ((skb = hci_uart_dequeue(hu))) { int len; set_bit(TTY_DO_WRITE_WAKEUP, &tty->flags); len = tty->ops->write(tty, skb->data, skb->len); hdev->stat.byte_tx += len; skb_pull(skb, len); if (skb->len) { hu->tx_skb = skb; break; } hci_uart_tx_complete(hu, hci_skb_pkt_type(skb)); kfree_skb(skb); } clear_bit(HCI_UART_SENDING, &hu->tx_state); if (test_bit(HCI_UART_TX_WAKEUP, &hu->tx_state)) goto restart; wake_up_bit(&hu->tx_state, HCI_UART_SENDING); } void hci_uart_init_work(struct work_struct *work) { struct hci_uart *hu = container_of(work, struct hci_uart, init_ready); int err; struct hci_dev *hdev; if (!test_and_clear_bit(HCI_UART_INIT_PENDING, &hu->hdev_flags)) return; err = hci_register_dev(hu->hdev); if (err < 0) { BT_ERR("Can't register HCI device"); clear_bit(HCI_UART_PROTO_READY, &hu->flags); hu->proto->close(hu); hdev = hu->hdev; hu->hdev = NULL; hci_free_dev(hdev); return; } set_bit(HCI_UART_REGISTERED, &hu->flags); } int hci_uart_init_ready(struct hci_uart *hu) { if (!test_bit(HCI_UART_INIT_PENDING, &hu->hdev_flags)) return -EALREADY; schedule_work(&hu->init_ready); return 0; } int hci_uart_wait_until_sent(struct hci_uart *hu) { return wait_on_bit_timeout(&hu->tx_state, HCI_UART_SENDING, TASK_INTERRUPTIBLE, msecs_to_jiffies(2000)); } /* ------- Interface to HCI layer ------ */ /* Reset device */ static int hci_uart_flush(struct hci_dev *hdev) { struct hci_uart *hu = hci_get_drvdata(hdev); struct tty_struct *tty = hu->tty; BT_DBG("hdev %p tty %p", hdev, tty); if (hu->tx_skb) { kfree_skb(hu->tx_skb); hu->tx_skb = NULL; } /* Flush any pending characters in the driver and discipline. */ tty_ldisc_flush(tty); tty_driver_flush_buffer(tty); percpu_down_read(&hu->proto_lock); if (test_bit(HCI_UART_PROTO_READY, &hu->flags)) hu->proto->flush(hu); percpu_up_read(&hu->proto_lock); return 0; } /* Initialize device */ static int hci_uart_open(struct hci_dev *hdev) { BT_DBG("%s %p", hdev->name, hdev); /* Undo clearing this from hci_uart_close() */ hdev->flush = hci_uart_flush; return 0; } /* Close device */ static int hci_uart_close(struct hci_dev *hdev) { BT_DBG("hdev %p", hdev); hci_uart_flush(hdev); hdev->flush = NULL; return 0; } /* Send frames from HCI layer */ static int hci_uart_send_frame(struct hci_dev *hdev, struct sk_buff *skb) { struct hci_uart *hu = hci_get_drvdata(hdev); BT_DBG("%s: type %d len %d", hdev->name, hci_skb_pkt_type(skb), skb->len); percpu_down_read(&hu->proto_lock); if (!test_bit(HCI_UART_PROTO_READY, &hu->flags)) { percpu_up_read(&hu->proto_lock); return -EUNATCH; } hu->proto->enqueue(hu, skb); percpu_up_read(&hu->proto_lock); hci_uart_tx_wakeup(hu); return 0; } /* Check the underlying device or tty has flow control support */ bool hci_uart_has_flow_control(struct hci_uart *hu) { /* serdev nodes check if the needed operations are present */ if (hu->serdev) return true; if (hu->tty->driver->ops->tiocmget && hu->tty->driver->ops->tiocmset) return true; return false; } /* Flow control or un-flow control the device */ void hci_uart_set_flow_control(struct hci_uart *hu, bool enable) { struct tty_struct *tty = hu->tty; struct ktermios ktermios; int status; unsigned int set = 0; unsigned int clear = 0; if (hu->serdev) { serdev_device_set_flow_control(hu->serdev, !enable); serdev_device_set_rts(hu->serdev, !enable); return; } if (enable) { /* Disable hardware flow control */ ktermios = tty->termios; ktermios.c_cflag &= ~CRTSCTS; tty_set_termios(tty, &ktermios); BT_DBG("Disabling hardware flow control: %s", (tty->termios.c_cflag & CRTSCTS) ? "failed" : "success"); /* Clear RTS to prevent the device from sending */ /* Most UARTs need OUT2 to enable interrupts */ status = tty->driver->ops->tiocmget(tty); BT_DBG("Current tiocm 0x%x", status); set &= ~(TIOCM_OUT2 | TIOCM_RTS); clear = ~set; set &= TIOCM_DTR | TIOCM_RTS | TIOCM_OUT1 | TIOCM_OUT2 | TIOCM_LOOP; clear &= TIOCM_DTR | TIOCM_RTS | TIOCM_OUT1 | TIOCM_OUT2 | TIOCM_LOOP; status = tty->driver->ops->tiocmset(tty, set, clear); BT_DBG("Clearing RTS: %s", status ? "failed" : "success"); } else { /* Set RTS to allow the device to send again */ status = tty->driver->ops->tiocmget(tty); BT_DBG("Current tiocm 0x%x", status); set |= (TIOCM_OUT2 | TIOCM_RTS); clear = ~set; set &= TIOCM_DTR | TIOCM_RTS | TIOCM_OUT1 | TIOCM_OUT2 | TIOCM_LOOP; clear &= TIOCM_DTR | TIOCM_RTS | TIOCM_OUT1 | TIOCM_OUT2 | TIOCM_LOOP; status = tty->driver->ops->tiocmset(tty, set, clear); BT_DBG("Setting RTS: %s", status ? "failed" : "success"); /* Re-enable hardware flow control */ ktermios = tty->termios; ktermios.c_cflag |= CRTSCTS; tty_set_termios(tty, &ktermios); BT_DBG("Enabling hardware flow control: %s", !(tty->termios.c_cflag & CRTSCTS) ? "failed" : "success"); } } void hci_uart_set_speeds(struct hci_uart *hu, unsigned int init_speed, unsigned int oper_speed) { hu->init_speed = init_speed; hu->oper_speed = oper_speed; } void hci_uart_set_baudrate(struct hci_uart *hu, unsigned int speed) { struct tty_struct *tty = hu->tty; struct ktermios ktermios; ktermios = tty->termios; ktermios.c_cflag &= ~CBAUD; tty_termios_encode_baud_rate(&ktermios, speed, speed); /* tty_set_termios() return not checked as it is always 0 */ tty_set_termios(tty, &ktermios); BT_DBG("%s: New tty speeds: %d/%d", hu->hdev->name, tty->termios.c_ispeed, tty->termios.c_ospeed); } static int hci_uart_setup(struct hci_dev *hdev) { struct hci_uart *hu = hci_get_drvdata(hdev); struct hci_rp_read_local_version *ver; struct sk_buff *skb; unsigned int speed; int err; /* Init speed if any */ if (hu->init_speed) speed = hu->init_speed; else if (hu->proto->init_speed) speed = hu->proto->init_speed; else speed = 0; if (speed) hci_uart_set_baudrate(hu, speed); /* Operational speed if any */ if (hu->oper_speed) speed = hu->oper_speed; else if (hu->proto->oper_speed) speed = hu->proto->oper_speed; else speed = 0; if (hu->proto->set_baudrate && speed) { err = hu->proto->set_baudrate(hu, speed); if (!err) hci_uart_set_baudrate(hu, speed); } if (hu->proto->setup) return hu->proto->setup(hu); if (!test_bit(HCI_UART_VND_DETECT, &hu->hdev_flags)) return 0; skb = __hci_cmd_sync(hdev, HCI_OP_READ_LOCAL_VERSION, 0, NULL, HCI_INIT_TIMEOUT); if (IS_ERR(skb)) { BT_ERR("%s: Reading local version information failed (%ld)", hdev->name, PTR_ERR(skb)); return 0; } if (skb->len != sizeof(*ver)) { BT_ERR("%s: Event length mismatch for version information", hdev->name); goto done; } ver = (struct hci_rp_read_local_version *)skb->data; switch (le16_to_cpu(ver->manufacturer)) { #ifdef CONFIG_BT_HCIUART_INTEL case 2: hdev->set_bdaddr = btintel_set_bdaddr; btintel_check_bdaddr(hdev); break; #endif #ifdef CONFIG_BT_HCIUART_BCM case 15: hdev->set_bdaddr = btbcm_set_bdaddr; btbcm_check_bdaddr(hdev); break; #endif default: break; } done: kfree_skb(skb); return 0; } /* ------ LDISC part ------ */ /* hci_uart_tty_open * * Called when line discipline changed to HCI_UART. * * Arguments: * tty pointer to tty info structure * Return Value: * 0 if success, otherwise error code */ static int hci_uart_tty_open(struct tty_struct *tty) { struct hci_uart *hu; BT_DBG("tty %p", tty); if (!capable(CAP_NET_ADMIN)) return -EPERM; /* Error if the tty has no write op instead of leaving an exploitable * hole */ if (tty->ops->write == NULL) return -EOPNOTSUPP; hu = kzalloc(sizeof(*hu), GFP_KERNEL); if (!hu) { BT_ERR("Can't allocate control structure"); return -ENFILE; } if (percpu_init_rwsem(&hu->proto_lock)) { BT_ERR("Can't allocate semaphore structure"); kfree(hu); return -ENOMEM; } tty->disc_data = hu; hu->tty = tty; tty->receive_room = 65536; /* disable alignment support by default */ hu->alignment = 1; hu->padding = 0; /* Use serial port speed as oper_speed */ hu->oper_speed = tty->termios.c_ospeed; INIT_WORK(&hu->init_ready, hci_uart_init_work); INIT_WORK(&hu->write_work, hci_uart_write_work); /* Flush any pending characters in the driver */ tty_driver_flush_buffer(tty); return 0; } /* hci_uart_tty_close() * * Called when the line discipline is changed to something * else, the tty is closed, or the tty detects a hangup. */ static void hci_uart_tty_close(struct tty_struct *tty) { struct hci_uart *hu = tty->disc_data; struct hci_dev *hdev; BT_DBG("tty %p", tty); /* Detach from the tty */ tty->disc_data = NULL; if (!hu) return; hdev = hu->hdev; if (hdev) hci_uart_close(hdev); if (test_bit(HCI_UART_PROTO_READY, &hu->flags)) { percpu_down_write(&hu->proto_lock); clear_bit(HCI_UART_PROTO_READY, &hu->flags); percpu_up_write(&hu->proto_lock); cancel_work_sync(&hu->init_ready); cancel_work_sync(&hu->write_work); if (hdev) { if (test_bit(HCI_UART_REGISTERED, &hu->flags)) hci_unregister_dev(hdev); hci_free_dev(hdev); } hu->proto->close(hu); } clear_bit(HCI_UART_PROTO_SET, &hu->flags); percpu_free_rwsem(&hu->proto_lock); kfree(hu); } /* hci_uart_tty_wakeup() * * Callback for transmit wakeup. Called when low level * device driver can accept more send data. * * Arguments: tty pointer to associated tty instance data * Return Value: None */ static void hci_uart_tty_wakeup(struct tty_struct *tty) { struct hci_uart *hu = tty->disc_data; BT_DBG(""); if (!hu) return; clear_bit(TTY_DO_WRITE_WAKEUP, &tty->flags); if (tty != hu->tty) return; if (test_bit(HCI_UART_PROTO_READY, &hu->flags)) hci_uart_tx_wakeup(hu); } /* hci_uart_tty_receive() * * Called by tty low level driver when receive data is * available. * * Arguments: tty pointer to tty instance data * data pointer to received data * flags pointer to flags for data * count count of received data in bytes * * Return Value: None */ static void hci_uart_tty_receive(struct tty_struct *tty, const u8 *data, const u8 *flags, size_t count) { struct hci_uart *hu = tty->disc_data; if (!hu || tty != hu->tty) return; percpu_down_read(&hu->proto_lock); if (!test_bit(HCI_UART_PROTO_READY, &hu->flags)) { percpu_up_read(&hu->proto_lock); return; } /* It does not need a lock here as it is already protected by a mutex in * tty caller */ hu->proto->recv(hu, data, count); percpu_up_read(&hu->proto_lock); if (hu->hdev) hu->hdev->stat.byte_rx += count; tty_unthrottle(tty); } static int hci_uart_register_dev(struct hci_uart *hu) { struct hci_dev *hdev; int err; BT_DBG(""); /* Initialize and register HCI device */ hdev = hci_alloc_dev(); if (!hdev) { BT_ERR("Can't allocate HCI device"); return -ENOMEM; } hu->hdev = hdev; hdev->bus = HCI_UART; hci_set_drvdata(hdev, hu); /* Only when vendor specific setup callback is provided, consider * the manufacturer information valid. This avoids filling in the * value for Ericsson when nothing is specified. */ if (hu->proto->setup) hdev->manufacturer = hu->proto->manufacturer; hdev->open = hci_uart_open; hdev->close = hci_uart_close; hdev->flush = hci_uart_flush; hdev->send = hci_uart_send_frame; hdev->setup = hci_uart_setup; SET_HCIDEV_DEV(hdev, hu->tty->dev); if (test_bit(HCI_UART_RAW_DEVICE, &hu->hdev_flags)) set_bit(HCI_QUIRK_RAW_DEVICE, &hdev->quirks); if (test_bit(HCI_UART_EXT_CONFIG, &hu->hdev_flags)) set_bit(HCI_QUIRK_EXTERNAL_CONFIG, &hdev->quirks); if (!test_bit(HCI_UART_RESET_ON_INIT, &hu->hdev_flags)) set_bit(HCI_QUIRK_RESET_ON_CLOSE, &hdev->quirks); /* Only call open() for the protocol after hdev is fully initialized as * open() (or a timer/workqueue it starts) may attempt to reference it. */ err = hu->proto->open(hu); if (err) { hu->hdev = NULL; hci_free_dev(hdev); return err; } if (test_bit(HCI_UART_INIT_PENDING, &hu->hdev_flags)) return 0; if (hci_register_dev(hdev) < 0) { BT_ERR("Can't register HCI device"); hu->proto->close(hu); hu->hdev = NULL; hci_free_dev(hdev); return -ENODEV; } set_bit(HCI_UART_REGISTERED, &hu->flags); return 0; } static int hci_uart_set_proto(struct hci_uart *hu, int id) { const struct hci_uart_proto *p; int err; p = hci_uart_get_proto(id); if (!p) return -EPROTONOSUPPORT; hu->proto = p; err = hci_uart_register_dev(hu); if (err) { return err; } set_bit(HCI_UART_PROTO_READY, &hu->flags); return 0; } static int hci_uart_set_flags(struct hci_uart *hu, unsigned long flags) { unsigned long valid_flags = BIT(HCI_UART_RAW_DEVICE) | BIT(HCI_UART_RESET_ON_INIT) | BIT(HCI_UART_INIT_PENDING) | BIT(HCI_UART_EXT_CONFIG) | BIT(HCI_UART_VND_DETECT); if (flags & ~valid_flags) return -EINVAL; hu->hdev_flags = flags; return 0; } /* hci_uart_tty_ioctl() * * Process IOCTL system call for the tty device. * * Arguments: * * tty pointer to tty instance data * cmd IOCTL command code * arg argument for IOCTL call (cmd dependent) * * Return Value: Command dependent */ static int hci_uart_tty_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg) { struct hci_uart *hu = tty->disc_data; int err = 0; BT_DBG(""); /* Verify the status of the device */ if (!hu) return -EBADF; switch (cmd) { case HCIUARTSETPROTO: if (!test_and_set_bit(HCI_UART_PROTO_SET, &hu->flags)) { err = hci_uart_set_proto(hu, arg); if (err) clear_bit(HCI_UART_PROTO_SET, &hu->flags); } else err = -EBUSY; break; case HCIUARTGETPROTO: if (test_bit(HCI_UART_PROTO_SET, &hu->flags) && test_bit(HCI_UART_PROTO_READY, &hu->flags)) err = hu->proto->id; else err = -EUNATCH; break; case HCIUARTGETDEVICE: if (test_bit(HCI_UART_REGISTERED, &hu->flags)) err = hu->hdev->id; else err = -EUNATCH; break; case HCIUARTSETFLAGS: if (test_bit(HCI_UART_PROTO_SET, &hu->flags)) err = -EBUSY; else err = hci_uart_set_flags(hu, arg); break; case HCIUARTGETFLAGS: err = hu->hdev_flags; break; default: err = n_tty_ioctl_helper(tty, cmd, arg); break; } return err; } /* * We don't provide read/write/poll interface for user space. */ static ssize_t hci_uart_tty_read(struct tty_struct *tty, struct file *file, u8 *buf, size_t nr, void **cookie, unsigned long offset) { return 0; } static ssize_t hci_uart_tty_write(struct tty_struct *tty, struct file *file, const u8 *data, size_t count) { return 0; } static struct tty_ldisc_ops hci_uart_ldisc = { .owner = THIS_MODULE, .num = N_HCI, .name = "n_hci", .open = hci_uart_tty_open, .close = hci_uart_tty_close, .read = hci_uart_tty_read, .write = hci_uart_tty_write, .ioctl = hci_uart_tty_ioctl, .compat_ioctl = hci_uart_tty_ioctl, .receive_buf = hci_uart_tty_receive, .write_wakeup = hci_uart_tty_wakeup, }; static int __init hci_uart_init(void) { int err; BT_INFO("HCI UART driver ver %s", VERSION); /* Register the tty discipline */ err = tty_register_ldisc(&hci_uart_ldisc); if (err) { BT_ERR("HCI line discipline registration failed. (%d)", err); return err; } #ifdef CONFIG_BT_HCIUART_H4 h4_init(); #endif #ifdef CONFIG_BT_HCIUART_BCSP bcsp_init(); #endif #ifdef CONFIG_BT_HCIUART_LL ll_init(); #endif #ifdef CONFIG_BT_HCIUART_ATH3K ath_init(); #endif #ifdef CONFIG_BT_HCIUART_3WIRE h5_init(); #endif #ifdef CONFIG_BT_HCIUART_INTEL intel_init(); #endif #ifdef CONFIG_BT_HCIUART_BCM bcm_init(); #endif #ifdef CONFIG_BT_HCIUART_QCA qca_init(); #endif #ifdef CONFIG_BT_HCIUART_AG6XX ag6xx_init(); #endif #ifdef CONFIG_BT_HCIUART_MRVL mrvl_init(); #endif #ifdef CONFIG_BT_HCIUART_AML aml_init(); #endif return 0; } static void __exit hci_uart_exit(void) { #ifdef CONFIG_BT_HCIUART_H4 h4_deinit(); #endif #ifdef CONFIG_BT_HCIUART_BCSP bcsp_deinit(); #endif #ifdef CONFIG_BT_HCIUART_LL ll_deinit(); #endif #ifdef CONFIG_BT_HCIUART_ATH3K ath_deinit(); #endif #ifdef CONFIG_BT_HCIUART_3WIRE h5_deinit(); #endif #ifdef CONFIG_BT_HCIUART_INTEL intel_deinit(); #endif #ifdef CONFIG_BT_HCIUART_BCM bcm_deinit(); #endif #ifdef CONFIG_BT_HCIUART_QCA qca_deinit(); #endif #ifdef CONFIG_BT_HCIUART_AG6XX ag6xx_deinit(); #endif #ifdef CONFIG_BT_HCIUART_MRVL mrvl_deinit(); #endif #ifdef CONFIG_BT_HCIUART_AML aml_deinit(); #endif tty_unregister_ldisc(&hci_uart_ldisc); } module_init(hci_uart_init); module_exit(hci_uart_exit); MODULE_AUTHOR("Marcel Holtmann <marcel@holtmann.org>"); MODULE_DESCRIPTION("Bluetooth HCI UART driver ver " VERSION); MODULE_VERSION(VERSION); MODULE_LICENSE("GPL"); MODULE_ALIAS_LDISC(N_HCI);
3 2 70 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 #ifndef _NET_FLOW_OFFLOAD_H #define _NET_FLOW_OFFLOAD_H #include <linux/kernel.h> #include <linux/list.h> #include <linux/netlink.h> #include <net/flow_dissector.h> struct flow_match { struct flow_dissector *dissector; void *mask; void *key; }; struct flow_match_meta { struct flow_dissector_key_meta *key, *mask; }; struct flow_match_basic { struct flow_dissector_key_basic *key, *mask; }; struct flow_match_control { struct flow_dissector_key_control *key, *mask; }; struct flow_match_eth_addrs { struct flow_dissector_key_eth_addrs *key, *mask; }; struct flow_match_vlan { struct flow_dissector_key_vlan *key, *mask; }; struct flow_match_arp { struct flow_dissector_key_arp *key, *mask; }; struct flow_match_ipv4_addrs { struct flow_dissector_key_ipv4_addrs *key, *mask; }; struct flow_match_ipv6_addrs { struct flow_dissector_key_ipv6_addrs *key, *mask; }; struct flow_match_ip { struct flow_dissector_key_ip *key, *mask; }; struct flow_match_ports { struct flow_dissector_key_ports *key, *mask; }; struct flow_match_ports_range { struct flow_dissector_key_ports_range *key, *mask; }; struct flow_match_icmp { struct flow_dissector_key_icmp *key, *mask; }; struct flow_match_tcp { struct flow_dissector_key_tcp *key, *mask; }; struct flow_match_ipsec { struct flow_dissector_key_ipsec *key, *mask; }; struct flow_match_mpls { struct flow_dissector_key_mpls *key, *mask; }; struct flow_match_enc_keyid { struct flow_dissector_key_keyid *key, *mask; }; struct flow_match_enc_opts { struct flow_dissector_key_enc_opts *key, *mask; }; struct flow_match_ct { struct flow_dissector_key_ct *key, *mask; }; struct flow_match_pppoe { struct flow_dissector_key_pppoe *key, *mask; }; struct flow_match_l2tpv3 { struct flow_dissector_key_l2tpv3 *key, *mask; }; struct flow_rule; void flow_rule_match_meta(const struct flow_rule *rule, struct flow_match_meta *out); void flow_rule_match_basic(const struct flow_rule *rule, struct flow_match_basic *out); void flow_rule_match_control(const struct flow_rule *rule, struct flow_match_control *out); void flow_rule_match_eth_addrs(const struct flow_rule *rule, struct flow_match_eth_addrs *out); void flow_rule_match_vlan(const struct flow_rule *rule, struct flow_match_vlan *out); void flow_rule_match_cvlan(const struct flow_rule *rule, struct flow_match_vlan *out); void flow_rule_match_arp(const struct flow_rule *rule, struct flow_match_arp *out); void flow_rule_match_ipv4_addrs(const struct flow_rule *rule, struct flow_match_ipv4_addrs *out); void flow_rule_match_ipv6_addrs(const struct flow_rule *rule, struct flow_match_ipv6_addrs *out); void flow_rule_match_ip(const struct flow_rule *rule, struct flow_match_ip *out); void flow_rule_match_ports(const struct flow_rule *rule, struct flow_match_ports *out); void flow_rule_match_ports_range(const struct flow_rule *rule, struct flow_match_ports_range *out); void flow_rule_match_tcp(const struct flow_rule *rule, struct flow_match_tcp *out); void flow_rule_match_ipsec(const struct flow_rule *rule, struct flow_match_ipsec *out); void flow_rule_match_icmp(const struct flow_rule *rule, struct flow_match_icmp *out); void flow_rule_match_mpls(const struct flow_rule *rule, struct flow_match_mpls *out); void flow_rule_match_enc_control(const struct flow_rule *rule, struct flow_match_control *out); void flow_rule_match_enc_ipv4_addrs(const struct flow_rule *rule, struct flow_match_ipv4_addrs *out); void flow_rule_match_enc_ipv6_addrs(const struct flow_rule *rule, struct flow_match_ipv6_addrs *out); void flow_rule_match_enc_ip(const struct flow_rule *rule, struct flow_match_ip *out); void flow_rule_match_enc_ports(const struct flow_rule *rule, struct flow_match_ports *out); void flow_rule_match_enc_keyid(const struct flow_rule *rule, struct flow_match_enc_keyid *out); void flow_rule_match_enc_opts(const struct flow_rule *rule, struct flow_match_enc_opts *out); void flow_rule_match_ct(const struct flow_rule *rule, struct flow_match_ct *out); void flow_rule_match_pppoe(const struct flow_rule *rule, struct flow_match_pppoe *out); void flow_rule_match_l2tpv3(const struct flow_rule *rule, struct flow_match_l2tpv3 *out); enum flow_action_id { FLOW_ACTION_ACCEPT = 0, FLOW_ACTION_DROP, FLOW_ACTION_TRAP, FLOW_ACTION_GOTO, FLOW_ACTION_REDIRECT, FLOW_ACTION_MIRRED, FLOW_ACTION_REDIRECT_INGRESS, FLOW_ACTION_MIRRED_INGRESS, FLOW_ACTION_VLAN_PUSH, FLOW_ACTION_VLAN_POP, FLOW_ACTION_VLAN_MANGLE, FLOW_ACTION_TUNNEL_ENCAP, FLOW_ACTION_TUNNEL_DECAP, FLOW_ACTION_MANGLE, FLOW_ACTION_ADD, FLOW_ACTION_CSUM, FLOW_ACTION_MARK, FLOW_ACTION_PTYPE, FLOW_ACTION_PRIORITY, FLOW_ACTION_RX_QUEUE_MAPPING, FLOW_ACTION_WAKE, FLOW_ACTION_QUEUE, FLOW_ACTION_SAMPLE, FLOW_ACTION_POLICE, FLOW_ACTION_CT, FLOW_ACTION_CT_METADATA, FLOW_ACTION_MPLS_PUSH, FLOW_ACTION_MPLS_POP, FLOW_ACTION_MPLS_MANGLE, FLOW_ACTION_GATE, FLOW_ACTION_PPPOE_PUSH, FLOW_ACTION_JUMP, FLOW_ACTION_PIPE, FLOW_ACTION_VLAN_PUSH_ETH, FLOW_ACTION_VLAN_POP_ETH, FLOW_ACTION_CONTINUE, NUM_FLOW_ACTIONS, }; /* This is mirroring enum pedit_header_type definition for easy mapping between * tc pedit action. Legacy TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK is mapped to * FLOW_ACT_MANGLE_UNSPEC, which is supported by no driver. */ enum flow_action_mangle_base { FLOW_ACT_MANGLE_UNSPEC = 0, FLOW_ACT_MANGLE_HDR_TYPE_ETH, FLOW_ACT_MANGLE_HDR_TYPE_IP4, FLOW_ACT_MANGLE_HDR_TYPE_IP6, FLOW_ACT_MANGLE_HDR_TYPE_TCP, FLOW_ACT_MANGLE_HDR_TYPE_UDP, }; enum flow_action_hw_stats_bit { FLOW_ACTION_HW_STATS_IMMEDIATE_BIT, FLOW_ACTION_HW_STATS_DELAYED_BIT, FLOW_ACTION_HW_STATS_DISABLED_BIT, FLOW_ACTION_HW_STATS_NUM_BITS }; enum flow_action_hw_stats { FLOW_ACTION_HW_STATS_IMMEDIATE = BIT(FLOW_ACTION_HW_STATS_IMMEDIATE_BIT), FLOW_ACTION_HW_STATS_DELAYED = BIT(FLOW_ACTION_HW_STATS_DELAYED_BIT), FLOW_ACTION_HW_STATS_ANY = FLOW_ACTION_HW_STATS_IMMEDIATE | FLOW_ACTION_HW_STATS_DELAYED, FLOW_ACTION_HW_STATS_DISABLED = BIT(FLOW_ACTION_HW_STATS_DISABLED_BIT), FLOW_ACTION_HW_STATS_DONT_CARE = BIT(FLOW_ACTION_HW_STATS_NUM_BITS) - 1, }; typedef void (*action_destr)(void *priv); struct flow_action_cookie { u32 cookie_len; u8 cookie[]; }; struct flow_action_cookie *flow_action_cookie_create(void *data, unsigned int len, gfp_t gfp); void flow_action_cookie_destroy(struct flow_action_cookie *cookie); struct flow_action_entry { enum flow_action_id id; u32 hw_index; unsigned long cookie; u64 miss_cookie; enum flow_action_hw_stats hw_stats; action_destr destructor; void *destructor_priv; union { u32 chain_index; /* FLOW_ACTION_GOTO */ struct net_device *dev; /* FLOW_ACTION_REDIRECT */ struct { /* FLOW_ACTION_VLAN */ u16 vid; __be16 proto; u8 prio; } vlan; struct { /* FLOW_ACTION_VLAN_PUSH_ETH */ unsigned char dst[ETH_ALEN]; unsigned char src[ETH_ALEN]; } vlan_push_eth; struct { /* FLOW_ACTION_MANGLE */ /* FLOW_ACTION_ADD */ enum flow_action_mangle_base htype; u32 offset; u32 mask; u32 val; } mangle; struct ip_tunnel_info *tunnel; /* FLOW_ACTION_TUNNEL_ENCAP */ u32 csum_flags; /* FLOW_ACTION_CSUM */ u32 mark; /* FLOW_ACTION_MARK */ u16 ptype; /* FLOW_ACTION_PTYPE */ u16 rx_queue; /* FLOW_ACTION_RX_QUEUE_MAPPING */ u32 priority; /* FLOW_ACTION_PRIORITY */ struct { /* FLOW_ACTION_QUEUE */ u32 ctx; u32 index; u8 vf; } queue; struct { /* FLOW_ACTION_SAMPLE */ struct psample_group *psample_group; u32 rate; u32 trunc_size; bool truncate; } sample; struct { /* FLOW_ACTION_POLICE */ u32 burst; u64 rate_bytes_ps; u64 peakrate_bytes_ps; u32 avrate; u16 overhead; u64 burst_pkt; u64 rate_pkt_ps; u32 mtu; struct { enum flow_action_id act_id; u32 extval; } exceed, notexceed; } police; struct { /* FLOW_ACTION_CT */ int action; u16 zone; struct nf_flowtable *flow_table; } ct; struct { unsigned long cookie; u32 mark; u32 labels[4]; bool orig_dir; } ct_metadata; struct { /* FLOW_ACTION_MPLS_PUSH */ u32 label; __be16 proto; u8 tc; u8 bos; u8 ttl; } mpls_push; struct { /* FLOW_ACTION_MPLS_POP */ __be16 proto; } mpls_pop; struct { /* FLOW_ACTION_MPLS_MANGLE */ u32 label; u8 tc; u8 bos; u8 ttl; } mpls_mangle; struct { s32 prio; u64 basetime; u64 cycletime; u64 cycletimeext; u32 num_entries; struct action_gate_entry *entries; } gate; struct { /* FLOW_ACTION_PPPOE_PUSH */ u16 sid; } pppoe; }; struct flow_action_cookie *user_cookie; /* user defined action cookie */ }; struct flow_action { unsigned int num_entries; struct flow_action_entry entries[] __counted_by(num_entries); }; static inline bool flow_action_has_entries(const struct flow_action *action) { return action->num_entries; } /** * flow_offload_has_one_action() - check if exactly one action is present * @action: tc filter flow offload action * * Return: true if exactly one action is present. */ static inline bool flow_offload_has_one_action(const struct flow_action *action) { return action->num_entries == 1; } static inline bool flow_action_is_last_entry(const struct flow_action *action, const struct flow_action_entry *entry) { return entry == &action->entries[action->num_entries - 1]; } #define flow_action_for_each(__i, __act, __actions) \ for (__i = 0, __act = &(__actions)->entries[0]; \ __i < (__actions)->num_entries; \ __act = &(__actions)->entries[++__i]) static inline bool flow_action_mixed_hw_stats_check(const struct flow_action *action, struct netlink_ext_ack *extack) { const struct flow_action_entry *action_entry; u8 last_hw_stats; int i; if (flow_offload_has_one_action(action)) return true; flow_action_for_each(i, action_entry, action) { if (i && action_entry->hw_stats != last_hw_stats) { NL_SET_ERR_MSG_MOD(extack, "Mixing HW stats types for actions is not supported"); return false; } last_hw_stats = action_entry->hw_stats; } return true; } static inline const struct flow_action_entry * flow_action_first_entry_get(const struct flow_action *action) { WARN_ON(!flow_action_has_entries(action)); return &action->entries[0]; } static inline bool __flow_action_hw_stats_check(const struct flow_action *action, struct netlink_ext_ack *extack, bool check_allow_bit, enum flow_action_hw_stats_bit allow_bit) { const struct flow_action_entry *action_entry; if (!flow_action_has_entries(action)) return true; if (!flow_action_mixed_hw_stats_check(action, extack)) return false; action_entry = flow_action_first_entry_get(action); /* Zero is not a legal value for hw_stats, catch anyone passing it */ WARN_ON_ONCE(!action_entry->hw_stats); if (!check_allow_bit && ~action_entry->hw_stats & FLOW_ACTION_HW_STATS_ANY) { NL_SET_ERR_MSG_MOD(extack, "Driver supports only default HW stats type \"any\""); return false; } else if (check_allow_bit && !(action_entry->hw_stats & BIT(allow_bit))) { NL_SET_ERR_MSG_MOD(extack, "Driver does not support selected HW stats type"); return false; } return true; } static inline bool flow_action_hw_stats_check(const struct flow_action *action, struct netlink_ext_ack *extack, enum flow_action_hw_stats_bit allow_bit) { return __flow_action_hw_stats_check(action, extack, true, allow_bit); } static inline bool flow_action_basic_hw_stats_check(const struct flow_action *action, struct netlink_ext_ack *extack) { return __flow_action_hw_stats_check(action, extack, false, 0); } struct flow_rule { struct flow_match match; struct flow_action action; }; struct flow_rule *flow_rule_alloc(unsigned int num_actions); static inline bool flow_rule_match_key(const struct flow_rule *rule, enum flow_dissector_key_id key) { return dissector_uses_key(rule->match.dissector, key); } /** * flow_rule_is_supp_control_flags() - check for supported control flags * @supp_flags: control flags supported by driver * @ctrl_flags: control flags present in rule * @extack: The netlink extended ACK for reporting errors. * * Return: true if only supported control flags are set, false otherwise. */ static inline bool flow_rule_is_supp_control_flags(const u32 supp_flags, const u32 ctrl_flags, struct netlink_ext_ack *extack) { if (likely((ctrl_flags & ~supp_flags) == 0)) return true; NL_SET_ERR_MSG_FMT_MOD(extack, "Unsupported match on control.flags %#x", ctrl_flags); return false; } /** * flow_rule_is_supp_enc_control_flags() - check for supported control flags * @supp_enc_flags: encapsulation control flags supported by driver * @enc_ctrl_flags: encapsulation control flags present in rule * @extack: The netlink extended ACK for reporting errors. * * Return: true if only supported control flags are set, false otherwise. */ static inline bool flow_rule_is_supp_enc_control_flags(const u32 supp_enc_flags, const u32 enc_ctrl_flags, struct netlink_ext_ack *extack) { if (likely((enc_ctrl_flags & ~supp_enc_flags) == 0)) return true; NL_SET_ERR_MSG_FMT_MOD(extack, "Unsupported match on enc_control.flags %#x", enc_ctrl_flags); return false; } /** * flow_rule_has_control_flags() - check for presence of any control flags * @ctrl_flags: control flags present in rule * @extack: The netlink extended ACK for reporting errors. * * Return: true if control flags are set, false otherwise. */ static inline bool flow_rule_has_control_flags(const u32 ctrl_flags, struct netlink_ext_ack *extack) { return !flow_rule_is_supp_control_flags(0, ctrl_flags, extack); } /** * flow_rule_has_enc_control_flags() - check for presence of any control flags * @enc_ctrl_flags: encapsulation control flags present in rule * @extack: The netlink extended ACK for reporting errors. * * Return: true if control flags are set, false otherwise. */ static inline bool flow_rule_has_enc_control_flags(const u32 enc_ctrl_flags, struct netlink_ext_ack *extack) { return !flow_rule_is_supp_enc_control_flags(0, enc_ctrl_flags, extack); } /** * flow_rule_match_has_control_flags() - match and check for any control flags * @rule: The flow_rule under evaluation. * @extack: The netlink extended ACK for reporting errors. * * Return: true if control flags are set, false otherwise. */ static inline bool flow_rule_match_has_control_flags(struct flow_rule *rule, struct netlink_ext_ack *extack) { struct flow_match_control match; if (!flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_CONTROL)) return false; flow_rule_match_control(rule, &match); return flow_rule_has_control_flags(match.mask->flags, extack); } struct flow_stats { u64 pkts; u64 bytes; u64 drops; u64 lastused; enum flow_action_hw_stats used_hw_stats; bool used_hw_stats_valid; }; static inline void flow_stats_update(struct flow_stats *flow_stats, u64 bytes, u64 pkts, u64 drops, u64 lastused, enum flow_action_hw_stats used_hw_stats) { flow_stats->pkts += pkts; flow_stats->bytes += bytes; flow_stats->drops += drops; flow_stats->lastused = max_t(u64, flow_stats->lastused, lastused); /* The driver should pass value with a maximum of one bit set. * Passing FLOW_ACTION_HW_STATS_ANY is invalid. */ WARN_ON(used_hw_stats == FLOW_ACTION_HW_STATS_ANY); flow_stats->used_hw_stats |= used_hw_stats; flow_stats->used_hw_stats_valid = true; } enum flow_block_command { FLOW_BLOCK_BIND, FLOW_BLOCK_UNBIND, }; enum flow_block_binder_type { FLOW_BLOCK_BINDER_TYPE_UNSPEC, FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS, FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS, FLOW_BLOCK_BINDER_TYPE_RED_EARLY_DROP, FLOW_BLOCK_BINDER_TYPE_RED_MARK, }; struct flow_block { struct list_head cb_list; }; struct netlink_ext_ack; struct flow_block_offload { enum flow_block_command command; enum flow_block_binder_type binder_type; bool block_shared; bool unlocked_driver_cb; struct net *net; struct flow_block *block; struct list_head cb_list; struct list_head *driver_block_list; struct netlink_ext_ack *extack; struct Qdisc *sch; struct list_head *cb_list_head; }; enum tc_setup_type; typedef int flow_setup_cb_t(enum tc_setup_type type, void *type_data, void *cb_priv); struct flow_block_cb; struct flow_block_indr { struct list_head list; struct net_device *dev; struct Qdisc *sch; enum flow_block_binder_type binder_type; void *data; void *cb_priv; void (*cleanup)(struct flow_block_cb *block_cb); }; struct flow_block_cb { struct list_head driver_list; struct list_head list; flow_setup_cb_t *cb; void *cb_ident; void *cb_priv; void (*release)(void *cb_priv); struct flow_block_indr indr; unsigned int refcnt; }; struct flow_block_cb *flow_block_cb_alloc(flow_setup_cb_t *cb, void *cb_ident, void *cb_priv, void (*release)(void *cb_priv)); struct flow_block_cb *flow_indr_block_cb_alloc(flow_setup_cb_t *cb, void *cb_ident, void *cb_priv, void (*release)(void *cb_priv), struct flow_block_offload *bo, struct net_device *dev, struct Qdisc *sch, void *data, void *indr_cb_priv, void (*cleanup)(struct flow_block_cb *block_cb)); void flow_block_cb_free(struct flow_block_cb *block_cb); struct flow_block_cb *flow_block_cb_lookup(struct flow_block *block, flow_setup_cb_t *cb, void *cb_ident); void *flow_block_cb_priv(struct flow_block_cb *block_cb); void flow_block_cb_incref(struct flow_block_cb *block_cb); unsigned int flow_block_cb_decref(struct flow_block_cb *block_cb); static inline void flow_block_cb_add(struct flow_block_cb *block_cb, struct flow_block_offload *offload) { list_add_tail(&block_cb->list, &offload->cb_list); } static inline void flow_block_cb_remove(struct flow_block_cb *block_cb, struct flow_block_offload *offload) { list_move(&block_cb->list, &offload->cb_list); } static inline void flow_indr_block_cb_remove(struct flow_block_cb *block_cb, struct flow_block_offload *offload) { list_del(&block_cb->indr.list); list_move(&block_cb->list, &offload->cb_list); } bool flow_block_cb_is_busy(flow_setup_cb_t *cb, void *cb_ident, struct list_head *driver_block_list); int flow_block_cb_setup_simple(struct flow_block_offload *f, struct list_head *driver_list, flow_setup_cb_t *cb, void *cb_ident, void *cb_priv, bool ingress_only); enum flow_cls_command { FLOW_CLS_REPLACE, FLOW_CLS_DESTROY, FLOW_CLS_STATS, FLOW_CLS_TMPLT_CREATE, FLOW_CLS_TMPLT_DESTROY, }; struct flow_cls_common_offload { u32 chain_index; __be16 protocol; u32 prio; bool skip_sw; struct netlink_ext_ack *extack; }; struct flow_cls_offload { struct flow_cls_common_offload common; enum flow_cls_command command; bool use_act_stats; unsigned long cookie; struct flow_rule *rule; struct flow_stats stats; u32 classid; }; enum offload_act_command { FLOW_ACT_REPLACE, FLOW_ACT_DESTROY, FLOW_ACT_STATS, }; struct flow_offload_action { struct netlink_ext_ack *extack; /* NULL in FLOW_ACT_STATS process*/ enum offload_act_command command; enum flow_action_id id; u32 index; unsigned long cookie; struct flow_stats stats; struct flow_action action; }; struct flow_offload_action *offload_action_alloc(unsigned int num_actions); static inline struct flow_rule * flow_cls_offload_flow_rule(struct flow_cls_offload *flow_cmd) { return flow_cmd->rule; } static inline void flow_block_init(struct flow_block *flow_block) { INIT_LIST_HEAD(&flow_block->cb_list); } typedef int flow_indr_block_bind_cb_t(struct net_device *dev, struct Qdisc *sch, void *cb_priv, enum tc_setup_type type, void *type_data, void *data, void (*cleanup)(struct flow_block_cb *block_cb)); int flow_indr_dev_register(flow_indr_block_bind_cb_t *cb, void *cb_priv); void flow_indr_dev_unregister(flow_indr_block_bind_cb_t *cb, void *cb_priv, void (*release)(void *cb_priv)); int flow_indr_dev_setup_offload(struct net_device *dev, struct Qdisc *sch, enum tc_setup_type type, void *data, struct flow_block_offload *bo, void (*cleanup)(struct flow_block_cb *block_cb)); bool flow_indr_dev_exists(void); #endif /* _NET_FLOW_OFFLOAD_H */
7 19 152 431 7 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 /* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ /* * Copyright (c) 2005 Voltaire Inc. All rights reserved. * Copyright (c) 2005 Intel Corporation. All rights reserved. */ #ifndef IB_ADDR_H #define IB_ADDR_H #include <linux/ethtool.h> #include <linux/in.h> #include <linux/in6.h> #include <linux/if_arp.h> #include <linux/netdevice.h> #include <linux/inetdevice.h> #include <linux/socket.h> #include <linux/if_vlan.h> #include <net/ipv6.h> #include <net/if_inet6.h> #include <net/ip.h> #include <rdma/ib_verbs.h> #include <rdma/ib_pack.h> #include <net/net_namespace.h> /** * struct rdma_dev_addr - Contains resolved RDMA hardware addresses * @src_dev_addr: Source MAC address. * @dst_dev_addr: Destination MAC address. * @broadcast: Broadcast address of the device. * @dev_type: The interface hardware type of the device. * @bound_dev_if: An optional device interface index. * @transport: The transport type used. * @net: Network namespace containing the bound_dev_if net_dev. * @sgid_attr: GID attribute to use for identified SGID */ struct rdma_dev_addr { unsigned char src_dev_addr[MAX_ADDR_LEN]; unsigned char dst_dev_addr[MAX_ADDR_LEN]; unsigned char broadcast[MAX_ADDR_LEN]; unsigned short dev_type; int bound_dev_if; enum rdma_transport_type transport; struct net *net; const struct ib_gid_attr *sgid_attr; enum rdma_network_type network; int hoplimit; }; /** * rdma_translate_ip - Translate a local IP address to an RDMA hardware * address. * * The dev_addr->net field must be initialized. */ int rdma_translate_ip(const struct sockaddr *addr, struct rdma_dev_addr *dev_addr); /** * rdma_resolve_ip - Resolve source and destination IP addresses to * RDMA hardware addresses. * @src_addr: An optional source address to use in the resolution. If a * source address is not provided, a usable address will be returned via * the callback. * @dst_addr: The destination address to resolve. * @addr: A reference to a data location that will receive the resolved * addresses. The data location must remain valid until the callback has * been invoked. The net field of the addr struct must be valid. * @timeout_ms: Amount of time to wait for the address resolution to complete. * @callback: Call invoked once address resolution has completed, timed out, * or been canceled. A status of 0 indicates success. * @resolve_by_gid_attr: Resolve the ip based on the GID attribute from * rdma_dev_addr. * @context: User-specified context associated with the call. */ int rdma_resolve_ip(struct sockaddr *src_addr, const struct sockaddr *dst_addr, struct rdma_dev_addr *addr, unsigned long timeout_ms, void (*callback)(int status, struct sockaddr *src_addr, struct rdma_dev_addr *addr, void *context), bool resolve_by_gid_attr, void *context); void rdma_addr_cancel(struct rdma_dev_addr *addr); int rdma_addr_size(const struct sockaddr *addr); int rdma_addr_size_in6(struct sockaddr_in6 *addr); int rdma_addr_size_kss(struct __kernel_sockaddr_storage *addr); static inline u16 ib_addr_get_pkey(struct rdma_dev_addr *dev_addr) { return ((u16)dev_addr->broadcast[8] << 8) | (u16)dev_addr->broadcast[9]; } static inline void ib_addr_set_pkey(struct rdma_dev_addr *dev_addr, u16 pkey) { dev_addr->broadcast[8] = pkey >> 8; dev_addr->broadcast[9] = (unsigned char) pkey; } static inline void ib_addr_get_mgid(struct rdma_dev_addr *dev_addr, union ib_gid *gid) { memcpy(gid, dev_addr->broadcast + 4, sizeof *gid); } static inline int rdma_addr_gid_offset(struct rdma_dev_addr *dev_addr) { return dev_addr->dev_type == ARPHRD_INFINIBAND ? 4 : 0; } static inline u16 rdma_vlan_dev_vlan_id(const struct net_device *dev) { return is_vlan_dev(dev) ? vlan_dev_vlan_id(dev) : 0xffff; } static inline int rdma_ip2gid(struct sockaddr *addr, union ib_gid *gid) { switch (addr->sa_family) { case AF_INET: ipv6_addr_set_v4mapped(((struct sockaddr_in *) addr)->sin_addr.s_addr, (struct in6_addr *)gid); break; case AF_INET6: *(struct in6_addr *)&gid->raw = ((struct sockaddr_in6 *)addr)->sin6_addr; break; default: return -EINVAL; } return 0; } /* Important - sockaddr should be a union of sockaddr_in and sockaddr_in6 */ static inline void rdma_gid2ip(struct sockaddr *out, const union ib_gid *gid) { if (ipv6_addr_v4mapped((struct in6_addr *)gid)) { struct sockaddr_in *out_in = (struct sockaddr_in *)out; memset(out_in, 0, sizeof(*out_in)); out_in->sin_family = AF_INET; memcpy(&out_in->sin_addr.s_addr, gid->raw + 12, 4); } else { struct sockaddr_in6 *out_in = (struct sockaddr_in6 *)out; memset(out_in, 0, sizeof(*out_in)); out_in->sin6_family = AF_INET6; memcpy(&out_in->sin6_addr.s6_addr, gid->raw, 16); } } /* * rdma_get/set_sgid/dgid() APIs are applicable to IB, and iWarp. * They are not applicable to RoCE. * RoCE GIDs are derived from the IP addresses. */ static inline void rdma_addr_get_sgid(struct rdma_dev_addr *dev_addr, union ib_gid *gid) { memcpy(gid, dev_addr->src_dev_addr + rdma_addr_gid_offset(dev_addr), sizeof(*gid)); } static inline void rdma_addr_set_sgid(struct rdma_dev_addr *dev_addr, union ib_gid *gid) { memcpy(dev_addr->src_dev_addr + rdma_addr_gid_offset(dev_addr), gid, sizeof *gid); } static inline void rdma_addr_get_dgid(struct rdma_dev_addr *dev_addr, union ib_gid *gid) { memcpy(gid, dev_addr->dst_dev_addr + rdma_addr_gid_offset(dev_addr), sizeof *gid); } static inline void rdma_addr_set_dgid(struct rdma_dev_addr *dev_addr, union ib_gid *gid) { memcpy(dev_addr->dst_dev_addr + rdma_addr_gid_offset(dev_addr), gid, sizeof *gid); } static inline enum ib_mtu iboe_get_mtu(int mtu) { /* * Reduce IB headers from effective IBoE MTU. */ mtu = mtu - (IB_GRH_BYTES + IB_UDP_BYTES + IB_BTH_BYTES + IB_EXT_XRC_BYTES + IB_EXT_ATOMICETH_BYTES + IB_ICRC_BYTES); if (mtu >= ib_mtu_enum_to_int(IB_MTU_4096)) return IB_MTU_4096; else if (mtu >= ib_mtu_enum_to_int(IB_MTU_2048)) return IB_MTU_2048; else if (mtu >= ib_mtu_enum_to_int(IB_MTU_1024)) return IB_MTU_1024; else if (mtu >= ib_mtu_enum_to_int(IB_MTU_512)) return IB_MTU_512; else if (mtu >= ib_mtu_enum_to_int(IB_MTU_256)) return IB_MTU_256; else return 0; } static inline int rdma_link_local_addr(struct in6_addr *addr) { if (addr->s6_addr32[0] == htonl(0xfe800000) && addr->s6_addr32[1] == 0) return 1; return 0; } static inline void rdma_get_ll_mac(struct in6_addr *addr, u8 *mac) { memcpy(mac, &addr->s6_addr[8], 3); memcpy(mac + 3, &addr->s6_addr[13], 3); mac[0] ^= 2; } static inline int rdma_is_multicast_addr(struct in6_addr *addr) { __be32 ipv4_addr; if (addr->s6_addr[0] == 0xff) return 1; ipv4_addr = addr->s6_addr32[3]; return (ipv6_addr_v4mapped(addr) && ipv4_is_multicast(ipv4_addr)); } static inline void rdma_get_mcast_mac(struct in6_addr *addr, u8 *mac) { int i; mac[0] = 0x33; mac[1] = 0x33; for (i = 2; i < 6; ++i) mac[i] = addr->s6_addr[i + 10]; } static inline u16 rdma_get_vlan_id(union ib_gid *dgid) { u16 vid; vid = dgid->raw[11] << 8 | dgid->raw[12]; return vid < 0x1000 ? vid : 0xffff; } static inline struct net_device *rdma_vlan_dev_real_dev(const struct net_device *dev) { return is_vlan_dev(dev) ? vlan_dev_real_dev(dev) : NULL; } #endif /* IB_ADDR_H */
48 41 30 2 1 1 76 266 126 8 1 7 7 7 43 1 1 1 1 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 /* SPDX-License-Identifier: GPL-2.0-only */ /* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu> * Patrick Schaaf <bof@bof.de> * Martin Josefsson <gandalf@wlug.westbo.se> * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */ #ifndef _IP_SET_H #define _IP_SET_H #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/x_tables.h> #include <linux/stringify.h> #include <linux/vmalloc.h> #include <net/netlink.h> #include <uapi/linux/netfilter/ipset/ip_set.h> #define _IP_SET_MODULE_DESC(a, b, c) \ MODULE_DESCRIPTION(a " type of IP sets, revisions " b "-" c) #define IP_SET_MODULE_DESC(a, b, c) \ _IP_SET_MODULE_DESC(a, __stringify(b), __stringify(c)) /* Set features */ enum ip_set_feature { IPSET_TYPE_IP_FLAG = 0, IPSET_TYPE_IP = (1 << IPSET_TYPE_IP_FLAG), IPSET_TYPE_PORT_FLAG = 1, IPSET_TYPE_PORT = (1 << IPSET_TYPE_PORT_FLAG), IPSET_TYPE_MAC_FLAG = 2, IPSET_TYPE_MAC = (1 << IPSET_TYPE_MAC_FLAG), IPSET_TYPE_IP2_FLAG = 3, IPSET_TYPE_IP2 = (1 << IPSET_TYPE_IP2_FLAG), IPSET_TYPE_NAME_FLAG = 4, IPSET_TYPE_NAME = (1 << IPSET_TYPE_NAME_FLAG), IPSET_TYPE_IFACE_FLAG = 5, IPSET_TYPE_IFACE = (1 << IPSET_TYPE_IFACE_FLAG), IPSET_TYPE_MARK_FLAG = 6, IPSET_TYPE_MARK = (1 << IPSET_TYPE_MARK_FLAG), IPSET_TYPE_NOMATCH_FLAG = 7, IPSET_TYPE_NOMATCH = (1 << IPSET_TYPE_NOMATCH_FLAG), /* Strictly speaking not a feature, but a flag for dumping: * this settype must be dumped last */ IPSET_DUMP_LAST_FLAG = 8, IPSET_DUMP_LAST = (1 << IPSET_DUMP_LAST_FLAG), }; /* Set extensions */ enum ip_set_extension { IPSET_EXT_BIT_TIMEOUT = 0, IPSET_EXT_TIMEOUT = (1 << IPSET_EXT_BIT_TIMEOUT), IPSET_EXT_BIT_COUNTER = 1, IPSET_EXT_COUNTER = (1 << IPSET_EXT_BIT_COUNTER), IPSET_EXT_BIT_COMMENT = 2, IPSET_EXT_COMMENT = (1 << IPSET_EXT_BIT_COMMENT), IPSET_EXT_BIT_SKBINFO = 3, IPSET_EXT_SKBINFO = (1 << IPSET_EXT_BIT_SKBINFO), /* Mark set with an extension which needs to call destroy */ IPSET_EXT_BIT_DESTROY = 7, IPSET_EXT_DESTROY = (1 << IPSET_EXT_BIT_DESTROY), }; #define SET_WITH_TIMEOUT(s) ((s)->extensions & IPSET_EXT_TIMEOUT) #define SET_WITH_COUNTER(s) ((s)->extensions & IPSET_EXT_COUNTER) #define SET_WITH_COMMENT(s) ((s)->extensions & IPSET_EXT_COMMENT) #define SET_WITH_SKBINFO(s) ((s)->extensions & IPSET_EXT_SKBINFO) #define SET_WITH_FORCEADD(s) ((s)->flags & IPSET_CREATE_FLAG_FORCEADD) /* Extension id, in size order */ enum ip_set_ext_id { IPSET_EXT_ID_COUNTER = 0, IPSET_EXT_ID_TIMEOUT, IPSET_EXT_ID_SKBINFO, IPSET_EXT_ID_COMMENT, IPSET_EXT_ID_MAX, }; struct ip_set; /* Extension type */ struct ip_set_ext_type { /* Destroy extension private data (can be NULL) */ void (*destroy)(struct ip_set *set, void *ext); enum ip_set_extension type; enum ipset_cadt_flags flag; /* Size and minimal alignment */ u8 len; u8 align; }; extern const struct ip_set_ext_type ip_set_extensions[]; struct ip_set_counter { atomic64_t bytes; atomic64_t packets; }; struct ip_set_comment_rcu { struct rcu_head rcu; char str[]; }; struct ip_set_comment { struct ip_set_comment_rcu __rcu *c; }; struct ip_set_skbinfo { u32 skbmark; u32 skbmarkmask; u32 skbprio; u16 skbqueue; u16 __pad; }; struct ip_set_ext { struct ip_set_skbinfo skbinfo; u64 packets; u64 bytes; char *comment; u32 timeout; u8 packets_op; u8 bytes_op; bool target; }; #define ext_timeout(e, s) \ ((unsigned long *)(((void *)(e)) + (s)->offset[IPSET_EXT_ID_TIMEOUT])) #define ext_counter(e, s) \ ((struct ip_set_counter *)(((void *)(e)) + (s)->offset[IPSET_EXT_ID_COUNTER])) #define ext_comment(e, s) \ ((struct ip_set_comment *)(((void *)(e)) + (s)->offset[IPSET_EXT_ID_COMMENT])) #define ext_skbinfo(e, s) \ ((struct ip_set_skbinfo *)(((void *)(e)) + (s)->offset[IPSET_EXT_ID_SKBINFO])) typedef int (*ipset_adtfn)(struct ip_set *set, void *value, const struct ip_set_ext *ext, struct ip_set_ext *mext, u32 cmdflags); /* Kernel API function options */ struct ip_set_adt_opt { u8 family; /* Actual protocol family */ u8 dim; /* Dimension of match/target */ u8 flags; /* Direction and negation flags */ u32 cmdflags; /* Command-like flags */ struct ip_set_ext ext; /* Extensions */ }; /* Set type, variant-specific part */ struct ip_set_type_variant { /* Kernelspace: test/add/del entries * returns negative error code, * zero for no match/success to add/delete * positive for matching element */ int (*kadt)(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, enum ipset_adt adt, struct ip_set_adt_opt *opt); /* Userspace: test/add/del entries * returns negative error code, * zero for no match/success to add/delete * positive for matching element */ int (*uadt)(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried); /* Low level add/del/test functions */ ipset_adtfn adt[IPSET_ADT_MAX]; /* When adding entries and set is full, try to resize the set */ int (*resize)(struct ip_set *set, bool retried); /* Destroy the set */ void (*destroy)(struct ip_set *set); /* Flush the elements */ void (*flush)(struct ip_set *set); /* Expire entries before listing */ void (*expire)(struct ip_set *set); /* List set header data */ int (*head)(struct ip_set *set, struct sk_buff *skb); /* List elements */ int (*list)(const struct ip_set *set, struct sk_buff *skb, struct netlink_callback *cb); /* Keep listing private when resizing runs parallel */ void (*uref)(struct ip_set *set, struct netlink_callback *cb, bool start); /* Return true if "b" set is the same as "a" * according to the create set parameters */ bool (*same_set)(const struct ip_set *a, const struct ip_set *b); /* Cancel ongoing garbage collectors before destroying the set*/ void (*cancel_gc)(struct ip_set *set); /* Region-locking is used */ bool region_lock; }; struct ip_set_region { spinlock_t lock; /* Region lock */ size_t ext_size; /* Size of the dynamic extensions */ u32 elements; /* Number of elements vs timeout */ }; /* Max range where every element is added/deleted in one step */ #define IPSET_MAX_RANGE (1<<14) /* The max revision number supported by any set type + 1 */ #define IPSET_REVISION_MAX 9 /* The core set type structure */ struct ip_set_type { struct list_head list; /* Typename */ char name[IPSET_MAXNAMELEN]; /* Protocol version */ u8 protocol; /* Set type dimension */ u8 dimension; /* * Supported family: may be NFPROTO_UNSPEC for both * NFPROTO_IPV4/NFPROTO_IPV6. */ u8 family; /* Type revisions */ u8 revision_min, revision_max; /* Revision-specific supported (create) flags */ u8 create_flags[IPSET_REVISION_MAX+1]; /* Set features to control swapping */ u16 features; /* Create set */ int (*create)(struct net *net, struct ip_set *set, struct nlattr *tb[], u32 flags); /* Attribute policies */ const struct nla_policy create_policy[IPSET_ATTR_CREATE_MAX + 1]; const struct nla_policy adt_policy[IPSET_ATTR_ADT_MAX + 1]; /* Set this to THIS_MODULE if you are a module, otherwise NULL */ struct module *me; }; /* register and unregister set type */ extern int ip_set_type_register(struct ip_set_type *set_type); extern void ip_set_type_unregister(struct ip_set_type *set_type); /* A generic IP set */ struct ip_set { /* For call_cru in destroy */ struct rcu_head rcu; /* The name of the set */ char name[IPSET_MAXNAMELEN]; /* Lock protecting the set data */ spinlock_t lock; /* References to the set */ u32 ref; /* References to the set for netlink events like dump, * ref can be swapped out by ip_set_swap */ u32 ref_netlink; /* The core set type */ struct ip_set_type *type; /* The type variant doing the real job */ const struct ip_set_type_variant *variant; /* The actual INET family of the set */ u8 family; /* The type revision */ u8 revision; /* Extensions */ u8 extensions; /* Create flags */ u8 flags; /* Default timeout value, if enabled */ u32 timeout; /* Number of elements (vs timeout) */ u32 elements; /* Size of the dynamic extensions (vs timeout) */ size_t ext_size; /* Element data size */ size_t dsize; /* Offsets to extensions in elements */ size_t offset[IPSET_EXT_ID_MAX]; /* The type specific data */ void *data; }; static inline void ip_set_ext_destroy(struct ip_set *set, void *data) { /* Check that the extension is enabled for the set and * call it's destroy function for its extension part in data. */ if (SET_WITH_COMMENT(set)) { struct ip_set_comment *c = ext_comment(data, set); ip_set_extensions[IPSET_EXT_ID_COMMENT].destroy(set, c); } } int ip_set_put_flags(struct sk_buff *skb, struct ip_set *set); /* Netlink CB args */ enum { IPSET_CB_NET = 0, /* net namespace */ IPSET_CB_PROTO, /* ipset protocol */ IPSET_CB_DUMP, /* dump single set/all sets */ IPSET_CB_INDEX, /* set index */ IPSET_CB_PRIVATE, /* set private data */ IPSET_CB_ARG0, /* type specific */ }; /* register and unregister set references */ extern ip_set_id_t ip_set_get_byname(struct net *net, const char *name, struct ip_set **set); extern void ip_set_put_byindex(struct net *net, ip_set_id_t index); extern void ip_set_name_byindex(struct net *net, ip_set_id_t index, char *name); extern ip_set_id_t ip_set_nfnl_get_byindex(struct net *net, ip_set_id_t index); extern void ip_set_nfnl_put(struct net *net, ip_set_id_t index); /* API for iptables set match, and SET target */ extern int ip_set_add(ip_set_id_t id, const struct sk_buff *skb, const struct xt_action_param *par, struct ip_set_adt_opt *opt); extern int ip_set_del(ip_set_id_t id, const struct sk_buff *skb, const struct xt_action_param *par, struct ip_set_adt_opt *opt); extern int ip_set_test(ip_set_id_t id, const struct sk_buff *skb, const struct xt_action_param *par, struct ip_set_adt_opt *opt); /* Utility functions */ extern void *ip_set_alloc(size_t size); extern void ip_set_free(void *members); extern int ip_set_get_ipaddr4(struct nlattr *nla, __be32 *ipaddr); extern int ip_set_get_ipaddr6(struct nlattr *nla, union nf_inet_addr *ipaddr); extern size_t ip_set_elem_len(struct ip_set *set, struct nlattr *tb[], size_t len, size_t align); extern int ip_set_get_extensions(struct ip_set *set, struct nlattr *tb[], struct ip_set_ext *ext); extern int ip_set_put_extensions(struct sk_buff *skb, const struct ip_set *set, const void *e, bool active); extern bool ip_set_match_extensions(struct ip_set *set, const struct ip_set_ext *ext, struct ip_set_ext *mext, u32 flags, void *data); static inline int ip_set_get_hostipaddr4(struct nlattr *nla, u32 *ipaddr) { __be32 ip; int ret = ip_set_get_ipaddr4(nla, &ip); if (ret) return ret; *ipaddr = ntohl(ip); return 0; } /* Ignore IPSET_ERR_EXIST errors if asked to do so? */ static inline bool ip_set_eexist(int ret, u32 flags) { return ret == -IPSET_ERR_EXIST && (flags & IPSET_FLAG_EXIST); } /* Match elements marked with nomatch */ static inline bool ip_set_enomatch(int ret, u32 flags, enum ipset_adt adt, struct ip_set *set) { return adt == IPSET_TEST && (set->type->features & IPSET_TYPE_NOMATCH) && ((flags >> 16) & IPSET_FLAG_NOMATCH) && (ret > 0 || ret == -ENOTEMPTY); } /* Check the NLA_F_NET_BYTEORDER flag */ static inline bool ip_set_attr_netorder(struct nlattr *tb[], int type) { return tb[type] && (tb[type]->nla_type & NLA_F_NET_BYTEORDER); } static inline bool ip_set_optattr_netorder(struct nlattr *tb[], int type) { return !tb[type] || (tb[type]->nla_type & NLA_F_NET_BYTEORDER); } /* Useful converters */ static inline u32 ip_set_get_h32(const struct nlattr *attr) { return ntohl(nla_get_be32(attr)); } static inline u16 ip_set_get_h16(const struct nlattr *attr) { return ntohs(nla_get_be16(attr)); } static inline int nla_put_ipaddr4(struct sk_buff *skb, int type, __be32 ipaddr) { struct nlattr *__nested = nla_nest_start(skb, type); int ret; if (!__nested) return -EMSGSIZE; ret = nla_put_in_addr(skb, IPSET_ATTR_IPADDR_IPV4, ipaddr); if (!ret) nla_nest_end(skb, __nested); return ret; } static inline int nla_put_ipaddr6(struct sk_buff *skb, int type, const struct in6_addr *ipaddrptr) { struct nlattr *__nested = nla_nest_start(skb, type); int ret; if (!__nested) return -EMSGSIZE; ret = nla_put_in6_addr(skb, IPSET_ATTR_IPADDR_IPV6, ipaddrptr); if (!ret) nla_nest_end(skb, __nested); return ret; } /* Get address from skbuff */ static inline __be32 ip4addr(const struct sk_buff *skb, bool src) { return src ? ip_hdr(skb)->saddr : ip_hdr(skb)->daddr; } static inline void ip4addrptr(const struct sk_buff *skb, bool src, __be32 *addr) { *addr = src ? ip_hdr(skb)->saddr : ip_hdr(skb)->daddr; } static inline void ip6addrptr(const struct sk_buff *skb, bool src, struct in6_addr *addr) { memcpy(addr, src ? &ipv6_hdr(skb)->saddr : &ipv6_hdr(skb)->daddr, sizeof(*addr)); } /* How often should the gc be run by default */ #define IPSET_GC_TIME (3 * 60) /* Timeout period depending on the timeout value of the given set */ #define IPSET_GC_PERIOD(timeout) \ ((timeout/3) ? min_t(u32, (timeout)/3, IPSET_GC_TIME) : 1) /* Entry is set with no timeout value */ #define IPSET_ELEM_PERMANENT 0 /* Set is defined with timeout support: timeout value may be 0 */ #define IPSET_NO_TIMEOUT UINT_MAX /* Max timeout value, see msecs_to_jiffies() in jiffies.h */ #define IPSET_MAX_TIMEOUT (UINT_MAX >> 1)/MSEC_PER_SEC #define ip_set_adt_opt_timeout(opt, set) \ ((opt)->ext.timeout != IPSET_NO_TIMEOUT ? (opt)->ext.timeout : (set)->timeout) static inline unsigned int ip_set_timeout_uget(struct nlattr *tb) { unsigned int timeout = ip_set_get_h32(tb); /* Normalize to fit into jiffies */ if (timeout > IPSET_MAX_TIMEOUT) timeout = IPSET_MAX_TIMEOUT; return timeout; } static inline bool ip_set_timeout_expired(const unsigned long *t) { return *t != IPSET_ELEM_PERMANENT && time_is_before_jiffies(*t); } static inline void ip_set_timeout_set(unsigned long *timeout, u32 value) { unsigned long t; if (!value) { *timeout = IPSET_ELEM_PERMANENT; return; } t = msecs_to_jiffies(value * MSEC_PER_SEC) + jiffies; if (t == IPSET_ELEM_PERMANENT) /* Bingo! :-) */ t--; *timeout = t; } void ip_set_init_comment(struct ip_set *set, struct ip_set_comment *comment, const struct ip_set_ext *ext); static inline void ip_set_init_counter(struct ip_set_counter *counter, const struct ip_set_ext *ext) { if (ext->bytes != ULLONG_MAX) atomic64_set(&(counter)->bytes, (long long)(ext->bytes)); if (ext->packets != ULLONG_MAX) atomic64_set(&(counter)->packets, (long long)(ext->packets)); } static inline void ip_set_init_skbinfo(struct ip_set_skbinfo *skbinfo, const struct ip_set_ext *ext) { *skbinfo = ext->skbinfo; } static inline void nf_inet_addr_mask_inplace(union nf_inet_addr *a1, const union nf_inet_addr *mask) { a1->all[0] &= mask->all[0]; a1->all[1] &= mask->all[1]; a1->all[2] &= mask->all[2]; a1->all[3] &= mask->all[3]; } #define IP_SET_INIT_KEXT(skb, opt, set) \ { .bytes = (skb)->len, .packets = 1, .target = true,\ .timeout = ip_set_adt_opt_timeout(opt, set) } #define IP_SET_INIT_UEXT(set) \ { .bytes = ULLONG_MAX, .packets = ULLONG_MAX, \ .timeout = (set)->timeout } #define IPSET_CONCAT(a, b) a##b #define IPSET_TOKEN(a, b) IPSET_CONCAT(a, b) #endif /*_IP_SET_H */
10 2 3 8 12 10 144 12 623 47 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM compaction #if !defined(_TRACE_COMPACTION_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_COMPACTION_H #include <linux/types.h> #include <linux/list.h> #include <linux/tracepoint.h> #include <trace/events/mmflags.h> DECLARE_EVENT_CLASS(mm_compaction_isolate_template, TP_PROTO( unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_scanned, unsigned long nr_taken), TP_ARGS(start_pfn, end_pfn, nr_scanned, nr_taken), TP_STRUCT__entry( __field(unsigned long, start_pfn) __field(unsigned long, end_pfn) __field(unsigned long, nr_scanned) __field(unsigned long, nr_taken) ), TP_fast_assign( __entry->start_pfn = start_pfn; __entry->end_pfn = end_pfn; __entry->nr_scanned = nr_scanned; __entry->nr_taken = nr_taken; ), TP_printk("range=(0x%lx ~ 0x%lx) nr_scanned=%lu nr_taken=%lu", __entry->start_pfn, __entry->end_pfn, __entry->nr_scanned, __entry->nr_taken) ); DEFINE_EVENT(mm_compaction_isolate_template, mm_compaction_isolate_migratepages, TP_PROTO( unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_scanned, unsigned long nr_taken), TP_ARGS(start_pfn, end_pfn, nr_scanned, nr_taken) ); DEFINE_EVENT(mm_compaction_isolate_template, mm_compaction_isolate_freepages, TP_PROTO( unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_scanned, unsigned long nr_taken), TP_ARGS(start_pfn, end_pfn, nr_scanned, nr_taken) ); DEFINE_EVENT(mm_compaction_isolate_template, mm_compaction_fast_isolate_freepages, TP_PROTO( unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_scanned, unsigned long nr_taken), TP_ARGS(start_pfn, end_pfn, nr_scanned, nr_taken) ); #ifdef CONFIG_COMPACTION TRACE_EVENT(mm_compaction_migratepages, TP_PROTO(unsigned int nr_migratepages, unsigned int nr_succeeded), TP_ARGS(nr_migratepages, nr_succeeded), TP_STRUCT__entry( __field(unsigned long, nr_migrated) __field(unsigned long, nr_failed) ), TP_fast_assign( __entry->nr_migrated = nr_succeeded; __entry->nr_failed = nr_migratepages - nr_succeeded; ), TP_printk("nr_migrated=%lu nr_failed=%lu", __entry->nr_migrated, __entry->nr_failed) ); TRACE_EVENT(mm_compaction_begin, TP_PROTO(struct compact_control *cc, unsigned long zone_start, unsigned long zone_end, bool sync), TP_ARGS(cc, zone_start, zone_end, sync), TP_STRUCT__entry( __field(unsigned long, zone_start) __field(unsigned long, migrate_pfn) __field(unsigned long, free_pfn) __field(unsigned long, zone_end) __field(bool, sync) ), TP_fast_assign( __entry->zone_start = zone_start; __entry->migrate_pfn = cc->migrate_pfn; __entry->free_pfn = cc->free_pfn; __entry->zone_end = zone_end; __entry->sync = sync; ), TP_printk("zone_start=0x%lx migrate_pfn=0x%lx free_pfn=0x%lx zone_end=0x%lx, mode=%s", __entry->zone_start, __entry->migrate_pfn, __entry->free_pfn, __entry->zone_end, __entry->sync ? "sync" : "async") ); TRACE_EVENT(mm_compaction_end, TP_PROTO(struct compact_control *cc, unsigned long zone_start, unsigned long zone_end, bool sync, int status), TP_ARGS(cc, zone_start, zone_end, sync, status), TP_STRUCT__entry( __field(unsigned long, zone_start) __field(unsigned long, migrate_pfn) __field(unsigned long, free_pfn) __field(unsigned long, zone_end) __field(bool, sync) __field(int, status) ), TP_fast_assign( __entry->zone_start = zone_start; __entry->migrate_pfn = cc->migrate_pfn; __entry->free_pfn = cc->free_pfn; __entry->zone_end = zone_end; __entry->sync = sync; __entry->status = status; ), TP_printk("zone_start=0x%lx migrate_pfn=0x%lx free_pfn=0x%lx zone_end=0x%lx, mode=%s status=%s", __entry->zone_start, __entry->migrate_pfn, __entry->free_pfn, __entry->zone_end, __entry->sync ? "sync" : "async", __print_symbolic(__entry->status, COMPACTION_STATUS)) ); TRACE_EVENT(mm_compaction_try_to_compact_pages, TP_PROTO( int order, gfp_t gfp_mask, int prio), TP_ARGS(order, gfp_mask, prio), TP_STRUCT__entry( __field(int, order) __field(unsigned long, gfp_mask) __field(int, prio) ), TP_fast_assign( __entry->order = order; __entry->gfp_mask = (__force unsigned long)gfp_mask; __entry->prio = prio; ), TP_printk("order=%d gfp_mask=%s priority=%d", __entry->order, show_gfp_flags(__entry->gfp_mask), __entry->prio) ); DECLARE_EVENT_CLASS(mm_compaction_suitable_template, TP_PROTO(struct zone *zone, int order, int ret), TP_ARGS(zone, order, ret), TP_STRUCT__entry( __field(int, nid) __field(enum zone_type, idx) __field(int, order) __field(int, ret) ), TP_fast_assign( __entry->nid = zone_to_nid(zone); __entry->idx = zone_idx(zone); __entry->order = order; __entry->ret = ret; ), TP_printk("node=%d zone=%-8s order=%d ret=%s", __entry->nid, __print_symbolic(__entry->idx, ZONE_TYPE), __entry->order, __print_symbolic(__entry->ret, COMPACTION_STATUS)) ); DEFINE_EVENT(mm_compaction_suitable_template, mm_compaction_finished, TP_PROTO(struct zone *zone, int order, int ret), TP_ARGS(zone, order, ret) ); DEFINE_EVENT(mm_compaction_suitable_template, mm_compaction_suitable, TP_PROTO(struct zone *zone, int order, int ret), TP_ARGS(zone, order, ret) ); DECLARE_EVENT_CLASS(mm_compaction_defer_template, TP_PROTO(struct zone *zone, int order), TP_ARGS(zone, order), TP_STRUCT__entry( __field(int, nid) __field(enum zone_type, idx) __field(int, order) __field(unsigned int, considered) __field(unsigned int, defer_shift) __field(int, order_failed) ), TP_fast_assign( __entry->nid = zone_to_nid(zone); __entry->idx = zone_idx(zone); __entry->order = order; __entry->considered = zone->compact_considered; __entry->defer_shift = zone->compact_defer_shift; __entry->order_failed = zone->compact_order_failed; ), TP_printk("node=%d zone=%-8s order=%d order_failed=%d consider=%u limit=%lu", __entry->nid, __print_symbolic(__entry->idx, ZONE_TYPE), __entry->order, __entry->order_failed, __entry->considered, 1UL << __entry->defer_shift) ); DEFINE_EVENT(mm_compaction_defer_template, mm_compaction_deferred, TP_PROTO(struct zone *zone, int order), TP_ARGS(zone, order) ); DEFINE_EVENT(mm_compaction_defer_template, mm_compaction_defer_compaction, TP_PROTO(struct zone *zone, int order), TP_ARGS(zone, order) ); DEFINE_EVENT(mm_compaction_defer_template, mm_compaction_defer_reset, TP_PROTO(struct zone *zone, int order), TP_ARGS(zone, order) ); TRACE_EVENT(mm_compaction_kcompactd_sleep, TP_PROTO(int nid), TP_ARGS(nid), TP_STRUCT__entry( __field(int, nid) ), TP_fast_assign( __entry->nid = nid; ), TP_printk("nid=%d", __entry->nid) ); DECLARE_EVENT_CLASS(kcompactd_wake_template, TP_PROTO(int nid, int order, enum zone_type highest_zoneidx), TP_ARGS(nid, order, highest_zoneidx), TP_STRUCT__entry( __field(int, nid) __field(int, order) __field(enum zone_type, highest_zoneidx) ), TP_fast_assign( __entry->nid = nid; __entry->order = order; __entry->highest_zoneidx = highest_zoneidx; ), /* * classzone_idx is previous name of the highest_zoneidx. * Reason not to change it is the ABI requirement of the tracepoint. */ TP_printk("nid=%d order=%d classzone_idx=%-8s", __entry->nid, __entry->order, __print_symbolic(__entry->highest_zoneidx, ZONE_TYPE)) ); DEFINE_EVENT(kcompactd_wake_template, mm_compaction_wakeup_kcompactd, TP_PROTO(int nid, int order, enum zone_type highest_zoneidx), TP_ARGS(nid, order, highest_zoneidx) ); DEFINE_EVENT(kcompactd_wake_template, mm_compaction_kcompactd_wake, TP_PROTO(int nid, int order, enum zone_type highest_zoneidx), TP_ARGS(nid, order, highest_zoneidx) ); #endif #endif /* _TRACE_COMPACTION_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
425 3 17 23 1 17 48 464 4 4 4 1 1 1 201 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 /* SPDX-License-Identifier: GPL-2.0 */ #if !defined(_TRACE_KVM_MAIN_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_KVM_MAIN_H #include <linux/tracepoint.h> #undef TRACE_SYSTEM #define TRACE_SYSTEM kvm #define ERSN(x) { KVM_EXIT_##x, "KVM_EXIT_" #x } #define kvm_trace_exit_reason \ ERSN(UNKNOWN), ERSN(EXCEPTION), ERSN(IO), ERSN(HYPERCALL), \ ERSN(DEBUG), ERSN(HLT), ERSN(MMIO), ERSN(IRQ_WINDOW_OPEN), \ ERSN(SHUTDOWN), ERSN(FAIL_ENTRY), ERSN(INTR), ERSN(SET_TPR), \ ERSN(TPR_ACCESS), ERSN(S390_SIEIC), ERSN(S390_RESET), ERSN(DCR),\ ERSN(NMI), ERSN(INTERNAL_ERROR), ERSN(OSI), ERSN(PAPR_HCALL), \ ERSN(S390_UCONTROL), ERSN(WATCHDOG), ERSN(S390_TSCH), ERSN(EPR),\ ERSN(SYSTEM_EVENT), ERSN(S390_STSI), ERSN(IOAPIC_EOI), \ ERSN(HYPERV), ERSN(ARM_NISV), ERSN(X86_RDMSR), ERSN(X86_WRMSR) TRACE_EVENT(kvm_userspace_exit, TP_PROTO(__u32 reason, int errno), TP_ARGS(reason, errno), TP_STRUCT__entry( __field( __u32, reason ) __field( int, errno ) ), TP_fast_assign( __entry->reason = reason; __entry->errno = errno; ), TP_printk("reason %s (%d)", __entry->errno < 0 ? (__entry->errno == -EINTR ? "restart" : "error") : __print_symbolic(__entry->reason, kvm_trace_exit_reason), __entry->errno < 0 ? -__entry->errno : __entry->reason) ); TRACE_EVENT(kvm_vcpu_wakeup, TP_PROTO(__u64 ns, bool waited, bool valid), TP_ARGS(ns, waited, valid), TP_STRUCT__entry( __field( __u64, ns ) __field( bool, waited ) __field( bool, valid ) ), TP_fast_assign( __entry->ns = ns; __entry->waited = waited; __entry->valid = valid; ), TP_printk("%s time %lld ns, polling %s", __entry->waited ? "wait" : "poll", __entry->ns, __entry->valid ? "valid" : "invalid") ); #if defined(CONFIG_HAVE_KVM_IRQCHIP) TRACE_EVENT(kvm_set_irq, TP_PROTO(unsigned int gsi, int level, int irq_source_id), TP_ARGS(gsi, level, irq_source_id), TP_STRUCT__entry( __field( unsigned int, gsi ) __field( int, level ) __field( int, irq_source_id ) ), TP_fast_assign( __entry->gsi = gsi; __entry->level = level; __entry->irq_source_id = irq_source_id; ), TP_printk("gsi %u level %d source %d", __entry->gsi, __entry->level, __entry->irq_source_id) ); #endif /* defined(CONFIG_HAVE_KVM_IRQCHIP) */ #if defined(__KVM_HAVE_IOAPIC) #define kvm_deliver_mode \ {0x0, "Fixed"}, \ {0x1, "LowPrio"}, \ {0x2, "SMI"}, \ {0x3, "Res3"}, \ {0x4, "NMI"}, \ {0x5, "INIT"}, \ {0x6, "SIPI"}, \ {0x7, "ExtINT"} TRACE_EVENT(kvm_ioapic_set_irq, TP_PROTO(__u64 e, int pin, bool coalesced), TP_ARGS(e, pin, coalesced), TP_STRUCT__entry( __field( __u64, e ) __field( int, pin ) __field( bool, coalesced ) ), TP_fast_assign( __entry->e = e; __entry->pin = pin; __entry->coalesced = coalesced; ), TP_printk("pin %u dst %x vec %u (%s|%s|%s%s)%s", __entry->pin, (u8)(__entry->e >> 56), (u8)__entry->e, __print_symbolic((__entry->e >> 8 & 0x7), kvm_deliver_mode), (__entry->e & (1<<11)) ? "logical" : "physical", (__entry->e & (1<<15)) ? "level" : "edge", (__entry->e & (1<<16)) ? "|masked" : "", __entry->coalesced ? " (coalesced)" : "") ); TRACE_EVENT(kvm_ioapic_delayed_eoi_inj, TP_PROTO(__u64 e), TP_ARGS(e), TP_STRUCT__entry( __field( __u64, e ) ), TP_fast_assign( __entry->e = e; ), TP_printk("dst %x vec %u (%s|%s|%s%s)", (u8)(__entry->e >> 56), (u8)__entry->e, __print_symbolic((__entry->e >> 8 & 0x7), kvm_deliver_mode), (__entry->e & (1<<11)) ? "logical" : "physical", (__entry->e & (1<<15)) ? "level" : "edge", (__entry->e & (1<<16)) ? "|masked" : "") ); TRACE_EVENT(kvm_msi_set_irq, TP_PROTO(__u64 address, __u64 data), TP_ARGS(address, data), TP_STRUCT__entry( __field( __u64, address ) __field( __u64, data ) ), TP_fast_assign( __entry->address = address; __entry->data = data; ), TP_printk("dst %llx vec %u (%s|%s|%s%s)", (u8)(__entry->address >> 12) | ((__entry->address >> 32) & 0xffffff00), (u8)__entry->data, __print_symbolic((__entry->data >> 8 & 0x7), kvm_deliver_mode), (__entry->address & (1<<2)) ? "logical" : "physical", (__entry->data & (1<<15)) ? "level" : "edge", (__entry->address & (1<<3)) ? "|rh" : "") ); #define kvm_irqchips \ {KVM_IRQCHIP_PIC_MASTER, "PIC master"}, \ {KVM_IRQCHIP_PIC_SLAVE, "PIC slave"}, \ {KVM_IRQCHIP_IOAPIC, "IOAPIC"} #endif /* defined(__KVM_HAVE_IOAPIC) */ #if defined(CONFIG_HAVE_KVM_IRQCHIP) #ifdef kvm_irqchips #define kvm_ack_irq_string "irqchip %s pin %u" #define kvm_ack_irq_parm __print_symbolic(__entry->irqchip, kvm_irqchips), __entry->pin #else #define kvm_ack_irq_string "irqchip %d pin %u" #define kvm_ack_irq_parm __entry->irqchip, __entry->pin #endif TRACE_EVENT(kvm_ack_irq, TP_PROTO(unsigned int irqchip, unsigned int pin), TP_ARGS(irqchip, pin), TP_STRUCT__entry( __field( unsigned int, irqchip ) __field( unsigned int, pin ) ), TP_fast_assign( __entry->irqchip = irqchip; __entry->pin = pin; ), TP_printk(kvm_ack_irq_string, kvm_ack_irq_parm) ); #endif /* defined(CONFIG_HAVE_KVM_IRQCHIP) */ #define KVM_TRACE_MMIO_READ_UNSATISFIED 0 #define KVM_TRACE_MMIO_READ 1 #define KVM_TRACE_MMIO_WRITE 2 #define kvm_trace_symbol_mmio \ { KVM_TRACE_MMIO_READ_UNSATISFIED, "unsatisfied-read" }, \ { KVM_TRACE_MMIO_READ, "read" }, \ { KVM_TRACE_MMIO_WRITE, "write" } TRACE_EVENT(kvm_mmio, TP_PROTO(int type, int len, u64 gpa, void *val), TP_ARGS(type, len, gpa, val), TP_STRUCT__entry( __field( u32, type ) __field( u32, len ) __field( u64, gpa ) __field( u64, val ) ), TP_fast_assign( __entry->type = type; __entry->len = len; __entry->gpa = gpa; __entry->val = 0; if (val) memcpy(&__entry->val, val, min_t(u32, sizeof(__entry->val), len)); ), TP_printk("mmio %s len %u gpa 0x%llx val 0x%llx", __print_symbolic(__entry->type, kvm_trace_symbol_mmio), __entry->len, __entry->gpa, __entry->val) ); #define KVM_TRACE_IOCSR_READ_UNSATISFIED 0 #define KVM_TRACE_IOCSR_READ 1 #define KVM_TRACE_IOCSR_WRITE 2 #define kvm_trace_symbol_iocsr \ { KVM_TRACE_IOCSR_READ_UNSATISFIED, "unsatisfied-read" }, \ { KVM_TRACE_IOCSR_READ, "read" }, \ { KVM_TRACE_IOCSR_WRITE, "write" } TRACE_EVENT(kvm_iocsr, TP_PROTO(int type, int len, u64 gpa, void *val), TP_ARGS(type, len, gpa, val), TP_STRUCT__entry( __field( u32, type ) __field( u32, len ) __field( u64, gpa ) __field( u64, val ) ), TP_fast_assign( __entry->type = type; __entry->len = len; __entry->gpa = gpa; __entry->val = 0; if (val) memcpy(&__entry->val, val, min_t(u32, sizeof(__entry->val), len)); ), TP_printk("iocsr %s len %u gpa 0x%llx val 0x%llx", __print_symbolic(__entry->type, kvm_trace_symbol_iocsr), __entry->len, __entry->gpa, __entry->val) ); #define kvm_fpu_load_symbol \ {0, "unload"}, \ {1, "load"} TRACE_EVENT(kvm_fpu, TP_PROTO(int load), TP_ARGS(load), TP_STRUCT__entry( __field( u32, load ) ), TP_fast_assign( __entry->load = load; ), TP_printk("%s", __print_symbolic(__entry->load, kvm_fpu_load_symbol)) ); #ifdef CONFIG_KVM_ASYNC_PF DECLARE_EVENT_CLASS(kvm_async_get_page_class, TP_PROTO(u64 gva, u64 gfn), TP_ARGS(gva, gfn), TP_STRUCT__entry( __field(__u64, gva) __field(u64, gfn) ), TP_fast_assign( __entry->gva = gva; __entry->gfn = gfn; ), TP_printk("gva = %#llx, gfn = %#llx", __entry->gva, __entry->gfn) ); DEFINE_EVENT(kvm_async_get_page_class, kvm_try_async_get_page, TP_PROTO(u64 gva, u64 gfn), TP_ARGS(gva, gfn) ); DEFINE_EVENT(kvm_async_get_page_class, kvm_async_pf_repeated_fault, TP_PROTO(u64 gva, u64 gfn), TP_ARGS(gva, gfn) ); DECLARE_EVENT_CLASS(kvm_async_pf_nopresent_ready, TP_PROTO(u64 token, u64 gva), TP_ARGS(token, gva), TP_STRUCT__entry( __field(__u64, token) __field(__u64, gva) ), TP_fast_assign( __entry->token = token; __entry->gva = gva; ), TP_printk("token %#llx gva %#llx", __entry->token, __entry->gva) ); DEFINE_EVENT(kvm_async_pf_nopresent_ready, kvm_async_pf_not_present, TP_PROTO(u64 token, u64 gva), TP_ARGS(token, gva) ); DEFINE_EVENT(kvm_async_pf_nopresent_ready, kvm_async_pf_ready, TP_PROTO(u64 token, u64 gva), TP_ARGS(token, gva) ); TRACE_EVENT( kvm_async_pf_completed, TP_PROTO(unsigned long address, u64 gva), TP_ARGS(address, gva), TP_STRUCT__entry( __field(unsigned long, address) __field(u64, gva) ), TP_fast_assign( __entry->address = address; __entry->gva = gva; ), TP_printk("gva %#llx address %#lx", __entry->gva, __entry->address) ); #endif TRACE_EVENT(kvm_halt_poll_ns, TP_PROTO(bool grow, unsigned int vcpu_id, unsigned int new, unsigned int old), TP_ARGS(grow, vcpu_id, new, old), TP_STRUCT__entry( __field(bool, grow) __field(unsigned int, vcpu_id) __field(unsigned int, new) __field(unsigned int, old) ), TP_fast_assign( __entry->grow = grow; __entry->vcpu_id = vcpu_id; __entry->new = new; __entry->old = old; ), TP_printk("vcpu %u: halt_poll_ns %u (%s %u)", __entry->vcpu_id, __entry->new, __entry->grow ? "grow" : "shrink", __entry->old) ); #define trace_kvm_halt_poll_ns_grow(vcpu_id, new, old) \ trace_kvm_halt_poll_ns(true, vcpu_id, new, old) #define trace_kvm_halt_poll_ns_shrink(vcpu_id, new, old) \ trace_kvm_halt_poll_ns(false, vcpu_id, new, old) TRACE_EVENT(kvm_dirty_ring_push, TP_PROTO(struct kvm_dirty_ring *ring, u32 slot, u64 offset), TP_ARGS(ring, slot, offset), TP_STRUCT__entry( __field(int, index) __field(u32, dirty_index) __field(u32, reset_index) __field(u32, slot) __field(u64, offset) ), TP_fast_assign( __entry->index = ring->index; __entry->dirty_index = ring->dirty_index; __entry->reset_index = ring->reset_index; __entry->slot = slot; __entry->offset = offset; ), TP_printk("ring %d: dirty 0x%x reset 0x%x " "slot %u offset 0x%llx (used %u)", __entry->index, __entry->dirty_index, __entry->reset_index, __entry->slot, __entry->offset, __entry->dirty_index - __entry->reset_index) ); TRACE_EVENT(kvm_dirty_ring_reset, TP_PROTO(struct kvm_dirty_ring *ring), TP_ARGS(ring), TP_STRUCT__entry( __field(int, index) __field(u32, dirty_index) __field(u32, reset_index) ), TP_fast_assign( __entry->index = ring->index; __entry->dirty_index = ring->dirty_index; __entry->reset_index = ring->reset_index; ), TP_printk("ring %d: dirty 0x%x reset 0x%x (used %u)", __entry->index, __entry->dirty_index, __entry->reset_index, __entry->dirty_index - __entry->reset_index) ); TRACE_EVENT(kvm_dirty_ring_exit, TP_PROTO(struct kvm_vcpu *vcpu), TP_ARGS(vcpu), TP_STRUCT__entry( __field(int, vcpu_id) ), TP_fast_assign( __entry->vcpu_id = vcpu->vcpu_id; ), TP_printk("vcpu %d", __entry->vcpu_id) ); TRACE_EVENT(kvm_unmap_hva_range, TP_PROTO(unsigned long start, unsigned long end), TP_ARGS(start, end), TP_STRUCT__entry( __field( unsigned long, start ) __field( unsigned long, end ) ), TP_fast_assign( __entry->start = start; __entry->end = end; ), TP_printk("mmu notifier unmap range: %#016lx -- %#016lx", __entry->start, __entry->end) ); TRACE_EVENT(kvm_age_hva, TP_PROTO(unsigned long start, unsigned long end), TP_ARGS(start, end), TP_STRUCT__entry( __field( unsigned long, start ) __field( unsigned long, end ) ), TP_fast_assign( __entry->start = start; __entry->end = end; ), TP_printk("mmu notifier age hva: %#016lx -- %#016lx", __entry->start, __entry->end) ); TRACE_EVENT(kvm_test_age_hva, TP_PROTO(unsigned long hva), TP_ARGS(hva), TP_STRUCT__entry( __field( unsigned long, hva ) ), TP_fast_assign( __entry->hva = hva; ), TP_printk("mmu notifier test age hva: %#016lx", __entry->hva) ); #endif /* _TRACE_KVM_MAIN_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
49 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM dccp #if !defined(_TRACE_DCCP_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_DCCP_H #include <net/sock.h> #include "dccp.h" #include "ccids/ccid3.h" #include <linux/tracepoint.h> #include <trace/events/net_probe_common.h> TRACE_EVENT(dccp_probe, TP_PROTO(struct sock *sk, size_t size), TP_ARGS(sk, size), TP_STRUCT__entry( /* sockaddr_in6 is always bigger than sockaddr_in */ __array(__u8, saddr, sizeof(struct sockaddr_in6)) __array(__u8, daddr, sizeof(struct sockaddr_in6)) __field(__u16, sport) __field(__u16, dport) __field(__u16, size) __field(__u16, tx_s) __field(__u32, tx_rtt) __field(__u32, tx_p) __field(__u32, tx_x_calc) __field(__u64, tx_x_recv) __field(__u64, tx_x) __field(__u32, tx_t_ipi) ), TP_fast_assign( const struct inet_sock *inet = inet_sk(sk); struct ccid3_hc_tx_sock *hc = NULL; if (ccid_get_current_tx_ccid(dccp_sk(sk)) == DCCPC_CCID3) hc = ccid3_hc_tx_sk(sk); memset(__entry->saddr, 0, sizeof(struct sockaddr_in6)); memset(__entry->daddr, 0, sizeof(struct sockaddr_in6)); TP_STORE_ADDR_PORTS(__entry, inet, sk); /* For filtering use */ __entry->sport = ntohs(inet->inet_sport); __entry->dport = ntohs(inet->inet_dport); __entry->size = size; if (hc) { __entry->tx_s = hc->tx_s; __entry->tx_rtt = hc->tx_rtt; __entry->tx_p = hc->tx_p; __entry->tx_x_calc = hc->tx_x_calc; __entry->tx_x_recv = hc->tx_x_recv >> 6; __entry->tx_x = hc->tx_x >> 6; __entry->tx_t_ipi = hc->tx_t_ipi; } else { __entry->tx_s = 0; memset_startat(__entry, 0, tx_rtt); } ), TP_printk("src=%pISpc dest=%pISpc size=%d tx_s=%d tx_rtt=%d " "tx_p=%d tx_x_calc=%u tx_x_recv=%llu tx_x=%llu tx_t_ipi=%d", __entry->saddr, __entry->daddr, __entry->size, __entry->tx_s, __entry->tx_rtt, __entry->tx_p, __entry->tx_x_calc, __entry->tx_x_recv, __entry->tx_x, __entry->tx_t_ipi) ); #endif /* _TRACE_TCP_H */ /* This part must be outside protection */ #undef TRACE_INCLUDE_PATH #define TRACE_INCLUDE_PATH . #undef TRACE_INCLUDE_FILE #define TRACE_INCLUDE_FILE trace #include <trace/define_trace.h>
2 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 /* SPDX-License-Identifier: GPL-2.0-only */ /* * v4l2-dv-timings - Internal header with dv-timings helper functions * * Copyright 2013 Cisco Systems, Inc. and/or its affiliates. All rights reserved. */ #ifndef __V4L2_DV_TIMINGS_H #define __V4L2_DV_TIMINGS_H #include <linux/debugfs.h> #include <linux/videodev2.h> /** * v4l2_calc_timeperframe - helper function to calculate timeperframe based * v4l2_dv_timings fields. * @t: Timings for the video mode. * * Calculates the expected timeperframe using the pixel clock value and * horizontal/vertical measures. This means that v4l2_dv_timings structure * must be correctly and fully filled. */ struct v4l2_fract v4l2_calc_timeperframe(const struct v4l2_dv_timings *t); /* * v4l2_dv_timings_presets: list of all dv_timings presets. */ extern const struct v4l2_dv_timings v4l2_dv_timings_presets[]; /** * typedef v4l2_check_dv_timings_fnc - timings check callback * * @t: the v4l2_dv_timings struct. * @handle: a handle from the driver. * * Returns true if the given timings are valid. */ typedef bool v4l2_check_dv_timings_fnc(const struct v4l2_dv_timings *t, void *handle); /** * v4l2_valid_dv_timings() - are these timings valid? * * @t: the v4l2_dv_timings struct. * @cap: the v4l2_dv_timings_cap capabilities. * @fnc: callback to check if this timing is OK. May be NULL. * @fnc_handle: a handle that is passed on to @fnc. * * Returns true if the given dv_timings struct is supported by the * hardware capabilities and the callback function (if non-NULL), returns * false otherwise. */ bool v4l2_valid_dv_timings(const struct v4l2_dv_timings *t, const struct v4l2_dv_timings_cap *cap, v4l2_check_dv_timings_fnc fnc, void *fnc_handle); /** * v4l2_enum_dv_timings_cap() - Helper function to enumerate possible DV * timings based on capabilities * * @t: the v4l2_enum_dv_timings struct. * @cap: the v4l2_dv_timings_cap capabilities. * @fnc: callback to check if this timing is OK. May be NULL. * @fnc_handle: a handle that is passed on to @fnc. * * This enumerates dv_timings using the full list of possible CEA-861 and DMT * timings, filtering out any timings that are not supported based on the * hardware capabilities and the callback function (if non-NULL). * * If a valid timing for the given index is found, it will fill in @t and * return 0, otherwise it returns -EINVAL. */ int v4l2_enum_dv_timings_cap(struct v4l2_enum_dv_timings *t, const struct v4l2_dv_timings_cap *cap, v4l2_check_dv_timings_fnc fnc, void *fnc_handle); /** * v4l2_find_dv_timings_cap() - Find the closest timings struct * * @t: the v4l2_enum_dv_timings struct. * @cap: the v4l2_dv_timings_cap capabilities. * @pclock_delta: maximum delta between t->pixelclock and the timing struct * under consideration. * @fnc: callback to check if a given timings struct is OK. May be NULL. * @fnc_handle: a handle that is passed on to @fnc. * * This function tries to map the given timings to an entry in the * full list of possible CEA-861 and DMT timings, filtering out any timings * that are not supported based on the hardware capabilities and the callback * function (if non-NULL). * * On success it will fill in @t with the found timings and it returns true. * On failure it will return false. */ bool v4l2_find_dv_timings_cap(struct v4l2_dv_timings *t, const struct v4l2_dv_timings_cap *cap, unsigned pclock_delta, v4l2_check_dv_timings_fnc fnc, void *fnc_handle); /** * v4l2_find_dv_timings_cea861_vic() - find timings based on CEA-861 VIC * @t: the timings data. * @vic: CEA-861 VIC code * * On success it will fill in @t with the found timings and it returns true. * On failure it will return false. */ bool v4l2_find_dv_timings_cea861_vic(struct v4l2_dv_timings *t, u8 vic); /** * v4l2_match_dv_timings() - do two timings match? * * @measured: the measured timings data. * @standard: the timings according to the standard. * @pclock_delta: maximum delta in Hz between standard->pixelclock and * the measured timings. * @match_reduced_fps: if true, then fail if V4L2_DV_FL_REDUCED_FPS does not * match. * * Returns true if the two timings match, returns false otherwise. */ bool v4l2_match_dv_timings(const struct v4l2_dv_timings *measured, const struct v4l2_dv_timings *standard, unsigned pclock_delta, bool match_reduced_fps); /** * v4l2_print_dv_timings() - log the contents of a dv_timings struct * @dev_prefix:device prefix for each log line. * @prefix: additional prefix for each log line, may be NULL. * @t: the timings data. * @detailed: if true, give a detailed log. */ void v4l2_print_dv_timings(const char *dev_prefix, const char *prefix, const struct v4l2_dv_timings *t, bool detailed); /** * v4l2_detect_cvt - detect if the given timings follow the CVT standard * * @frame_height: the total height of the frame (including blanking) in lines. * @hfreq: the horizontal frequency in Hz. * @vsync: the height of the vertical sync in lines. * @active_width: active width of image (does not include blanking). This * information is needed only in case of version 2 of reduced blanking. * In other cases, this parameter does not have any effect on timings. * @polarities: the horizontal and vertical polarities (same as struct * v4l2_bt_timings polarities). * @interlaced: if this flag is true, it indicates interlaced format * @cap: the v4l2_dv_timings_cap capabilities. * @fmt: the resulting timings. * * This function will attempt to detect if the given values correspond to a * valid CVT format. If so, then it will return true, and fmt will be filled * in with the found CVT timings. */ bool v4l2_detect_cvt(unsigned int frame_height, unsigned int hfreq, unsigned int vsync, unsigned int active_width, u32 polarities, bool interlaced, const struct v4l2_dv_timings_cap *cap, struct v4l2_dv_timings *fmt); /** * v4l2_detect_gtf - detect if the given timings follow the GTF standard * * @frame_height: the total height of the frame (including blanking) in lines. * @hfreq: the horizontal frequency in Hz. * @vsync: the height of the vertical sync in lines. * @polarities: the horizontal and vertical polarities (same as struct * v4l2_bt_timings polarities). * @interlaced: if this flag is true, it indicates interlaced format * @aspect: preferred aspect ratio. GTF has no method of determining the * aspect ratio in order to derive the image width from the * image height, so it has to be passed explicitly. Usually * the native screen aspect ratio is used for this. If it * is not filled in correctly, then 16:9 will be assumed. * @cap: the v4l2_dv_timings_cap capabilities. * @fmt: the resulting timings. * * This function will attempt to detect if the given values correspond to a * valid GTF format. If so, then it will return true, and fmt will be filled * in with the found GTF timings. */ bool v4l2_detect_gtf(unsigned int frame_height, unsigned int hfreq, unsigned int vsync, u32 polarities, bool interlaced, struct v4l2_fract aspect, const struct v4l2_dv_timings_cap *cap, struct v4l2_dv_timings *fmt); /** * v4l2_calc_aspect_ratio - calculate the aspect ratio based on bytes * 0x15 and 0x16 from the EDID. * * @hor_landscape: byte 0x15 from the EDID. * @vert_portrait: byte 0x16 from the EDID. * * Determines the aspect ratio from the EDID. * See VESA Enhanced EDID standard, release A, rev 2, section 3.6.2: * "Horizontal and Vertical Screen Size or Aspect Ratio" */ struct v4l2_fract v4l2_calc_aspect_ratio(u8 hor_landscape, u8 vert_portrait); /** * v4l2_dv_timings_aspect_ratio - calculate the aspect ratio based on the * v4l2_dv_timings information. * * @t: the timings data. */ struct v4l2_fract v4l2_dv_timings_aspect_ratio(const struct v4l2_dv_timings *t); /** * can_reduce_fps - check if conditions for reduced fps are true. * @bt: v4l2 timing structure * * For different timings reduced fps is allowed if the following conditions * are met: * * - For CVT timings: if reduced blanking v2 (vsync == 8) is true. * - For CEA861 timings: if %V4L2_DV_FL_CAN_REDUCE_FPS flag is true. */ static inline bool can_reduce_fps(struct v4l2_bt_timings *bt) { if ((bt->standards & V4L2_DV_BT_STD_CVT) && (bt->vsync == 8)) return true; if ((bt->standards & V4L2_DV_BT_STD_CEA861) && (bt->flags & V4L2_DV_FL_CAN_REDUCE_FPS)) return true; return false; } /** * struct v4l2_hdmi_colorimetry - describes the HDMI colorimetry information * @colorspace: enum v4l2_colorspace, the colorspace * @ycbcr_enc: enum v4l2_ycbcr_encoding, Y'CbCr encoding * @quantization: enum v4l2_quantization, colorspace quantization * @xfer_func: enum v4l2_xfer_func, colorspace transfer function */ struct v4l2_hdmi_colorimetry { enum v4l2_colorspace colorspace; enum v4l2_ycbcr_encoding ycbcr_enc; enum v4l2_quantization quantization; enum v4l2_xfer_func xfer_func; }; struct hdmi_avi_infoframe; struct hdmi_vendor_infoframe; struct v4l2_hdmi_colorimetry v4l2_hdmi_rx_colorimetry(const struct hdmi_avi_infoframe *avi, const struct hdmi_vendor_infoframe *hdmi, unsigned int height); u16 v4l2_get_edid_phys_addr(const u8 *edid, unsigned int size, unsigned int *offset); void v4l2_set_edid_phys_addr(u8 *edid, unsigned int size, u16 phys_addr); u16 v4l2_phys_addr_for_input(u16 phys_addr, u8 input); int v4l2_phys_addr_validate(u16 phys_addr, u16 *parent, u16 *port); /* Add support for exporting InfoFrames to debugfs */ /* * HDMI InfoFrames start with a 3 byte header, then a checksum, * followed by the actual IF payload. * * The payload length is limited to 30 bytes according to the HDMI spec, * but since the length is encoded in 5 bits, it can be 31 bytes theoretically. * So set the max length as 31 + 3 (header) + 1 (checksum) = 35. */ #define V4L2_DEBUGFS_IF_MAX_LEN (35) #define V4L2_DEBUGFS_IF_AVI BIT(0) #define V4L2_DEBUGFS_IF_AUDIO BIT(1) #define V4L2_DEBUGFS_IF_SPD BIT(2) #define V4L2_DEBUGFS_IF_HDMI BIT(3) typedef ssize_t (*v4l2_debugfs_if_read_t)(u32 type, void *priv, struct file *filp, char __user *ubuf, size_t count, loff_t *ppos); struct v4l2_debugfs_if { struct dentry *if_dir; void *priv; v4l2_debugfs_if_read_t if_read; }; #ifdef CONFIG_DEBUG_FS struct v4l2_debugfs_if *v4l2_debugfs_if_alloc(struct dentry *root, u32 if_types, void *priv, v4l2_debugfs_if_read_t if_read); void v4l2_debugfs_if_free(struct v4l2_debugfs_if *infoframes); #else static inline struct v4l2_debugfs_if *v4l2_debugfs_if_alloc(struct dentry *root, u32 if_types, void *priv, v4l2_debugfs_if_read_t if_read) { return NULL; } static inline void v4l2_debugfs_if_free(struct v4l2_debugfs_if *infoframes) { } #endif #endif
1 1 2 3 3 3 3 1 3 3 3 1 2 2 2 2 2 2 2 2 2 1 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 // SPDX-License-Identifier: GPL-2.0-only /* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */ /* Kernel module implementing an IP set type: the hash:net type */ #include <linux/jhash.h> #include <linux/module.h> #include <linux/ip.h> #include <linux/skbuff.h> #include <linux/errno.h> #include <linux/random.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/ipset/pfxlen.h> #include <linux/netfilter/ipset/ip_set.h> #include <linux/netfilter/ipset/ip_set_hash.h> #define IPSET_TYPE_REV_MIN 0 /* 1 Range as input support for IPv4 added */ /* 2 nomatch flag support added */ /* 3 Counters support added */ /* 4 Comments support added */ /* 5 Forceadd support added */ /* 6 skbinfo support added */ #define IPSET_TYPE_REV_MAX 7 /* bucketsize, initval support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); IP_SET_MODULE_DESC("hash:net", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_hash:net"); /* Type specific function prefix */ #define HTYPE hash_net #define IP_SET_HASH_WITH_NETS /* IPv4 variant */ /* Member elements */ struct hash_net4_elem { __be32 ip; u16 padding0; u8 nomatch; u8 cidr; }; /* Common functions */ static bool hash_net4_data_equal(const struct hash_net4_elem *ip1, const struct hash_net4_elem *ip2, u32 *multi) { return ip1->ip == ip2->ip && ip1->cidr == ip2->cidr; } static int hash_net4_do_data_match(const struct hash_net4_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } static void hash_net4_data_set_flags(struct hash_net4_elem *elem, u32 flags) { elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH; } static void hash_net4_data_reset_flags(struct hash_net4_elem *elem, u8 *flags) { swap(*flags, elem->nomatch); } static void hash_net4_data_netmask(struct hash_net4_elem *elem, u8 cidr) { elem->ip &= ip_set_netmask(cidr); elem->cidr = cidr; } static bool hash_net4_data_list(struct sk_buff *skb, const struct hash_net4_elem *data) { u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip) || nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr) || (flags && nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) goto nla_put_failure; return false; nla_put_failure: return true; } static void hash_net4_data_next(struct hash_net4_elem *next, const struct hash_net4_elem *d) { next->ip = d->ip; } #define MTYPE hash_net4 #define HOST_MASK 32 #include "ip_set_hash_gen.h" static int hash_net4_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, enum ipset_adt adt, struct ip_set_adt_opt *opt) { const struct hash_net4 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_net4_elem e = { .cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK), }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); if (e.cidr == 0) return -EINVAL; if (adt == IPSET_TEST) e.cidr = HOST_MASK; ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip); e.ip &= ip_set_netmask(e.cidr); return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_net4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { struct hash_net4 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_net4_elem e = { .cidr = HOST_MASK }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 ip = 0, ip_to = 0, i = 0; int ret; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); if (unlikely(!tb[IPSET_ATTR_IP] || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip); if (ret) return ret; ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; if (tb[IPSET_ATTR_CIDR]) { e.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); if (!e.cidr || e.cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; } if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); if (cadt_flags & IPSET_FLAG_NOMATCH) flags |= (IPSET_FLAG_NOMATCH << 16); } if (adt == IPSET_TEST || !tb[IPSET_ATTR_IP_TO]) { e.ip = htonl(ip & ip_set_hostmask(e.cidr)); ret = adtfn(set, &e, &ext, &ext, flags); return ip_set_enomatch(ret, flags, adt, set) ? -ret : ip_set_eexist(ret, flags) ? 0 : ret; } ip_to = ip; if (tb[IPSET_ATTR_IP_TO]) { ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to); if (ret) return ret; if (ip_to < ip) swap(ip, ip_to); if (ip + UINT_MAX == ip_to) return -IPSET_ERR_HASH_RANGE; } if (retried) ip = ntohl(h->next.ip); do { i++; e.ip = htonl(ip); if (i > IPSET_MAX_RANGE) { hash_net4_data_next(&h->next, &e); return -ERANGE; } ip = ip_set_range_to_cidr(ip, ip_to, &e.cidr); ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; ret = 0; } while (ip++ < ip_to); return ret; } /* IPv6 variant */ struct hash_net6_elem { union nf_inet_addr ip; u16 padding0; u8 nomatch; u8 cidr; }; /* Common functions */ static bool hash_net6_data_equal(const struct hash_net6_elem *ip1, const struct hash_net6_elem *ip2, u32 *multi) { return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6) && ip1->cidr == ip2->cidr; } static int hash_net6_do_data_match(const struct hash_net6_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } static void hash_net6_data_set_flags(struct hash_net6_elem *elem, u32 flags) { elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH; } static void hash_net6_data_reset_flags(struct hash_net6_elem *elem, u8 *flags) { swap(*flags, elem->nomatch); } static void hash_net6_data_netmask(struct hash_net6_elem *elem, u8 cidr) { ip6_netmask(&elem->ip, cidr); elem->cidr = cidr; } static bool hash_net6_data_list(struct sk_buff *skb, const struct hash_net6_elem *data) { u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip.in6) || nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr) || (flags && nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) goto nla_put_failure; return false; nla_put_failure: return true; } static void hash_net6_data_next(struct hash_net6_elem *next, const struct hash_net6_elem *d) { } #undef MTYPE #undef HOST_MASK #define MTYPE hash_net6 #define HOST_MASK 128 #define IP_SET_EMIT_CREATE #include "ip_set_hash_gen.h" static int hash_net6_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, enum ipset_adt adt, struct ip_set_adt_opt *opt) { const struct hash_net6 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_net6_elem e = { .cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK), }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); if (e.cidr == 0) return -EINVAL; if (adt == IPSET_TEST) e.cidr = HOST_MASK; ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6); ip6_netmask(&e.ip, e.cidr); return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_net6_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_net6_elem e = { .cidr = HOST_MASK }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); int ret; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); if (unlikely(!tb[IPSET_ATTR_IP] || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; if (unlikely(tb[IPSET_ATTR_IP_TO])) return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip); if (ret) return ret; ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; if (tb[IPSET_ATTR_CIDR]) { e.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); if (!e.cidr || e.cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; } ip6_netmask(&e.ip, e.cidr); if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); if (cadt_flags & IPSET_FLAG_NOMATCH) flags |= (IPSET_FLAG_NOMATCH << 16); } ret = adtfn(set, &e, &ext, &ext, flags); return ip_set_enomatch(ret, flags, adt, set) ? -ret : ip_set_eexist(ret, flags) ? 0 : ret; } static struct ip_set_type hash_net_type __read_mostly = { .name = "hash:net", .protocol = IPSET_PROTOCOL, .features = IPSET_TYPE_IP | IPSET_TYPE_NOMATCH, .dimension = IPSET_DIM_ONE, .family = NFPROTO_UNSPEC, .revision_min = IPSET_TYPE_REV_MIN, .revision_max = IPSET_TYPE_REV_MAX, .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE, .create = hash_net_create, .create_policy = { [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, [IPSET_ATTR_INITVAL] = { .type = NLA_U32 }, [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 }, [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, }, .adt_policy = { [IPSET_ATTR_IP] = { .type = NLA_NESTED }, [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED }, [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, }, .me = THIS_MODULE, }; static int __init hash_net_init(void) { return ip_set_type_register(&hash_net_type); } static void __exit hash_net_fini(void) { rcu_barrier(); ip_set_type_unregister(&hash_net_type); } module_init(hash_net_init); module_exit(hash_net_fini);
27 27 27 27 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __NET_TC_CT_H #define __NET_TC_CT_H #include <net/act_api.h> #include <uapi/linux/tc_act/tc_ct.h> #if IS_ENABLED(CONFIG_NF_CONNTRACK) #include <net/netfilter/nf_nat.h> #include <net/netfilter/nf_conntrack_labels.h> struct tcf_ct_params { struct nf_conntrack_helper *helper; struct nf_conn *tmpl; u16 zone; u32 mark; u32 mark_mask; u32 labels[NF_CT_LABELS_MAX_SIZE / sizeof(u32)]; u32 labels_mask[NF_CT_LABELS_MAX_SIZE / sizeof(u32)]; struct nf_nat_range2 range; bool ipv4_range; bool put_labels; u16 ct_action; struct rcu_head rcu; struct tcf_ct_flow_table *ct_ft; struct nf_flowtable *nf_ft; }; struct tcf_ct { struct tc_action common; struct tcf_ct_params __rcu *params; }; #define to_ct(a) ((struct tcf_ct *)a) #define to_ct_params(a) \ ((struct tcf_ct_params *) \ rcu_dereference_protected(to_ct(a)->params, \ lockdep_is_held(&a->tcfa_lock))) static inline uint16_t tcf_ct_zone(const struct tc_action *a) { return to_ct_params(a)->zone; } static inline int tcf_ct_action(const struct tc_action *a) { return to_ct_params(a)->ct_action; } static inline struct nf_flowtable *tcf_ct_ft(const struct tc_action *a) { return to_ct_params(a)->nf_ft; } static inline struct nf_conntrack_helper *tcf_ct_helper(const struct tc_action *a) { return to_ct_params(a)->helper; } #else static inline uint16_t tcf_ct_zone(const struct tc_action *a) { return 0; } static inline int tcf_ct_action(const struct tc_action *a) { return 0; } static inline struct nf_flowtable *tcf_ct_ft(const struct tc_action *a) { return NULL; } static inline struct nf_conntrack_helper *tcf_ct_helper(const struct tc_action *a) { return NULL; } #endif /* CONFIG_NF_CONNTRACK */ #if IS_ENABLED(CONFIG_NET_ACT_CT) static inline void tcf_ct_flow_table_restore_skb(struct sk_buff *skb, unsigned long cookie) { enum ip_conntrack_info ctinfo = cookie & NFCT_INFOMASK; struct nf_conn *ct; ct = (struct nf_conn *)(cookie & NFCT_PTRMASK); nf_conntrack_get(&ct->ct_general); nf_ct_set(skb, ct, ctinfo); } #else static inline void tcf_ct_flow_table_restore_skb(struct sk_buff *skb, unsigned long cookie) { } #endif static inline bool is_tcf_ct(const struct tc_action *a) { #if defined(CONFIG_NET_CLS_ACT) && IS_ENABLED(CONFIG_NF_CONNTRACK) if (a->ops && a->ops->id == TCA_ID_CT) return true; #endif return false; } #endif /* __NET_TC_CT_H */
1 1 1 3 3 3 3 2 2 2 2 2 3 3 2 2 2 2 2 1 1 1 1 1 1 1 1 1 5 1 1 1 2 2 1 1 5 6 6 6 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 // SPDX-License-Identifier: GPL-2.0-only /* * Texas Instruments' Bluetooth HCILL UART protocol * * HCILL (HCI Low Level) is a Texas Instruments' power management * protocol extension to H4. * * Copyright (C) 2007 Texas Instruments, Inc. * * Written by Ohad Ben-Cohen <ohad@bencohen.org> * * Acknowledgements: * This file is based on hci_h4.c, which was written * by Maxim Krasnyansky and Marcel Holtmann. */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/init.h> #include <linux/sched.h> #include <linux/types.h> #include <linux/fcntl.h> #include <linux/firmware.h> #include <linux/interrupt.h> #include <linux/ptrace.h> #include <linux/poll.h> #include <linux/slab.h> #include <linux/errno.h> #include <linux/string.h> #include <linux/signal.h> #include <linux/ioctl.h> #include <linux/of.h> #include <linux/serdev.h> #include <linux/skbuff.h> #include <linux/ti_wilink_st.h> #include <linux/clk.h> #include <net/bluetooth/bluetooth.h> #include <net/bluetooth/hci_core.h> #include <linux/gpio/consumer.h> #include <linux/nvmem-consumer.h> #include "hci_uart.h" /* Vendor-specific HCI commands */ #define HCI_VS_WRITE_BD_ADDR 0xfc06 #define HCI_VS_UPDATE_UART_HCI_BAUDRATE 0xff36 /* HCILL commands */ #define HCILL_GO_TO_SLEEP_IND 0x30 #define HCILL_GO_TO_SLEEP_ACK 0x31 #define HCILL_WAKE_UP_IND 0x32 #define HCILL_WAKE_UP_ACK 0x33 /* HCILL states */ enum hcill_states_e { HCILL_ASLEEP, HCILL_ASLEEP_TO_AWAKE, HCILL_AWAKE, HCILL_AWAKE_TO_ASLEEP }; struct ll_device { struct hci_uart hu; struct serdev_device *serdev; struct gpio_desc *enable_gpio; struct clk *ext_clk; bdaddr_t bdaddr; }; struct ll_struct { struct sk_buff *rx_skb; struct sk_buff_head txq; spinlock_t hcill_lock; /* HCILL state lock */ unsigned long hcill_state; /* HCILL power state */ struct sk_buff_head tx_wait_q; /* HCILL wait queue */ }; /* * Builds and sends an HCILL command packet. * These are very simple packets with only 1 cmd byte */ static int send_hcill_cmd(u8 cmd, struct hci_uart *hu) { int err = 0; struct sk_buff *skb = NULL; struct ll_struct *ll = hu->priv; BT_DBG("hu %p cmd 0x%x", hu, cmd); /* allocate packet */ skb = bt_skb_alloc(1, GFP_ATOMIC); if (!skb) { BT_ERR("cannot allocate memory for HCILL packet"); err = -ENOMEM; goto out; } /* prepare packet */ skb_put_u8(skb, cmd); /* send packet */ skb_queue_tail(&ll->txq, skb); out: return err; } /* Initialize protocol */ static int ll_open(struct hci_uart *hu) { struct ll_struct *ll; BT_DBG("hu %p", hu); ll = kzalloc(sizeof(*ll), GFP_KERNEL); if (!ll) return -ENOMEM; skb_queue_head_init(&ll->txq); skb_queue_head_init(&ll->tx_wait_q); spin_lock_init(&ll->hcill_lock); ll->hcill_state = HCILL_AWAKE; hu->priv = ll; if (hu->serdev) { struct ll_device *lldev = serdev_device_get_drvdata(hu->serdev); if (!IS_ERR(lldev->ext_clk)) clk_prepare_enable(lldev->ext_clk); } return 0; } /* Flush protocol data */ static int ll_flush(struct hci_uart *hu) { struct ll_struct *ll = hu->priv; BT_DBG("hu %p", hu); skb_queue_purge(&ll->tx_wait_q); skb_queue_purge(&ll->txq); return 0; } /* Close protocol */ static int ll_close(struct hci_uart *hu) { struct ll_struct *ll = hu->priv; BT_DBG("hu %p", hu); skb_queue_purge(&ll->tx_wait_q); skb_queue_purge(&ll->txq); kfree_skb(ll->rx_skb); if (hu->serdev) { struct ll_device *lldev = serdev_device_get_drvdata(hu->serdev); gpiod_set_value_cansleep(lldev->enable_gpio, 0); clk_disable_unprepare(lldev->ext_clk); } hu->priv = NULL; kfree(ll); return 0; } /* * internal function, which does common work of the device wake up process: * 1. places all pending packets (waiting in tx_wait_q list) in txq list. * 2. changes internal state to HCILL_AWAKE. * Note: assumes that hcill_lock spinlock is taken, * shouldn't be called otherwise! */ static void __ll_do_awake(struct ll_struct *ll) { struct sk_buff *skb = NULL; while ((skb = skb_dequeue(&ll->tx_wait_q))) skb_queue_tail(&ll->txq, skb); ll->hcill_state = HCILL_AWAKE; } /* * Called upon a wake-up-indication from the device */ static void ll_device_want_to_wakeup(struct hci_uart *hu) { unsigned long flags; struct ll_struct *ll = hu->priv; BT_DBG("hu %p", hu); /* lock hcill state */ spin_lock_irqsave(&ll->hcill_lock, flags); switch (ll->hcill_state) { case HCILL_ASLEEP_TO_AWAKE: /* * This state means that both the host and the BRF chip * have simultaneously sent a wake-up-indication packet. * Traditionally, in this case, receiving a wake-up-indication * was enough and an additional wake-up-ack wasn't needed. * This has changed with the BRF6350, which does require an * explicit wake-up-ack. Other BRF versions, which do not * require an explicit ack here, do accept it, thus it is * perfectly safe to always send one. */ BT_DBG("dual wake-up-indication"); fallthrough; case HCILL_ASLEEP: /* acknowledge device wake up */ if (send_hcill_cmd(HCILL_WAKE_UP_ACK, hu) < 0) { BT_ERR("cannot acknowledge device wake up"); goto out; } break; default: /* any other state is illegal */ BT_ERR("received HCILL_WAKE_UP_IND in state %ld", ll->hcill_state); break; } /* send pending packets and change state to HCILL_AWAKE */ __ll_do_awake(ll); out: spin_unlock_irqrestore(&ll->hcill_lock, flags); /* actually send the packets */ hci_uart_tx_wakeup(hu); } /* * Called upon a sleep-indication from the device */ static void ll_device_want_to_sleep(struct hci_uart *hu) { unsigned long flags; struct ll_struct *ll = hu->priv; BT_DBG("hu %p", hu); /* lock hcill state */ spin_lock_irqsave(&ll->hcill_lock, flags); /* sanity check */ if (ll->hcill_state != HCILL_AWAKE) BT_ERR("ERR: HCILL_GO_TO_SLEEP_IND in state %ld", ll->hcill_state); /* acknowledge device sleep */ if (send_hcill_cmd(HCILL_GO_TO_SLEEP_ACK, hu) < 0) { BT_ERR("cannot acknowledge device sleep"); goto out; } /* update state */ ll->hcill_state = HCILL_ASLEEP; out: spin_unlock_irqrestore(&ll->hcill_lock, flags); /* actually send the sleep ack packet */ hci_uart_tx_wakeup(hu); } /* * Called upon wake-up-acknowledgement from the device */ static void ll_device_woke_up(struct hci_uart *hu) { unsigned long flags; struct ll_struct *ll = hu->priv; BT_DBG("hu %p", hu); /* lock hcill state */ spin_lock_irqsave(&ll->hcill_lock, flags); /* sanity check */ if (ll->hcill_state != HCILL_ASLEEP_TO_AWAKE) BT_ERR("received HCILL_WAKE_UP_ACK in state %ld", ll->hcill_state); /* send pending packets and change state to HCILL_AWAKE */ __ll_do_awake(ll); spin_unlock_irqrestore(&ll->hcill_lock, flags); /* actually send the packets */ hci_uart_tx_wakeup(hu); } /* Enqueue frame for transmission (padding, crc, etc) */ /* may be called from two simultaneous tasklets */ static int ll_enqueue(struct hci_uart *hu, struct sk_buff *skb) { unsigned long flags = 0; struct ll_struct *ll = hu->priv; BT_DBG("hu %p skb %p", hu, skb); /* Prepend skb with frame type */ memcpy(skb_push(skb, 1), &hci_skb_pkt_type(skb), 1); /* lock hcill state */ spin_lock_irqsave(&ll->hcill_lock, flags); /* act according to current state */ switch (ll->hcill_state) { case HCILL_AWAKE: BT_DBG("device awake, sending normally"); skb_queue_tail(&ll->txq, skb); break; case HCILL_ASLEEP: BT_DBG("device asleep, waking up and queueing packet"); /* save packet for later */ skb_queue_tail(&ll->tx_wait_q, skb); /* awake device */ if (send_hcill_cmd(HCILL_WAKE_UP_IND, hu) < 0) { BT_ERR("cannot wake up device"); break; } ll->hcill_state = HCILL_ASLEEP_TO_AWAKE; break; case HCILL_ASLEEP_TO_AWAKE: BT_DBG("device waking up, queueing packet"); /* transient state; just keep packet for later */ skb_queue_tail(&ll->tx_wait_q, skb); break; default: BT_ERR("illegal hcill state: %ld (losing packet)", ll->hcill_state); dev_kfree_skb_irq(skb); break; } spin_unlock_irqrestore(&ll->hcill_lock, flags); return 0; } static int ll_recv_frame(struct hci_dev *hdev, struct sk_buff *skb) { struct hci_uart *hu = hci_get_drvdata(hdev); struct ll_struct *ll = hu->priv; switch (hci_skb_pkt_type(skb)) { case HCILL_GO_TO_SLEEP_IND: BT_DBG("HCILL_GO_TO_SLEEP_IND packet"); ll_device_want_to_sleep(hu); break; case HCILL_GO_TO_SLEEP_ACK: /* shouldn't happen */ bt_dev_err(hdev, "received HCILL_GO_TO_SLEEP_ACK in state %ld", ll->hcill_state); break; case HCILL_WAKE_UP_IND: BT_DBG("HCILL_WAKE_UP_IND packet"); ll_device_want_to_wakeup(hu); break; case HCILL_WAKE_UP_ACK: BT_DBG("HCILL_WAKE_UP_ACK packet"); ll_device_woke_up(hu); break; } kfree_skb(skb); return 0; } #define LL_RECV_SLEEP_IND \ .type = HCILL_GO_TO_SLEEP_IND, \ .hlen = 0, \ .loff = 0, \ .lsize = 0, \ .maxlen = 0 #define LL_RECV_SLEEP_ACK \ .type = HCILL_GO_TO_SLEEP_ACK, \ .hlen = 0, \ .loff = 0, \ .lsize = 0, \ .maxlen = 0 #define LL_RECV_WAKE_IND \ .type = HCILL_WAKE_UP_IND, \ .hlen = 0, \ .loff = 0, \ .lsize = 0, \ .maxlen = 0 #define LL_RECV_WAKE_ACK \ .type = HCILL_WAKE_UP_ACK, \ .hlen = 0, \ .loff = 0, \ .lsize = 0, \ .maxlen = 0 static const struct h4_recv_pkt ll_recv_pkts[] = { { H4_RECV_ACL, .recv = hci_recv_frame }, { H4_RECV_SCO, .recv = hci_recv_frame }, { H4_RECV_EVENT, .recv = hci_recv_frame }, { LL_RECV_SLEEP_IND, .recv = ll_recv_frame }, { LL_RECV_SLEEP_ACK, .recv = ll_recv_frame }, { LL_RECV_WAKE_IND, .recv = ll_recv_frame }, { LL_RECV_WAKE_ACK, .recv = ll_recv_frame }, }; /* Recv data */ static int ll_recv(struct hci_uart *hu, const void *data, int count) { struct ll_struct *ll = hu->priv; if (!test_bit(HCI_UART_REGISTERED, &hu->flags)) return -EUNATCH; ll->rx_skb = h4_recv_buf(hu->hdev, ll->rx_skb, data, count, ll_recv_pkts, ARRAY_SIZE(ll_recv_pkts)); if (IS_ERR(ll->rx_skb)) { int err = PTR_ERR(ll->rx_skb); bt_dev_err(hu->hdev, "Frame reassembly failed (%d)", err); ll->rx_skb = NULL; return err; } return count; } static struct sk_buff *ll_dequeue(struct hci_uart *hu) { struct ll_struct *ll = hu->priv; return skb_dequeue(&ll->txq); } #if IS_ENABLED(CONFIG_SERIAL_DEV_BUS) static int read_local_version(struct hci_dev *hdev) { int err = 0; unsigned short version = 0; struct sk_buff *skb; struct hci_rp_read_local_version *ver; skb = __hci_cmd_sync(hdev, HCI_OP_READ_LOCAL_VERSION, 0, NULL, HCI_INIT_TIMEOUT); if (IS_ERR(skb)) { bt_dev_err(hdev, "Reading TI version information failed (%ld)", PTR_ERR(skb)); return PTR_ERR(skb); } if (skb->len != sizeof(*ver)) { err = -EILSEQ; goto out; } ver = (struct hci_rp_read_local_version *)skb->data; if (le16_to_cpu(ver->manufacturer) != 13) { err = -ENODEV; goto out; } version = le16_to_cpu(ver->lmp_subver); out: if (err) bt_dev_err(hdev, "Failed to read TI version info: %d", err); kfree_skb(skb); return err ? err : version; } static int send_command_from_firmware(struct ll_device *lldev, struct hci_command *cmd) { struct sk_buff *skb; if (cmd->opcode == HCI_VS_UPDATE_UART_HCI_BAUDRATE) { /* ignore remote change * baud rate HCI VS command */ bt_dev_warn(lldev->hu.hdev, "change remote baud rate command in firmware"); return 0; } if (cmd->prefix != 1) bt_dev_dbg(lldev->hu.hdev, "command type %d", cmd->prefix); skb = __hci_cmd_sync(lldev->hu.hdev, cmd->opcode, cmd->plen, &cmd->speed, HCI_INIT_TIMEOUT); if (IS_ERR(skb)) { bt_dev_err(lldev->hu.hdev, "send command failed"); return PTR_ERR(skb); } kfree_skb(skb); return 0; } /* * download_firmware - * internal function which parses through the .bts firmware * script file intreprets SEND, DELAY actions only as of now */ static int download_firmware(struct ll_device *lldev) { unsigned short chip, min_ver, maj_ver; int version, err, len; unsigned char *ptr, *action_ptr; unsigned char bts_scr_name[40]; /* 40 char long bts scr name? */ const struct firmware *fw; struct hci_command *cmd; version = read_local_version(lldev->hu.hdev); if (version < 0) return version; chip = (version & 0x7C00) >> 10; min_ver = (version & 0x007F); maj_ver = (version & 0x0380) >> 7; if (version & 0x8000) maj_ver |= 0x0008; snprintf(bts_scr_name, sizeof(bts_scr_name), "ti-connectivity/TIInit_%d.%d.%d.bts", chip, maj_ver, min_ver); err = request_firmware(&fw, bts_scr_name, &lldev->serdev->dev); if (err || !fw->data || !fw->size) { bt_dev_err(lldev->hu.hdev, "request_firmware failed(errno %d) for %s", err, bts_scr_name); return -EINVAL; } ptr = (void *)fw->data; len = fw->size; /* bts_header to remove out magic number and * version */ ptr += sizeof(struct bts_header); len -= sizeof(struct bts_header); while (len > 0 && ptr) { bt_dev_dbg(lldev->hu.hdev, " action size %d, type %d ", ((struct bts_action *)ptr)->size, ((struct bts_action *)ptr)->type); action_ptr = &(((struct bts_action *)ptr)->data[0]); switch (((struct bts_action *)ptr)->type) { case ACTION_SEND_COMMAND: /* action send */ bt_dev_dbg(lldev->hu.hdev, "S"); cmd = (struct hci_command *)action_ptr; err = send_command_from_firmware(lldev, cmd); if (err) goto out_rel_fw; break; case ACTION_WAIT_EVENT: /* wait */ /* no need to wait as command was synchronous */ bt_dev_dbg(lldev->hu.hdev, "W"); break; case ACTION_DELAY: /* sleep */ bt_dev_info(lldev->hu.hdev, "sleep command in scr"); msleep(((struct bts_action_delay *)action_ptr)->msec); break; } len -= (sizeof(struct bts_action) + ((struct bts_action *)ptr)->size); ptr += sizeof(struct bts_action) + ((struct bts_action *)ptr)->size; } out_rel_fw: /* fw download complete */ release_firmware(fw); return err; } static int ll_set_bdaddr(struct hci_dev *hdev, const bdaddr_t *bdaddr) { bdaddr_t bdaddr_swapped; struct sk_buff *skb; /* HCI_VS_WRITE_BD_ADDR (at least on a CC2560A chip) expects the BD * address to be MSB first, but bdaddr_t has the convention of being * LSB first. */ baswap(&bdaddr_swapped, bdaddr); skb = __hci_cmd_sync(hdev, HCI_VS_WRITE_BD_ADDR, sizeof(bdaddr_t), &bdaddr_swapped, HCI_INIT_TIMEOUT); if (!IS_ERR(skb)) kfree_skb(skb); return PTR_ERR_OR_ZERO(skb); } static int ll_setup(struct hci_uart *hu) { int err, retry = 3; struct ll_device *lldev; struct serdev_device *serdev = hu->serdev; u32 speed; if (!serdev) return 0; lldev = serdev_device_get_drvdata(serdev); hu->hdev->set_bdaddr = ll_set_bdaddr; serdev_device_set_flow_control(serdev, true); do { /* Reset the Bluetooth device */ gpiod_set_value_cansleep(lldev->enable_gpio, 0); msleep(5); gpiod_set_value_cansleep(lldev->enable_gpio, 1); mdelay(100); err = serdev_device_wait_for_cts(serdev, true, 200); if (err) { bt_dev_err(hu->hdev, "Failed to get CTS"); return err; } err = download_firmware(lldev); if (!err) break; /* Toggle BT_EN and retry */ bt_dev_err(hu->hdev, "download firmware failed, retrying..."); } while (retry--); if (err) return err; /* Set BD address if one was specified at probe */ if (!bacmp(&lldev->bdaddr, BDADDR_NONE)) { /* This means that there was an error getting the BD address * during probe, so mark the device as having a bad address. */ set_bit(HCI_QUIRK_INVALID_BDADDR, &hu->hdev->quirks); } else if (bacmp(&lldev->bdaddr, BDADDR_ANY)) { err = ll_set_bdaddr(hu->hdev, &lldev->bdaddr); if (err) set_bit(HCI_QUIRK_INVALID_BDADDR, &hu->hdev->quirks); } /* Operational speed if any */ if (hu->oper_speed) speed = hu->oper_speed; else if (hu->proto->oper_speed) speed = hu->proto->oper_speed; else speed = 0; if (speed) { __le32 speed_le = cpu_to_le32(speed); struct sk_buff *skb; skb = __hci_cmd_sync(hu->hdev, HCI_VS_UPDATE_UART_HCI_BAUDRATE, sizeof(speed_le), &speed_le, HCI_INIT_TIMEOUT); if (!IS_ERR(skb)) { kfree_skb(skb); serdev_device_set_baudrate(serdev, speed); } } return 0; } static const struct hci_uart_proto llp; static int hci_ti_probe(struct serdev_device *serdev) { struct hci_uart *hu; struct ll_device *lldev; struct nvmem_cell *bdaddr_cell; u32 max_speed = 3000000; lldev = devm_kzalloc(&serdev->dev, sizeof(struct ll_device), GFP_KERNEL); if (!lldev) return -ENOMEM; hu = &lldev->hu; serdev_device_set_drvdata(serdev, lldev); lldev->serdev = hu->serdev = serdev; lldev->enable_gpio = devm_gpiod_get_optional(&serdev->dev, "enable", GPIOD_OUT_LOW); if (IS_ERR(lldev->enable_gpio)) return PTR_ERR(lldev->enable_gpio); lldev->ext_clk = devm_clk_get(&serdev->dev, "ext_clock"); if (IS_ERR(lldev->ext_clk) && PTR_ERR(lldev->ext_clk) != -ENOENT) return PTR_ERR(lldev->ext_clk); of_property_read_u32(serdev->dev.of_node, "max-speed", &max_speed); hci_uart_set_speeds(hu, 115200, max_speed); /* optional BD address from nvram */ bdaddr_cell = nvmem_cell_get(&serdev->dev, "bd-address"); if (IS_ERR(bdaddr_cell)) { int err = PTR_ERR(bdaddr_cell); if (err == -EPROBE_DEFER) return err; /* ENOENT means there is no matching nvmem cell and ENOSYS * means that nvmem is not enabled in the kernel configuration. */ if (err != -ENOENT && err != -ENOSYS) { /* If there was some other error, give userspace a * chance to fix the problem instead of failing to load * the driver. Using BDADDR_NONE as a flag that is * tested later in the setup function. */ dev_warn(&serdev->dev, "Failed to get \"bd-address\" nvmem cell (%d)\n", err); bacpy(&lldev->bdaddr, BDADDR_NONE); } } else { bdaddr_t *bdaddr; size_t len; bdaddr = nvmem_cell_read(bdaddr_cell, &len); nvmem_cell_put(bdaddr_cell); if (IS_ERR(bdaddr)) { dev_err(&serdev->dev, "Failed to read nvmem bd-address\n"); return PTR_ERR(bdaddr); } if (len != sizeof(bdaddr_t)) { dev_err(&serdev->dev, "Invalid nvmem bd-address length\n"); kfree(bdaddr); return -EINVAL; } /* As per the device tree bindings, the value from nvmem is * expected to be MSB first, but in the kernel it is expected * that bdaddr_t is LSB first. */ baswap(&lldev->bdaddr, bdaddr); kfree(bdaddr); } return hci_uart_register_device(hu, &llp); } static void hci_ti_remove(struct serdev_device *serdev) { struct ll_device *lldev = serdev_device_get_drvdata(serdev); hci_uart_unregister_device(&lldev->hu); } static const struct of_device_id hci_ti_of_match[] = { { .compatible = "ti,cc2560" }, { .compatible = "ti,wl1271-st" }, { .compatible = "ti,wl1273-st" }, { .compatible = "ti,wl1281-st" }, { .compatible = "ti,wl1283-st" }, { .compatible = "ti,wl1285-st" }, { .compatible = "ti,wl1801-st" }, { .compatible = "ti,wl1805-st" }, { .compatible = "ti,wl1807-st" }, { .compatible = "ti,wl1831-st" }, { .compatible = "ti,wl1835-st" }, { .compatible = "ti,wl1837-st" }, {}, }; MODULE_DEVICE_TABLE(of, hci_ti_of_match); static struct serdev_device_driver hci_ti_drv = { .driver = { .name = "hci-ti", .of_match_table = hci_ti_of_match, }, .probe = hci_ti_probe, .remove = hci_ti_remove, }; #else #define ll_setup NULL #endif static const struct hci_uart_proto llp = { .id = HCI_UART_LL, .name = "LL", .setup = ll_setup, .open = ll_open, .close = ll_close, .recv = ll_recv, .enqueue = ll_enqueue, .dequeue = ll_dequeue, .flush = ll_flush, }; int __init ll_init(void) { serdev_device_driver_register(&hci_ti_drv); return hci_uart_register_proto(&llp); } int __exit ll_deinit(void) { serdev_device_driver_unregister(&hci_ti_drv); return hci_uart_unregister_proto(&llp); }
15 15 15 15 15 15 15 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 // SPDX-License-Identifier: GPL-2.0 /* * linux/net/sunrpc/auth_null.c * * AUTH_NULL authentication. Really :-) * * Copyright (C) 1996, Olaf Kirch <okir@monad.swb.de> */ #include <linux/types.h> #include <linux/module.h> #include <linux/sunrpc/clnt.h> #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) # define RPCDBG_FACILITY RPCDBG_AUTH #endif static struct rpc_auth null_auth; static struct rpc_cred null_cred; static struct rpc_auth * nul_create(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt) { refcount_inc(&null_auth.au_count); return &null_auth; } static void nul_destroy(struct rpc_auth *auth) { } /* * Lookup NULL creds for current process */ static struct rpc_cred * nul_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags) { return get_rpccred(&null_cred); } /* * Destroy cred handle. */ static void nul_destroy_cred(struct rpc_cred *cred) { } /* * Match cred handle against current process */ static int nul_match(struct auth_cred *acred, struct rpc_cred *cred, int taskflags) { return 1; } /* * Marshal credential. */ static int nul_marshal(struct rpc_task *task, struct xdr_stream *xdr) { __be32 *p; p = xdr_reserve_space(xdr, 4 * sizeof(*p)); if (!p) return -EMSGSIZE; /* Credential */ *p++ = rpc_auth_null; *p++ = xdr_zero; /* Verifier */ *p++ = rpc_auth_null; *p = xdr_zero; return 0; } /* * Refresh credential. This is a no-op for AUTH_NULL */ static int nul_refresh(struct rpc_task *task) { set_bit(RPCAUTH_CRED_UPTODATE, &task->tk_rqstp->rq_cred->cr_flags); return 0; } static int nul_validate(struct rpc_task *task, struct xdr_stream *xdr) { __be32 *p; p = xdr_inline_decode(xdr, 2 * sizeof(*p)); if (!p) return -EIO; if (*p++ != rpc_auth_null) return -EIO; if (*p != xdr_zero) return -EIO; return 0; } const struct rpc_authops authnull_ops = { .owner = THIS_MODULE, .au_flavor = RPC_AUTH_NULL, .au_name = "NULL", .create = nul_create, .destroy = nul_destroy, .lookup_cred = nul_lookup_cred, }; static struct rpc_auth null_auth = { .au_cslack = NUL_CALLSLACK, .au_rslack = NUL_REPLYSLACK, .au_verfsize = NUL_REPLYSLACK, .au_ralign = NUL_REPLYSLACK, .au_ops = &authnull_ops, .au_flavor = RPC_AUTH_NULL, .au_count = REFCOUNT_INIT(1), }; static const struct rpc_credops null_credops = { .cr_name = "AUTH_NULL", .crdestroy = nul_destroy_cred, .crmatch = nul_match, .crmarshal = nul_marshal, .crwrap_req = rpcauth_wrap_req_encode, .crrefresh = nul_refresh, .crvalidate = nul_validate, .crunwrap_resp = rpcauth_unwrap_resp_decode, }; static struct rpc_cred null_cred = { .cr_lru = LIST_HEAD_INIT(null_cred.cr_lru), .cr_auth = &null_auth, .cr_ops = &null_credops, .cr_count = REFCOUNT_INIT(2), .cr_flags = 1UL << RPCAUTH_CRED_UPTODATE, };
39 44 25 32 45 17 17 17 29 28 29 74 24 72 72 72 7 39 10 16 1 18 18 18 18 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 /* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */ #ifndef _LINUX_SKMSG_H #define _LINUX_SKMSG_H #include <linux/bpf.h> #include <linux/filter.h> #include <linux/scatterlist.h> #include <linux/skbuff.h> #include <net/sock.h> #include <net/tcp.h> #include <net/strparser.h> #define MAX_MSG_FRAGS MAX_SKB_FRAGS #define NR_MSG_FRAG_IDS (MAX_MSG_FRAGS + 1) enum __sk_action { __SK_DROP = 0, __SK_PASS, __SK_REDIRECT, __SK_NONE, }; struct sk_msg_sg { u32 start; u32 curr; u32 end; u32 size; u32 copybreak; DECLARE_BITMAP(copy, MAX_MSG_FRAGS + 2); /* The extra two elements: * 1) used for chaining the front and sections when the list becomes * partitioned (e.g. end < start). The crypto APIs require the * chaining; * 2) to chain tailer SG entries after the message. */ struct scatterlist data[MAX_MSG_FRAGS + 2]; }; /* UAPI in filter.c depends on struct sk_msg_sg being first element. */ struct sk_msg { struct sk_msg_sg sg; void *data; void *data_end; u32 apply_bytes; u32 cork_bytes; u32 flags; struct sk_buff *skb; struct sock *sk_redir; struct sock *sk; struct list_head list; }; struct sk_psock_progs { struct bpf_prog *msg_parser; struct bpf_prog *stream_parser; struct bpf_prog *stream_verdict; struct bpf_prog *skb_verdict; struct bpf_link *msg_parser_link; struct bpf_link *stream_parser_link; struct bpf_link *stream_verdict_link; struct bpf_link *skb_verdict_link; }; enum sk_psock_state_bits { SK_PSOCK_TX_ENABLED, SK_PSOCK_RX_STRP_ENABLED, }; struct sk_psock_link { struct list_head list; struct bpf_map *map; void *link_raw; }; struct sk_psock_work_state { u32 len; u32 off; }; struct sk_psock { struct sock *sk; struct sock *sk_redir; u32 apply_bytes; u32 cork_bytes; u32 eval; bool redir_ingress; /* undefined if sk_redir is null */ struct sk_msg *cork; struct sk_psock_progs progs; #if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) struct strparser strp; #endif struct sk_buff_head ingress_skb; struct list_head ingress_msg; spinlock_t ingress_lock; unsigned long state; struct list_head link; spinlock_t link_lock; refcount_t refcnt; void (*saved_unhash)(struct sock *sk); void (*saved_destroy)(struct sock *sk); void (*saved_close)(struct sock *sk, long timeout); void (*saved_write_space)(struct sock *sk); void (*saved_data_ready)(struct sock *sk); /* psock_update_sk_prot may be called with restore=false many times * so the handler must be safe for this case. It will be called * exactly once with restore=true when the psock is being destroyed * and psock refcnt is zero, but before an RCU grace period. */ int (*psock_update_sk_prot)(struct sock *sk, struct sk_psock *psock, bool restore); struct proto *sk_proto; struct mutex work_mutex; struct sk_psock_work_state work_state; struct delayed_work work; struct sock *sk_pair; struct rcu_work rwork; }; int sk_msg_alloc(struct sock *sk, struct sk_msg *msg, int len, int elem_first_coalesce); int sk_msg_clone(struct sock *sk, struct sk_msg *dst, struct sk_msg *src, u32 off, u32 len); void sk_msg_trim(struct sock *sk, struct sk_msg *msg, int len); int sk_msg_free(struct sock *sk, struct sk_msg *msg); int sk_msg_free_nocharge(struct sock *sk, struct sk_msg *msg); void sk_msg_free_partial(struct sock *sk, struct sk_msg *msg, u32 bytes); void sk_msg_free_partial_nocharge(struct sock *sk, struct sk_msg *msg, u32 bytes); void sk_msg_return(struct sock *sk, struct sk_msg *msg, int bytes); void sk_msg_return_zero(struct sock *sk, struct sk_msg *msg, int bytes); int sk_msg_zerocopy_from_iter(struct sock *sk, struct iov_iter *from, struct sk_msg *msg, u32 bytes); int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from, struct sk_msg *msg, u32 bytes); int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, int len, int flags); bool sk_msg_is_readable(struct sock *sk); static inline void sk_msg_check_to_free(struct sk_msg *msg, u32 i, u32 bytes) { WARN_ON(i == msg->sg.end && bytes); } static inline void sk_msg_apply_bytes(struct sk_psock *psock, u32 bytes) { if (psock->apply_bytes) { if (psock->apply_bytes < bytes) psock->apply_bytes = 0; else psock->apply_bytes -= bytes; } } static inline u32 sk_msg_iter_dist(u32 start, u32 end) { return end >= start ? end - start : end + (NR_MSG_FRAG_IDS - start); } #define sk_msg_iter_var_prev(var) \ do { \ if (var == 0) \ var = NR_MSG_FRAG_IDS - 1; \ else \ var--; \ } while (0) #define sk_msg_iter_var_next(var) \ do { \ var++; \ if (var == NR_MSG_FRAG_IDS) \ var = 0; \ } while (0) #define sk_msg_iter_prev(msg, which) \ sk_msg_iter_var_prev(msg->sg.which) #define sk_msg_iter_next(msg, which) \ sk_msg_iter_var_next(msg->sg.which) static inline void sk_msg_init(struct sk_msg *msg) { BUILD_BUG_ON(ARRAY_SIZE(msg->sg.data) - 1 != NR_MSG_FRAG_IDS); memset(msg, 0, sizeof(*msg)); sg_init_marker(msg->sg.data, NR_MSG_FRAG_IDS); } static inline void sk_msg_xfer(struct sk_msg *dst, struct sk_msg *src, int which, u32 size) { dst->sg.data[which] = src->sg.data[which]; dst->sg.data[which].length = size; dst->sg.size += size; src->sg.size -= size; src->sg.data[which].length -= size; src->sg.data[which].offset += size; } static inline void sk_msg_xfer_full(struct sk_msg *dst, struct sk_msg *src) { memcpy(dst, src, sizeof(*src)); sk_msg_init(src); } static inline bool sk_msg_full(const struct sk_msg *msg) { return sk_msg_iter_dist(msg->sg.start, msg->sg.end) == MAX_MSG_FRAGS; } static inline u32 sk_msg_elem_used(const struct sk_msg *msg) { return sk_msg_iter_dist(msg->sg.start, msg->sg.end); } static inline struct scatterlist *sk_msg_elem(struct sk_msg *msg, int which) { return &msg->sg.data[which]; } static inline struct scatterlist sk_msg_elem_cpy(struct sk_msg *msg, int which) { return msg->sg.data[which]; } static inline struct page *sk_msg_page(struct sk_msg *msg, int which) { return sg_page(sk_msg_elem(msg, which)); } static inline bool sk_msg_to_ingress(const struct sk_msg *msg) { return msg->flags & BPF_F_INGRESS; } static inline void sk_msg_compute_data_pointers(struct sk_msg *msg) { struct scatterlist *sge = sk_msg_elem(msg, msg->sg.start); if (test_bit(msg->sg.start, msg->sg.copy)) { msg->data = NULL; msg->data_end = NULL; } else { msg->data = sg_virt(sge); msg->data_end = msg->data + sge->length; } } static inline void sk_msg_page_add(struct sk_msg *msg, struct page *page, u32 len, u32 offset) { struct scatterlist *sge; get_page(page); sge = sk_msg_elem(msg, msg->sg.end); sg_set_page(sge, page, len, offset); sg_unmark_end(sge); __set_bit(msg->sg.end, msg->sg.copy); msg->sg.size += len; sk_msg_iter_next(msg, end); } static inline void sk_msg_sg_copy(struct sk_msg *msg, u32 i, bool copy_state) { do { if (copy_state) __set_bit(i, msg->sg.copy); else __clear_bit(i, msg->sg.copy); sk_msg_iter_var_next(i); if (i == msg->sg.end) break; } while (1); } static inline void sk_msg_sg_copy_set(struct sk_msg *msg, u32 start) { sk_msg_sg_copy(msg, start, true); } static inline void sk_msg_sg_copy_clear(struct sk_msg *msg, u32 start) { sk_msg_sg_copy(msg, start, false); } static inline struct sk_psock *sk_psock(const struct sock *sk) { return __rcu_dereference_sk_user_data_with_flags(sk, SK_USER_DATA_PSOCK); } static inline void sk_psock_set_state(struct sk_psock *psock, enum sk_psock_state_bits bit) { set_bit(bit, &psock->state); } static inline void sk_psock_clear_state(struct sk_psock *psock, enum sk_psock_state_bits bit) { clear_bit(bit, &psock->state); } static inline bool sk_psock_test_state(const struct sk_psock *psock, enum sk_psock_state_bits bit) { return test_bit(bit, &psock->state); } static inline void sock_drop(struct sock *sk, struct sk_buff *skb) { sk_drops_add(sk, skb); kfree_skb(skb); } static inline bool sk_psock_queue_msg(struct sk_psock *psock, struct sk_msg *msg) { bool ret; spin_lock_bh(&psock->ingress_lock); if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) { list_add_tail(&msg->list, &psock->ingress_msg); ret = true; } else { sk_msg_free(psock->sk, msg); kfree(msg); ret = false; } spin_unlock_bh(&psock->ingress_lock); return ret; } static inline struct sk_msg *sk_psock_dequeue_msg(struct sk_psock *psock) { struct sk_msg *msg; spin_lock_bh(&psock->ingress_lock); msg = list_first_entry_or_null(&psock->ingress_msg, struct sk_msg, list); if (msg) list_del(&msg->list); spin_unlock_bh(&psock->ingress_lock); return msg; } static inline struct sk_msg *sk_psock_peek_msg(struct sk_psock *psock) { struct sk_msg *msg; spin_lock_bh(&psock->ingress_lock); msg = list_first_entry_or_null(&psock->ingress_msg, struct sk_msg, list); spin_unlock_bh(&psock->ingress_lock); return msg; } static inline struct sk_msg *sk_psock_next_msg(struct sk_psock *psock, struct sk_msg *msg) { struct sk_msg *ret; spin_lock_bh(&psock->ingress_lock); if (list_is_last(&msg->list, &psock->ingress_msg)) ret = NULL; else ret = list_next_entry(msg, list); spin_unlock_bh(&psock->ingress_lock); return ret; } static inline bool sk_psock_queue_empty(const struct sk_psock *psock) { return psock ? list_empty(&psock->ingress_msg) : true; } static inline void kfree_sk_msg(struct sk_msg *msg) { if (msg->skb) consume_skb(msg->skb); kfree(msg); } static inline void sk_psock_report_error(struct sk_psock *psock, int err) { struct sock *sk = psock->sk; sk->sk_err = err; sk_error_report(sk); } struct sk_psock *sk_psock_init(struct sock *sk, int node); void sk_psock_stop(struct sk_psock *psock); #if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock); void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock); void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock); #else static inline int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock) { return -EOPNOTSUPP; } static inline void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock) { } static inline void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock) { } #endif void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock); void sk_psock_stop_verdict(struct sock *sk, struct sk_psock *psock); int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock, struct sk_msg *msg); /* * This specialized allocator has to be a macro for its allocations to be * accounted separately (to have a separate alloc_tag). The typecast is * intentional to enforce typesafety. */ #define sk_psock_init_link() \ ((struct sk_psock_link *)kzalloc(sizeof(struct sk_psock_link), \ GFP_ATOMIC | __GFP_NOWARN)) static inline void sk_psock_free_link(struct sk_psock_link *link) { kfree(link); } struct sk_psock_link *sk_psock_link_pop(struct sk_psock *psock); static inline void sk_psock_cork_free(struct sk_psock *psock) { if (psock->cork) { sk_msg_free(psock->sk, psock->cork); kfree(psock->cork); psock->cork = NULL; } } static inline void sk_psock_restore_proto(struct sock *sk, struct sk_psock *psock) { if (psock->psock_update_sk_prot) psock->psock_update_sk_prot(sk, psock, true); } static inline struct sk_psock *sk_psock_get(struct sock *sk) { struct sk_psock *psock; rcu_read_lock(); psock = sk_psock(sk); if (psock && !refcount_inc_not_zero(&psock->refcnt)) psock = NULL; rcu_read_unlock(); return psock; } void sk_psock_drop(struct sock *sk, struct sk_psock *psock); static inline void sk_psock_put(struct sock *sk, struct sk_psock *psock) { if (refcount_dec_and_test(&psock->refcnt)) sk_psock_drop(sk, psock); } static inline void sk_psock_data_ready(struct sock *sk, struct sk_psock *psock) { read_lock_bh(&sk->sk_callback_lock); if (psock->saved_data_ready) psock->saved_data_ready(sk); else sk->sk_data_ready(sk); read_unlock_bh(&sk->sk_callback_lock); } static inline void psock_set_prog(struct bpf_prog **pprog, struct bpf_prog *prog) { prog = xchg(pprog, prog); if (prog) bpf_prog_put(prog); } static inline int psock_replace_prog(struct bpf_prog **pprog, struct bpf_prog *prog, struct bpf_prog *old) { if (cmpxchg(pprog, old, prog) != old) return -ENOENT; if (old) bpf_prog_put(old); return 0; } static inline void psock_progs_drop(struct sk_psock_progs *progs) { psock_set_prog(&progs->msg_parser, NULL); psock_set_prog(&progs->stream_parser, NULL); psock_set_prog(&progs->stream_verdict, NULL); psock_set_prog(&progs->skb_verdict, NULL); } int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb); static inline bool sk_psock_strp_enabled(struct sk_psock *psock) { if (!psock) return false; return !!psock->saved_data_ready; } #if IS_ENABLED(CONFIG_NET_SOCK_MSG) #define BPF_F_STRPARSER (1UL << 1) /* We only have two bits so far. */ #define BPF_F_PTR_MASK ~(BPF_F_INGRESS | BPF_F_STRPARSER) static inline bool skb_bpf_strparser(const struct sk_buff *skb) { unsigned long sk_redir = skb->_sk_redir; return sk_redir & BPF_F_STRPARSER; } static inline void skb_bpf_set_strparser(struct sk_buff *skb) { skb->_sk_redir |= BPF_F_STRPARSER; } static inline bool skb_bpf_ingress(const struct sk_buff *skb) { unsigned long sk_redir = skb->_sk_redir; return sk_redir & BPF_F_INGRESS; } static inline void skb_bpf_set_ingress(struct sk_buff *skb) { skb->_sk_redir |= BPF_F_INGRESS; } static inline void skb_bpf_set_redir(struct sk_buff *skb, struct sock *sk_redir, bool ingress) { skb->_sk_redir = (unsigned long)sk_redir; if (ingress) skb->_sk_redir |= BPF_F_INGRESS; } static inline struct sock *skb_bpf_redirect_fetch(const struct sk_buff *skb) { unsigned long sk_redir = skb->_sk_redir; return (struct sock *)(sk_redir & BPF_F_PTR_MASK); } static inline void skb_bpf_redirect_clear(struct sk_buff *skb) { skb->_sk_redir = 0; } #endif /* CONFIG_NET_SOCK_MSG */ #endif /* _LINUX_SKMSG_H */
316 315 311 315 316 315 316 36 314 309 53 54 54 54 54 313 316 313 315 2 2 2 2 2 2 310 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 310 2 3 3 3 36 36 36 36 36 36 36 3 36 36 36 36 35 36 36 36 36 36 35 311 24 310 10 9 9 9 9 9 9 310 9 9 311 310 311 310 310 310 311 311 311 9 9 310 303 309 310 209 211 311 308 310 221 310 308 310 311 311 310 9 309 309 311 309 36 35 36 36 35 36 28 28 28 28 311 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright © 2006-2014 Intel Corporation. * * Authors: David Woodhouse <dwmw2@infradead.org>, * Ashok Raj <ashok.raj@intel.com>, * Shaohua Li <shaohua.li@intel.com>, * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>, * Fenghua Yu <fenghua.yu@intel.com> * Joerg Roedel <jroedel@suse.de> */ #define pr_fmt(fmt) "DMAR: " fmt #define dev_fmt(fmt) pr_fmt(fmt) #include <linux/crash_dump.h> #include <linux/dma-direct.h> #include <linux/dmi.h> #include <linux/memory.h> #include <linux/pci.h> #include <linux/pci-ats.h> #include <linux/spinlock.h> #include <linux/syscore_ops.h> #include <linux/tboot.h> #include <uapi/linux/iommufd.h> #include "iommu.h" #include "../dma-iommu.h" #include "../irq_remapping.h" #include "../iommu-pages.h" #include "pasid.h" #include "cap_audit.h" #include "perfmon.h" #define ROOT_SIZE VTD_PAGE_SIZE #define CONTEXT_SIZE VTD_PAGE_SIZE #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY) #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB) #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e) #define IOAPIC_RANGE_START (0xfee00000) #define IOAPIC_RANGE_END (0xfeefffff) #define IOVA_START_ADDR (0x1000) #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1) #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1) /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR to match. That way, we can use 'unsigned long' for PFNs with impunity. */ #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \ __DOMAIN_MAX_PFN(gaw), (unsigned long)-1)) #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT) static void __init check_tylersburg_isoch(void); static int rwbf_quirk; /* * set to 1 to panic kernel if can't successfully enable VT-d * (used when kernel is launched w/ TXT) */ static int force_on = 0; static int intel_iommu_tboot_noforce; static int no_platform_optin; #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry)) /* * Take a root_entry and return the Lower Context Table Pointer (LCTP) * if marked present. */ static phys_addr_t root_entry_lctp(struct root_entry *re) { if (!(re->lo & 1)) return 0; return re->lo & VTD_PAGE_MASK; } /* * Take a root_entry and return the Upper Context Table Pointer (UCTP) * if marked present. */ static phys_addr_t root_entry_uctp(struct root_entry *re) { if (!(re->hi & 1)) return 0; return re->hi & VTD_PAGE_MASK; } static int device_rid_cmp_key(const void *key, const struct rb_node *node) { struct device_domain_info *info = rb_entry(node, struct device_domain_info, node); const u16 *rid_lhs = key; if (*rid_lhs < PCI_DEVID(info->bus, info->devfn)) return -1; if (*rid_lhs > PCI_DEVID(info->bus, info->devfn)) return 1; return 0; } static int device_rid_cmp(struct rb_node *lhs, const struct rb_node *rhs) { struct device_domain_info *info = rb_entry(lhs, struct device_domain_info, node); u16 key = PCI_DEVID(info->bus, info->devfn); return device_rid_cmp_key(&key, rhs); } /* * Looks up an IOMMU-probed device using its source ID. * * Returns the pointer to the device if there is a match. Otherwise, * returns NULL. * * Note that this helper doesn't guarantee that the device won't be * released by the iommu subsystem after being returned. The caller * should use its own synchronization mechanism to avoid the device * being released during its use if its possibly the case. */ struct device *device_rbtree_find(struct intel_iommu *iommu, u16 rid) { struct device_domain_info *info = NULL; struct rb_node *node; unsigned long flags; spin_lock_irqsave(&iommu->device_rbtree_lock, flags); node = rb_find(&rid, &iommu->device_rbtree, device_rid_cmp_key); if (node) info = rb_entry(node, struct device_domain_info, node); spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags); return info ? info->dev : NULL; } static int device_rbtree_insert(struct intel_iommu *iommu, struct device_domain_info *info) { struct rb_node *curr; unsigned long flags; spin_lock_irqsave(&iommu->device_rbtree_lock, flags); curr = rb_find_add(&info->node, &iommu->device_rbtree, device_rid_cmp); spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags); if (WARN_ON(curr)) return -EEXIST; return 0; } static void device_rbtree_remove(struct device_domain_info *info) { struct intel_iommu *iommu = info->iommu; unsigned long flags; spin_lock_irqsave(&iommu->device_rbtree_lock, flags); rb_erase(&info->node, &iommu->device_rbtree); spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags); } struct dmar_rmrr_unit { struct list_head list; /* list of rmrr units */ struct acpi_dmar_header *hdr; /* ACPI header */ u64 base_address; /* reserved base address*/ u64 end_address; /* reserved end address */ struct dmar_dev_scope *devices; /* target devices */ int devices_cnt; /* target device count */ }; struct dmar_atsr_unit { struct list_head list; /* list of ATSR units */ struct acpi_dmar_header *hdr; /* ACPI header */ struct dmar_dev_scope *devices; /* target devices */ int devices_cnt; /* target device count */ u8 include_all:1; /* include all ports */ }; struct dmar_satc_unit { struct list_head list; /* list of SATC units */ struct acpi_dmar_header *hdr; /* ACPI header */ struct dmar_dev_scope *devices; /* target devices */ struct intel_iommu *iommu; /* the corresponding iommu */ int devices_cnt; /* target device count */ u8 atc_required:1; /* ATS is required */ }; static LIST_HEAD(dmar_atsr_units); static LIST_HEAD(dmar_rmrr_units); static LIST_HEAD(dmar_satc_units); #define for_each_rmrr_units(rmrr) \ list_for_each_entry(rmrr, &dmar_rmrr_units, list) static void intel_iommu_domain_free(struct iommu_domain *domain); int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON); int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON); int intel_iommu_enabled = 0; EXPORT_SYMBOL_GPL(intel_iommu_enabled); static int intel_iommu_superpage = 1; static int iommu_identity_mapping; static int iommu_skip_te_disable; static int disable_igfx_iommu; #define IDENTMAP_AZALIA 4 const struct iommu_ops intel_iommu_ops; static const struct iommu_dirty_ops intel_dirty_ops; static bool translation_pre_enabled(struct intel_iommu *iommu) { return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED); } static void clear_translation_pre_enabled(struct intel_iommu *iommu) { iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED; } static void init_translation_status(struct intel_iommu *iommu) { u32 gsts; gsts = readl(iommu->reg + DMAR_GSTS_REG); if (gsts & DMA_GSTS_TES) iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED; } static int __init intel_iommu_setup(char *str) { if (!str) return -EINVAL; while (*str) { if (!strncmp(str, "on", 2)) { dmar_disabled = 0; pr_info("IOMMU enabled\n"); } else if (!strncmp(str, "off", 3)) { dmar_disabled = 1; no_platform_optin = 1; pr_info("IOMMU disabled\n"); } else if (!strncmp(str, "igfx_off", 8)) { disable_igfx_iommu = 1; pr_info("Disable GFX device mapping\n"); } else if (!strncmp(str, "forcedac", 8)) { pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n"); iommu_dma_forcedac = true; } else if (!strncmp(str, "strict", 6)) { pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n"); iommu_set_dma_strict(); } else if (!strncmp(str, "sp_off", 6)) { pr_info("Disable supported super page\n"); intel_iommu_superpage = 0; } else if (!strncmp(str, "sm_on", 5)) { pr_info("Enable scalable mode if hardware supports\n"); intel_iommu_sm = 1; } else if (!strncmp(str, "sm_off", 6)) { pr_info("Scalable mode is disallowed\n"); intel_iommu_sm = 0; } else if (!strncmp(str, "tboot_noforce", 13)) { pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n"); intel_iommu_tboot_noforce = 1; } else { pr_notice("Unknown option - '%s'\n", str); } str += strcspn(str, ","); while (*str == ',') str++; } return 1; } __setup("intel_iommu=", intel_iommu_setup); static int domain_pfn_supported(struct dmar_domain *domain, unsigned long pfn) { int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT; return !(addr_width < BITS_PER_LONG && pfn >> addr_width); } /* * Calculate the Supported Adjusted Guest Address Widths of an IOMMU. * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of * the returned SAGAW. */ static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu) { unsigned long fl_sagaw, sl_sagaw; fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0); sl_sagaw = cap_sagaw(iommu->cap); /* Second level only. */ if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) return sl_sagaw; /* First level only. */ if (!ecap_slts(iommu->ecap)) return fl_sagaw; return fl_sagaw & sl_sagaw; } static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw) { unsigned long sagaw; int agaw; sagaw = __iommu_calculate_sagaw(iommu); for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) { if (test_bit(agaw, &sagaw)) break; } return agaw; } /* * Calculate max SAGAW for each iommu. */ int iommu_calculate_max_sagaw(struct intel_iommu *iommu) { return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH); } /* * calculate agaw for each iommu. * "SAGAW" may be different across iommus, use a default agaw, and * get a supported less agaw for iommus that don't support the default agaw. */ int iommu_calculate_agaw(struct intel_iommu *iommu) { return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH); } static bool iommu_paging_structure_coherency(struct intel_iommu *iommu) { return sm_supported(iommu) ? ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap); } /* Return the super pagesize bitmap if supported. */ static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain) { unsigned long bitmap = 0; /* * 1-level super page supports page size of 2MiB, 2-level super page * supports page size of both 2MiB and 1GiB. */ if (domain->iommu_superpage == 1) bitmap |= SZ_2M; else if (domain->iommu_superpage == 2) bitmap |= SZ_2M | SZ_1G; return bitmap; } struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus, u8 devfn, int alloc) { struct root_entry *root = &iommu->root_entry[bus]; struct context_entry *context; u64 *entry; /* * Except that the caller requested to allocate a new entry, * returning a copied context entry makes no sense. */ if (!alloc && context_copied(iommu, bus, devfn)) return NULL; entry = &root->lo; if (sm_supported(iommu)) { if (devfn >= 0x80) { devfn -= 0x80; entry = &root->hi; } devfn *= 2; } if (*entry & 1) context = phys_to_virt(*entry & VTD_PAGE_MASK); else { unsigned long phy_addr; if (!alloc) return NULL; context = iommu_alloc_page_node(iommu->node, GFP_ATOMIC); if (!context) return NULL; __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE); phy_addr = virt_to_phys((void *)context); *entry = phy_addr | 1; __iommu_flush_cache(iommu, entry, sizeof(*entry)); } return &context[devfn]; } /** * is_downstream_to_pci_bridge - test if a device belongs to the PCI * sub-hierarchy of a candidate PCI-PCI bridge * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy * @bridge: the candidate PCI-PCI bridge * * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false. */ static bool is_downstream_to_pci_bridge(struct device *dev, struct device *bridge) { struct pci_dev *pdev, *pbridge; if (!dev_is_pci(dev) || !dev_is_pci(bridge)) return false; pdev = to_pci_dev(dev); pbridge = to_pci_dev(bridge); if (pbridge->subordinate && pbridge->subordinate->number <= pdev->bus->number && pbridge->subordinate->busn_res.end >= pdev->bus->number) return true; return false; } static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev) { struct dmar_drhd_unit *drhd; u32 vtbar; int rc; /* We know that this device on this chipset has its own IOMMU. * If we find it under a different IOMMU, then the BIOS is lying * to us. Hope that the IOMMU for this device is actually * disabled, and it needs no translation... */ rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar); if (rc) { /* "can't" happen */ dev_info(&pdev->dev, "failed to run vt-d quirk\n"); return false; } vtbar &= 0xffff0000; /* we know that the this iommu should be at offset 0xa000 from vtbar */ drhd = dmar_find_matched_drhd_unit(pdev); if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) { pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"); add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); return true; } return false; } static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev) { if (!iommu || iommu->drhd->ignored) return true; if (dev_is_pci(dev)) { struct pci_dev *pdev = to_pci_dev(dev); if (pdev->vendor == PCI_VENDOR_ID_INTEL && pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB && quirk_ioat_snb_local_iommu(pdev)) return true; } return false; } static struct intel_iommu *device_lookup_iommu(struct device *dev, u8 *bus, u8 *devfn) { struct dmar_drhd_unit *drhd = NULL; struct pci_dev *pdev = NULL; struct intel_iommu *iommu; struct device *tmp; u16 segment = 0; int i; if (!dev) return NULL; if (dev_is_pci(dev)) { struct pci_dev *pf_pdev; pdev = pci_real_dma_dev(to_pci_dev(dev)); /* VFs aren't listed in scope tables; we need to look up * the PF instead to find the IOMMU. */ pf_pdev = pci_physfn(pdev); dev = &pf_pdev->dev; segment = pci_domain_nr(pdev->bus); } else if (has_acpi_companion(dev)) dev = &ACPI_COMPANION(dev)->dev; rcu_read_lock(); for_each_iommu(iommu, drhd) { if (pdev && segment != drhd->segment) continue; for_each_active_dev_scope(drhd->devices, drhd->devices_cnt, i, tmp) { if (tmp == dev) { /* For a VF use its original BDF# not that of the PF * which we used for the IOMMU lookup. Strictly speaking * we could do this for all PCI devices; we only need to * get the BDF# from the scope table for ACPI matches. */ if (pdev && pdev->is_virtfn) goto got_pdev; if (bus && devfn) { *bus = drhd->devices[i].bus; *devfn = drhd->devices[i].devfn; } goto out; } if (is_downstream_to_pci_bridge(dev, tmp)) goto got_pdev; } if (pdev && drhd->include_all) { got_pdev: if (bus && devfn) { *bus = pdev->bus->number; *devfn = pdev->devfn; } goto out; } } iommu = NULL; out: if (iommu_is_dummy(iommu, dev)) iommu = NULL; rcu_read_unlock(); return iommu; } static void domain_flush_cache(struct dmar_domain *domain, void *addr, int size) { if (!domain->iommu_coherency) clflush_cache_range(addr, size); } static void free_context_table(struct intel_iommu *iommu) { struct context_entry *context; int i; if (!iommu->root_entry) return; for (i = 0; i < ROOT_ENTRY_NR; i++) { context = iommu_context_addr(iommu, i, 0, 0); if (context) iommu_free_page(context); if (!sm_supported(iommu)) continue; context = iommu_context_addr(iommu, i, 0x80, 0); if (context) iommu_free_page(context); } iommu_free_page(iommu->root_entry); iommu->root_entry = NULL; } #ifdef CONFIG_DMAR_DEBUG static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn, u8 bus, u8 devfn, struct dma_pte *parent, int level) { struct dma_pte *pte; int offset; while (1) { offset = pfn_level_offset(pfn, level); pte = &parent[offset]; pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val); if (!dma_pte_present(pte)) { pr_info("page table not present at level %d\n", level - 1); break; } if (level == 1 || dma_pte_superpage(pte)) break; parent = phys_to_virt(dma_pte_addr(pte)); level--; } } void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id, unsigned long long addr, u32 pasid) { struct pasid_dir_entry *dir, *pde; struct pasid_entry *entries, *pte; struct context_entry *ctx_entry; struct root_entry *rt_entry; int i, dir_index, index, level; u8 devfn = source_id & 0xff; u8 bus = source_id >> 8; struct dma_pte *pgtable; pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr); /* root entry dump */ if (!iommu->root_entry) { pr_info("root table is not present\n"); return; } rt_entry = &iommu->root_entry[bus]; if (sm_supported(iommu)) pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n", rt_entry->hi, rt_entry->lo); else pr_info("root entry: 0x%016llx", rt_entry->lo); /* context entry dump */ ctx_entry = iommu_context_addr(iommu, bus, devfn, 0); if (!ctx_entry) { pr_info("context table is not present\n"); return; } pr_info("context entry: hi 0x%016llx, low 0x%016llx\n", ctx_entry->hi, ctx_entry->lo); /* legacy mode does not require PASID entries */ if (!sm_supported(iommu)) { if (!context_present(ctx_entry)) { pr_info("legacy mode page table is not present\n"); return; } level = agaw_to_level(ctx_entry->hi & 7); pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK); goto pgtable_walk; } if (!context_present(ctx_entry)) { pr_info("pasid directory table is not present\n"); return; } /* get the pointer to pasid directory entry */ dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK); /* For request-without-pasid, get the pasid from context entry */ if (intel_iommu_sm && pasid == IOMMU_PASID_INVALID) pasid = IOMMU_NO_PASID; dir_index = pasid >> PASID_PDE_SHIFT; pde = &dir[dir_index]; pr_info("pasid dir entry: 0x%016llx\n", pde->val); /* get the pointer to the pasid table entry */ entries = get_pasid_table_from_pde(pde); if (!entries) { pr_info("pasid table is not present\n"); return; } index = pasid & PASID_PTE_MASK; pte = &entries[index]; for (i = 0; i < ARRAY_SIZE(pte->val); i++) pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]); if (!pasid_pte_is_present(pte)) { pr_info("scalable mode page table is not present\n"); return; } if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) { level = pte->val[2] & BIT_ULL(2) ? 5 : 4; pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK); } else { level = agaw_to_level((pte->val[0] >> 2) & 0x7); pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK); } pgtable_walk: pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level); } #endif static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain, unsigned long pfn, int *target_level, gfp_t gfp) { struct dma_pte *parent, *pte; int level = agaw_to_level(domain->agaw); int offset; if (!domain_pfn_supported(domain, pfn)) /* Address beyond IOMMU's addressing capabilities. */ return NULL; parent = domain->pgd; while (1) { void *tmp_page; offset = pfn_level_offset(pfn, level); pte = &parent[offset]; if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte))) break; if (level == *target_level) break; if (!dma_pte_present(pte)) { uint64_t pteval, tmp; tmp_page = iommu_alloc_page_node(domain->nid, gfp); if (!tmp_page) return NULL; domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE); pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE; if (domain->use_first_level) pteval |= DMA_FL_PTE_US | DMA_FL_PTE_ACCESS; tmp = 0ULL; if (!try_cmpxchg64(&pte->val, &tmp, pteval)) /* Someone else set it while we were thinking; use theirs. */ iommu_free_page(tmp_page); else domain_flush_cache(domain, pte, sizeof(*pte)); } if (level == 1) break; parent = phys_to_virt(dma_pte_addr(pte)); level--; } if (!*target_level) *target_level = level; return pte; } /* return address's pte at specific level */ static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain, unsigned long pfn, int level, int *large_page) { struct dma_pte *parent, *pte; int total = agaw_to_level(domain->agaw); int offset; parent = domain->pgd; while (level <= total) { offset = pfn_level_offset(pfn, total); pte = &parent[offset]; if (level == total) return pte; if (!dma_pte_present(pte)) { *large_page = total; break; } if (dma_pte_superpage(pte)) { *large_page = total; return pte; } parent = phys_to_virt(dma_pte_addr(pte)); total--; } return NULL; } /* clear last level pte, a tlb flush should be followed */ static void dma_pte_clear_range(struct dmar_domain *domain, unsigned long start_pfn, unsigned long last_pfn) { unsigned int large_page; struct dma_pte *first_pte, *pte; if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) || WARN_ON(start_pfn > last_pfn)) return; /* we don't need lock here; nobody else touches the iova range */ do { large_page = 1; first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page); if (!pte) { start_pfn = align_to_level(start_pfn + 1, large_page + 1); continue; } do { dma_clear_pte(pte); start_pfn += lvl_to_nr_pages(large_page); pte++; } while (start_pfn <= last_pfn && !first_pte_in_page(pte)); domain_flush_cache(domain, first_pte, (void *)pte - (void *)first_pte); } while (start_pfn && start_pfn <= last_pfn); } static void dma_pte_free_level(struct dmar_domain *domain, int level, int retain_level, struct dma_pte *pte, unsigned long pfn, unsigned long start_pfn, unsigned long last_pfn) { pfn = max(start_pfn, pfn); pte = &pte[pfn_level_offset(pfn, level)]; do { unsigned long level_pfn; struct dma_pte *level_pte; if (!dma_pte_present(pte) || dma_pte_superpage(pte)) goto next; level_pfn = pfn & level_mask(level); level_pte = phys_to_virt(dma_pte_addr(pte)); if (level > 2) { dma_pte_free_level(domain, level - 1, retain_level, level_pte, level_pfn, start_pfn, last_pfn); } /* * Free the page table if we're below the level we want to * retain and the range covers the entire table. */ if (level < retain_level && !(start_pfn > level_pfn || last_pfn < level_pfn + level_size(level) - 1)) { dma_clear_pte(pte); domain_flush_cache(domain, pte, sizeof(*pte)); iommu_free_page(level_pte); } next: pfn += level_size(level); } while (!first_pte_in_page(++pte) && pfn <= last_pfn); } /* * clear last level (leaf) ptes and free page table pages below the * level we wish to keep intact. */ static void dma_pte_free_pagetable(struct dmar_domain *domain, unsigned long start_pfn, unsigned long last_pfn, int retain_level) { dma_pte_clear_range(domain, start_pfn, last_pfn); /* We don't need lock here; nobody else touches the iova range */ dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level, domain->pgd, 0, start_pfn, last_pfn); /* free pgd */ if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) { iommu_free_page(domain->pgd); domain->pgd = NULL; } } /* When a page at a given level is being unlinked from its parent, we don't need to *modify* it at all. All we need to do is make a list of all the pages which can be freed just as soon as we've flushed the IOTLB and we know the hardware page-walk will no longer touch them. The 'pte' argument is the *parent* PTE, pointing to the page that is to be freed. */ static void dma_pte_list_pagetables(struct dmar_domain *domain, int level, struct dma_pte *pte, struct list_head *freelist) { struct page *pg; pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT); list_add_tail(&pg->lru, freelist); if (level == 1) return; pte = page_address(pg); do { if (dma_pte_present(pte) && !dma_pte_superpage(pte)) dma_pte_list_pagetables(domain, level - 1, pte, freelist); pte++; } while (!first_pte_in_page(pte)); } static void dma_pte_clear_level(struct dmar_domain *domain, int level, struct dma_pte *pte, unsigned long pfn, unsigned long start_pfn, unsigned long last_pfn, struct list_head *freelist) { struct dma_pte *first_pte = NULL, *last_pte = NULL; pfn = max(start_pfn, pfn); pte = &pte[pfn_level_offset(pfn, level)]; do { unsigned long level_pfn = pfn & level_mask(level); if (!dma_pte_present(pte)) goto next; /* If range covers entire pagetable, free it */ if (start_pfn <= level_pfn && last_pfn >= level_pfn + level_size(level) - 1) { /* These suborbinate page tables are going away entirely. Don't bother to clear them; we're just going to *free* them. */ if (level > 1 && !dma_pte_superpage(pte)) dma_pte_list_pagetables(domain, level - 1, pte, freelist); dma_clear_pte(pte); if (!first_pte) first_pte = pte; last_pte = pte; } else if (level > 1) { /* Recurse down into a level that isn't *entirely* obsolete */ dma_pte_clear_level(domain, level - 1, phys_to_virt(dma_pte_addr(pte)), level_pfn, start_pfn, last_pfn, freelist); } next: pfn = level_pfn + level_size(level); } while (!first_pte_in_page(++pte) && pfn <= last_pfn); if (first_pte) domain_flush_cache(domain, first_pte, (void *)++last_pte - (void *)first_pte); } /* We can't just free the pages because the IOMMU may still be walking the page tables, and may have cached the intermediate levels. The pages can only be freed after the IOTLB flush has been done. */ static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn, unsigned long last_pfn, struct list_head *freelist) { if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) || WARN_ON(start_pfn > last_pfn)) return; /* we don't need lock here; nobody else touches the iova range */ dma_pte_clear_level(domain, agaw_to_level(domain->agaw), domain->pgd, 0, start_pfn, last_pfn, freelist); /* free pgd */ if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) { struct page *pgd_page = virt_to_page(domain->pgd); list_add_tail(&pgd_page->lru, freelist); domain->pgd = NULL; } } /* iommu handling */ static int iommu_alloc_root_entry(struct intel_iommu *iommu) { struct root_entry *root; root = iommu_alloc_page_node(iommu->node, GFP_ATOMIC); if (!root) { pr_err("Allocating root entry for %s failed\n", iommu->name); return -ENOMEM; } __iommu_flush_cache(iommu, root, ROOT_SIZE); iommu->root_entry = root; return 0; } static void iommu_set_root_entry(struct intel_iommu *iommu) { u64 addr; u32 sts; unsigned long flag; addr = virt_to_phys(iommu->root_entry); if (sm_supported(iommu)) addr |= DMA_RTADDR_SMT; raw_spin_lock_irqsave(&iommu->register_lock, flag); dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr); writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG); /* Make sure hardware complete it */ IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, readl, (sts & DMA_GSTS_RTPS), sts); raw_spin_unlock_irqrestore(&iommu->register_lock, flag); /* * Hardware invalidates all DMA remapping hardware translation * caches as part of SRTP flow. */ if (cap_esrtps(iommu->cap)) return; iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL); if (sm_supported(iommu)) qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0); iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH); } void iommu_flush_write_buffer(struct intel_iommu *iommu) { u32 val; unsigned long flag; if (!rwbf_quirk && !cap_rwbf(iommu->cap)) return; raw_spin_lock_irqsave(&iommu->register_lock, flag); writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG); /* Make sure hardware complete it */ IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, readl, (!(val & DMA_GSTS_WBFS)), val); raw_spin_unlock_irqrestore(&iommu->register_lock, flag); } /* return value determine if we need a write buffer flush */ static void __iommu_flush_context(struct intel_iommu *iommu, u16 did, u16 source_id, u8 function_mask, u64 type) { u64 val = 0; unsigned long flag; switch (type) { case DMA_CCMD_GLOBAL_INVL: val = DMA_CCMD_GLOBAL_INVL; break; case DMA_CCMD_DOMAIN_INVL: val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did); break; case DMA_CCMD_DEVICE_INVL: val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did) | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask); break; default: pr_warn("%s: Unexpected context-cache invalidation type 0x%llx\n", iommu->name, type); return; } val |= DMA_CCMD_ICC; raw_spin_lock_irqsave(&iommu->register_lock, flag); dmar_writeq(iommu->reg + DMAR_CCMD_REG, val); /* Make sure hardware complete it */ IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG, dmar_readq, (!(val & DMA_CCMD_ICC)), val); raw_spin_unlock_irqrestore(&iommu->register_lock, flag); } void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr, unsigned int size_order, u64 type) { int tlb_offset = ecap_iotlb_offset(iommu->ecap); u64 val = 0, val_iva = 0; unsigned long flag; switch (type) { case DMA_TLB_GLOBAL_FLUSH: /* global flush doesn't need set IVA_REG */ val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT; break; case DMA_TLB_DSI_FLUSH: val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did); break; case DMA_TLB_PSI_FLUSH: val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did); /* IH bit is passed in as part of address */ val_iva = size_order | addr; break; default: pr_warn("%s: Unexpected iotlb invalidation type 0x%llx\n", iommu->name, type); return; } if (cap_write_drain(iommu->cap)) val |= DMA_TLB_WRITE_DRAIN; raw_spin_lock_irqsave(&iommu->register_lock, flag); /* Note: Only uses first TLB reg currently */ if (val_iva) dmar_writeq(iommu->reg + tlb_offset, val_iva); dmar_writeq(iommu->reg + tlb_offset + 8, val); /* Make sure hardware complete it */ IOMMU_WAIT_OP(iommu, tlb_offset + 8, dmar_readq, (!(val & DMA_TLB_IVT)), val); raw_spin_unlock_irqrestore(&iommu->register_lock, flag); /* check IOTLB invalidation granularity */ if (DMA_TLB_IAIG(val) == 0) pr_err("Flush IOTLB failed\n"); if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type)) pr_debug("TLB flush request %Lx, actual %Lx\n", (unsigned long long)DMA_TLB_IIRG(type), (unsigned long long)DMA_TLB_IAIG(val)); } static struct device_domain_info * domain_lookup_dev_info(struct dmar_domain *domain, struct intel_iommu *iommu, u8 bus, u8 devfn) { struct device_domain_info *info; unsigned long flags; spin_lock_irqsave(&domain->lock, flags); list_for_each_entry(info, &domain->devices, link) { if (info->iommu == iommu && info->bus == bus && info->devfn == devfn) { spin_unlock_irqrestore(&domain->lock, flags); return info; } } spin_unlock_irqrestore(&domain->lock, flags); return NULL; } /* * The extra devTLB flush quirk impacts those QAT devices with PCI device * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device() * check because it applies only to the built-in QAT devices and it doesn't * grant additional privileges. */ #define BUGGY_QAT_DEVID_MASK 0x4940 static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev) { if (pdev->vendor != PCI_VENDOR_ID_INTEL) return false; if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK) return false; return true; } static void iommu_enable_pci_caps(struct device_domain_info *info) { struct pci_dev *pdev; if (!dev_is_pci(info->dev)) return; pdev = to_pci_dev(info->dev); if (info->ats_supported && pci_ats_page_aligned(pdev) && !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) info->ats_enabled = 1; } static void iommu_disable_pci_caps(struct device_domain_info *info) { struct pci_dev *pdev; if (!dev_is_pci(info->dev)) return; pdev = to_pci_dev(info->dev); if (info->ats_enabled) { pci_disable_ats(pdev); info->ats_enabled = 0; } } static void intel_flush_iotlb_all(struct iommu_domain *domain) { cache_tag_flush_all(to_dmar_domain(domain)); } static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu) { u32 pmen; unsigned long flags; if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap)) return; raw_spin_lock_irqsave(&iommu->register_lock, flags); pmen = readl(iommu->reg + DMAR_PMEN_REG); pmen &= ~DMA_PMEN_EPM; writel(pmen, iommu->reg + DMAR_PMEN_REG); /* wait for the protected region status bit to clear */ IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG, readl, !(pmen & DMA_PMEN_PRS), pmen); raw_spin_unlock_irqrestore(&iommu->register_lock, flags); } static void iommu_enable_translation(struct intel_iommu *iommu) { u32 sts; unsigned long flags; raw_spin_lock_irqsave(&iommu->register_lock, flags); iommu->gcmd |= DMA_GCMD_TE; writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); /* Make sure hardware complete it */ IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, readl, (sts & DMA_GSTS_TES), sts); raw_spin_unlock_irqrestore(&iommu->register_lock, flags); } static void iommu_disable_translation(struct intel_iommu *iommu) { u32 sts; unsigned long flag; if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated && (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap))) return; raw_spin_lock_irqsave(&iommu->register_lock, flag); iommu->gcmd &= ~DMA_GCMD_TE; writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); /* Make sure hardware complete it */ IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, readl, (!(sts & DMA_GSTS_TES)), sts); raw_spin_unlock_irqrestore(&iommu->register_lock, flag); } static int iommu_init_domains(struct intel_iommu *iommu) { u32 ndomains; ndomains = cap_ndoms(iommu->cap); pr_debug("%s: Number of Domains supported <%d>\n", iommu->name, ndomains); spin_lock_init(&iommu->lock); iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL); if (!iommu->domain_ids) return -ENOMEM; /* * If Caching mode is set, then invalid translations are tagged * with domain-id 0, hence we need to pre-allocate it. We also * use domain-id 0 as a marker for non-allocated domain-id, so * make sure it is not used for a real domain. */ set_bit(0, iommu->domain_ids); /* * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid * entry for first-level or pass-through translation modes should * be programmed with a domain id different from those used for * second-level or nested translation. We reserve a domain id for * this purpose. This domain id is also used for identity domain * in legacy mode. */ set_bit(FLPT_DEFAULT_DID, iommu->domain_ids); return 0; } static void disable_dmar_iommu(struct intel_iommu *iommu) { if (!iommu->domain_ids) return; /* * All iommu domains must have been detached from the devices, * hence there should be no domain IDs in use. */ if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap)) > NUM_RESERVED_DID)) return; if (iommu->gcmd & DMA_GCMD_TE) iommu_disable_translation(iommu); } static void free_dmar_iommu(struct intel_iommu *iommu) { if (iommu->domain_ids) { bitmap_free(iommu->domain_ids); iommu->domain_ids = NULL; } if (iommu->copied_tables) { bitmap_free(iommu->copied_tables); iommu->copied_tables = NULL; } /* free context mapping */ free_context_table(iommu); if (ecap_prs(iommu->ecap)) intel_iommu_finish_prq(iommu); } /* * Check and return whether first level is used by default for * DMA translation. */ static bool first_level_by_default(struct intel_iommu *iommu) { /* Only SL is available in legacy mode */ if (!sm_supported(iommu)) return false; /* Only level (either FL or SL) is available, just use it */ if (ecap_flts(iommu->ecap) ^ ecap_slts(iommu->ecap)) return ecap_flts(iommu->ecap); return true; } int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu) { struct iommu_domain_info *info, *curr; unsigned long ndomains; int num, ret = -ENOSPC; if (domain->domain.type == IOMMU_DOMAIN_SVA) return 0; info = kzalloc(sizeof(*info), GFP_KERNEL); if (!info) return -ENOMEM; spin_lock(&iommu->lock); curr = xa_load(&domain->iommu_array, iommu->seq_id); if (curr) { curr->refcnt++; spin_unlock(&iommu->lock); kfree(info); return 0; } ndomains = cap_ndoms(iommu->cap); num = find_first_zero_bit(iommu->domain_ids, ndomains); if (num >= ndomains) { pr_err("%s: No free domain ids\n", iommu->name); goto err_unlock; } set_bit(num, iommu->domain_ids); info->refcnt = 1; info->did = num; info->iommu = iommu; curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id, NULL, info, GFP_ATOMIC); if (curr) { ret = xa_err(curr) ? : -EBUSY; goto err_clear; } spin_unlock(&iommu->lock); return 0; err_clear: clear_bit(info->did, iommu->domain_ids); err_unlock: spin_unlock(&iommu->lock); kfree(info); return ret; } void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu) { struct iommu_domain_info *info; if (domain->domain.type == IOMMU_DOMAIN_SVA) return; spin_lock(&iommu->lock); info = xa_load(&domain->iommu_array, iommu->seq_id); if (--info->refcnt == 0) { clear_bit(info->did, iommu->domain_ids); xa_erase(&domain->iommu_array, iommu->seq_id); domain->nid = NUMA_NO_NODE; kfree(info); } spin_unlock(&iommu->lock); } static void domain_exit(struct dmar_domain *domain) { if (domain->pgd) { LIST_HEAD(freelist); domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist); iommu_put_pages_list(&freelist); } if (WARN_ON(!list_empty(&domain->devices))) return; kfree(domain->qi_batch); kfree(domain); } /* * For kdump cases, old valid entries may be cached due to the * in-flight DMA and copied pgtable, but there is no unmapping * behaviour for them, thus we need an explicit cache flush for * the newly-mapped device. For kdump, at this point, the device * is supposed to finish reset at its driver probe stage, so no * in-flight DMA will exist, and we don't need to worry anymore * hereafter. */ static void copied_context_tear_down(struct intel_iommu *iommu, struct context_entry *context, u8 bus, u8 devfn) { u16 did_old; if (!context_copied(iommu, bus, devfn)) return; assert_spin_locked(&iommu->lock); did_old = context_domain_id(context); context_clear_entry(context); if (did_old < cap_ndoms(iommu->cap)) { iommu->flush.flush_context(iommu, did_old, PCI_DEVID(bus, devfn), DMA_CCMD_MASK_NOBIT, DMA_CCMD_DEVICE_INVL); iommu->flush.flush_iotlb(iommu, did_old, 0, 0, DMA_TLB_DSI_FLUSH); } clear_context_copied(iommu, bus, devfn); } /* * It's a non-present to present mapping. If hardware doesn't cache * non-present entry we only need to flush the write-buffer. If the * _does_ cache non-present entries, then it does so in the special * domain #0, which we have to flush: */ static void context_present_cache_flush(struct intel_iommu *iommu, u16 did, u8 bus, u8 devfn) { if (cap_caching_mode(iommu->cap)) { iommu->flush.flush_context(iommu, 0, PCI_DEVID(bus, devfn), DMA_CCMD_MASK_NOBIT, DMA_CCMD_DEVICE_INVL); iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH); } else { iommu_flush_write_buffer(iommu); } } static int domain_context_mapping_one(struct dmar_domain *domain, struct intel_iommu *iommu, u8 bus, u8 devfn) { struct device_domain_info *info = domain_lookup_dev_info(domain, iommu, bus, devfn); u16 did = domain_id_iommu(domain, iommu); int translation = CONTEXT_TT_MULTI_LEVEL; struct dma_pte *pgd = domain->pgd; struct context_entry *context; int ret; pr_debug("Set context mapping for %02x:%02x.%d\n", bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); spin_lock(&iommu->lock); ret = -ENOMEM; context = iommu_context_addr(iommu, bus, devfn, 1); if (!context) goto out_unlock; ret = 0; if (context_present(context) && !context_copied(iommu, bus, devfn)) goto out_unlock; copied_context_tear_down(iommu, context, bus, devfn); context_clear_entry(context); context_set_domain_id(context, did); if (info && info->ats_supported) translation = CONTEXT_TT_DEV_IOTLB; else translation = CONTEXT_TT_MULTI_LEVEL; context_set_address_root(context, virt_to_phys(pgd)); context_set_address_width(context, domain->agaw); context_set_translation_type(context, translation); context_set_fault_enable(context); context_set_present(context); if (!ecap_coherent(iommu->ecap)) clflush_cache_range(context, sizeof(*context)); context_present_cache_flush(iommu, did, bus, devfn); ret = 0; out_unlock: spin_unlock(&iommu->lock); return ret; } static int domain_context_mapping_cb(struct pci_dev *pdev, u16 alias, void *opaque) { struct device_domain_info *info = dev_iommu_priv_get(&pdev->dev); struct intel_iommu *iommu = info->iommu; struct dmar_domain *domain = opaque; return domain_context_mapping_one(domain, iommu, PCI_BUS_NUM(alias), alias & 0xff); } static int domain_context_mapping(struct dmar_domain *domain, struct device *dev) { struct device_domain_info *info = dev_iommu_priv_get(dev); struct intel_iommu *iommu = info->iommu; u8 bus = info->bus, devfn = info->devfn; if (!dev_is_pci(dev)) return domain_context_mapping_one(domain, iommu, bus, devfn); return pci_for_each_dma_alias(to_pci_dev(dev), domain_context_mapping_cb, domain); } /* Return largest possible superpage level for a given mapping */ static int hardware_largepage_caps(struct dmar_domain *domain, unsigned long iov_pfn, unsigned long phy_pfn, unsigned long pages) { int support, level = 1; unsigned long pfnmerge; support = domain->iommu_superpage; /* To use a large page, the virtual *and* physical addresses must be aligned to 2MiB/1GiB/etc. Lower bits set in either of them will mean we have to use smaller pages. So just merge them and check both at once. */ pfnmerge = iov_pfn | phy_pfn; while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) { pages >>= VTD_STRIDE_SHIFT; if (!pages) break; pfnmerge >>= VTD_STRIDE_SHIFT; level++; support--; } return level; } /* * Ensure that old small page tables are removed to make room for superpage(s). * We're going to add new large pages, so make sure we don't remove their parent * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared. */ static void switch_to_super_page(struct dmar_domain *domain, unsigned long start_pfn, unsigned long end_pfn, int level) { unsigned long lvl_pages = lvl_to_nr_pages(level); struct dma_pte *pte = NULL; while (start_pfn <= end_pfn) { if (!pte) pte = pfn_to_dma_pte(domain, start_pfn, &level, GFP_ATOMIC); if (dma_pte_present(pte)) { dma_pte_free_pagetable(domain, start_pfn, start_pfn + lvl_pages - 1, level + 1); cache_tag_flush_range(domain, start_pfn << VTD_PAGE_SHIFT, end_pfn << VTD_PAGE_SHIFT, 0); } pte++; start_pfn += lvl_pages; if (first_pte_in_page(pte)) pte = NULL; } } static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, unsigned long phys_pfn, unsigned long nr_pages, int prot, gfp_t gfp) { struct dma_pte *first_pte = NULL, *pte = NULL; unsigned int largepage_lvl = 0; unsigned long lvl_pages = 0; phys_addr_t pteval; u64 attr; if (unlikely(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1))) return -EINVAL; if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0) return -EINVAL; if (!(prot & DMA_PTE_WRITE) && domain->nested_parent) { pr_err_ratelimited("Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n"); return -EINVAL; } attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP); attr |= DMA_FL_PTE_PRESENT; if (domain->use_first_level) { attr |= DMA_FL_PTE_US | DMA_FL_PTE_ACCESS; if (prot & DMA_PTE_WRITE) attr |= DMA_FL_PTE_DIRTY; } domain->has_mappings = true; pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr; while (nr_pages > 0) { uint64_t tmp; if (!pte) { largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, nr_pages); pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl, gfp); if (!pte) return -ENOMEM; first_pte = pte; lvl_pages = lvl_to_nr_pages(largepage_lvl); /* It is large page*/ if (largepage_lvl > 1) { unsigned long end_pfn; unsigned long pages_to_remove; pteval |= DMA_PTE_LARGE_PAGE; pages_to_remove = min_t(unsigned long, nr_pages, nr_pte_to_next_page(pte) * lvl_pages); end_pfn = iov_pfn + pages_to_remove - 1; switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl); } else { pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE; } } /* We don't need lock here, nobody else * touches the iova range */ tmp = 0ULL; if (!try_cmpxchg64_local(&pte->val, &tmp, pteval)) { static int dumps = 5; pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n", iov_pfn, tmp, (unsigned long long)pteval); if (dumps) { dumps--; debug_dma_dump_mappings(NULL); } WARN_ON(1); } nr_pages -= lvl_pages; iov_pfn += lvl_pages; phys_pfn += lvl_pages; pteval += lvl_pages * VTD_PAGE_SIZE; /* If the next PTE would be the first in a new page, then we * need to flush the cache on the entries we've just written. * And then we'll need to recalculate 'pte', so clear it and * let it get set again in the if (!pte) block above. * * If we're done (!nr_pages) we need to flush the cache too. * * Also if we've been setting superpages, we may need to * recalculate 'pte' and switch back to smaller pages for the * end of the mapping, if the trailing size is not enough to * use another superpage (i.e. nr_pages < lvl_pages). */ pte++; if (!nr_pages || first_pte_in_page(pte) || (largepage_lvl > 1 && nr_pages < lvl_pages)) { domain_flush_cache(domain, first_pte, (void *)pte - (void *)first_pte); pte = NULL; } } return 0; } static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn) { struct intel_iommu *iommu = info->iommu; struct context_entry *context; u16 did; spin_lock(&iommu->lock); context = iommu_context_addr(iommu, bus, devfn, 0); if (!context) { spin_unlock(&iommu->lock); return; } did = context_domain_id(context); context_clear_entry(context); __iommu_flush_cache(iommu, context, sizeof(*context)); spin_unlock(&iommu->lock); intel_context_flush_present(info, context, did, true); } int __domain_setup_first_level(struct intel_iommu *iommu, struct device *dev, ioasid_t pasid, u16 did, pgd_t *pgd, int flags, struct iommu_domain *old) { if (!old) return intel_pasid_setup_first_level(iommu, dev, pgd, pasid, did, flags); return intel_pasid_replace_first_level(iommu, dev, pgd, pasid, did, iommu_domain_did(old, iommu), flags); } static int domain_setup_second_level(struct intel_iommu *iommu, struct dmar_domain *domain, struct device *dev, ioasid_t pasid, struct iommu_domain *old) { if (!old) return intel_pasid_setup_second_level(iommu, domain, dev, pasid); return intel_pasid_replace_second_level(iommu, domain, dev, iommu_domain_did(old, iommu), pasid); } static int domain_setup_passthrough(struct intel_iommu *iommu, struct device *dev, ioasid_t pasid, struct iommu_domain *old) { if (!old) return intel_pasid_setup_pass_through(iommu, dev, pasid); return intel_pasid_replace_pass_through(iommu, dev, iommu_domain_did(old, iommu), pasid); } static int domain_setup_first_level(struct intel_iommu *iommu, struct dmar_domain *domain, struct device *dev, u32 pasid, struct iommu_domain *old) { struct dma_pte *pgd = domain->pgd; int level, flags = 0; level = agaw_to_level(domain->agaw); if (level != 4 && level != 5) return -EINVAL; if (level == 5) flags |= PASID_FLAG_FL5LP; if (domain->force_snooping) flags |= PASID_FLAG_PAGE_SNOOP; return __domain_setup_first_level(iommu, dev, pasid, domain_id_iommu(domain, iommu), (pgd_t *)pgd, flags, old); } static int dmar_domain_attach_device(struct dmar_domain *domain, struct device *dev) { struct device_domain_info *info = dev_iommu_priv_get(dev); struct intel_iommu *iommu = info->iommu; unsigned long flags; int ret; ret = domain_attach_iommu(domain, iommu); if (ret) return ret; info->domain = domain; spin_lock_irqsave(&domain->lock, flags); list_add(&info->link, &domain->devices); spin_unlock_irqrestore(&domain->lock, flags); if (dev_is_real_dma_subdevice(dev)) return 0; if (!sm_supported(iommu)) ret = domain_context_mapping(domain, dev); else if (domain->use_first_level) ret = domain_setup_first_level(iommu, domain, dev, IOMMU_NO_PASID, NULL); else ret = domain_setup_second_level(iommu, domain, dev, IOMMU_NO_PASID, NULL); if (ret) goto out_block_translation; iommu_enable_pci_caps(info); ret = cache_tag_assign_domain(domain, dev, IOMMU_NO_PASID); if (ret) goto out_block_translation; return 0; out_block_translation: device_block_translation(dev); return ret; } /** * device_rmrr_is_relaxable - Test whether the RMRR of this device * is relaxable (ie. is allowed to be not enforced under some conditions) * @dev: device handle * * We assume that PCI USB devices with RMRRs have them largely * for historical reasons and that the RMRR space is not actively used post * boot. This exclusion may change if vendors begin to abuse it. * * The same exception is made for graphics devices, with the requirement that * any use of the RMRR regions will be torn down before assigning the device * to a guest. * * Return: true if the RMRR is relaxable, false otherwise */ static bool device_rmrr_is_relaxable(struct device *dev) { struct pci_dev *pdev; if (!dev_is_pci(dev)) return false; pdev = to_pci_dev(dev); if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev)) return true; else return false; } static int device_def_domain_type(struct device *dev) { struct device_domain_info *info = dev_iommu_priv_get(dev); struct intel_iommu *iommu = info->iommu; /* * Hardware does not support the passthrough translation mode. * Always use a dynamaic mapping domain. */ if (!ecap_pass_through(iommu->ecap)) return IOMMU_DOMAIN_DMA; if (dev_is_pci(dev)) { struct pci_dev *pdev = to_pci_dev(dev); if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev)) return IOMMU_DOMAIN_IDENTITY; } return 0; } static void intel_iommu_init_qi(struct intel_iommu *iommu) { /* * Start from the sane iommu hardware state. * If the queued invalidation is already initialized by us * (for example, while enabling interrupt-remapping) then * we got the things already rolling from a sane state. */ if (!iommu->qi) { /* * Clear any previous faults. */ dmar_fault(-1, iommu); /* * Disable queued invalidation if supported and already enabled * before OS handover. */ dmar_disable_qi(iommu); } if (dmar_enable_qi(iommu)) { /* * Queued Invalidate not enabled, use Register Based Invalidate */ iommu->flush.flush_context = __iommu_flush_context; iommu->flush.flush_iotlb = __iommu_flush_iotlb; pr_info("%s: Using Register based invalidation\n", iommu->name); } else { iommu->flush.flush_context = qi_flush_context; iommu->flush.flush_iotlb = qi_flush_iotlb; pr_info("%s: Using Queued invalidation\n", iommu->name); } } static int copy_context_table(struct intel_iommu *iommu, struct root_entry *old_re, struct context_entry **tbl, int bus, bool ext) { int tbl_idx, pos = 0, idx, devfn, ret = 0, did; struct context_entry *new_ce = NULL, ce; struct context_entry *old_ce = NULL; struct root_entry re; phys_addr_t old_ce_phys; tbl_idx = ext ? bus * 2 : bus; memcpy(&re, old_re, sizeof(re)); for (devfn = 0; devfn < 256; devfn++) { /* First calculate the correct index */ idx = (ext ? devfn * 2 : devfn) % 256; if (idx == 0) { /* First save what we may have and clean up */ if (new_ce) { tbl[tbl_idx] = new_ce; __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE); pos = 1; } if (old_ce) memunmap(old_ce); ret = 0; if (devfn < 0x80) old_ce_phys = root_entry_lctp(&re); else old_ce_phys = root_entry_uctp(&re); if (!old_ce_phys) { if (ext && devfn == 0) { /* No LCTP, try UCTP */ devfn = 0x7f; continue; } else { goto out; } } ret = -ENOMEM; old_ce = memremap(old_ce_phys, PAGE_SIZE, MEMREMAP_WB); if (!old_ce) goto out; new_ce = iommu_alloc_page_node(iommu->node, GFP_KERNEL); if (!new_ce) goto out_unmap; ret = 0; } /* Now copy the context entry */ memcpy(&ce, old_ce + idx, sizeof(ce)); if (!context_present(&ce)) continue; did = context_domain_id(&ce); if (did >= 0 && did < cap_ndoms(iommu->cap)) set_bit(did, iommu->domain_ids); set_context_copied(iommu, bus, devfn); new_ce[idx] = ce; } tbl[tbl_idx + pos] = new_ce; __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE); out_unmap: memunmap(old_ce); out: return ret; } static int copy_translation_tables(struct intel_iommu *iommu) { struct context_entry **ctxt_tbls; struct root_entry *old_rt; phys_addr_t old_rt_phys; int ctxt_table_entries; u64 rtaddr_reg; int bus, ret; bool new_ext, ext; rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG); ext = !!(rtaddr_reg & DMA_RTADDR_SMT); new_ext = !!sm_supported(iommu); /* * The RTT bit can only be changed when translation is disabled, * but disabling translation means to open a window for data * corruption. So bail out and don't copy anything if we would * have to change the bit. */ if (new_ext != ext) return -EINVAL; iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL); if (!iommu->copied_tables) return -ENOMEM; old_rt_phys = rtaddr_reg & VTD_PAGE_MASK; if (!old_rt_phys) return -EINVAL; old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB); if (!old_rt) return -ENOMEM; /* This is too big for the stack - allocate it from slab */ ctxt_table_entries = ext ? 512 : 256; ret = -ENOMEM; ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL); if (!ctxt_tbls) goto out_unmap; for (bus = 0; bus < 256; bus++) { ret = copy_context_table(iommu, &old_rt[bus], ctxt_tbls, bus, ext); if (ret) { pr_err("%s: Failed to copy context table for bus %d\n", iommu->name, bus); continue; } } spin_lock(&iommu->lock); /* Context tables are copied, now write them to the root_entry table */ for (bus = 0; bus < 256; bus++) { int idx = ext ? bus * 2 : bus; u64 val; if (ctxt_tbls[idx]) { val = virt_to_phys(ctxt_tbls[idx]) | 1; iommu->root_entry[bus].lo = val; } if (!ext || !ctxt_tbls[idx + 1]) continue; val = virt_to_phys(ctxt_tbls[idx + 1]) | 1; iommu->root_entry[bus].hi = val; } spin_unlock(&iommu->lock); kfree(ctxt_tbls); __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE); ret = 0; out_unmap: memunmap(old_rt); return ret; } static int __init init_dmars(void) { struct dmar_drhd_unit *drhd; struct intel_iommu *iommu; int ret; ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL); if (ret) goto free_iommu; for_each_iommu(iommu, drhd) { if (drhd->ignored) { iommu_disable_translation(iommu); continue; } /* * Find the max pasid size of all IOMMU's in the system. * We need to ensure the system pasid table is no bigger * than the smallest supported. */ if (pasid_supported(iommu)) { u32 temp = 2 << ecap_pss(iommu->ecap); intel_pasid_max_id = min_t(u32, temp, intel_pasid_max_id); } intel_iommu_init_qi(iommu); ret = iommu_init_domains(iommu); if (ret) goto free_iommu; init_translation_status(iommu); if (translation_pre_enabled(iommu) && !is_kdump_kernel()) { iommu_disable_translation(iommu); clear_translation_pre_enabled(iommu); pr_warn("Translation was enabled for %s but we are not in kdump mode\n", iommu->name); } /* * TBD: * we could share the same root & context tables * among all IOMMU's. Need to Split it later. */ ret = iommu_alloc_root_entry(iommu); if (ret) goto free_iommu; if (translation_pre_enabled(iommu)) { pr_info("Translation already enabled - trying to copy translation structures\n"); ret = copy_translation_tables(iommu); if (ret) { /* * We found the IOMMU with translation * enabled - but failed to copy over the * old root-entry table. Try to proceed * by disabling translation now and * allocating a clean root-entry table. * This might cause DMAR faults, but * probably the dump will still succeed. */ pr_err("Failed to copy translation tables from previous kernel for %s\n", iommu->name); iommu_disable_translation(iommu); clear_translation_pre_enabled(iommu); } else { pr_info("Copied translation tables from previous kernel for %s\n", iommu->name); } } intel_svm_check(iommu); } /* * Now that qi is enabled on all iommus, set the root entry and flush * caches. This is required on some Intel X58 chipsets, otherwise the * flush_context function will loop forever and the boot hangs. */ for_each_active_iommu(iommu, drhd) { iommu_flush_write_buffer(iommu); iommu_set_root_entry(iommu); } check_tylersburg_isoch(); /* * for each drhd * enable fault log * global invalidate context cache * global invalidate iotlb * enable translation */ for_each_iommu(iommu, drhd) { if (drhd->ignored) { /* * we always have to disable PMRs or DMA may fail on * this device */ if (force_on) iommu_disable_protect_mem_regions(iommu); continue; } iommu_flush_write_buffer(iommu); if (ecap_prs(iommu->ecap)) { /* * Call dmar_alloc_hwirq() with dmar_global_lock held, * could cause possible lock race condition. */ up_write(&dmar_global_lock); ret = intel_iommu_enable_prq(iommu); down_write(&dmar_global_lock); if (ret) goto free_iommu; } ret = dmar_set_interrupt(iommu); if (ret) goto free_iommu; } return 0; free_iommu: for_each_active_iommu(iommu, drhd) { disable_dmar_iommu(iommu); free_dmar_iommu(iommu); } return ret; } static void __init init_no_remapping_devices(void) { struct dmar_drhd_unit *drhd; struct device *dev; int i; for_each_drhd_unit(drhd) { if (!drhd->include_all) { for_each_active_dev_scope(drhd->devices, drhd->devices_cnt, i, dev) break; /* ignore DMAR unit if no devices exist */ if (i == drhd->devices_cnt) drhd->ignored = 1; } } for_each_active_drhd_unit(drhd) { if (drhd->include_all) continue; for_each_active_dev_scope(drhd->devices, drhd->devices_cnt, i, dev) if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev))) break; if (i < drhd->devices_cnt) continue; /* This IOMMU has *only* gfx devices. Either bypass it or set the gfx_mapped flag, as appropriate */ drhd->gfx_dedicated = 1; if (disable_igfx_iommu) drhd->ignored = 1; } } #ifdef CONFIG_SUSPEND static int init_iommu_hw(void) { struct dmar_drhd_unit *drhd; struct intel_iommu *iommu = NULL; int ret; for_each_active_iommu(iommu, drhd) { if (iommu->qi) { ret = dmar_reenable_qi(iommu); if (ret) return ret; } } for_each_iommu(iommu, drhd) { if (drhd->ignored) { /* * we always have to disable PMRs or DMA may fail on * this device */ if (force_on) iommu_disable_protect_mem_regions(iommu); continue; } iommu_flush_write_buffer(iommu); iommu_set_root_entry(iommu); iommu_enable_translation(iommu); iommu_disable_protect_mem_regions(iommu); } return 0; } static void iommu_flush_all(void) { struct dmar_drhd_unit *drhd; struct intel_iommu *iommu; for_each_active_iommu(iommu, drhd) { iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL); iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH); } } static int iommu_suspend(void) { struct dmar_drhd_unit *drhd; struct intel_iommu *iommu = NULL; unsigned long flag; iommu_flush_all(); for_each_active_iommu(iommu, drhd) { iommu_disable_translation(iommu); raw_spin_lock_irqsave(&iommu->register_lock, flag); iommu->iommu_state[SR_DMAR_FECTL_REG] = readl(iommu->reg + DMAR_FECTL_REG); iommu->iommu_state[SR_DMAR_FEDATA_REG] = readl(iommu->reg + DMAR_FEDATA_REG); iommu->iommu_state[SR_DMAR_FEADDR_REG] = readl(iommu->reg + DMAR_FEADDR_REG); iommu->iommu_state[SR_DMAR_FEUADDR_REG] = readl(iommu->reg + DMAR_FEUADDR_REG); raw_spin_unlock_irqrestore(&iommu->register_lock, flag); } return 0; } static void iommu_resume(void) { struct dmar_drhd_unit *drhd; struct intel_iommu *iommu = NULL; unsigned long flag; if (init_iommu_hw()) { if (force_on) panic("tboot: IOMMU setup failed, DMAR can not resume!\n"); else WARN(1, "IOMMU setup failed, DMAR can not resume!\n"); return; } for_each_active_iommu(iommu, drhd) { raw_spin_lock_irqsave(&iommu->register_lock, flag); writel(iommu->iommu_state[SR_DMAR_FECTL_REG], iommu->reg + DMAR_FECTL_REG); writel(iommu->iommu_state[SR_DMAR_FEDATA_REG], iommu->reg + DMAR_FEDATA_REG); writel(iommu->iommu_state[SR_DMAR_FEADDR_REG], iommu->reg + DMAR_FEADDR_REG); writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG], iommu->reg + DMAR_FEUADDR_REG); raw_spin_unlock_irqrestore(&iommu->register_lock, flag); } } static struct syscore_ops iommu_syscore_ops = { .resume = iommu_resume, .suspend = iommu_suspend, }; static void __init init_iommu_pm_ops(void) { register_syscore_ops(&iommu_syscore_ops); } #else static inline void init_iommu_pm_ops(void) {} #endif /* CONFIG_PM */ static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr) { if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) || !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) || rmrr->end_address <= rmrr->base_address || arch_rmrr_sanity_check(rmrr)) return -EINVAL; return 0; } int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg) { struct acpi_dmar_reserved_memory *rmrr; struct dmar_rmrr_unit *rmrru; rmrr = (struct acpi_dmar_reserved_memory *)header; if (rmrr_sanity_check(rmrr)) { pr_warn(FW_BUG "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n" "BIOS vendor: %s; Ver: %s; Product Version: %s\n", rmrr->base_address, rmrr->end_address, dmi_get_system_info(DMI_BIOS_VENDOR), dmi_get_system_info(DMI_BIOS_VERSION), dmi_get_system_info(DMI_PRODUCT_VERSION)); add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); } rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL); if (!rmrru) goto out; rmrru->hdr = header; rmrru->base_address = rmrr->base_address; rmrru->end_address = rmrr->end_address; rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1), ((void *)rmrr) + rmrr->header.length, &rmrru->devices_cnt); if (rmrru->devices_cnt && rmrru->devices == NULL) goto free_rmrru; list_add(&rmrru->list, &dmar_rmrr_units); return 0; free_rmrru: kfree(rmrru); out: return -ENOMEM; } static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr) { struct dmar_atsr_unit *atsru; struct acpi_dmar_atsr *tmp; list_for_each_entry_rcu(atsru, &dmar_atsr_units, list, dmar_rcu_check()) { tmp = (struct acpi_dmar_atsr *)atsru->hdr; if (atsr->segment != tmp->segment) continue; if (atsr->header.length != tmp->header.length) continue; if (memcmp(atsr, tmp, atsr->header.length) == 0) return atsru; } return NULL; } int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg) { struct acpi_dmar_atsr *atsr; struct dmar_atsr_unit *atsru; if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled) return 0; atsr = container_of(hdr, struct acpi_dmar_atsr, header); atsru = dmar_find_atsr(atsr); if (atsru) return 0; atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL); if (!atsru) return -ENOMEM; /* * If memory is allocated from slab by ACPI _DSM method, we need to * copy the memory content because the memory buffer will be freed * on return. */ atsru->hdr = (void *)(atsru + 1); memcpy(atsru->hdr, hdr, hdr->length); atsru->include_all = atsr->flags & 0x1; if (!atsru->include_all) { atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1), (void *)atsr + atsr->header.length, &atsru->devices_cnt); if (atsru->devices_cnt && atsru->devices == NULL) { kfree(atsru); return -ENOMEM; } } list_add_rcu(&atsru->list, &dmar_atsr_units); return 0; } static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru) { dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt); kfree(atsru); } int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg) { struct acpi_dmar_atsr *atsr; struct dmar_atsr_unit *atsru; atsr = container_of(hdr, struct acpi_dmar_atsr, header); atsru = dmar_find_atsr(atsr); if (atsru) { list_del_rcu(&atsru->list); synchronize_rcu(); intel_iommu_free_atsr(atsru); } return 0; } int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg) { int i; struct device *dev; struct acpi_dmar_atsr *atsr; struct dmar_atsr_unit *atsru; atsr = container_of(hdr, struct acpi_dmar_atsr, header); atsru = dmar_find_atsr(atsr); if (!atsru) return 0; if (!atsru->include_all && atsru->devices && atsru->devices_cnt) { for_each_active_dev_scope(atsru->devices, atsru->devices_cnt, i, dev) return -EBUSY; } return 0; } static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc) { struct dmar_satc_unit *satcu; struct acpi_dmar_satc *tmp; list_for_each_entry_rcu(satcu, &dmar_satc_units, list, dmar_rcu_check()) { tmp = (struct acpi_dmar_satc *)satcu->hdr; if (satc->segment != tmp->segment) continue; if (satc->header.length != tmp->header.length) continue; if (memcmp(satc, tmp, satc->header.length) == 0) return satcu; } return NULL; } int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg) { struct acpi_dmar_satc *satc; struct dmar_satc_unit *satcu; if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled) return 0; satc = container_of(hdr, struct acpi_dmar_satc, header); satcu = dmar_find_satc(satc); if (satcu) return 0; satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL); if (!satcu) return -ENOMEM; satcu->hdr = (void *)(satcu + 1); memcpy(satcu->hdr, hdr, hdr->length); satcu->atc_required = satc->flags & 0x1; satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1), (void *)satc + satc->header.length, &satcu->devices_cnt); if (satcu->devices_cnt && !satcu->devices) { kfree(satcu); return -ENOMEM; } list_add_rcu(&satcu->list, &dmar_satc_units); return 0; } static int intel_iommu_add(struct dmar_drhd_unit *dmaru) { struct intel_iommu *iommu = dmaru->iommu; int ret; ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu); if (ret) goto out; /* * Disable translation if already enabled prior to OS handover. */ if (iommu->gcmd & DMA_GCMD_TE) iommu_disable_translation(iommu); ret = iommu_init_domains(iommu); if (ret == 0) ret = iommu_alloc_root_entry(iommu); if (ret) goto out; intel_svm_check(iommu); if (dmaru->ignored) { /* * we always have to disable PMRs or DMA may fail on this device */ if (force_on) iommu_disable_protect_mem_regions(iommu); return 0; } intel_iommu_init_qi(iommu); iommu_flush_write_buffer(iommu); if (ecap_prs(iommu->ecap)) { ret = intel_iommu_enable_prq(iommu); if (ret) goto disable_iommu; } ret = dmar_set_interrupt(iommu); if (ret) goto disable_iommu; iommu_set_root_entry(iommu); iommu_enable_translation(iommu); iommu_disable_protect_mem_regions(iommu); return 0; disable_iommu: disable_dmar_iommu(iommu); out: free_dmar_iommu(iommu); return ret; } int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert) { int ret = 0; struct intel_iommu *iommu = dmaru->iommu; if (!intel_iommu_enabled) return 0; if (iommu == NULL) return -EINVAL; if (insert) { ret = intel_iommu_add(dmaru); } else { disable_dmar_iommu(iommu); free_dmar_iommu(iommu); } return ret; } static void intel_iommu_free_dmars(void) { struct dmar_rmrr_unit *rmrru, *rmrr_n; struct dmar_atsr_unit *atsru, *atsr_n; struct dmar_satc_unit *satcu, *satc_n; list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) { list_del(&rmrru->list); dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt); kfree(rmrru); } list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) { list_del(&atsru->list); intel_iommu_free_atsr(atsru); } list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) { list_del(&satcu->list); dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt); kfree(satcu); } } static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev) { struct dmar_satc_unit *satcu; struct acpi_dmar_satc *satc; struct device *tmp; int i; dev = pci_physfn(dev); rcu_read_lock(); list_for_each_entry_rcu(satcu, &dmar_satc_units, list) { satc = container_of(satcu->hdr, struct acpi_dmar_satc, header); if (satc->segment != pci_domain_nr(dev->bus)) continue; for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp) if (to_pci_dev(tmp) == dev) goto out; } satcu = NULL; out: rcu_read_unlock(); return satcu; } static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu) { int i, ret = 1; struct pci_bus *bus; struct pci_dev *bridge = NULL; struct device *tmp; struct acpi_dmar_atsr *atsr; struct dmar_atsr_unit *atsru; struct dmar_satc_unit *satcu; dev = pci_physfn(dev); satcu = dmar_find_matched_satc_unit(dev); if (satcu) /* * This device supports ATS as it is in SATC table. * When IOMMU is in legacy mode, enabling ATS is done * automatically by HW for the device that requires * ATS, hence OS should not enable this device ATS * to avoid duplicated TLB invalidation. */ return !(satcu->atc_required && !sm_supported(iommu)); for (bus = dev->bus; bus; bus = bus->parent) { bridge = bus->self; /* If it's an integrated device, allow ATS */ if (!bridge) return 1; /* Connected via non-PCIe: no ATS */ if (!pci_is_pcie(bridge) || pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE) return 0; /* If we found the root port, look it up in the ATSR */ if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT) break; } rcu_read_lock(); list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) { atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header); if (atsr->segment != pci_domain_nr(dev->bus)) continue; for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp) if (tmp == &bridge->dev) goto out; if (atsru->include_all) goto out; } ret = 0; out: rcu_read_unlock(); return ret; } int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info) { int ret; struct dmar_rmrr_unit *rmrru; struct dmar_atsr_unit *atsru; struct dmar_satc_unit *satcu; struct acpi_dmar_atsr *atsr; struct acpi_dmar_reserved_memory *rmrr; struct acpi_dmar_satc *satc; if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING) return 0; list_for_each_entry(rmrru, &dmar_rmrr_units, list) { rmrr = container_of(rmrru->hdr, struct acpi_dmar_reserved_memory, header); if (info->event == BUS_NOTIFY_ADD_DEVICE) { ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1), ((void *)rmrr) + rmrr->header.length, rmrr->segment, rmrru->devices, rmrru->devices_cnt); if (ret < 0) return ret; } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) { dmar_remove_dev_scope(info, rmrr->segment, rmrru->devices, rmrru->devices_cnt); } } list_for_each_entry(atsru, &dmar_atsr_units, list) { if (atsru->include_all) continue; atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header); if (info->event == BUS_NOTIFY_ADD_DEVICE) { ret = dmar_insert_dev_scope(info, (void *)(atsr + 1), (void *)atsr + atsr->header.length, atsr->segment, atsru->devices, atsru->devices_cnt); if (ret > 0) break; else if (ret < 0) return ret; } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) { if (dmar_remove_dev_scope(info, atsr->segment, atsru->devices, atsru->devices_cnt)) break; } } list_for_each_entry(satcu, &dmar_satc_units, list) { satc = container_of(satcu->hdr, struct acpi_dmar_satc, header); if (info->event == BUS_NOTIFY_ADD_DEVICE) { ret = dmar_insert_dev_scope(info, (void *)(satc + 1), (void *)satc + satc->header.length, satc->segment, satcu->devices, satcu->devices_cnt); if (ret > 0) break; else if (ret < 0) return ret; } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) { if (dmar_remove_dev_scope(info, satc->segment, satcu->devices, satcu->devices_cnt)) break; } } return 0; } static void intel_disable_iommus(void) { struct intel_iommu *iommu = NULL; struct dmar_drhd_unit *drhd; for_each_iommu(iommu, drhd) iommu_disable_translation(iommu); } void intel_iommu_shutdown(void) { struct dmar_drhd_unit *drhd; struct intel_iommu *iommu = NULL; if (no_iommu || dmar_disabled) return; down_write(&dmar_global_lock); /* Disable PMRs explicitly here. */ for_each_iommu(iommu, drhd) iommu_disable_protect_mem_regions(iommu); /* Make sure the IOMMUs are switched off */ intel_disable_iommus(); up_write(&dmar_global_lock); } static struct intel_iommu *dev_to_intel_iommu(struct device *dev) { struct iommu_device *iommu_dev = dev_to_iommu_device(dev); return container_of(iommu_dev, struct intel_iommu, iommu); } static ssize_t version_show(struct device *dev, struct device_attribute *attr, char *buf) { struct intel_iommu *iommu = dev_to_intel_iommu(dev); u32 ver = readl(iommu->reg + DMAR_VER_REG); return sysfs_emit(buf, "%d:%d\n", DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver)); } static DEVICE_ATTR_RO(version); static ssize_t address_show(struct device *dev, struct device_attribute *attr, char *buf) { struct intel_iommu *iommu = dev_to_intel_iommu(dev); return sysfs_emit(buf, "%llx\n", iommu->reg_phys); } static DEVICE_ATTR_RO(address); static ssize_t cap_show(struct device *dev, struct device_attribute *attr, char *buf) { struct intel_iommu *iommu = dev_to_intel_iommu(dev); return sysfs_emit(buf, "%llx\n", iommu->cap); } static DEVICE_ATTR_RO(cap); static ssize_t ecap_show(struct device *dev, struct device_attribute *attr, char *buf) { struct intel_iommu *iommu = dev_to_intel_iommu(dev); return sysfs_emit(buf, "%llx\n", iommu->ecap); } static DEVICE_ATTR_RO(ecap); static ssize_t domains_supported_show(struct device *dev, struct device_attribute *attr, char *buf) { struct intel_iommu *iommu = dev_to_intel_iommu(dev); return sysfs_emit(buf, "%ld\n", cap_ndoms(iommu->cap)); } static DEVICE_ATTR_RO(domains_supported); static ssize_t domains_used_show(struct device *dev, struct device_attribute *attr, char *buf) { struct intel_iommu *iommu = dev_to_intel_iommu(dev); return sysfs_emit(buf, "%d\n", bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap))); } static DEVICE_ATTR_RO(domains_used); static struct attribute *intel_iommu_attrs[] = { &dev_attr_version.attr, &dev_attr_address.attr, &dev_attr_cap.attr, &dev_attr_ecap.attr, &dev_attr_domains_supported.attr, &dev_attr_domains_used.attr, NULL, }; static struct attribute_group intel_iommu_group = { .name = "intel-iommu", .attrs = intel_iommu_attrs, }; const struct attribute_group *intel_iommu_groups[] = { &intel_iommu_group, NULL, }; static bool has_external_pci(void) { struct pci_dev *pdev = NULL; for_each_pci_dev(pdev) if (pdev->external_facing) { pci_dev_put(pdev); return true; } return false; } static int __init platform_optin_force_iommu(void) { if (!dmar_platform_optin() || no_platform_optin || !has_external_pci()) return 0; if (no_iommu || dmar_disabled) pr_info("Intel-IOMMU force enabled due to platform opt in\n"); /* * If Intel-IOMMU is disabled by default, we will apply identity * map for all devices except those marked as being untrusted. */ if (dmar_disabled) iommu_set_default_passthrough(false); dmar_disabled = 0; no_iommu = 0; return 1; } static int __init probe_acpi_namespace_devices(void) { struct dmar_drhd_unit *drhd; /* To avoid a -Wunused-but-set-variable warning. */ struct intel_iommu *iommu __maybe_unused; struct device *dev; int i, ret = 0; for_each_active_iommu(iommu, drhd) { for_each_active_dev_scope(drhd->devices, drhd->devices_cnt, i, dev) { struct acpi_device_physical_node *pn; struct acpi_device *adev; if (dev->bus != &acpi_bus_type) continue; adev = to_acpi_device(dev); mutex_lock(&adev->physical_node_lock); list_for_each_entry(pn, &adev->physical_node_list, node) { ret = iommu_probe_device(pn->dev); if (ret) break; } mutex_unlock(&adev->physical_node_lock); if (ret) return ret; } } return 0; } static __init int tboot_force_iommu(void) { if (!tboot_enabled()) return 0; if (no_iommu || dmar_disabled) pr_warn("Forcing Intel-IOMMU to enabled\n"); dmar_disabled = 0; no_iommu = 0; return 1; } int __init intel_iommu_init(void) { int ret = -ENODEV; struct dmar_drhd_unit *drhd; struct intel_iommu *iommu; /* * Intel IOMMU is required for a TXT/tboot launch or platform * opt in, so enforce that. */ force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) || platform_optin_force_iommu(); down_write(&dmar_global_lock); if (dmar_table_init()) { if (force_on) panic("tboot: Failed to initialize DMAR table\n"); goto out_free_dmar; } if (dmar_dev_scope_init() < 0) { if (force_on) panic("tboot: Failed to initialize DMAR device scope\n"); goto out_free_dmar; } up_write(&dmar_global_lock); /* * The bus notifier takes the dmar_global_lock, so lockdep will * complain later when we register it under the lock. */ dmar_register_bus_notifier(); down_write(&dmar_global_lock); if (!no_iommu) intel_iommu_debugfs_init(); if (no_iommu || dmar_disabled) { /* * We exit the function here to ensure IOMMU's remapping and * mempool aren't setup, which means that the IOMMU's PMRs * won't be disabled via the call to init_dmars(). So disable * it explicitly here. The PMRs were setup by tboot prior to * calling SENTER, but the kernel is expected to reset/tear * down the PMRs. */ if (intel_iommu_tboot_noforce) { for_each_iommu(iommu, drhd) iommu_disable_protect_mem_regions(iommu); } /* * Make sure the IOMMUs are switched off, even when we * boot into a kexec kernel and the previous kernel left * them enabled */ intel_disable_iommus(); goto out_free_dmar; } if (list_empty(&dmar_rmrr_units)) pr_info("No RMRR found\n"); if (list_empty(&dmar_atsr_units)) pr_info("No ATSR found\n"); if (list_empty(&dmar_satc_units)) pr_info("No SATC found\n"); init_no_remapping_devices(); ret = init_dmars(); if (ret) { if (force_on) panic("tboot: Failed to initialize DMARs\n"); pr_err("Initialization failed\n"); goto out_free_dmar; } up_write(&dmar_global_lock); init_iommu_pm_ops(); down_read(&dmar_global_lock); for_each_active_iommu(iommu, drhd) { /* * The flush queue implementation does not perform * page-selective invalidations that are required for efficient * TLB flushes in virtual environments. The benefit of batching * is likely to be much lower than the overhead of synchronizing * the virtual and physical IOMMU page-tables. */ if (cap_caching_mode(iommu->cap) && !first_level_by_default(iommu)) { pr_info_once("IOMMU batching disallowed due to virtualization\n"); iommu_set_dma_strict(); } iommu_device_sysfs_add(&iommu->iommu, NULL, intel_iommu_groups, "%s", iommu->name); iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL); iommu_pmu_register(iommu); } if (probe_acpi_namespace_devices()) pr_warn("ACPI name space devices didn't probe correctly\n"); /* Finally, we enable the DMA remapping hardware. */ for_each_iommu(iommu, drhd) { if (!drhd->ignored && !translation_pre_enabled(iommu)) iommu_enable_translation(iommu); iommu_disable_protect_mem_regions(iommu); } up_read(&dmar_global_lock); pr_info("Intel(R) Virtualization Technology for Directed I/O\n"); intel_iommu_enabled = 1; return 0; out_free_dmar: intel_iommu_free_dmars(); up_write(&dmar_global_lock); return ret; } static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque) { struct device_domain_info *info = opaque; domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff); return 0; } /* * NB - intel-iommu lacks any sort of reference counting for the users of * dependent devices. If multiple endpoints have intersecting dependent * devices, unbinding the driver from any one of them will possibly leave * the others unable to operate. */ static void domain_context_clear(struct device_domain_info *info) { if (!dev_is_pci(info->dev)) { domain_context_clear_one(info, info->bus, info->devfn); return; } pci_for_each_dma_alias(to_pci_dev(info->dev), &domain_context_clear_one_cb, info); } /* * Clear the page table pointer in context or pasid table entries so that * all DMA requests without PASID from the device are blocked. If the page * table has been set, clean up the data structures. */ void device_block_translation(struct device *dev) { struct device_domain_info *info = dev_iommu_priv_get(dev); struct intel_iommu *iommu = info->iommu; unsigned long flags; if (info->domain) cache_tag_unassign_domain(info->domain, dev, IOMMU_NO_PASID); iommu_disable_pci_caps(info); if (!dev_is_real_dma_subdevice(dev)) { if (sm_supported(iommu)) intel_pasid_tear_down_entry(iommu, dev, IOMMU_NO_PASID, false); else domain_context_clear(info); } if (!info->domain) return; spin_lock_irqsave(&info->domain->lock, flags); list_del(&info->link); spin_unlock_irqrestore(&info->domain->lock, flags); domain_detach_iommu(info->domain, iommu); info->domain = NULL; } static int blocking_domain_attach_dev(struct iommu_domain *domain, struct device *dev) { device_block_translation(dev); return 0; } static struct iommu_domain blocking_domain = { .type = IOMMU_DOMAIN_BLOCKED, .ops = &(const struct iommu_domain_ops) { .attach_dev = blocking_domain_attach_dev, } }; static int iommu_superpage_capability(struct intel_iommu *iommu, bool first_stage) { if (!intel_iommu_superpage) return 0; if (first_stage) return cap_fl1gp_support(iommu->cap) ? 2 : 1; return fls(cap_super_page_val(iommu->cap)); } static struct dmar_domain *paging_domain_alloc(struct device *dev, bool first_stage) { struct device_domain_info *info = dev_iommu_priv_get(dev); struct intel_iommu *iommu = info->iommu; struct dmar_domain *domain; int addr_width; domain = kzalloc(sizeof(*domain), GFP_KERNEL); if (!domain) return ERR_PTR(-ENOMEM); INIT_LIST_HEAD(&domain->devices); INIT_LIST_HEAD(&domain->dev_pasids); INIT_LIST_HEAD(&domain->cache_tags); spin_lock_init(&domain->lock); spin_lock_init(&domain->cache_lock); xa_init(&domain->iommu_array); domain->nid = dev_to_node(dev); domain->use_first_level = first_stage; /* calculate the address width */ addr_width = agaw_to_width(iommu->agaw); if (addr_width > cap_mgaw(iommu->cap)) addr_width = cap_mgaw(iommu->cap); domain->gaw = addr_width; domain->agaw = iommu->agaw; domain->max_addr = __DOMAIN_MAX_ADDR(addr_width); /* iommu memory access coherency */ domain->iommu_coherency = iommu_paging_structure_coherency(iommu); /* pagesize bitmap */ domain->domain.pgsize_bitmap = SZ_4K; domain->iommu_superpage = iommu_superpage_capability(iommu, first_stage); domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain); /* * IOVA aperture: First-level translation restricts the input-address * to a canonical address (i.e., address bits 63:N have the same value * as address bit [N-1], where N is 48-bits with 4-level paging and * 57-bits with 5-level paging). Hence, skip bit [N-1]. */ domain->domain.geometry.force_aperture = true; domain->domain.geometry.aperture_start = 0; if (first_stage) domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1); else domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw); /* always allocate the top pgd */ domain->pgd = iommu_alloc_page_node(domain->nid, GFP_KERNEL); if (!domain->pgd) { kfree(domain); return ERR_PTR(-ENOMEM); } domain_flush_cache(domain, domain->pgd, PAGE_SIZE); return domain; } static struct iommu_domain * intel_iommu_domain_alloc_paging_flags(struct device *dev, u32 flags, const struct iommu_user_data *user_data) { struct device_domain_info *info = dev_iommu_priv_get(dev); bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING; bool nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT; struct intel_iommu *iommu = info->iommu; struct dmar_domain *dmar_domain; struct iommu_domain *domain; bool first_stage; if (flags & (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING | IOMMU_HWPT_FAULT_ID_VALID))) return ERR_PTR(-EOPNOTSUPP); if (nested_parent && !nested_supported(iommu)) return ERR_PTR(-EOPNOTSUPP); if (user_data || (dirty_tracking && !ssads_supported(iommu))) return ERR_PTR(-EOPNOTSUPP); /* * Always allocate the guest compatible page table unless * IOMMU_HWPT_ALLOC_NEST_PARENT or IOMMU_HWPT_ALLOC_DIRTY_TRACKING * is specified. */ if (nested_parent || dirty_tracking) { if (!sm_supported(iommu) || !ecap_slts(iommu->ecap)) return ERR_PTR(-EOPNOTSUPP); first_stage = false; } else { first_stage = first_level_by_default(iommu); } dmar_domain = paging_domain_alloc(dev, first_stage); if (IS_ERR(dmar_domain)) return ERR_CAST(dmar_domain); domain = &dmar_domain->domain; domain->type = IOMMU_DOMAIN_UNMANAGED; domain->owner = &intel_iommu_ops; domain->ops = intel_iommu_ops.default_domain_ops; if (nested_parent) { dmar_domain->nested_parent = true; INIT_LIST_HEAD(&dmar_domain->s1_domains); spin_lock_init(&dmar_domain->s1_lock); } if (dirty_tracking) { if (dmar_domain->use_first_level) { iommu_domain_free(domain); return ERR_PTR(-EOPNOTSUPP); } domain->dirty_ops = &intel_dirty_ops; } return domain; } static void intel_iommu_domain_free(struct iommu_domain *domain) { struct dmar_domain *dmar_domain = to_dmar_domain(domain); WARN_ON(dmar_domain->nested_parent && !list_empty(&dmar_domain->s1_domains)); domain_exit(dmar_domain); } int paging_domain_compatible(struct iommu_domain *domain, struct device *dev) { struct device_domain_info *info = dev_iommu_priv_get(dev); struct dmar_domain *dmar_domain = to_dmar_domain(domain); struct intel_iommu *iommu = info->iommu; int addr_width; if (WARN_ON_ONCE(!(domain->type & __IOMMU_DOMAIN_PAGING))) return -EPERM; if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap)) return -EINVAL; if (domain->dirty_ops && !ssads_supported(iommu)) return -EINVAL; if (dmar_domain->iommu_coherency != iommu_paging_structure_coherency(iommu)) return -EINVAL; if (dmar_domain->iommu_superpage != iommu_superpage_capability(iommu, dmar_domain->use_first_level)) return -EINVAL; if (dmar_domain->use_first_level && (!sm_supported(iommu) || !ecap_flts(iommu->ecap))) return -EINVAL; /* check if this iommu agaw is sufficient for max mapped address */ addr_width = agaw_to_width(iommu->agaw); if (addr_width > cap_mgaw(iommu->cap)) addr_width = cap_mgaw(iommu->cap); if (dmar_domain->gaw > addr_width || dmar_domain->agaw > iommu->agaw) return -EINVAL; if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) && context_copied(iommu, info->bus, info->devfn)) return intel_pasid_setup_sm_context(dev); return 0; } static int intel_iommu_attach_device(struct iommu_domain *domain, struct device *dev) { int ret; device_block_translation(dev); ret = paging_domain_compatible(domain, dev); if (ret) return ret; return dmar_domain_attach_device(to_dmar_domain(domain), dev); } static int intel_iommu_map(struct iommu_domain *domain, unsigned long iova, phys_addr_t hpa, size_t size, int iommu_prot, gfp_t gfp) { struct dmar_domain *dmar_domain = to_dmar_domain(domain); u64 max_addr; int prot = 0; if (iommu_prot & IOMMU_READ) prot |= DMA_PTE_READ; if (iommu_prot & IOMMU_WRITE) prot |= DMA_PTE_WRITE; if (dmar_domain->set_pte_snp) prot |= DMA_PTE_SNP; max_addr = iova + size; if (dmar_domain->max_addr < max_addr) { u64 end; /* check if minimum agaw is sufficient for mapped address */ end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1; if (end < max_addr) { pr_err("%s: iommu width (%d) is not " "sufficient for the mapped address (%llx)\n", __func__, dmar_domain->gaw, max_addr); return -EFAULT; } dmar_domain->max_addr = max_addr; } /* Round up size to next multiple of PAGE_SIZE, if it and the low bits of hpa would take us onto the next page */ size = aligned_nrpages(hpa, size); return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT, hpa >> VTD_PAGE_SHIFT, size, prot, gfp); } static int intel_iommu_map_pages(struct iommu_domain *domain, unsigned long iova, phys_addr_t paddr, size_t pgsize, size_t pgcount, int prot, gfp_t gfp, size_t *mapped) { unsigned long pgshift = __ffs(pgsize); size_t size = pgcount << pgshift; int ret; if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G) return -EINVAL; if (!IS_ALIGNED(iova | paddr, pgsize)) return -EINVAL; ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp); if (!ret && mapped) *mapped = size; return ret; } static size_t intel_iommu_unmap(struct iommu_domain *domain, unsigned long iova, size_t size, struct iommu_iotlb_gather *gather) { struct dmar_domain *dmar_domain = to_dmar_domain(domain); unsigned long start_pfn, last_pfn; int level = 0; /* Cope with horrid API which requires us to unmap more than the size argument if it happens to be a large-page mapping. */ if (unlikely(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level, GFP_ATOMIC))) return 0; if (size < VTD_PAGE_SIZE << level_to_offset_bits(level)) size = VTD_PAGE_SIZE << level_to_offset_bits(level); start_pfn = iova >> VTD_PAGE_SHIFT; last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT; domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist); if (dmar_domain->max_addr == iova + size) dmar_domain->max_addr = iova; /* * We do not use page-selective IOTLB invalidation in flush queue, * so there is no need to track page and sync iotlb. */ if (!iommu_iotlb_gather_queued(gather)) iommu_iotlb_gather_add_page(domain, gather, iova, size); return size; } static size_t intel_iommu_unmap_pages(struct iommu_domain *domain, unsigned long iova, size_t pgsize, size_t pgcount, struct iommu_iotlb_gather *gather) { unsigned long pgshift = __ffs(pgsize); size_t size = pgcount << pgshift; return intel_iommu_unmap(domain, iova, size, gather); } static void intel_iommu_tlb_sync(struct iommu_domain *domain, struct iommu_iotlb_gather *gather) { cache_tag_flush_range(to_dmar_domain(domain), gather->start, gather->end, list_empty(&gather->freelist)); iommu_put_pages_list(&gather->freelist); } static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain, dma_addr_t iova) { struct dmar_domain *dmar_domain = to_dmar_domain(domain); struct dma_pte *pte; int level = 0; u64 phys = 0; pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level, GFP_ATOMIC); if (pte && dma_pte_present(pte)) phys = dma_pte_addr(pte) + (iova & (BIT_MASK(level_to_offset_bits(level) + VTD_PAGE_SHIFT) - 1)); return phys; } static bool domain_support_force_snooping(struct dmar_domain *domain) { struct device_domain_info *info; bool support = true; assert_spin_locked(&domain->lock); list_for_each_entry(info, &domain->devices, link) { if (!ecap_sc_support(info->iommu->ecap)) { support = false; break; } } return support; } static void domain_set_force_snooping(struct dmar_domain *domain) { struct device_domain_info *info; assert_spin_locked(&domain->lock); /* * Second level page table supports per-PTE snoop control. The * iommu_map() interface will handle this by setting SNP bit. */ if (!domain->use_first_level) { domain->set_pte_snp = true; return; } list_for_each_entry(info, &domain->devices, link) intel_pasid_setup_page_snoop_control(info->iommu, info->dev, IOMMU_NO_PASID); } static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain) { struct dmar_domain *dmar_domain = to_dmar_domain(domain); unsigned long flags; if (dmar_domain->force_snooping) return true; spin_lock_irqsave(&dmar_domain->lock, flags); if (!domain_support_force_snooping(dmar_domain) || (!dmar_domain->use_first_level && dmar_domain->has_mappings)) { spin_unlock_irqrestore(&dmar_domain->lock, flags); return false; } domain_set_force_snooping(dmar_domain); dmar_domain->force_snooping = true; spin_unlock_irqrestore(&dmar_domain->lock, flags); return true; } static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap) { struct device_domain_info *info = dev_iommu_priv_get(dev); switch (cap) { case IOMMU_CAP_CACHE_COHERENCY: case IOMMU_CAP_DEFERRED_FLUSH: return true; case IOMMU_CAP_PRE_BOOT_PROTECTION: return dmar_platform_optin(); case IOMMU_CAP_ENFORCE_CACHE_COHERENCY: return ecap_sc_support(info->iommu->ecap); case IOMMU_CAP_DIRTY_TRACKING: return ssads_supported(info->iommu); default: return false; } } static struct iommu_device *intel_iommu_probe_device(struct device *dev) { struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL; struct device_domain_info *info; struct intel_iommu *iommu; u8 bus, devfn; int ret; iommu = device_lookup_iommu(dev, &bus, &devfn); if (!iommu || !iommu->iommu.ops) return ERR_PTR(-ENODEV); info = kzalloc(sizeof(*info), GFP_KERNEL); if (!info) return ERR_PTR(-ENOMEM); if (dev_is_real_dma_subdevice(dev)) { info->bus = pdev->bus->number; info->devfn = pdev->devfn; info->segment = pci_domain_nr(pdev->bus); } else { info->bus = bus; info->devfn = devfn; info->segment = iommu->segment; } info->dev = dev; info->iommu = iommu; if (dev_is_pci(dev)) { if (ecap_dev_iotlb_support(iommu->ecap) && pci_ats_supported(pdev) && dmar_ats_supported(pdev, iommu)) { info->ats_supported = 1; info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev); /* * For IOMMU that supports device IOTLB throttling * (DIT), we assign PFSID to the invalidation desc * of a VF such that IOMMU HW can gauge queue depth * at PF level. If DIT is not set, PFSID will be * treated as reserved, which should be set to 0. */ if (ecap_dit(iommu->ecap)) info->pfsid = pci_dev_id(pci_physfn(pdev)); info->ats_qdep = pci_ats_queue_depth(pdev); } if (sm_supported(iommu)) { if (pasid_supported(iommu)) { int features = pci_pasid_features(pdev); if (features >= 0) info->pasid_supported = features | 1; } if (info->ats_supported && ecap_prs(iommu->ecap) && pci_pri_supported(pdev)) info->pri_supported = 1; } } dev_iommu_priv_set(dev, info); if (pdev && pci_ats_supported(pdev)) { pci_prepare_ats(pdev, VTD_PAGE_SHIFT); ret = device_rbtree_insert(iommu, info); if (ret) goto free; } if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) { ret = intel_pasid_alloc_table(dev); if (ret) { dev_err(dev, "PASID table allocation failed\n"); goto clear_rbtree; } if (!context_copied(iommu, info->bus, info->devfn)) { ret = intel_pasid_setup_sm_context(dev); if (ret) goto free_table; } } intel_iommu_debugfs_create_dev(info); /* * The PCIe spec, in its wisdom, declares that the behaviour of the * device is undefined if you enable PASID support after ATS support. * So always enable PASID support on devices which have it, even if * we can't yet know if we're ever going to use it. */ if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1)) info->pasid_enabled = 1; return &iommu->iommu; free_table: intel_pasid_free_table(dev); clear_rbtree: device_rbtree_remove(info); free: kfree(info); return ERR_PTR(ret); } static void intel_iommu_release_device(struct device *dev) { struct device_domain_info *info = dev_iommu_priv_get(dev); struct intel_iommu *iommu = info->iommu; if (info->pasid_enabled) { pci_disable_pasid(to_pci_dev(dev)); info->pasid_enabled = 0; } mutex_lock(&iommu->iopf_lock); if (dev_is_pci(dev) && pci_ats_supported(to_pci_dev(dev))) device_rbtree_remove(info); mutex_unlock(&iommu->iopf_lock); if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) && !context_copied(iommu, info->bus, info->devfn)) intel_pasid_teardown_sm_context(dev); intel_pasid_free_table(dev); intel_iommu_debugfs_remove_dev(info); kfree(info); set_dma_ops(dev, NULL); } static void intel_iommu_get_resv_regions(struct device *device, struct list_head *head) { int prot = DMA_PTE_READ | DMA_PTE_WRITE; struct iommu_resv_region *reg; struct dmar_rmrr_unit *rmrr; struct device *i_dev; int i; rcu_read_lock(); for_each_rmrr_units(rmrr) { for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt, i, i_dev) { struct iommu_resv_region *resv; enum iommu_resv_type type; size_t length; if (i_dev != device && !is_downstream_to_pci_bridge(device, i_dev)) continue; length = rmrr->end_address - rmrr->base_address + 1; type = device_rmrr_is_relaxable(device) ? IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT; resv = iommu_alloc_resv_region(rmrr->base_address, length, prot, type, GFP_ATOMIC); if (!resv) break; list_add_tail(&resv->list, head); } } rcu_read_unlock(); #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA if (dev_is_pci(device)) { struct pci_dev *pdev = to_pci_dev(device); if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) { reg = iommu_alloc_resv_region(0, 1UL << 24, prot, IOMMU_RESV_DIRECT_RELAXABLE, GFP_KERNEL); if (reg) list_add_tail(&reg->list, head); } } #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */ reg = iommu_alloc_resv_region(IOAPIC_RANGE_START, IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1, 0, IOMMU_RESV_MSI, GFP_KERNEL); if (!reg) return; list_add_tail(&reg->list, head); } static struct iommu_group *intel_iommu_device_group(struct device *dev) { if (dev_is_pci(dev)) return pci_device_group(dev); return generic_device_group(dev); } static int intel_iommu_enable_sva(struct device *dev) { struct device_domain_info *info = dev_iommu_priv_get(dev); struct intel_iommu *iommu; if (!info || dmar_disabled) return -EINVAL; iommu = info->iommu; if (!iommu) return -EINVAL; if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE)) return -ENODEV; if (!info->pasid_enabled || !info->ats_enabled) return -EINVAL; /* * Devices having device-specific I/O fault handling should not * support PCI/PRI. The IOMMU side has no means to check the * capability of device-specific IOPF. Therefore, IOMMU can only * default that if the device driver enables SVA on a non-PRI * device, it will handle IOPF in its own way. */ if (!info->pri_supported) return 0; /* Devices supporting PRI should have it enabled. */ if (!info->pri_enabled) return -EINVAL; return 0; } static int context_flip_pri(struct device_domain_info *info, bool enable) { struct intel_iommu *iommu = info->iommu; u8 bus = info->bus, devfn = info->devfn; struct context_entry *context; u16 did; spin_lock(&iommu->lock); if (context_copied(iommu, bus, devfn)) { spin_unlock(&iommu->lock); return -EINVAL; } context = iommu_context_addr(iommu, bus, devfn, false); if (!context || !context_present(context)) { spin_unlock(&iommu->lock); return -ENODEV; } did = context_domain_id(context); if (enable) context_set_sm_pre(context); else context_clear_sm_pre(context); if (!ecap_coherent(iommu->ecap)) clflush_cache_range(context, sizeof(*context)); intel_context_flush_present(info, context, did, true); spin_unlock(&iommu->lock); return 0; } static int intel_iommu_enable_iopf(struct device *dev) { struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL; struct device_domain_info *info = dev_iommu_priv_get(dev); struct intel_iommu *iommu; int ret; if (!pdev || !info || !info->ats_enabled || !info->pri_supported) return -ENODEV; if (info->pri_enabled) return -EBUSY; iommu = info->iommu; if (!iommu) return -EINVAL; /* PASID is required in PRG Response Message. */ if (info->pasid_enabled && !pci_prg_resp_pasid_required(pdev)) return -EINVAL; ret = pci_reset_pri(pdev); if (ret) return ret; ret = iopf_queue_add_device(iommu->iopf_queue, dev); if (ret) return ret; ret = context_flip_pri(info, true); if (ret) goto err_remove_device; ret = pci_enable_pri(pdev, PRQ_DEPTH); if (ret) goto err_clear_pri; info->pri_enabled = 1; return 0; err_clear_pri: context_flip_pri(info, false); err_remove_device: iopf_queue_remove_device(iommu->iopf_queue, dev); return ret; } static int intel_iommu_disable_iopf(struct device *dev) { struct device_domain_info *info = dev_iommu_priv_get(dev); struct intel_iommu *iommu = info->iommu; if (!info->pri_enabled) return -EINVAL; /* Disable new PRI reception: */ context_flip_pri(info, false); /* * Remove device from fault queue and acknowledge all outstanding * PRQs to the device: */ iopf_queue_remove_device(iommu->iopf_queue, dev); /* * PCIe spec states that by clearing PRI enable bit, the Page * Request Interface will not issue new page requests, but has * outstanding page requests that have been transmitted or are * queued for transmission. This is supposed to be called after * the device driver has stopped DMA, all PASIDs have been * unbound and the outstanding PRQs have been drained. */ pci_disable_pri(to_pci_dev(dev)); info->pri_enabled = 0; return 0; } static int intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat) { switch (feat) { case IOMMU_DEV_FEAT_IOPF: return intel_iommu_enable_iopf(dev); case IOMMU_DEV_FEAT_SVA: return intel_iommu_enable_sva(dev); default: return -ENODEV; } } static int intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat) { switch (feat) { case IOMMU_DEV_FEAT_IOPF: return intel_iommu_disable_iopf(dev); case IOMMU_DEV_FEAT_SVA: return 0; default: return -ENODEV; } } static bool intel_iommu_is_attach_deferred(struct device *dev) { struct device_domain_info *info = dev_iommu_priv_get(dev); return translation_pre_enabled(info->iommu) && !info->domain; } /* * Check that the device does not live on an external facing PCI port that is * marked as untrusted. Such devices should not be able to apply quirks and * thus not be able to bypass the IOMMU restrictions. */ static bool risky_device(struct pci_dev *pdev) { if (pdev->untrusted) { pci_info(pdev, "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n", pdev->vendor, pdev->device); pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n"); return true; } return false; } static int intel_iommu_iotlb_sync_map(struct iommu_domain *domain, unsigned long iova, size_t size) { cache_tag_flush_range_np(to_dmar_domain(domain), iova, iova + size - 1); return 0; } void domain_remove_dev_pasid(struct iommu_domain *domain, struct device *dev, ioasid_t pasid) { struct device_domain_info *info = dev_iommu_priv_get(dev); struct dev_pasid_info *curr, *dev_pasid = NULL; struct intel_iommu *iommu = info->iommu; struct dmar_domain *dmar_domain; unsigned long flags; if (!domain) return; /* Identity domain has no meta data for pasid. */ if (domain->type == IOMMU_DOMAIN_IDENTITY) return; dmar_domain = to_dmar_domain(domain); spin_lock_irqsave(&dmar_domain->lock, flags); list_for_each_entry(curr, &dmar_domain->dev_pasids, link_domain) { if (curr->dev == dev && curr->pasid == pasid) { list_del(&curr->link_domain); dev_pasid = curr; break; } } WARN_ON_ONCE(!dev_pasid); spin_unlock_irqrestore(&dmar_domain->lock, flags); cache_tag_unassign_domain(dmar_domain, dev, pasid); domain_detach_iommu(dmar_domain, iommu); intel_iommu_debugfs_remove_dev_pasid(dev_pasid); kfree(dev_pasid); } static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid, struct iommu_domain *domain) { struct device_domain_info *info = dev_iommu_priv_get(dev); intel_pasid_tear_down_entry(info->iommu, dev, pasid, false); domain_remove_dev_pasid(domain, dev, pasid); } struct dev_pasid_info * domain_add_dev_pasid(struct iommu_domain *domain, struct device *dev, ioasid_t pasid) { struct device_domain_info *info = dev_iommu_priv_get(dev); struct dmar_domain *dmar_domain = to_dmar_domain(domain); struct intel_iommu *iommu = info->iommu; struct dev_pasid_info *dev_pasid; unsigned long flags; int ret; dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL); if (!dev_pasid) return ERR_PTR(-ENOMEM); ret = domain_attach_iommu(dmar_domain, iommu); if (ret) goto out_free; ret = cache_tag_assign_domain(dmar_domain, dev, pasid); if (ret) goto out_detach_iommu; dev_pasid->dev = dev; dev_pasid->pasid = pasid; spin_lock_irqsave(&dmar_domain->lock, flags); list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids); spin_unlock_irqrestore(&dmar_domain->lock, flags); return dev_pasid; out_detach_iommu: domain_detach_iommu(dmar_domain, iommu); out_free: kfree(dev_pasid); return ERR_PTR(ret); } static int intel_iommu_set_dev_pasid(struct iommu_domain *domain, struct device *dev, ioasid_t pasid, struct iommu_domain *old) { struct device_domain_info *info = dev_iommu_priv_get(dev); struct dmar_domain *dmar_domain = to_dmar_domain(domain); struct intel_iommu *iommu = info->iommu; struct dev_pasid_info *dev_pasid; int ret; if (WARN_ON_ONCE(!(domain->type & __IOMMU_DOMAIN_PAGING))) return -EINVAL; if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev)) return -EOPNOTSUPP; if (domain->dirty_ops) return -EINVAL; if (context_copied(iommu, info->bus, info->devfn)) return -EBUSY; ret = paging_domain_compatible(domain, dev); if (ret) return ret; dev_pasid = domain_add_dev_pasid(domain, dev, pasid); if (IS_ERR(dev_pasid)) return PTR_ERR(dev_pasid); if (dmar_domain->use_first_level) ret = domain_setup_first_level(iommu, dmar_domain, dev, pasid, old); else ret = domain_setup_second_level(iommu, dmar_domain, dev, pasid, old); if (ret) goto out_remove_dev_pasid; domain_remove_dev_pasid(old, dev, pasid); intel_iommu_debugfs_create_dev_pasid(dev_pasid); return 0; out_remove_dev_pasid: domain_remove_dev_pasid(domain, dev, pasid); return ret; } static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type) { struct device_domain_info *info = dev_iommu_priv_get(dev); struct intel_iommu *iommu = info->iommu; struct iommu_hw_info_vtd *vtd; vtd = kzalloc(sizeof(*vtd), GFP_KERNEL); if (!vtd) return ERR_PTR(-ENOMEM); vtd->flags = IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17; vtd->cap_reg = iommu->cap; vtd->ecap_reg = iommu->ecap; *length = sizeof(*vtd); *type = IOMMU_HW_INFO_TYPE_INTEL_VTD; return vtd; } /* * Set dirty tracking for the device list of a domain. The caller must * hold the domain->lock when calling it. */ static int device_set_dirty_tracking(struct list_head *devices, bool enable) { struct device_domain_info *info; int ret = 0; list_for_each_entry(info, devices, link) { ret = intel_pasid_setup_dirty_tracking(info->iommu, info->dev, IOMMU_NO_PASID, enable); if (ret) break; } return ret; } static int parent_domain_set_dirty_tracking(struct dmar_domain *domain, bool enable) { struct dmar_domain *s1_domain; unsigned long flags; int ret; spin_lock(&domain->s1_lock); list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) { spin_lock_irqsave(&s1_domain->lock, flags); ret = device_set_dirty_tracking(&s1_domain->devices, enable); spin_unlock_irqrestore(&s1_domain->lock, flags); if (ret) goto err_unwind; } spin_unlock(&domain->s1_lock); return 0; err_unwind: list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) { spin_lock_irqsave(&s1_domain->lock, flags); device_set_dirty_tracking(&s1_domain->devices, domain->dirty_tracking); spin_unlock_irqrestore(&s1_domain->lock, flags); } spin_unlock(&domain->s1_lock); return ret; } static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain, bool enable) { struct dmar_domain *dmar_domain = to_dmar_domain(domain); int ret; spin_lock(&dmar_domain->lock); if (dmar_domain->dirty_tracking == enable) goto out_unlock; ret = device_set_dirty_tracking(&dmar_domain->devices, enable); if (ret) goto err_unwind; if (dmar_domain->nested_parent) { ret = parent_domain_set_dirty_tracking(dmar_domain, enable); if (ret) goto err_unwind; } dmar_domain->dirty_tracking = enable; out_unlock: spin_unlock(&dmar_domain->lock); return 0; err_unwind: device_set_dirty_tracking(&dmar_domain->devices, dmar_domain->dirty_tracking); spin_unlock(&dmar_domain->lock); return ret; } static int intel_iommu_read_and_clear_dirty(struct iommu_domain *domain, unsigned long iova, size_t size, unsigned long flags, struct iommu_dirty_bitmap *dirty) { struct dmar_domain *dmar_domain = to_dmar_domain(domain); unsigned long end = iova + size - 1; unsigned long pgsize; /* * IOMMUFD core calls into a dirty tracking disabled domain without an * IOVA bitmap set in order to clean dirty bits in all PTEs that might * have occurred when we stopped dirty tracking. This ensures that we * never inherit dirtied bits from a previous cycle. */ if (!dmar_domain->dirty_tracking && dirty->bitmap) return -EINVAL; do { struct dma_pte *pte; int lvl = 0; pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &lvl, GFP_ATOMIC); pgsize = level_size(lvl) << VTD_PAGE_SHIFT; if (!pte || !dma_pte_present(pte)) { iova += pgsize; continue; } if (dma_sl_pte_test_and_clear_dirty(pte, flags)) iommu_dirty_bitmap_record(dirty, iova, pgsize); iova += pgsize; } while (iova < end); return 0; } static const struct iommu_dirty_ops intel_dirty_ops = { .set_dirty_tracking = intel_iommu_set_dirty_tracking, .read_and_clear_dirty = intel_iommu_read_and_clear_dirty, }; static int context_setup_pass_through(struct device *dev, u8 bus, u8 devfn) { struct device_domain_info *info = dev_iommu_priv_get(dev); struct intel_iommu *iommu = info->iommu; struct context_entry *context; spin_lock(&iommu->lock); context = iommu_context_addr(iommu, bus, devfn, 1); if (!context) { spin_unlock(&iommu->lock); return -ENOMEM; } if (context_present(context) && !context_copied(iommu, bus, devfn)) { spin_unlock(&iommu->lock); return 0; } copied_context_tear_down(iommu, context, bus, devfn); context_clear_entry(context); context_set_domain_id(context, FLPT_DEFAULT_DID); /* * In pass through mode, AW must be programmed to indicate the largest * AGAW value supported by hardware. And ASR is ignored by hardware. */ context_set_address_width(context, iommu->msagaw); context_set_translation_type(context, CONTEXT_TT_PASS_THROUGH); context_set_fault_enable(context); context_set_present(context); if (!ecap_coherent(iommu->ecap)) clflush_cache_range(context, sizeof(*context)); context_present_cache_flush(iommu, FLPT_DEFAULT_DID, bus, devfn); spin_unlock(&iommu->lock); return 0; } static int context_setup_pass_through_cb(struct pci_dev *pdev, u16 alias, void *data) { struct device *dev = data; if (dev != &pdev->dev) return 0; return context_setup_pass_through(dev, PCI_BUS_NUM(alias), alias & 0xff); } static int device_setup_pass_through(struct device *dev) { struct device_domain_info *info = dev_iommu_priv_get(dev); if (!dev_is_pci(dev)) return context_setup_pass_through(dev, info->bus, info->devfn); return pci_for_each_dma_alias(to_pci_dev(dev), context_setup_pass_through_cb, dev); } static int identity_domain_attach_dev(struct iommu_domain *domain, struct device *dev) { struct device_domain_info *info = dev_iommu_priv_get(dev); struct intel_iommu *iommu = info->iommu; int ret; device_block_translation(dev); if (dev_is_real_dma_subdevice(dev)) return 0; if (sm_supported(iommu)) { ret = intel_pasid_setup_pass_through(iommu, dev, IOMMU_NO_PASID); if (!ret) iommu_enable_pci_caps(info); } else { ret = device_setup_pass_through(dev); } return ret; } static int identity_domain_set_dev_pasid(struct iommu_domain *domain, struct device *dev, ioasid_t pasid, struct iommu_domain *old) { struct device_domain_info *info = dev_iommu_priv_get(dev); struct intel_iommu *iommu = info->iommu; int ret; if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev)) return -EOPNOTSUPP; ret = domain_setup_passthrough(iommu, dev, pasid, old); if (ret) return ret; domain_remove_dev_pasid(old, dev, pasid); return 0; } static struct iommu_domain identity_domain = { .type = IOMMU_DOMAIN_IDENTITY, .ops = &(const struct iommu_domain_ops) { .attach_dev = identity_domain_attach_dev, .set_dev_pasid = identity_domain_set_dev_pasid, }, }; static struct iommu_domain *intel_iommu_domain_alloc_paging(struct device *dev) { struct device_domain_info *info = dev_iommu_priv_get(dev); struct intel_iommu *iommu = info->iommu; struct dmar_domain *dmar_domain; bool first_stage; first_stage = first_level_by_default(iommu); dmar_domain = paging_domain_alloc(dev, first_stage); if (IS_ERR(dmar_domain)) return ERR_CAST(dmar_domain); return &dmar_domain->domain; } const struct iommu_ops intel_iommu_ops = { .blocked_domain = &blocking_domain, .release_domain = &blocking_domain, .identity_domain = &identity_domain, .capable = intel_iommu_capable, .hw_info = intel_iommu_hw_info, .domain_alloc_paging_flags = intel_iommu_domain_alloc_paging_flags, .domain_alloc_sva = intel_svm_domain_alloc, .domain_alloc_paging = intel_iommu_domain_alloc_paging, .domain_alloc_nested = intel_iommu_domain_alloc_nested, .probe_device = intel_iommu_probe_device, .release_device = intel_iommu_release_device, .get_resv_regions = intel_iommu_get_resv_regions, .device_group = intel_iommu_device_group, .dev_enable_feat = intel_iommu_dev_enable_feat, .dev_disable_feat = intel_iommu_dev_disable_feat, .is_attach_deferred = intel_iommu_is_attach_deferred, .def_domain_type = device_def_domain_type, .remove_dev_pasid = intel_iommu_remove_dev_pasid, .pgsize_bitmap = SZ_4K, .page_response = intel_iommu_page_response, .default_domain_ops = &(const struct iommu_domain_ops) { .attach_dev = intel_iommu_attach_device, .set_dev_pasid = intel_iommu_set_dev_pasid, .map_pages = intel_iommu_map_pages, .unmap_pages = intel_iommu_unmap_pages, .iotlb_sync_map = intel_iommu_iotlb_sync_map, .flush_iotlb_all = intel_flush_iotlb_all, .iotlb_sync = intel_iommu_tlb_sync, .iova_to_phys = intel_iommu_iova_to_phys, .free = intel_iommu_domain_free, .enforce_cache_coherency = intel_iommu_enforce_cache_coherency, } }; static void quirk_iommu_igfx(struct pci_dev *dev) { if (risky_device(dev)) return; pci_info(dev, "Disabling IOMMU for graphics on this chipset\n"); disable_igfx_iommu = 1; } /* G4x/GM45 integrated gfx dmar support is totally busted. */ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx); /* Broadwell igfx malfunctions with dmar */ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx); static void quirk_iommu_rwbf(struct pci_dev *dev) { if (risky_device(dev)) return; /* * Mobile 4 Series Chipset neglects to set RWBF capability, * but needs it. Same seems to hold for the desktop versions. */ pci_info(dev, "Forcing write-buffer flush capability\n"); rwbf_quirk = 1; } DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf); #define GGC 0x52 #define GGC_MEMORY_SIZE_MASK (0xf << 8) #define GGC_MEMORY_SIZE_NONE (0x0 << 8) #define GGC_MEMORY_SIZE_1M (0x1 << 8) #define GGC_MEMORY_SIZE_2M (0x3 << 8) #define GGC_MEMORY_VT_ENABLED (0x8 << 8) #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8) #define GGC_MEMORY_SIZE_3M_VT (0xa << 8) #define GGC_MEMORY_SIZE_4M_VT (0xb << 8) static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev) { unsigned short ggc; if (risky_device(dev)) return; if (pci_read_config_word(dev, GGC, &ggc)) return; if (!(ggc & GGC_MEMORY_VT_ENABLED)) { pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n"); disable_igfx_iommu = 1; } else if (!disable_igfx_iommu) { /* we have to ensure the gfx device is idle before we flush */ pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n"); iommu_set_dma_strict(); } } DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt); static void quirk_igfx_skip_te_disable(struct pci_dev *dev) { unsigned short ver; if (!IS_GFX_DEVICE(dev)) return; ver = (dev->device >> 8) & 0xff; if (ver != 0x45 && ver != 0x46 && ver != 0x4c && ver != 0x4e && ver != 0x8a && ver != 0x98 && ver != 0x9a && ver != 0xa7 && ver != 0x7d) return; if (risky_device(dev)) return; pci_info(dev, "Skip IOMMU disabling for graphics\n"); iommu_skip_te_disable = 1; } DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable); /* On Tylersburg chipsets, some BIOSes have been known to enable the ISOCH DMAR unit for the Azalia sound device, but not give it any TLB entries, which causes it to deadlock. Check for that. We do this in a function called from init_dmars(), instead of in a PCI quirk, because we don't want to print the obnoxious "BIOS broken" message if VT-d is actually disabled. */ static void __init check_tylersburg_isoch(void) { struct pci_dev *pdev; uint32_t vtisochctrl; /* If there's no Azalia in the system anyway, forget it. */ pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL); if (!pdev) return; if (risky_device(pdev)) { pci_dev_put(pdev); return; } pci_dev_put(pdev); /* System Management Registers. Might be hidden, in which case we can't do the sanity check. But that's OK, because the known-broken BIOSes _don't_ actually hide it, so far. */ pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL); if (!pdev) return; if (risky_device(pdev)) { pci_dev_put(pdev); return; } if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) { pci_dev_put(pdev); return; } pci_dev_put(pdev); /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */ if (vtisochctrl & 1) return; /* Drop all bits other than the number of TLB entries */ vtisochctrl &= 0x1c; /* If we have the recommended number of TLB entries (16), fine. */ if (vtisochctrl == 0x10) return; /* Zero TLB entries? You get to ride the short bus to school. */ if (!vtisochctrl) { WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n" "BIOS vendor: %s; Ver: %s; Product Version: %s\n", dmi_get_system_info(DMI_BIOS_VENDOR), dmi_get_system_info(DMI_BIOS_VERSION), dmi_get_system_info(DMI_PRODUCT_VERSION)); iommu_identity_mapping |= IDENTMAP_AZALIA; return; } pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n", vtisochctrl); } /* * Here we deal with a device TLB defect where device may inadvertently issue ATS * invalidation completion before posted writes initiated with translated address * that utilized translations matching the invalidation address range, violating * the invalidation completion ordering. * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is * vulnerable to this defect. In other words, any dTLB invalidation initiated not * under the control of the trusted/privileged host device driver must use this * quirk. * Device TLBs are invalidated under the following six conditions: * 1. Device driver does DMA API unmap IOVA * 2. Device driver unbind a PASID from a process, sva_unbind_device() * 3. PASID is torn down, after PASID cache is flushed. e.g. process * exit_mmap() due to crash * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where * VM has to free pages that were unmapped * 5. Userspace driver unmaps a DMA buffer * 6. Cache invalidation in vSVA usage (upcoming) * * For #1 and #2, device drivers are responsible for stopping DMA traffic * before unmap/unbind. For #3, iommu driver gets mmu_notifier to * invalidate TLB the same way as normal user unmap which will use this quirk. * The dTLB invalidation after PASID cache flush does not need this quirk. * * As a reminder, #6 will *NEED* this quirk as we enable nested translation. */ void quirk_extra_dev_tlb_flush(struct device_domain_info *info, unsigned long address, unsigned long mask, u32 pasid, u16 qdep) { u16 sid; if (likely(!info->dtlb_extra_inval)) return; sid = PCI_DEVID(info->bus, info->devfn); if (pasid == IOMMU_NO_PASID) { qi_flush_dev_iotlb(info->iommu, sid, info->pfsid, qdep, address, mask); } else { qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid, pasid, qdep, address, mask); } } #define ecmd_get_status_code(res) (((res) & 0xff) >> 1) /* * Function to submit a command to the enhanced command interface. The * valid enhanced command descriptions are defined in Table 47 of the * VT-d spec. The VT-d hardware implementation may support some but not * all commands, which can be determined by checking the Enhanced * Command Capability Register. * * Return values: * - 0: Command successful without any error; * - Negative: software error value; * - Nonzero positive: failure status code defined in Table 48. */ int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd, u64 oa, u64 ob) { unsigned long flags; u64 res; int ret; if (!cap_ecmds(iommu->cap)) return -ENODEV; raw_spin_lock_irqsave(&iommu->register_lock, flags); res = dmar_readq(iommu->reg + DMAR_ECRSP_REG); if (res & DMA_ECMD_ECRSP_IP) { ret = -EBUSY; goto err; } /* * Unconditionally write the operand B, because * - There is no side effect if an ecmd doesn't require an * operand B, but we set the register to some value. * - It's not invoked in any critical path. The extra MMIO * write doesn't bring any performance concerns. */ dmar_writeq(iommu->reg + DMAR_ECEO_REG, ob); dmar_writeq(iommu->reg + DMAR_ECMD_REG, ecmd | (oa << DMA_ECMD_OA_SHIFT)); IOMMU_WAIT_OP(iommu, DMAR_ECRSP_REG, dmar_readq, !(res & DMA_ECMD_ECRSP_IP), res); if (res & DMA_ECMD_ECRSP_IP) { ret = -ETIMEDOUT; goto err; } ret = ecmd_get_status_code(res); err: raw_spin_unlock_irqrestore(&iommu->register_lock, flags); return ret; }
26 25 26 14 4 5 4 4 2 4 4 2 2 1 3 3 1 6 5 4 3 1 6 1 1 4 3 3 1 1 1 1 1 1 2 2 2 9 8 8 4 2 1 1 8 8 5 4 1 1 4 4 4 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 /* * Copyright (c) 2007, 2020 Oracle and/or its affiliates. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ #include <linux/pagemap.h> #include <linux/slab.h> #include <linux/rbtree.h> #include <linux/dma-mapping.h> /* for DMA_*_DEVICE */ #include "rds.h" /* * XXX * - build with sparse * - should we detect duplicate keys on a socket? hmm. * - an rdma is an mlock, apply rlimit? */ /* * get the number of pages by looking at the page indices that the start and * end addresses fall in. * * Returns 0 if the vec is invalid. It is invalid if the number of bytes * causes the address to wrap or overflows an unsigned int. This comes * from being stored in the 'length' member of 'struct scatterlist'. */ static unsigned int rds_pages_in_vec(struct rds_iovec *vec) { if ((vec->addr + vec->bytes <= vec->addr) || (vec->bytes > (u64)UINT_MAX)) return 0; return ((vec->addr + vec->bytes + PAGE_SIZE - 1) >> PAGE_SHIFT) - (vec->addr >> PAGE_SHIFT); } static struct rds_mr *rds_mr_tree_walk(struct rb_root *root, u64 key, struct rds_mr *insert) { struct rb_node **p = &root->rb_node; struct rb_node *parent = NULL; struct rds_mr *mr; while (*p) { parent = *p; mr = rb_entry(parent, struct rds_mr, r_rb_node); if (key < mr->r_key) p = &(*p)->rb_left; else if (key > mr->r_key) p = &(*p)->rb_right; else return mr; } if (insert) { rb_link_node(&insert->r_rb_node, parent, p); rb_insert_color(&insert->r_rb_node, root); kref_get(&insert->r_kref); } return NULL; } /* * Destroy the transport-specific part of a MR. */ static void rds_destroy_mr(struct rds_mr *mr) { struct rds_sock *rs = mr->r_sock; void *trans_private = NULL; unsigned long flags; rdsdebug("RDS: destroy mr key is %x refcnt %u\n", mr->r_key, kref_read(&mr->r_kref)); spin_lock_irqsave(&rs->rs_rdma_lock, flags); if (!RB_EMPTY_NODE(&mr->r_rb_node)) rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys); trans_private = mr->r_trans_private; mr->r_trans_private = NULL; spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); if (trans_private) mr->r_trans->free_mr(trans_private, mr->r_invalidate); } void __rds_put_mr_final(struct kref *kref) { struct rds_mr *mr = container_of(kref, struct rds_mr, r_kref); rds_destroy_mr(mr); kfree(mr); } /* * By the time this is called we can't have any more ioctls called on * the socket so we don't need to worry about racing with others. */ void rds_rdma_drop_keys(struct rds_sock *rs) { struct rds_mr *mr; struct rb_node *node; unsigned long flags; /* Release any MRs associated with this socket */ spin_lock_irqsave(&rs->rs_rdma_lock, flags); while ((node = rb_first(&rs->rs_rdma_keys))) { mr = rb_entry(node, struct rds_mr, r_rb_node); if (mr->r_trans == rs->rs_transport) mr->r_invalidate = 0; rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys); RB_CLEAR_NODE(&mr->r_rb_node); spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); kref_put(&mr->r_kref, __rds_put_mr_final); spin_lock_irqsave(&rs->rs_rdma_lock, flags); } spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); if (rs->rs_transport && rs->rs_transport->flush_mrs) rs->rs_transport->flush_mrs(); } /* * Helper function to pin user pages. */ static int rds_pin_pages(unsigned long user_addr, unsigned int nr_pages, struct page **pages, int write) { unsigned int gup_flags = FOLL_LONGTERM; int ret; if (write) gup_flags |= FOLL_WRITE; ret = pin_user_pages_fast(user_addr, nr_pages, gup_flags, pages); if (ret >= 0 && ret < nr_pages) { unpin_user_pages(pages, ret); ret = -EFAULT; } return ret; } static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args, u64 *cookie_ret, struct rds_mr **mr_ret, struct rds_conn_path *cp) { struct rds_mr *mr = NULL, *found; struct scatterlist *sg = NULL; unsigned int nr_pages; struct page **pages = NULL; void *trans_private; unsigned long flags; rds_rdma_cookie_t cookie; unsigned int nents = 0; int need_odp = 0; long i; int ret; if (ipv6_addr_any(&rs->rs_bound_addr) || !rs->rs_transport) { ret = -ENOTCONN; /* XXX not a great errno */ goto out; } if (!rs->rs_transport->get_mr) { ret = -EOPNOTSUPP; goto out; } /* If the combination of the addr and size requested for this memory * region causes an integer overflow, return error. */ if (((args->vec.addr + args->vec.bytes) < args->vec.addr) || PAGE_ALIGN(args->vec.addr + args->vec.bytes) < (args->vec.addr + args->vec.bytes)) { ret = -EINVAL; goto out; } if (!can_do_mlock()) { ret = -EPERM; goto out; } nr_pages = rds_pages_in_vec(&args->vec); if (nr_pages == 0) { ret = -EINVAL; goto out; } /* Restrict the size of mr irrespective of underlying transport * To account for unaligned mr regions, subtract one from nr_pages */ if ((nr_pages - 1) > (RDS_MAX_MSG_SIZE >> PAGE_SHIFT)) { ret = -EMSGSIZE; goto out; } rdsdebug("RDS: get_mr addr %llx len %llu nr_pages %u\n", args->vec.addr, args->vec.bytes, nr_pages); /* XXX clamp nr_pages to limit the size of this alloc? */ pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); if (!pages) { ret = -ENOMEM; goto out; } mr = kzalloc(sizeof(struct rds_mr), GFP_KERNEL); if (!mr) { ret = -ENOMEM; goto out; } kref_init(&mr->r_kref); RB_CLEAR_NODE(&mr->r_rb_node); mr->r_trans = rs->rs_transport; mr->r_sock = rs; if (args->flags & RDS_RDMA_USE_ONCE) mr->r_use_once = 1; if (args->flags & RDS_RDMA_INVALIDATE) mr->r_invalidate = 1; if (args->flags & RDS_RDMA_READWRITE) mr->r_write = 1; /* * Pin the pages that make up the user buffer and transfer the page * pointers to the mr's sg array. We check to see if we've mapped * the whole region after transferring the partial page references * to the sg array so that we can have one page ref cleanup path. * * For now we have no flag that tells us whether the mapping is * r/o or r/w. We need to assume r/w, or we'll do a lot of RDMA to * the zero page. */ ret = rds_pin_pages(args->vec.addr, nr_pages, pages, 1); if (ret == -EOPNOTSUPP) { need_odp = 1; } else if (ret <= 0) { goto out; } else { nents = ret; sg = kmalloc_array(nents, sizeof(*sg), GFP_KERNEL); if (!sg) { ret = -ENOMEM; goto out; } WARN_ON(!nents); sg_init_table(sg, nents); /* Stick all pages into the scatterlist */ for (i = 0 ; i < nents; i++) sg_set_page(&sg[i], pages[i], PAGE_SIZE, 0); rdsdebug("RDS: trans_private nents is %u\n", nents); } /* Obtain a transport specific MR. If this succeeds, the * s/g list is now owned by the MR. * Note that dma_map() implies that pending writes are * flushed to RAM, so no dma_sync is needed here. */ trans_private = rs->rs_transport->get_mr( sg, nents, rs, &mr->r_key, cp ? cp->cp_conn : NULL, args->vec.addr, args->vec.bytes, need_odp ? ODP_ZEROBASED : ODP_NOT_NEEDED); if (IS_ERR(trans_private)) { /* In ODP case, we don't GUP pages, so don't need * to release anything. */ if (!need_odp) { unpin_user_pages(pages, nr_pages); kfree(sg); } ret = PTR_ERR(trans_private); /* Trigger connection so that its ready for the next retry */ if (ret == -ENODEV && cp) rds_conn_connect_if_down(cp->cp_conn); goto out; } mr->r_trans_private = trans_private; rdsdebug("RDS: get_mr put_user key is %x cookie_addr %p\n", mr->r_key, (void *)(unsigned long) args->cookie_addr); /* The user may pass us an unaligned address, but we can only * map page aligned regions. So we keep the offset, and build * a 64bit cookie containing <R_Key, offset> and pass that * around. */ if (need_odp) cookie = rds_rdma_make_cookie(mr->r_key, 0); else cookie = rds_rdma_make_cookie(mr->r_key, args->vec.addr & ~PAGE_MASK); if (cookie_ret) *cookie_ret = cookie; if (args->cookie_addr && put_user(cookie, (u64 __user *)(unsigned long)args->cookie_addr)) { if (!need_odp) { unpin_user_pages(pages, nr_pages); kfree(sg); } ret = -EFAULT; goto out; } /* Inserting the new MR into the rbtree bumps its * reference count. */ spin_lock_irqsave(&rs->rs_rdma_lock, flags); found = rds_mr_tree_walk(&rs->rs_rdma_keys, mr->r_key, mr); spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); BUG_ON(found && found != mr); rdsdebug("RDS: get_mr key is %x\n", mr->r_key); if (mr_ret) { kref_get(&mr->r_kref); *mr_ret = mr; } ret = 0; out: kfree(pages); if (mr) kref_put(&mr->r_kref, __rds_put_mr_final); return ret; } int rds_get_mr(struct rds_sock *rs, sockptr_t optval, int optlen) { struct rds_get_mr_args args; if (optlen != sizeof(struct rds_get_mr_args)) return -EINVAL; if (copy_from_sockptr(&args, optval, sizeof(struct rds_get_mr_args))) return -EFAULT; return __rds_rdma_map(rs, &args, NULL, NULL, NULL); } int rds_get_mr_for_dest(struct rds_sock *rs, sockptr_t optval, int optlen) { struct rds_get_mr_for_dest_args args; struct rds_get_mr_args new_args; if (optlen != sizeof(struct rds_get_mr_for_dest_args)) return -EINVAL; if (copy_from_sockptr(&args, optval, sizeof(struct rds_get_mr_for_dest_args))) return -EFAULT; /* * Initially, just behave like get_mr(). * TODO: Implement get_mr as wrapper around this * and deprecate it. */ new_args.vec = args.vec; new_args.cookie_addr = args.cookie_addr; new_args.flags = args.flags; return __rds_rdma_map(rs, &new_args, NULL, NULL, NULL); } /* * Free the MR indicated by the given R_Key */ int rds_free_mr(struct rds_sock *rs, sockptr_t optval, int optlen) { struct rds_free_mr_args args; struct rds_mr *mr; unsigned long flags; if (optlen != sizeof(struct rds_free_mr_args)) return -EINVAL; if (copy_from_sockptr(&args, optval, sizeof(struct rds_free_mr_args))) return -EFAULT; /* Special case - a null cookie means flush all unused MRs */ if (args.cookie == 0) { if (!rs->rs_transport || !rs->rs_transport->flush_mrs) return -EINVAL; rs->rs_transport->flush_mrs(); return 0; } /* Look up the MR given its R_key and remove it from the rbtree * so nobody else finds it. * This should also prevent races with rds_rdma_unuse. */ spin_lock_irqsave(&rs->rs_rdma_lock, flags); mr = rds_mr_tree_walk(&rs->rs_rdma_keys, rds_rdma_cookie_key(args.cookie), NULL); if (mr) { rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys); RB_CLEAR_NODE(&mr->r_rb_node); if (args.flags & RDS_RDMA_INVALIDATE) mr->r_invalidate = 1; } spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); if (!mr) return -EINVAL; kref_put(&mr->r_kref, __rds_put_mr_final); return 0; } /* * This is called when we receive an extension header that * tells us this MR was used. It allows us to implement * use_once semantics */ void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force) { struct rds_mr *mr; unsigned long flags; int zot_me = 0; spin_lock_irqsave(&rs->rs_rdma_lock, flags); mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL); if (!mr) { pr_debug("rds: trying to unuse MR with unknown r_key %u!\n", r_key); spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); return; } /* Get a reference so that the MR won't go away before calling * sync_mr() below. */ kref_get(&mr->r_kref); /* If it is going to be freed, remove it from the tree now so * that no other thread can find it and free it. */ if (mr->r_use_once || force) { rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys); RB_CLEAR_NODE(&mr->r_rb_node); zot_me = 1; } spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); /* May have to issue a dma_sync on this memory region. * Note we could avoid this if the operation was a RDMA READ, * but at this point we can't tell. */ if (mr->r_trans->sync_mr) mr->r_trans->sync_mr(mr->r_trans_private, DMA_FROM_DEVICE); /* Release the reference held above. */ kref_put(&mr->r_kref, __rds_put_mr_final); /* If the MR was marked as invalidate, this will * trigger an async flush. */ if (zot_me) kref_put(&mr->r_kref, __rds_put_mr_final); } void rds_rdma_free_op(struct rm_rdma_op *ro) { unsigned int i; if (ro->op_odp_mr) { kref_put(&ro->op_odp_mr->r_kref, __rds_put_mr_final); } else { for (i = 0; i < ro->op_nents; i++) { struct page *page = sg_page(&ro->op_sg[i]); /* Mark page dirty if it was possibly modified, which * is the case for a RDMA_READ which copies from remote * to local memory */ unpin_user_pages_dirty_lock(&page, 1, !ro->op_write); } } kfree(ro->op_notifier); ro->op_notifier = NULL; ro->op_active = 0; ro->op_odp_mr = NULL; } void rds_atomic_free_op(struct rm_atomic_op *ao) { struct page *page = sg_page(ao->op_sg); /* Mark page dirty if it was possibly modified, which * is the case for a RDMA_READ which copies from remote * to local memory */ unpin_user_pages_dirty_lock(&page, 1, true); kfree(ao->op_notifier); ao->op_notifier = NULL; ao->op_active = 0; } /* * Count the number of pages needed to describe an incoming iovec array. */ static int rds_rdma_pages(struct rds_iovec iov[], int nr_iovecs) { int tot_pages = 0; unsigned int nr_pages; unsigned int i; /* figure out the number of pages in the vector */ for (i = 0; i < nr_iovecs; i++) { nr_pages = rds_pages_in_vec(&iov[i]); if (nr_pages == 0) return -EINVAL; tot_pages += nr_pages; /* * nr_pages for one entry is limited to (UINT_MAX>>PAGE_SHIFT)+1, * so tot_pages cannot overflow without first going negative. */ if (tot_pages < 0) return -EINVAL; } return tot_pages; } int rds_rdma_extra_size(struct rds_rdma_args *args, struct rds_iov_vector *iov) { struct rds_iovec *vec; struct rds_iovec __user *local_vec; int tot_pages = 0; unsigned int nr_pages; unsigned int i; local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr; if (args->nr_local == 0) return -EINVAL; if (args->nr_local > UIO_MAXIOV) return -EMSGSIZE; iov->iov = kcalloc(args->nr_local, sizeof(struct rds_iovec), GFP_KERNEL); if (!iov->iov) return -ENOMEM; vec = &iov->iov[0]; if (copy_from_user(vec, local_vec, args->nr_local * sizeof(struct rds_iovec))) return -EFAULT; iov->len = args->nr_local; /* figure out the number of pages in the vector */ for (i = 0; i < args->nr_local; i++, vec++) { nr_pages = rds_pages_in_vec(vec); if (nr_pages == 0) return -EINVAL; tot_pages += nr_pages; /* * nr_pages for one entry is limited to (UINT_MAX>>PAGE_SHIFT)+1, * so tot_pages cannot overflow without first going negative. */ if (tot_pages < 0) return -EINVAL; } return tot_pages * sizeof(struct scatterlist); } /* * The application asks for a RDMA transfer. * Extract all arguments and set up the rdma_op */ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, struct cmsghdr *cmsg, struct rds_iov_vector *vec) { struct rds_rdma_args *args; struct rm_rdma_op *op = &rm->rdma; int nr_pages; unsigned int nr_bytes; struct page **pages = NULL; struct rds_iovec *iovs; unsigned int i, j; int ret = 0; bool odp_supported = true; if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_rdma_args)) || rm->rdma.op_active) return -EINVAL; args = CMSG_DATA(cmsg); if (ipv6_addr_any(&rs->rs_bound_addr)) { ret = -ENOTCONN; /* XXX not a great errno */ goto out_ret; } if (args->nr_local > UIO_MAXIOV) { ret = -EMSGSIZE; goto out_ret; } if (vec->len != args->nr_local) { ret = -EINVAL; goto out_ret; } /* odp-mr is not supported for multiple requests within one message */ if (args->nr_local != 1) odp_supported = false; iovs = vec->iov; nr_pages = rds_rdma_pages(iovs, args->nr_local); if (nr_pages < 0) { ret = -EINVAL; goto out_ret; } pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); if (!pages) { ret = -ENOMEM; goto out_ret; } op->op_write = !!(args->flags & RDS_RDMA_READWRITE); op->op_fence = !!(args->flags & RDS_RDMA_FENCE); op->op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME); op->op_silent = !!(args->flags & RDS_RDMA_SILENT); op->op_active = 1; op->op_recverr = rs->rs_recverr; op->op_odp_mr = NULL; WARN_ON(!nr_pages); op->op_sg = rds_message_alloc_sgs(rm, nr_pages); if (IS_ERR(op->op_sg)) { ret = PTR_ERR(op->op_sg); goto out_pages; } if (op->op_notify || op->op_recverr) { /* We allocate an uninitialized notifier here, because * we don't want to do that in the completion handler. We * would have to use GFP_ATOMIC there, and don't want to deal * with failed allocations. */ op->op_notifier = kmalloc(sizeof(struct rds_notifier), GFP_KERNEL); if (!op->op_notifier) { ret = -ENOMEM; goto out_pages; } op->op_notifier->n_user_token = args->user_token; op->op_notifier->n_status = RDS_RDMA_SUCCESS; } /* The cookie contains the R_Key of the remote memory region, and * optionally an offset into it. This is how we implement RDMA into * unaligned memory. * When setting up the RDMA, we need to add that offset to the * destination address (which is really an offset into the MR) * FIXME: We may want to move this into ib_rdma.c */ op->op_rkey = rds_rdma_cookie_key(args->cookie); op->op_remote_addr = args->remote_vec.addr + rds_rdma_cookie_offset(args->cookie); nr_bytes = 0; rdsdebug("RDS: rdma prepare nr_local %llu rva %llx rkey %x\n", (unsigned long long)args->nr_local, (unsigned long long)args->remote_vec.addr, op->op_rkey); for (i = 0; i < args->nr_local; i++) { struct rds_iovec *iov = &iovs[i]; /* don't need to check, rds_rdma_pages() verified nr will be +nonzero */ unsigned int nr = rds_pages_in_vec(iov); rs->rs_user_addr = iov->addr; rs->rs_user_bytes = iov->bytes; /* If it's a WRITE operation, we want to pin the pages for reading. * If it's a READ operation, we need to pin the pages for writing. */ ret = rds_pin_pages(iov->addr, nr, pages, !op->op_write); if ((!odp_supported && ret <= 0) || (odp_supported && ret <= 0 && ret != -EOPNOTSUPP)) goto out_pages; if (ret == -EOPNOTSUPP) { struct rds_mr *local_odp_mr; if (!rs->rs_transport->get_mr) { ret = -EOPNOTSUPP; goto out_pages; } local_odp_mr = kzalloc(sizeof(*local_odp_mr), GFP_KERNEL); if (!local_odp_mr) { ret = -ENOMEM; goto out_pages; } RB_CLEAR_NODE(&local_odp_mr->r_rb_node); kref_init(&local_odp_mr->r_kref); local_odp_mr->r_trans = rs->rs_transport; local_odp_mr->r_sock = rs; local_odp_mr->r_trans_private = rs->rs_transport->get_mr( NULL, 0, rs, &local_odp_mr->r_key, NULL, iov->addr, iov->bytes, ODP_VIRTUAL); if (IS_ERR(local_odp_mr->r_trans_private)) { ret = PTR_ERR(local_odp_mr->r_trans_private); rdsdebug("get_mr ret %d %p\"", ret, local_odp_mr->r_trans_private); kfree(local_odp_mr); ret = -EOPNOTSUPP; goto out_pages; } rdsdebug("Need odp; local_odp_mr %p trans_private %p\n", local_odp_mr, local_odp_mr->r_trans_private); op->op_odp_mr = local_odp_mr; op->op_odp_addr = iov->addr; } rdsdebug("RDS: nr_bytes %u nr %u iov->bytes %llu iov->addr %llx\n", nr_bytes, nr, iov->bytes, iov->addr); nr_bytes += iov->bytes; for (j = 0; j < nr; j++) { unsigned int offset = iov->addr & ~PAGE_MASK; struct scatterlist *sg; sg = &op->op_sg[op->op_nents + j]; sg_set_page(sg, pages[j], min_t(unsigned int, iov->bytes, PAGE_SIZE - offset), offset); sg_dma_len(sg) = sg->length; rdsdebug("RDS: sg->offset %x sg->len %x iov->addr %llx iov->bytes %llu\n", sg->offset, sg->length, iov->addr, iov->bytes); iov->addr += sg->length; iov->bytes -= sg->length; } op->op_nents += nr; } if (nr_bytes > args->remote_vec.bytes) { rdsdebug("RDS nr_bytes %u remote_bytes %u do not match\n", nr_bytes, (unsigned int) args->remote_vec.bytes); ret = -EINVAL; goto out_pages; } op->op_bytes = nr_bytes; ret = 0; out_pages: kfree(pages); out_ret: if (ret) rds_rdma_free_op(op); else rds_stats_inc(s_send_rdma); return ret; } /* * The application wants us to pass an RDMA destination (aka MR) * to the remote */ int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm, struct cmsghdr *cmsg) { unsigned long flags; struct rds_mr *mr; u32 r_key; int err = 0; if (cmsg->cmsg_len < CMSG_LEN(sizeof(rds_rdma_cookie_t)) || rm->m_rdma_cookie != 0) return -EINVAL; memcpy(&rm->m_rdma_cookie, CMSG_DATA(cmsg), sizeof(rm->m_rdma_cookie)); /* We are reusing a previously mapped MR here. Most likely, the * application has written to the buffer, so we need to explicitly * flush those writes to RAM. Otherwise the HCA may not see them * when doing a DMA from that buffer. */ r_key = rds_rdma_cookie_key(rm->m_rdma_cookie); spin_lock_irqsave(&rs->rs_rdma_lock, flags); mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL); if (!mr) err = -EINVAL; /* invalid r_key */ else kref_get(&mr->r_kref); spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); if (mr) { mr->r_trans->sync_mr(mr->r_trans_private, DMA_TO_DEVICE); rm->rdma.op_rdma_mr = mr; } return err; } /* * The application passes us an address range it wants to enable RDMA * to/from. We map the area, and save the <R_Key,offset> pair * in rm->m_rdma_cookie. This causes it to be sent along to the peer * in an extension header. */ int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm, struct cmsghdr *cmsg) { if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_get_mr_args)) || rm->m_rdma_cookie != 0) return -EINVAL; return __rds_rdma_map(rs, CMSG_DATA(cmsg), &rm->m_rdma_cookie, &rm->rdma.op_rdma_mr, rm->m_conn_path); } /* * Fill in rds_message for an atomic request. */ int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm, struct cmsghdr *cmsg) { struct page *page = NULL; struct rds_atomic_args *args; int ret = 0; if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_atomic_args)) || rm->atomic.op_active) return -EINVAL; args = CMSG_DATA(cmsg); /* Nonmasked & masked cmsg ops converted to masked hw ops */ switch (cmsg->cmsg_type) { case RDS_CMSG_ATOMIC_FADD: rm->atomic.op_type = RDS_ATOMIC_TYPE_FADD; rm->atomic.op_m_fadd.add = args->fadd.add; rm->atomic.op_m_fadd.nocarry_mask = 0; break; case RDS_CMSG_MASKED_ATOMIC_FADD: rm->atomic.op_type = RDS_ATOMIC_TYPE_FADD; rm->atomic.op_m_fadd.add = args->m_fadd.add; rm->atomic.op_m_fadd.nocarry_mask = args->m_fadd.nocarry_mask; break; case RDS_CMSG_ATOMIC_CSWP: rm->atomic.op_type = RDS_ATOMIC_TYPE_CSWP; rm->atomic.op_m_cswp.compare = args->cswp.compare; rm->atomic.op_m_cswp.swap = args->cswp.swap; rm->atomic.op_m_cswp.compare_mask = ~0; rm->atomic.op_m_cswp.swap_mask = ~0; break; case RDS_CMSG_MASKED_ATOMIC_CSWP: rm->atomic.op_type = RDS_ATOMIC_TYPE_CSWP; rm->atomic.op_m_cswp.compare = args->m_cswp.compare; rm->atomic.op_m_cswp.swap = args->m_cswp.swap; rm->atomic.op_m_cswp.compare_mask = args->m_cswp.compare_mask; rm->atomic.op_m_cswp.swap_mask = args->m_cswp.swap_mask; break; default: BUG(); /* should never happen */ } rm->atomic.op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME); rm->atomic.op_silent = !!(args->flags & RDS_RDMA_SILENT); rm->atomic.op_active = 1; rm->atomic.op_recverr = rs->rs_recverr; rm->atomic.op_sg = rds_message_alloc_sgs(rm, 1); if (IS_ERR(rm->atomic.op_sg)) { ret = PTR_ERR(rm->atomic.op_sg); goto err; } /* verify 8 byte-aligned */ if (args->local_addr & 0x7) { ret = -EFAULT; goto err; } ret = rds_pin_pages(args->local_addr, 1, &page, 1); if (ret != 1) goto err; ret = 0; sg_set_page(rm->atomic.op_sg, page, 8, offset_in_page(args->local_addr)); if (rm->atomic.op_notify || rm->atomic.op_recverr) { /* We allocate an uninitialized notifier here, because * we don't want to do that in the completion handler. We * would have to use GFP_ATOMIC there, and don't want to deal * with failed allocations. */ rm->atomic.op_notifier = kmalloc(sizeof(*rm->atomic.op_notifier), GFP_KERNEL); if (!rm->atomic.op_notifier) { ret = -ENOMEM; goto err; } rm->atomic.op_notifier->n_user_token = args->user_token; rm->atomic.op_notifier->n_status = RDS_RDMA_SUCCESS; } rm->atomic.op_rkey = rds_rdma_cookie_key(args->cookie); rm->atomic.op_remote_addr = args->remote_addr + rds_rdma_cookie_offset(args->cookie); return ret; err: if (page) unpin_user_page(page); rm->atomic.op_active = 0; kfree(rm->atomic.op_notifier); return ret; }
24181 18752 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 /* * include/linux/topology.h * * Written by: Matthew Dobson, IBM Corporation * * Copyright (C) 2002, IBM Corp. * * All rights reserved. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or * NON INFRINGEMENT. See the GNU General Public License for more * details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * * Send feedback to <colpatch@us.ibm.com> */ #ifndef _LINUX_TOPOLOGY_H #define _LINUX_TOPOLOGY_H #include <linux/arch_topology.h> #include <linux/cpumask.h> #include <linux/bitops.h> #include <linux/mmzone.h> #include <linux/smp.h> #include <linux/percpu.h> #include <asm/topology.h> #ifndef nr_cpus_node #define nr_cpus_node(node) cpumask_weight(cpumask_of_node(node)) #endif #define for_each_node_with_cpus(node) \ for_each_online_node(node) \ if (nr_cpus_node(node)) int arch_update_cpu_topology(void); /* Conform to ACPI 2.0 SLIT distance definitions */ #define LOCAL_DISTANCE 10 #define REMOTE_DISTANCE 20 #define DISTANCE_BITS 8 #ifndef node_distance #define node_distance(from,to) ((from) == (to) ? LOCAL_DISTANCE : REMOTE_DISTANCE) #endif #ifndef RECLAIM_DISTANCE /* * If the distance between nodes in a system is larger than RECLAIM_DISTANCE * (in whatever arch specific measurement units returned by node_distance()) * and node_reclaim_mode is enabled then the VM will only call node_reclaim() * on nodes within this distance. */ #define RECLAIM_DISTANCE 30 #endif /* * The following tunable allows platforms to override the default node * reclaim distance (RECLAIM_DISTANCE) if remote memory accesses are * sufficiently fast that the default value actually hurts * performance. * * AMD EPYC machines use this because even though the 2-hop distance * is 32 (3.2x slower than a local memory access) performance actually * *improves* if allowed to reclaim memory and load balance tasks * between NUMA nodes 2-hops apart. */ extern int __read_mostly node_reclaim_distance; #ifndef PENALTY_FOR_NODE_WITH_CPUS #define PENALTY_FOR_NODE_WITH_CPUS (1) #endif #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID DECLARE_PER_CPU(int, numa_node); #ifndef numa_node_id /* Returns the number of the current Node. */ static inline int numa_node_id(void) { return raw_cpu_read(numa_node); } #endif #ifndef cpu_to_node static inline int cpu_to_node(int cpu) { return per_cpu(numa_node, cpu); } #endif #ifndef set_numa_node static inline void set_numa_node(int node) { this_cpu_write(numa_node, node); } #endif #ifndef set_cpu_numa_node static inline void set_cpu_numa_node(int cpu, int node) { per_cpu(numa_node, cpu) = node; } #endif #else /* !CONFIG_USE_PERCPU_NUMA_NODE_ID */ /* Returns the number of the current Node. */ #ifndef numa_node_id static inline int numa_node_id(void) { return cpu_to_node(raw_smp_processor_id()); } #endif #endif /* [!]CONFIG_USE_PERCPU_NUMA_NODE_ID */ #ifdef CONFIG_HAVE_MEMORYLESS_NODES /* * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly. * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined. * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem(). */ DECLARE_PER_CPU(int, _numa_mem_); #ifndef set_numa_mem static inline void set_numa_mem(int node) { this_cpu_write(_numa_mem_, node); } #endif #ifndef numa_mem_id /* Returns the number of the nearest Node with memory */ static inline int numa_mem_id(void) { return raw_cpu_read(_numa_mem_); } #endif #ifndef cpu_to_mem static inline int cpu_to_mem(int cpu) { return per_cpu(_numa_mem_, cpu); } #endif #ifndef set_cpu_numa_mem static inline void set_cpu_numa_mem(int cpu, int node) { per_cpu(_numa_mem_, cpu) = node; } #endif #else /* !CONFIG_HAVE_MEMORYLESS_NODES */ #ifndef numa_mem_id /* Returns the number of the nearest Node with memory */ static inline int numa_mem_id(void) { return numa_node_id(); } #endif #ifndef cpu_to_mem static inline int cpu_to_mem(int cpu) { return cpu_to_node(cpu); } #endif #endif /* [!]CONFIG_HAVE_MEMORYLESS_NODES */ #if defined(topology_die_id) && defined(topology_die_cpumask) #define TOPOLOGY_DIE_SYSFS #endif #if defined(topology_cluster_id) && defined(topology_cluster_cpumask) #define TOPOLOGY_CLUSTER_SYSFS #endif #if defined(topology_book_id) && defined(topology_book_cpumask) #define TOPOLOGY_BOOK_SYSFS #endif #if defined(topology_drawer_id) && defined(topology_drawer_cpumask) #define TOPOLOGY_DRAWER_SYSFS #endif #ifndef topology_physical_package_id #define topology_physical_package_id(cpu) ((void)(cpu), -1) #endif #ifndef topology_die_id #define topology_die_id(cpu) ((void)(cpu), -1) #endif #ifndef topology_cluster_id #define topology_cluster_id(cpu) ((void)(cpu), -1) #endif #ifndef topology_core_id #define topology_core_id(cpu) ((void)(cpu), 0) #endif #ifndef topology_book_id #define topology_book_id(cpu) ((void)(cpu), -1) #endif #ifndef topology_drawer_id #define topology_drawer_id(cpu) ((void)(cpu), -1) #endif #ifndef topology_ppin #define topology_ppin(cpu) ((void)(cpu), 0ull) #endif #ifndef topology_sibling_cpumask #define topology_sibling_cpumask(cpu) cpumask_of(cpu) #endif #ifndef topology_core_cpumask #define topology_core_cpumask(cpu) cpumask_of(cpu) #endif #ifndef topology_cluster_cpumask #define topology_cluster_cpumask(cpu) cpumask_of(cpu) #endif #ifndef topology_die_cpumask #define topology_die_cpumask(cpu) cpumask_of(cpu) #endif #ifndef topology_book_cpumask #define topology_book_cpumask(cpu) cpumask_of(cpu) #endif #ifndef topology_drawer_cpumask #define topology_drawer_cpumask(cpu) cpumask_of(cpu) #endif #if defined(CONFIG_SCHED_SMT) && !defined(cpu_smt_mask) static inline const struct cpumask *cpu_smt_mask(int cpu) { return topology_sibling_cpumask(cpu); } #endif static inline const struct cpumask *cpu_cpu_mask(int cpu) { return cpumask_of_node(cpu_to_node(cpu)); } #ifdef CONFIG_NUMA int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node); extern const struct cpumask *sched_numa_hop_mask(unsigned int node, unsigned int hops); #else static __always_inline int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node) { return cpumask_nth_and(cpu, cpus, cpu_online_mask); } static inline const struct cpumask * sched_numa_hop_mask(unsigned int node, unsigned int hops) { return ERR_PTR(-EOPNOTSUPP); } #endif /* CONFIG_NUMA */ /** * for_each_numa_hop_mask - iterate over cpumasks of increasing NUMA distance * from a given node. * @mask: the iteration variable. * @node: the NUMA node to start the search from. * * Requires rcu_lock to be held. * * Yields cpu_online_mask for @node == NUMA_NO_NODE. */ #define for_each_numa_hop_mask(mask, node) \ for (unsigned int __hops = 0; \ mask = (node != NUMA_NO_NODE || __hops) ? \ sched_numa_hop_mask(node, __hops) : \ cpu_online_mask, \ !IS_ERR_OR_NULL(mask); \ __hops++) #endif /* _LINUX_TOPOLOGY_H */
6 4 4 2 6 6 6 6 6 6 3 2 2 1 3 3 2 2 1 3 4 4 4 3 3 3 11 11 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 // SPDX-License-Identifier: GPL-2.0-only /* tunnel4.c: Generic IP tunnel transformer. * * Copyright (C) 2003 David S. Miller (davem@redhat.com) */ #include <linux/init.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/mpls.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <net/icmp.h> #include <net/ip.h> #include <net/protocol.h> #include <net/xfrm.h> static struct xfrm_tunnel __rcu *tunnel4_handlers __read_mostly; static struct xfrm_tunnel __rcu *tunnel64_handlers __read_mostly; static struct xfrm_tunnel __rcu *tunnelmpls4_handlers __read_mostly; static DEFINE_MUTEX(tunnel4_mutex); static inline struct xfrm_tunnel __rcu **fam_handlers(unsigned short family) { return (family == AF_INET) ? &tunnel4_handlers : (family == AF_INET6) ? &tunnel64_handlers : &tunnelmpls4_handlers; } int xfrm4_tunnel_register(struct xfrm_tunnel *handler, unsigned short family) { struct xfrm_tunnel __rcu **pprev; struct xfrm_tunnel *t; int ret = -EEXIST; int priority = handler->priority; mutex_lock(&tunnel4_mutex); for (pprev = fam_handlers(family); (t = rcu_dereference_protected(*pprev, lockdep_is_held(&tunnel4_mutex))) != NULL; pprev = &t->next) { if (t->priority > priority) break; if (t->priority == priority) goto err; } handler->next = *pprev; rcu_assign_pointer(*pprev, handler); ret = 0; err: mutex_unlock(&tunnel4_mutex); return ret; } EXPORT_SYMBOL(xfrm4_tunnel_register); int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler, unsigned short family) { struct xfrm_tunnel __rcu **pprev; struct xfrm_tunnel *t; int ret = -ENOENT; mutex_lock(&tunnel4_mutex); for (pprev = fam_handlers(family); (t = rcu_dereference_protected(*pprev, lockdep_is_held(&tunnel4_mutex))) != NULL; pprev = &t->next) { if (t == handler) { *pprev = handler->next; ret = 0; break; } } mutex_unlock(&tunnel4_mutex); synchronize_net(); return ret; } EXPORT_SYMBOL(xfrm4_tunnel_deregister); #define for_each_tunnel_rcu(head, handler) \ for (handler = rcu_dereference(head); \ handler != NULL; \ handler = rcu_dereference(handler->next)) \ static int tunnel4_rcv(struct sk_buff *skb) { struct xfrm_tunnel *handler; if (!pskb_may_pull(skb, sizeof(struct iphdr))) goto drop; for_each_tunnel_rcu(tunnel4_handlers, handler) if (!handler->handler(skb)) return 0; icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); drop: kfree_skb(skb); return 0; } #if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL) static int tunnel4_rcv_cb(struct sk_buff *skb, u8 proto, int err) { struct xfrm_tunnel __rcu *head; struct xfrm_tunnel *handler; int ret; head = (proto == IPPROTO_IPIP) ? tunnel4_handlers : tunnel64_handlers; for_each_tunnel_rcu(head, handler) { if (handler->cb_handler) { ret = handler->cb_handler(skb, err); if (ret <= 0) return ret; } } return 0; } static const struct xfrm_input_afinfo tunnel4_input_afinfo = { .family = AF_INET, .is_ipip = true, .callback = tunnel4_rcv_cb, }; #endif #if IS_ENABLED(CONFIG_IPV6) static int tunnel64_rcv(struct sk_buff *skb) { struct xfrm_tunnel *handler; if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) goto drop; for_each_tunnel_rcu(tunnel64_handlers, handler) if (!handler->handler(skb)) return 0; icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); drop: kfree_skb(skb); return 0; } #endif #if IS_ENABLED(CONFIG_MPLS) static int tunnelmpls4_rcv(struct sk_buff *skb) { struct xfrm_tunnel *handler; if (!pskb_may_pull(skb, sizeof(struct mpls_label))) goto drop; for_each_tunnel_rcu(tunnelmpls4_handlers, handler) if (!handler->handler(skb)) return 0; icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); drop: kfree_skb(skb); return 0; } #endif static int tunnel4_err(struct sk_buff *skb, u32 info) { struct xfrm_tunnel *handler; for_each_tunnel_rcu(tunnel4_handlers, handler) if (!handler->err_handler(skb, info)) return 0; return -ENOENT; } #if IS_ENABLED(CONFIG_IPV6) static int tunnel64_err(struct sk_buff *skb, u32 info) { struct xfrm_tunnel *handler; for_each_tunnel_rcu(tunnel64_handlers, handler) if (!handler->err_handler(skb, info)) return 0; return -ENOENT; } #endif #if IS_ENABLED(CONFIG_MPLS) static int tunnelmpls4_err(struct sk_buff *skb, u32 info) { struct xfrm_tunnel *handler; for_each_tunnel_rcu(tunnelmpls4_handlers, handler) if (!handler->err_handler(skb, info)) return 0; return -ENOENT; } #endif static const struct net_protocol tunnel4_protocol = { .handler = tunnel4_rcv, .err_handler = tunnel4_err, .no_policy = 1, }; #if IS_ENABLED(CONFIG_IPV6) static const struct net_protocol tunnel64_protocol = { .handler = tunnel64_rcv, .err_handler = tunnel64_err, .no_policy = 1, }; #endif #if IS_ENABLED(CONFIG_MPLS) static const struct net_protocol tunnelmpls4_protocol = { .handler = tunnelmpls4_rcv, .err_handler = tunnelmpls4_err, .no_policy = 1, }; #endif static int __init tunnel4_init(void) { if (inet_add_protocol(&tunnel4_protocol, IPPROTO_IPIP)) goto err; #if IS_ENABLED(CONFIG_IPV6) if (inet_add_protocol(&tunnel64_protocol, IPPROTO_IPV6)) { inet_del_protocol(&tunnel4_protocol, IPPROTO_IPIP); goto err; } #endif #if IS_ENABLED(CONFIG_MPLS) if (inet_add_protocol(&tunnelmpls4_protocol, IPPROTO_MPLS)) { inet_del_protocol(&tunnel4_protocol, IPPROTO_IPIP); #if IS_ENABLED(CONFIG_IPV6) inet_del_protocol(&tunnel64_protocol, IPPROTO_IPV6); #endif goto err; } #endif #if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL) if (xfrm_input_register_afinfo(&tunnel4_input_afinfo)) { inet_del_protocol(&tunnel4_protocol, IPPROTO_IPIP); #if IS_ENABLED(CONFIG_IPV6) inet_del_protocol(&tunnel64_protocol, IPPROTO_IPV6); #endif #if IS_ENABLED(CONFIG_MPLS) inet_del_protocol(&tunnelmpls4_protocol, IPPROTO_MPLS); #endif goto err; } #endif return 0; err: pr_err("%s: can't add protocol\n", __func__); return -EAGAIN; } static void __exit tunnel4_fini(void) { #if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL) if (xfrm_input_unregister_afinfo(&tunnel4_input_afinfo)) pr_err("tunnel4 close: can't remove input afinfo\n"); #endif #if IS_ENABLED(CONFIG_MPLS) if (inet_del_protocol(&tunnelmpls4_protocol, IPPROTO_MPLS)) pr_err("tunnelmpls4 close: can't remove protocol\n"); #endif #if IS_ENABLED(CONFIG_IPV6) if (inet_del_protocol(&tunnel64_protocol, IPPROTO_IPV6)) pr_err("tunnel64 close: can't remove protocol\n"); #endif if (inet_del_protocol(&tunnel4_protocol, IPPROTO_IPIP)) pr_err("tunnel4 close: can't remove protocol\n"); } module_init(tunnel4_init); module_exit(tunnel4_fini); MODULE_DESCRIPTION("IPv4 XFRM tunnel library"); MODULE_LICENSE("GPL");
4 1200 1195 1198 1198 198 198 174 175 168 1198 699 1197 1200 1195 1199 1195 1199 1199 1205 1202 1199 1200 1197 1202 1198 1196 1202 1198 1199 1199 176 175 173 168 174 174 172 174 172 164 2 2 4 148 3 168 167 111 68 111 147 148 70 175 175 148 112 132 132 87 153 125 73 156 175 175 172 174 182 182 182 182 182 180 180 179 174 179 175 173 181 88 58 88 47 88 182 178 182 182 182 182 181 181 6 180 180 176 175 175 91 175 156 156 72 7 182 178 3 179 7 7 7 7 7 7 7 7 7 7 7 7 6 7 7 9 8 7 7 7 6 6 5 6 6 1 7 9 16 1 16 16 16 16 16 17 1 3 1 2 1 1 1 1 2 3 4 166 2 166 7 3 7 197 197 197 196 196 192 187 167 166 187 1 185 1 3 3 2 178 179 161 179 179 179 178 179 178 199 197 197 3 186 181 179 179 179 178 178 179 179 177 178 17 163 20 199 18 7 205 204 203 201 200 199 200 200 199 197 17 17 199 205 5 4 3 2 1 1 1 1 2 5 208 210 204 205 4 19 18 9 19 5 5 4 1 4 3 1 2 1 1 4 4 4 4 3 3 3 3 4 104 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 // SPDX-License-Identifier: GPL-2.0-only /* * Packet matching code. * * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling * Copyright (C) 2000-2005 Netfilter Core Team <coreteam@netfilter.org> * Copyright (c) 2006-2010 Patrick McHardy <kaber@trash.net> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kernel.h> #include <linux/capability.h> #include <linux/in.h> #include <linux/skbuff.h> #include <linux/kmod.h> #include <linux/vmalloc.h> #include <linux/netdevice.h> #include <linux/module.h> #include <linux/poison.h> #include <net/ipv6.h> #include <net/compat.h> #include <linux/uaccess.h> #include <linux/mutex.h> #include <linux/proc_fs.h> #include <linux/err.h> #include <linux/cpumask.h> #include <linux/netfilter_ipv6/ip6_tables.h> #include <linux/netfilter/x_tables.h> #include <net/netfilter/nf_log.h> #include "../../netfilter/xt_repldata.h" MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); MODULE_DESCRIPTION("IPv6 packet filter"); void *ip6t_alloc_initial_table(const struct xt_table *info) { return xt_alloc_initial_table(ip6t, IP6T); } EXPORT_SYMBOL_GPL(ip6t_alloc_initial_table); /* Returns whether matches rule or not. */ /* Performance critical - called for every packet */ static inline bool ip6_packet_match(const struct sk_buff *skb, const char *indev, const char *outdev, const struct ip6t_ip6 *ip6info, unsigned int *protoff, u16 *fragoff, bool *hotdrop) { unsigned long ret; const struct ipv6hdr *ipv6 = ipv6_hdr(skb); if (NF_INVF(ip6info, IP6T_INV_SRCIP, ipv6_masked_addr_cmp(&ipv6->saddr, &ip6info->smsk, &ip6info->src)) || NF_INVF(ip6info, IP6T_INV_DSTIP, ipv6_masked_addr_cmp(&ipv6->daddr, &ip6info->dmsk, &ip6info->dst))) return false; ret = ifname_compare_aligned(indev, ip6info->iniface, ip6info->iniface_mask); if (NF_INVF(ip6info, IP6T_INV_VIA_IN, ret != 0)) return false; ret = ifname_compare_aligned(outdev, ip6info->outiface, ip6info->outiface_mask); if (NF_INVF(ip6info, IP6T_INV_VIA_OUT, ret != 0)) return false; /* ... might want to do something with class and flowlabel here ... */ /* look for the desired protocol header */ if (ip6info->flags & IP6T_F_PROTO) { int protohdr; unsigned short _frag_off; protohdr = ipv6_find_hdr(skb, protoff, -1, &_frag_off, NULL); if (protohdr < 0) { if (_frag_off == 0) *hotdrop = true; return false; } *fragoff = _frag_off; if (ip6info->proto == protohdr) { if (ip6info->invflags & IP6T_INV_PROTO) return false; return true; } /* We need match for the '-p all', too! */ if ((ip6info->proto != 0) && !(ip6info->invflags & IP6T_INV_PROTO)) return false; } return true; } /* should be ip6 safe */ static bool ip6_checkentry(const struct ip6t_ip6 *ipv6) { if (ipv6->flags & ~IP6T_F_MASK) return false; if (ipv6->invflags & ~IP6T_INV_MASK) return false; return true; } static unsigned int ip6t_error(struct sk_buff *skb, const struct xt_action_param *par) { net_info_ratelimited("error: `%s'\n", (const char *)par->targinfo); return NF_DROP; } static inline struct ip6t_entry * get_entry(const void *base, unsigned int offset) { return (struct ip6t_entry *)(base + offset); } /* All zeroes == unconditional rule. */ /* Mildly perf critical (only if packet tracing is on) */ static inline bool unconditional(const struct ip6t_entry *e) { static const struct ip6t_ip6 uncond; return e->target_offset == sizeof(struct ip6t_entry) && memcmp(&e->ipv6, &uncond, sizeof(uncond)) == 0; } static inline const struct xt_entry_target * ip6t_get_target_c(const struct ip6t_entry *e) { return ip6t_get_target((struct ip6t_entry *)e); } #if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) /* This cries for unification! */ static const char *const hooknames[] = { [NF_INET_PRE_ROUTING] = "PREROUTING", [NF_INET_LOCAL_IN] = "INPUT", [NF_INET_FORWARD] = "FORWARD", [NF_INET_LOCAL_OUT] = "OUTPUT", [NF_INET_POST_ROUTING] = "POSTROUTING", }; enum nf_ip_trace_comments { NF_IP6_TRACE_COMMENT_RULE, NF_IP6_TRACE_COMMENT_RETURN, NF_IP6_TRACE_COMMENT_POLICY, }; static const char *const comments[] = { [NF_IP6_TRACE_COMMENT_RULE] = "rule", [NF_IP6_TRACE_COMMENT_RETURN] = "return", [NF_IP6_TRACE_COMMENT_POLICY] = "policy", }; static const struct nf_loginfo trace_loginfo = { .type = NF_LOG_TYPE_LOG, .u = { .log = { .level = LOGLEVEL_WARNING, .logflags = NF_LOG_DEFAULT_MASK, }, }, }; /* Mildly perf critical (only if packet tracing is on) */ static inline int get_chainname_rulenum(const struct ip6t_entry *s, const struct ip6t_entry *e, const char *hookname, const char **chainname, const char **comment, unsigned int *rulenum) { const struct xt_standard_target *t = (void *)ip6t_get_target_c(s); if (strcmp(t->target.u.kernel.target->name, XT_ERROR_TARGET) == 0) { /* Head of user chain: ERROR target with chainname */ *chainname = t->target.data; (*rulenum) = 0; } else if (s == e) { (*rulenum)++; if (unconditional(s) && strcmp(t->target.u.kernel.target->name, XT_STANDARD_TARGET) == 0 && t->verdict < 0) { /* Tail of chains: STANDARD target (return/policy) */ *comment = *chainname == hookname ? comments[NF_IP6_TRACE_COMMENT_POLICY] : comments[NF_IP6_TRACE_COMMENT_RETURN]; } return 1; } else (*rulenum)++; return 0; } static void trace_packet(struct net *net, const struct sk_buff *skb, unsigned int hook, const struct net_device *in, const struct net_device *out, const char *tablename, const struct xt_table_info *private, const struct ip6t_entry *e) { const struct ip6t_entry *root; const char *hookname, *chainname, *comment; const struct ip6t_entry *iter; unsigned int rulenum = 0; root = get_entry(private->entries, private->hook_entry[hook]); hookname = chainname = hooknames[hook]; comment = comments[NF_IP6_TRACE_COMMENT_RULE]; xt_entry_foreach(iter, root, private->size - private->hook_entry[hook]) if (get_chainname_rulenum(iter, e, hookname, &chainname, &comment, &rulenum) != 0) break; nf_log_trace(net, AF_INET6, hook, skb, in, out, &trace_loginfo, "TRACE: %s:%s:%s:%u ", tablename, chainname, comment, rulenum); } #endif static inline struct ip6t_entry * ip6t_next_entry(const struct ip6t_entry *entry) { return (void *)entry + entry->next_offset; } /* Returns one of the generic firewall policies, like NF_ACCEPT. */ unsigned int ip6t_do_table(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { const struct xt_table *table = priv; unsigned int hook = state->hook; static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long)))); /* Initializing verdict to NF_DROP keeps gcc happy. */ unsigned int verdict = NF_DROP; const char *indev, *outdev; const void *table_base; struct ip6t_entry *e, **jumpstack; unsigned int stackidx, cpu; const struct xt_table_info *private; struct xt_action_param acpar; unsigned int addend; /* Initialization */ stackidx = 0; indev = state->in ? state->in->name : nulldevname; outdev = state->out ? state->out->name : nulldevname; /* We handle fragments by dealing with the first fragment as * if it was a normal packet. All other fragments are treated * normally, except that they will NEVER match rules that ask * things we don't know, ie. tcp syn flag or ports). If the * rule is also a fragment-specific rule, non-fragments won't * match it. */ acpar.fragoff = 0; acpar.hotdrop = false; acpar.state = state; WARN_ON(!(table->valid_hooks & (1 << hook))); local_bh_disable(); addend = xt_write_recseq_begin(); private = READ_ONCE(table->private); /* Address dependency. */ cpu = smp_processor_id(); table_base = private->entries; jumpstack = (struct ip6t_entry **)private->jumpstack[cpu]; /* Switch to alternate jumpstack if we're being invoked via TEE. * TEE issues XT_CONTINUE verdict on original skb so we must not * clobber the jumpstack. * * For recursion via REJECT or SYNPROXY the stack will be clobbered * but it is no problem since absolute verdict is issued by these. */ if (static_key_false(&xt_tee_enabled)) jumpstack += private->stacksize * __this_cpu_read(nf_skb_duplicated); e = get_entry(table_base, private->hook_entry[hook]); do { const struct xt_entry_target *t; const struct xt_entry_match *ematch; struct xt_counters *counter; WARN_ON(!e); acpar.thoff = 0; if (!ip6_packet_match(skb, indev, outdev, &e->ipv6, &acpar.thoff, &acpar.fragoff, &acpar.hotdrop)) { no_match: e = ip6t_next_entry(e); continue; } xt_ematch_foreach(ematch, e) { acpar.match = ematch->u.kernel.match; acpar.matchinfo = ematch->data; if (!acpar.match->match(skb, &acpar)) goto no_match; } counter = xt_get_this_cpu_counter(&e->counters); ADD_COUNTER(*counter, skb->len, 1); t = ip6t_get_target_c(e); WARN_ON(!t->u.kernel.target); #if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) /* The packet is traced: log it */ if (unlikely(skb->nf_trace)) trace_packet(state->net, skb, hook, state->in, state->out, table->name, private, e); #endif /* Standard target? */ if (!t->u.kernel.target->target) { int v; v = ((struct xt_standard_target *)t)->verdict; if (v < 0) { /* Pop from stack? */ if (v != XT_RETURN) { verdict = (unsigned int)(-v) - 1; break; } if (stackidx == 0) e = get_entry(table_base, private->underflow[hook]); else e = ip6t_next_entry(jumpstack[--stackidx]); continue; } if (table_base + v != ip6t_next_entry(e) && !(e->ipv6.flags & IP6T_F_GOTO)) { if (unlikely(stackidx >= private->stacksize)) { verdict = NF_DROP; break; } jumpstack[stackidx++] = e; } e = get_entry(table_base, v); continue; } acpar.target = t->u.kernel.target; acpar.targinfo = t->data; verdict = t->u.kernel.target->target(skb, &acpar); if (verdict == XT_CONTINUE) e = ip6t_next_entry(e); else /* Verdict */ break; } while (!acpar.hotdrop); xt_write_recseq_end(addend); local_bh_enable(); if (acpar.hotdrop) return NF_DROP; else return verdict; } /* Figures out from what hook each rule can be called: returns 0 if there are loops. Puts hook bitmask in comefrom. */ static int mark_source_chains(const struct xt_table_info *newinfo, unsigned int valid_hooks, void *entry0, unsigned int *offsets) { unsigned int hook; /* No recursion; use packet counter to save back ptrs (reset to 0 as we leave), and comefrom to save source hook bitmask */ for (hook = 0; hook < NF_INET_NUMHOOKS; hook++) { unsigned int pos = newinfo->hook_entry[hook]; struct ip6t_entry *e = entry0 + pos; if (!(valid_hooks & (1 << hook))) continue; /* Set initial back pointer. */ e->counters.pcnt = pos; for (;;) { const struct xt_standard_target *t = (void *)ip6t_get_target_c(e); int visited = e->comefrom & (1 << hook); if (e->comefrom & (1 << NF_INET_NUMHOOKS)) return 0; e->comefrom |= ((1 << hook) | (1 << NF_INET_NUMHOOKS)); /* Unconditional return/END. */ if ((unconditional(e) && (strcmp(t->target.u.user.name, XT_STANDARD_TARGET) == 0) && t->verdict < 0) || visited) { unsigned int oldpos, size; /* Return: backtrack through the last big jump. */ do { e->comefrom ^= (1<<NF_INET_NUMHOOKS); oldpos = pos; pos = e->counters.pcnt; e->counters.pcnt = 0; /* We're at the start. */ if (pos == oldpos) goto next; e = entry0 + pos; } while (oldpos == pos + e->next_offset); /* Move along one */ size = e->next_offset; e = entry0 + pos + size; if (pos + size >= newinfo->size) return 0; e->counters.pcnt = pos; pos += size; } else { int newpos = t->verdict; if (strcmp(t->target.u.user.name, XT_STANDARD_TARGET) == 0 && newpos >= 0) { /* This a jump; chase it. */ if (!xt_find_jump_offset(offsets, newpos, newinfo->number)) return 0; } else { /* ... this is a fallthru */ newpos = pos + e->next_offset; if (newpos >= newinfo->size) return 0; } e = entry0 + newpos; e->counters.pcnt = pos; pos = newpos; } } next: ; } return 1; } static void cleanup_match(struct xt_entry_match *m, struct net *net) { struct xt_mtdtor_param par; par.net = net; par.match = m->u.kernel.match; par.matchinfo = m->data; par.family = NFPROTO_IPV6; if (par.match->destroy != NULL) par.match->destroy(&par); module_put(par.match->me); } static int check_match(struct xt_entry_match *m, struct xt_mtchk_param *par) { const struct ip6t_ip6 *ipv6 = par->entryinfo; par->match = m->u.kernel.match; par->matchinfo = m->data; return xt_check_match(par, m->u.match_size - sizeof(*m), ipv6->proto, ipv6->invflags & IP6T_INV_PROTO); } static int find_check_match(struct xt_entry_match *m, struct xt_mtchk_param *par) { struct xt_match *match; int ret; match = xt_request_find_match(NFPROTO_IPV6, m->u.user.name, m->u.user.revision); if (IS_ERR(match)) return PTR_ERR(match); m->u.kernel.match = match; ret = check_match(m, par); if (ret) goto err; return 0; err: module_put(m->u.kernel.match->me); return ret; } static int check_target(struct ip6t_entry *e, struct net *net, const char *name) { struct xt_entry_target *t = ip6t_get_target(e); struct xt_tgchk_param par = { .net = net, .table = name, .entryinfo = e, .target = t->u.kernel.target, .targinfo = t->data, .hook_mask = e->comefrom, .family = NFPROTO_IPV6, }; return xt_check_target(&par, t->u.target_size - sizeof(*t), e->ipv6.proto, e->ipv6.invflags & IP6T_INV_PROTO); } static int find_check_entry(struct ip6t_entry *e, struct net *net, const char *name, unsigned int size, struct xt_percpu_counter_alloc_state *alloc_state) { struct xt_entry_target *t; struct xt_target *target; int ret; unsigned int j; struct xt_mtchk_param mtpar; struct xt_entry_match *ematch; if (!xt_percpu_counter_alloc(alloc_state, &e->counters)) return -ENOMEM; j = 0; memset(&mtpar, 0, sizeof(mtpar)); mtpar.net = net; mtpar.table = name; mtpar.entryinfo = &e->ipv6; mtpar.hook_mask = e->comefrom; mtpar.family = NFPROTO_IPV6; xt_ematch_foreach(ematch, e) { ret = find_check_match(ematch, &mtpar); if (ret != 0) goto cleanup_matches; ++j; } t = ip6t_get_target(e); target = xt_request_find_target(NFPROTO_IPV6, t->u.user.name, t->u.user.revision); if (IS_ERR(target)) { ret = PTR_ERR(target); goto cleanup_matches; } t->u.kernel.target = target; ret = check_target(e, net, name); if (ret) goto err; return 0; err: module_put(t->u.kernel.target->me); cleanup_matches: xt_ematch_foreach(ematch, e) { if (j-- == 0) break; cleanup_match(ematch, net); } xt_percpu_counter_free(&e->counters); return ret; } static bool check_underflow(const struct ip6t_entry *e) { const struct xt_entry_target *t; unsigned int verdict; if (!unconditional(e)) return false; t = ip6t_get_target_c(e); if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0) return false; verdict = ((struct xt_standard_target *)t)->verdict; verdict = -verdict - 1; return verdict == NF_DROP || verdict == NF_ACCEPT; } static int check_entry_size_and_hooks(struct ip6t_entry *e, struct xt_table_info *newinfo, const unsigned char *base, const unsigned char *limit, const unsigned int *hook_entries, const unsigned int *underflows, unsigned int valid_hooks) { unsigned int h; int err; if ((unsigned long)e % __alignof__(struct ip6t_entry) != 0 || (unsigned char *)e + sizeof(struct ip6t_entry) >= limit || (unsigned char *)e + e->next_offset > limit) return -EINVAL; if (e->next_offset < sizeof(struct ip6t_entry) + sizeof(struct xt_entry_target)) return -EINVAL; if (!ip6_checkentry(&e->ipv6)) return -EINVAL; err = xt_check_entry_offsets(e, e->elems, e->target_offset, e->next_offset); if (err) return err; /* Check hooks & underflows */ for (h = 0; h < NF_INET_NUMHOOKS; h++) { if (!(valid_hooks & (1 << h))) continue; if ((unsigned char *)e - base == hook_entries[h]) newinfo->hook_entry[h] = hook_entries[h]; if ((unsigned char *)e - base == underflows[h]) { if (!check_underflow(e)) return -EINVAL; newinfo->underflow[h] = underflows[h]; } } /* Clear counters and comefrom */ e->counters = ((struct xt_counters) { 0, 0 }); e->comefrom = 0; return 0; } static void cleanup_entry(struct ip6t_entry *e, struct net *net) { struct xt_tgdtor_param par; struct xt_entry_target *t; struct xt_entry_match *ematch; /* Cleanup all matches */ xt_ematch_foreach(ematch, e) cleanup_match(ematch, net); t = ip6t_get_target(e); par.net = net; par.target = t->u.kernel.target; par.targinfo = t->data; par.family = NFPROTO_IPV6; if (par.target->destroy != NULL) par.target->destroy(&par); module_put(par.target->me); xt_percpu_counter_free(&e->counters); } /* Checks and translates the user-supplied table segment (held in newinfo) */ static int translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, const struct ip6t_replace *repl) { struct xt_percpu_counter_alloc_state alloc_state = { 0 }; struct ip6t_entry *iter; unsigned int *offsets; unsigned int i; int ret = 0; newinfo->size = repl->size; newinfo->number = repl->num_entries; /* Init all hooks to impossible value. */ for (i = 0; i < NF_INET_NUMHOOKS; i++) { newinfo->hook_entry[i] = 0xFFFFFFFF; newinfo->underflow[i] = 0xFFFFFFFF; } offsets = xt_alloc_entry_offsets(newinfo->number); if (!offsets) return -ENOMEM; i = 0; /* Walk through entries, checking offsets. */ xt_entry_foreach(iter, entry0, newinfo->size) { ret = check_entry_size_and_hooks(iter, newinfo, entry0, entry0 + repl->size, repl->hook_entry, repl->underflow, repl->valid_hooks); if (ret != 0) goto out_free; if (i < repl->num_entries) offsets[i] = (void *)iter - entry0; ++i; if (strcmp(ip6t_get_target(iter)->u.user.name, XT_ERROR_TARGET) == 0) ++newinfo->stacksize; } ret = -EINVAL; if (i != repl->num_entries) goto out_free; ret = xt_check_table_hooks(newinfo, repl->valid_hooks); if (ret) goto out_free; if (!mark_source_chains(newinfo, repl->valid_hooks, entry0, offsets)) { ret = -ELOOP; goto out_free; } kvfree(offsets); /* Finally, each sanity check must pass */ i = 0; xt_entry_foreach(iter, entry0, newinfo->size) { ret = find_check_entry(iter, net, repl->name, repl->size, &alloc_state); if (ret != 0) break; ++i; } if (ret != 0) { xt_entry_foreach(iter, entry0, newinfo->size) { if (i-- == 0) break; cleanup_entry(iter, net); } return ret; } return ret; out_free: kvfree(offsets); return ret; } static void get_counters(const struct xt_table_info *t, struct xt_counters counters[]) { struct ip6t_entry *iter; unsigned int cpu; unsigned int i; for_each_possible_cpu(cpu) { seqcount_t *s = &per_cpu(xt_recseq, cpu); i = 0; xt_entry_foreach(iter, t->entries, t->size) { struct xt_counters *tmp; u64 bcnt, pcnt; unsigned int start; tmp = xt_get_per_cpu_counter(&iter->counters, cpu); do { start = read_seqcount_begin(s); bcnt = tmp->bcnt; pcnt = tmp->pcnt; } while (read_seqcount_retry(s, start)); ADD_COUNTER(counters[i], bcnt, pcnt); ++i; cond_resched(); } } } static void get_old_counters(const struct xt_table_info *t, struct xt_counters counters[]) { struct ip6t_entry *iter; unsigned int cpu, i; for_each_possible_cpu(cpu) { i = 0; xt_entry_foreach(iter, t->entries, t->size) { const struct xt_counters *tmp; tmp = xt_get_per_cpu_counter(&iter->counters, cpu); ADD_COUNTER(counters[i], tmp->bcnt, tmp->pcnt); ++i; } cond_resched(); } } static struct xt_counters *alloc_counters(const struct xt_table *table) { unsigned int countersize; struct xt_counters *counters; const struct xt_table_info *private = table->private; /* We need atomic snapshot of counters: rest doesn't change (other than comefrom, which userspace doesn't care about). */ countersize = sizeof(struct xt_counters) * private->number; counters = vzalloc(countersize); if (counters == NULL) return ERR_PTR(-ENOMEM); get_counters(private, counters); return counters; } static int copy_entries_to_user(unsigned int total_size, const struct xt_table *table, void __user *userptr) { unsigned int off, num; const struct ip6t_entry *e; struct xt_counters *counters; const struct xt_table_info *private = table->private; int ret = 0; const void *loc_cpu_entry; counters = alloc_counters(table); if (IS_ERR(counters)) return PTR_ERR(counters); loc_cpu_entry = private->entries; /* FIXME: use iterator macros --RR */ /* ... then go back and fix counters and names */ for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){ unsigned int i; const struct xt_entry_match *m; const struct xt_entry_target *t; e = loc_cpu_entry + off; if (copy_to_user(userptr + off, e, sizeof(*e))) { ret = -EFAULT; goto free_counters; } if (copy_to_user(userptr + off + offsetof(struct ip6t_entry, counters), &counters[num], sizeof(counters[num])) != 0) { ret = -EFAULT; goto free_counters; } for (i = sizeof(struct ip6t_entry); i < e->target_offset; i += m->u.match_size) { m = (void *)e + i; if (xt_match_to_user(m, userptr + off + i)) { ret = -EFAULT; goto free_counters; } } t = ip6t_get_target_c(e); if (xt_target_to_user(t, userptr + off + e->target_offset)) { ret = -EFAULT; goto free_counters; } } free_counters: vfree(counters); return ret; } #ifdef CONFIG_NETFILTER_XTABLES_COMPAT static void compat_standard_from_user(void *dst, const void *src) { int v = *(compat_int_t *)src; if (v > 0) v += xt_compat_calc_jump(AF_INET6, v); memcpy(dst, &v, sizeof(v)); } static int compat_standard_to_user(void __user *dst, const void *src) { compat_int_t cv = *(int *)src; if (cv > 0) cv -= xt_compat_calc_jump(AF_INET6, cv); return copy_to_user(dst, &cv, sizeof(cv)) ? -EFAULT : 0; } static int compat_calc_entry(const struct ip6t_entry *e, const struct xt_table_info *info, const void *base, struct xt_table_info *newinfo) { const struct xt_entry_match *ematch; const struct xt_entry_target *t; unsigned int entry_offset; int off, i, ret; off = sizeof(struct ip6t_entry) - sizeof(struct compat_ip6t_entry); entry_offset = (void *)e - base; xt_ematch_foreach(ematch, e) off += xt_compat_match_offset(ematch->u.kernel.match); t = ip6t_get_target_c(e); off += xt_compat_target_offset(t->u.kernel.target); newinfo->size -= off; ret = xt_compat_add_offset(AF_INET6, entry_offset, off); if (ret) return ret; for (i = 0; i < NF_INET_NUMHOOKS; i++) { if (info->hook_entry[i] && (e < (struct ip6t_entry *)(base + info->hook_entry[i]))) newinfo->hook_entry[i] -= off; if (info->underflow[i] && (e < (struct ip6t_entry *)(base + info->underflow[i]))) newinfo->underflow[i] -= off; } return 0; } static int compat_table_info(const struct xt_table_info *info, struct xt_table_info *newinfo) { struct ip6t_entry *iter; const void *loc_cpu_entry; int ret; if (!newinfo || !info) return -EINVAL; /* we dont care about newinfo->entries */ memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); newinfo->initial_entries = 0; loc_cpu_entry = info->entries; ret = xt_compat_init_offsets(AF_INET6, info->number); if (ret) return ret; xt_entry_foreach(iter, loc_cpu_entry, info->size) { ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo); if (ret != 0) return ret; } return 0; } #endif static int get_info(struct net *net, void __user *user, const int *len) { char name[XT_TABLE_MAXNAMELEN]; struct xt_table *t; int ret; if (*len != sizeof(struct ip6t_getinfo)) return -EINVAL; if (copy_from_user(name, user, sizeof(name)) != 0) return -EFAULT; name[XT_TABLE_MAXNAMELEN-1] = '\0'; #ifdef CONFIG_NETFILTER_XTABLES_COMPAT if (in_compat_syscall()) xt_compat_lock(AF_INET6); #endif t = xt_request_find_table_lock(net, AF_INET6, name); if (!IS_ERR(t)) { struct ip6t_getinfo info; const struct xt_table_info *private = t->private; #ifdef CONFIG_NETFILTER_XTABLES_COMPAT struct xt_table_info tmp; if (in_compat_syscall()) { ret = compat_table_info(private, &tmp); xt_compat_flush_offsets(AF_INET6); private = &tmp; } #endif memset(&info, 0, sizeof(info)); info.valid_hooks = t->valid_hooks; memcpy(info.hook_entry, private->hook_entry, sizeof(info.hook_entry)); memcpy(info.underflow, private->underflow, sizeof(info.underflow)); info.num_entries = private->number; info.size = private->size; strcpy(info.name, name); if (copy_to_user(user, &info, *len) != 0) ret = -EFAULT; else ret = 0; xt_table_unlock(t); module_put(t->me); } else ret = PTR_ERR(t); #ifdef CONFIG_NETFILTER_XTABLES_COMPAT if (in_compat_syscall()) xt_compat_unlock(AF_INET6); #endif return ret; } static int get_entries(struct net *net, struct ip6t_get_entries __user *uptr, const int *len) { int ret; struct ip6t_get_entries get; struct xt_table *t; if (*len < sizeof(get)) return -EINVAL; if (copy_from_user(&get, uptr, sizeof(get)) != 0) return -EFAULT; if (*len != sizeof(struct ip6t_get_entries) + get.size) return -EINVAL; get.name[sizeof(get.name) - 1] = '\0'; t = xt_find_table_lock(net, AF_INET6, get.name); if (!IS_ERR(t)) { struct xt_table_info *private = t->private; if (get.size == private->size) ret = copy_entries_to_user(private->size, t, uptr->entrytable); else ret = -EAGAIN; module_put(t->me); xt_table_unlock(t); } else ret = PTR_ERR(t); return ret; } static int __do_replace(struct net *net, const char *name, unsigned int valid_hooks, struct xt_table_info *newinfo, unsigned int num_counters, void __user *counters_ptr) { int ret; struct xt_table *t; struct xt_table_info *oldinfo; struct xt_counters *counters; struct ip6t_entry *iter; counters = xt_counters_alloc(num_counters); if (!counters) { ret = -ENOMEM; goto out; } t = xt_request_find_table_lock(net, AF_INET6, name); if (IS_ERR(t)) { ret = PTR_ERR(t); goto free_newinfo_counters_untrans; } /* You lied! */ if (valid_hooks != t->valid_hooks) { ret = -EINVAL; goto put_module; } oldinfo = xt_replace_table(t, num_counters, newinfo, &ret); if (!oldinfo) goto put_module; /* Update module usage count based on number of rules */ if ((oldinfo->number > oldinfo->initial_entries) || (newinfo->number <= oldinfo->initial_entries)) module_put(t->me); if ((oldinfo->number > oldinfo->initial_entries) && (newinfo->number <= oldinfo->initial_entries)) module_put(t->me); xt_table_unlock(t); get_old_counters(oldinfo, counters); /* Decrease module usage counts and free resource */ xt_entry_foreach(iter, oldinfo->entries, oldinfo->size) cleanup_entry(iter, net); xt_free_table_info(oldinfo); if (copy_to_user(counters_ptr, counters, sizeof(struct xt_counters) * num_counters) != 0) { /* Silent error, can't fail, new table is already in place */ net_warn_ratelimited("ip6tables: counters copy to user failed while replacing table\n"); } vfree(counters); return 0; put_module: module_put(t->me); xt_table_unlock(t); free_newinfo_counters_untrans: vfree(counters); out: return ret; } static int do_replace(struct net *net, sockptr_t arg, unsigned int len) { int ret; struct ip6t_replace tmp; struct xt_table_info *newinfo; void *loc_cpu_entry; struct ip6t_entry *iter; if (len < sizeof(tmp)) return -EINVAL; if (copy_from_sockptr(&tmp, arg, sizeof(tmp)) != 0) return -EFAULT; /* overflow check */ if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) return -ENOMEM; if (tmp.num_counters == 0) return -EINVAL; if ((u64)len < (u64)tmp.size + sizeof(tmp)) return -EINVAL; tmp.name[sizeof(tmp.name)-1] = 0; newinfo = xt_alloc_table_info(tmp.size); if (!newinfo) return -ENOMEM; loc_cpu_entry = newinfo->entries; if (copy_from_sockptr_offset(loc_cpu_entry, arg, sizeof(tmp), tmp.size) != 0) { ret = -EFAULT; goto free_newinfo; } ret = translate_table(net, newinfo, loc_cpu_entry, &tmp); if (ret != 0) goto free_newinfo; ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo, tmp.num_counters, tmp.counters); if (ret) goto free_newinfo_untrans; return 0; free_newinfo_untrans: xt_entry_foreach(iter, loc_cpu_entry, newinfo->size) cleanup_entry(iter, net); free_newinfo: xt_free_table_info(newinfo); return ret; } static int do_add_counters(struct net *net, sockptr_t arg, unsigned int len) { unsigned int i; struct xt_counters_info tmp; struct xt_counters *paddc; struct xt_table *t; const struct xt_table_info *private; int ret = 0; struct ip6t_entry *iter; unsigned int addend; paddc = xt_copy_counters(arg, len, &tmp); if (IS_ERR(paddc)) return PTR_ERR(paddc); t = xt_find_table_lock(net, AF_INET6, tmp.name); if (IS_ERR(t)) { ret = PTR_ERR(t); goto free; } local_bh_disable(); private = t->private; if (private->number != tmp.num_counters) { ret = -EINVAL; goto unlock_up_free; } i = 0; addend = xt_write_recseq_begin(); xt_entry_foreach(iter, private->entries, private->size) { struct xt_counters *tmp; tmp = xt_get_this_cpu_counter(&iter->counters); ADD_COUNTER(*tmp, paddc[i].bcnt, paddc[i].pcnt); ++i; } xt_write_recseq_end(addend); unlock_up_free: local_bh_enable(); xt_table_unlock(t); module_put(t->me); free: vfree(paddc); return ret; } #ifdef CONFIG_NETFILTER_XTABLES_COMPAT struct compat_ip6t_replace { char name[XT_TABLE_MAXNAMELEN]; u32 valid_hooks; u32 num_entries; u32 size; u32 hook_entry[NF_INET_NUMHOOKS]; u32 underflow[NF_INET_NUMHOOKS]; u32 num_counters; compat_uptr_t counters; /* struct xt_counters * */ struct compat_ip6t_entry entries[]; }; static int compat_copy_entry_to_user(struct ip6t_entry *e, void __user **dstptr, unsigned int *size, struct xt_counters *counters, unsigned int i) { struct xt_entry_target *t; struct compat_ip6t_entry __user *ce; u_int16_t target_offset, next_offset; compat_uint_t origsize; const struct xt_entry_match *ematch; int ret = 0; origsize = *size; ce = *dstptr; if (copy_to_user(ce, e, sizeof(struct ip6t_entry)) != 0 || copy_to_user(&ce->counters, &counters[i], sizeof(counters[i])) != 0) return -EFAULT; *dstptr += sizeof(struct compat_ip6t_entry); *size -= sizeof(struct ip6t_entry) - sizeof(struct compat_ip6t_entry); xt_ematch_foreach(ematch, e) { ret = xt_compat_match_to_user(ematch, dstptr, size); if (ret != 0) return ret; } target_offset = e->target_offset - (origsize - *size); t = ip6t_get_target(e); ret = xt_compat_target_to_user(t, dstptr, size); if (ret) return ret; next_offset = e->next_offset - (origsize - *size); if (put_user(target_offset, &ce->target_offset) != 0 || put_user(next_offset, &ce->next_offset) != 0) return -EFAULT; return 0; } static int compat_find_calc_match(struct xt_entry_match *m, const struct ip6t_ip6 *ipv6, int *size) { struct xt_match *match; match = xt_request_find_match(NFPROTO_IPV6, m->u.user.name, m->u.user.revision); if (IS_ERR(match)) return PTR_ERR(match); m->u.kernel.match = match; *size += xt_compat_match_offset(match); return 0; } static void compat_release_entry(struct compat_ip6t_entry *e) { struct xt_entry_target *t; struct xt_entry_match *ematch; /* Cleanup all matches */ xt_ematch_foreach(ematch, e) module_put(ematch->u.kernel.match->me); t = compat_ip6t_get_target(e); module_put(t->u.kernel.target->me); } static int check_compat_entry_size_and_hooks(struct compat_ip6t_entry *e, struct xt_table_info *newinfo, unsigned int *size, const unsigned char *base, const unsigned char *limit) { struct xt_entry_match *ematch; struct xt_entry_target *t; struct xt_target *target; unsigned int entry_offset; unsigned int j; int ret, off; if ((unsigned long)e % __alignof__(struct compat_ip6t_entry) != 0 || (unsigned char *)e + sizeof(struct compat_ip6t_entry) >= limit || (unsigned char *)e + e->next_offset > limit) return -EINVAL; if (e->next_offset < sizeof(struct compat_ip6t_entry) + sizeof(struct compat_xt_entry_target)) return -EINVAL; if (!ip6_checkentry(&e->ipv6)) return -EINVAL; ret = xt_compat_check_entry_offsets(e, e->elems, e->target_offset, e->next_offset); if (ret) return ret; off = sizeof(struct ip6t_entry) - sizeof(struct compat_ip6t_entry); entry_offset = (void *)e - (void *)base; j = 0; xt_ematch_foreach(ematch, e) { ret = compat_find_calc_match(ematch, &e->ipv6, &off); if (ret != 0) goto release_matches; ++j; } t = compat_ip6t_get_target(e); target = xt_request_find_target(NFPROTO_IPV6, t->u.user.name, t->u.user.revision); if (IS_ERR(target)) { ret = PTR_ERR(target); goto release_matches; } t->u.kernel.target = target; off += xt_compat_target_offset(target); *size += off; ret = xt_compat_add_offset(AF_INET6, entry_offset, off); if (ret) goto out; return 0; out: module_put(t->u.kernel.target->me); release_matches: xt_ematch_foreach(ematch, e) { if (j-- == 0) break; module_put(ematch->u.kernel.match->me); } return ret; } static void compat_copy_entry_from_user(struct compat_ip6t_entry *e, void **dstptr, unsigned int *size, struct xt_table_info *newinfo, unsigned char *base) { struct xt_entry_target *t; struct ip6t_entry *de; unsigned int origsize; int h; struct xt_entry_match *ematch; origsize = *size; de = *dstptr; memcpy(de, e, sizeof(struct ip6t_entry)); memcpy(&de->counters, &e->counters, sizeof(e->counters)); *dstptr += sizeof(struct ip6t_entry); *size += sizeof(struct ip6t_entry) - sizeof(struct compat_ip6t_entry); xt_ematch_foreach(ematch, e) xt_compat_match_from_user(ematch, dstptr, size); de->target_offset = e->target_offset - (origsize - *size); t = compat_ip6t_get_target(e); xt_compat_target_from_user(t, dstptr, size); de->next_offset = e->next_offset - (origsize - *size); for (h = 0; h < NF_INET_NUMHOOKS; h++) { if ((unsigned char *)de - base < newinfo->hook_entry[h]) newinfo->hook_entry[h] -= origsize - *size; if ((unsigned char *)de - base < newinfo->underflow[h]) newinfo->underflow[h] -= origsize - *size; } } static int translate_compat_table(struct net *net, struct xt_table_info **pinfo, void **pentry0, const struct compat_ip6t_replace *compatr) { unsigned int i, j; struct xt_table_info *newinfo, *info; void *pos, *entry0, *entry1; struct compat_ip6t_entry *iter0; struct ip6t_replace repl; unsigned int size; int ret; info = *pinfo; entry0 = *pentry0; size = compatr->size; info->number = compatr->num_entries; j = 0; xt_compat_lock(AF_INET6); ret = xt_compat_init_offsets(AF_INET6, compatr->num_entries); if (ret) goto out_unlock; /* Walk through entries, checking offsets. */ xt_entry_foreach(iter0, entry0, compatr->size) { ret = check_compat_entry_size_and_hooks(iter0, info, &size, entry0, entry0 + compatr->size); if (ret != 0) goto out_unlock; ++j; } ret = -EINVAL; if (j != compatr->num_entries) goto out_unlock; ret = -ENOMEM; newinfo = xt_alloc_table_info(size); if (!newinfo) goto out_unlock; memset(newinfo->entries, 0, size); newinfo->number = compatr->num_entries; for (i = 0; i < NF_INET_NUMHOOKS; i++) { newinfo->hook_entry[i] = compatr->hook_entry[i]; newinfo->underflow[i] = compatr->underflow[i]; } entry1 = newinfo->entries; pos = entry1; size = compatr->size; xt_entry_foreach(iter0, entry0, compatr->size) compat_copy_entry_from_user(iter0, &pos, &size, newinfo, entry1); /* all module references in entry0 are now gone. */ xt_compat_flush_offsets(AF_INET6); xt_compat_unlock(AF_INET6); memcpy(&repl, compatr, sizeof(*compatr)); for (i = 0; i < NF_INET_NUMHOOKS; i++) { repl.hook_entry[i] = newinfo->hook_entry[i]; repl.underflow[i] = newinfo->underflow[i]; } repl.num_counters = 0; repl.counters = NULL; repl.size = newinfo->size; ret = translate_table(net, newinfo, entry1, &repl); if (ret) goto free_newinfo; *pinfo = newinfo; *pentry0 = entry1; xt_free_table_info(info); return 0; free_newinfo: xt_free_table_info(newinfo); return ret; out_unlock: xt_compat_flush_offsets(AF_INET6); xt_compat_unlock(AF_INET6); xt_entry_foreach(iter0, entry0, compatr->size) { if (j-- == 0) break; compat_release_entry(iter0); } return ret; } static int compat_do_replace(struct net *net, sockptr_t arg, unsigned int len) { int ret; struct compat_ip6t_replace tmp; struct xt_table_info *newinfo; void *loc_cpu_entry; struct ip6t_entry *iter; if (len < sizeof(tmp)) return -EINVAL; if (copy_from_sockptr(&tmp, arg, sizeof(tmp)) != 0) return -EFAULT; /* overflow check */ if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) return -ENOMEM; if (tmp.num_counters == 0) return -EINVAL; if ((u64)len < (u64)tmp.size + sizeof(tmp)) return -EINVAL; tmp.name[sizeof(tmp.name)-1] = 0; newinfo = xt_alloc_table_info(tmp.size); if (!newinfo) return -ENOMEM; loc_cpu_entry = newinfo->entries; if (copy_from_sockptr_offset(loc_cpu_entry, arg, sizeof(tmp), tmp.size) != 0) { ret = -EFAULT; goto free_newinfo; } ret = translate_compat_table(net, &newinfo, &loc_cpu_entry, &tmp); if (ret != 0) goto free_newinfo; ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo, tmp.num_counters, compat_ptr(tmp.counters)); if (ret) goto free_newinfo_untrans; return 0; free_newinfo_untrans: xt_entry_foreach(iter, loc_cpu_entry, newinfo->size) cleanup_entry(iter, net); free_newinfo: xt_free_table_info(newinfo); return ret; } struct compat_ip6t_get_entries { char name[XT_TABLE_MAXNAMELEN]; compat_uint_t size; struct compat_ip6t_entry entrytable[]; }; static int compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table, void __user *userptr) { struct xt_counters *counters; const struct xt_table_info *private = table->private; void __user *pos; unsigned int size; int ret = 0; unsigned int i = 0; struct ip6t_entry *iter; counters = alloc_counters(table); if (IS_ERR(counters)) return PTR_ERR(counters); pos = userptr; size = total_size; xt_entry_foreach(iter, private->entries, total_size) { ret = compat_copy_entry_to_user(iter, &pos, &size, counters, i++); if (ret != 0) break; } vfree(counters); return ret; } static int compat_get_entries(struct net *net, struct compat_ip6t_get_entries __user *uptr, int *len) { int ret; struct compat_ip6t_get_entries get; struct xt_table *t; if (*len < sizeof(get)) return -EINVAL; if (copy_from_user(&get, uptr, sizeof(get)) != 0) return -EFAULT; if (*len != sizeof(struct compat_ip6t_get_entries) + get.size) return -EINVAL; get.name[sizeof(get.name) - 1] = '\0'; xt_compat_lock(AF_INET6); t = xt_find_table_lock(net, AF_INET6, get.name); if (!IS_ERR(t)) { const struct xt_table_info *private = t->private; struct xt_table_info info; ret = compat_table_info(private, &info); if (!ret && get.size == info.size) ret = compat_copy_entries_to_user(private->size, t, uptr->entrytable); else if (!ret) ret = -EAGAIN; xt_compat_flush_offsets(AF_INET6); module_put(t->me); xt_table_unlock(t); } else ret = PTR_ERR(t); xt_compat_unlock(AF_INET6); return ret; } #endif static int do_ip6t_set_ctl(struct sock *sk, int cmd, sockptr_t arg, unsigned int len) { int ret; if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) return -EPERM; switch (cmd) { case IP6T_SO_SET_REPLACE: #ifdef CONFIG_NETFILTER_XTABLES_COMPAT if (in_compat_syscall()) ret = compat_do_replace(sock_net(sk), arg, len); else #endif ret = do_replace(sock_net(sk), arg, len); break; case IP6T_SO_SET_ADD_COUNTERS: ret = do_add_counters(sock_net(sk), arg, len); break; default: ret = -EINVAL; } return ret; } static int do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) { int ret; if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) return -EPERM; switch (cmd) { case IP6T_SO_GET_INFO: ret = get_info(sock_net(sk), user, len); break; case IP6T_SO_GET_ENTRIES: #ifdef CONFIG_NETFILTER_XTABLES_COMPAT if (in_compat_syscall()) ret = compat_get_entries(sock_net(sk), user, len); else #endif ret = get_entries(sock_net(sk), user, len); break; case IP6T_SO_GET_REVISION_MATCH: case IP6T_SO_GET_REVISION_TARGET: { struct xt_get_revision rev; int target; if (*len != sizeof(rev)) { ret = -EINVAL; break; } if (copy_from_user(&rev, user, sizeof(rev)) != 0) { ret = -EFAULT; break; } rev.name[sizeof(rev.name)-1] = 0; if (cmd == IP6T_SO_GET_REVISION_TARGET) target = 1; else target = 0; try_then_request_module(xt_find_revision(AF_INET6, rev.name, rev.revision, target, &ret), "ip6t_%s", rev.name); break; } default: ret = -EINVAL; } return ret; } static void __ip6t_unregister_table(struct net *net, struct xt_table *table) { struct xt_table_info *private; void *loc_cpu_entry; struct module *table_owner = table->me; struct ip6t_entry *iter; private = xt_unregister_table(table); /* Decrease module usage counts and free resources */ loc_cpu_entry = private->entries; xt_entry_foreach(iter, loc_cpu_entry, private->size) cleanup_entry(iter, net); if (private->number > private->initial_entries) module_put(table_owner); xt_free_table_info(private); } int ip6t_register_table(struct net *net, const struct xt_table *table, const struct ip6t_replace *repl, const struct nf_hook_ops *template_ops) { struct nf_hook_ops *ops; unsigned int num_ops; int ret, i; struct xt_table_info *newinfo; struct xt_table_info bootstrap = {0}; void *loc_cpu_entry; struct xt_table *new_table; newinfo = xt_alloc_table_info(repl->size); if (!newinfo) return -ENOMEM; loc_cpu_entry = newinfo->entries; memcpy(loc_cpu_entry, repl->entries, repl->size); ret = translate_table(net, newinfo, loc_cpu_entry, repl); if (ret != 0) { xt_free_table_info(newinfo); return ret; } new_table = xt_register_table(net, table, &bootstrap, newinfo); if (IS_ERR(new_table)) { struct ip6t_entry *iter; xt_entry_foreach(iter, loc_cpu_entry, newinfo->size) cleanup_entry(iter, net); xt_free_table_info(newinfo); return PTR_ERR(new_table); } if (!template_ops) return 0; num_ops = hweight32(table->valid_hooks); if (num_ops == 0) { ret = -EINVAL; goto out_free; } ops = kmemdup_array(template_ops, num_ops, sizeof(*ops), GFP_KERNEL); if (!ops) { ret = -ENOMEM; goto out_free; } for (i = 0; i < num_ops; i++) ops[i].priv = new_table; new_table->ops = ops; ret = nf_register_net_hooks(net, ops, num_ops); if (ret != 0) goto out_free; return ret; out_free: __ip6t_unregister_table(net, new_table); return ret; } void ip6t_unregister_table_pre_exit(struct net *net, const char *name) { struct xt_table *table = xt_find_table(net, NFPROTO_IPV6, name); if (table) nf_unregister_net_hooks(net, table->ops, hweight32(table->valid_hooks)); } void ip6t_unregister_table_exit(struct net *net, const char *name) { struct xt_table *table = xt_find_table(net, NFPROTO_IPV6, name); if (table) __ip6t_unregister_table(net, table); } /* The built-in targets: standard (NULL) and error. */ static struct xt_target ip6t_builtin_tg[] __read_mostly = { { .name = XT_STANDARD_TARGET, .targetsize = sizeof(int), .family = NFPROTO_IPV6, #ifdef CONFIG_NETFILTER_XTABLES_COMPAT .compatsize = sizeof(compat_int_t), .compat_from_user = compat_standard_from_user, .compat_to_user = compat_standard_to_user, #endif }, { .name = XT_ERROR_TARGET, .target = ip6t_error, .targetsize = XT_FUNCTION_MAXNAMELEN, .family = NFPROTO_IPV6, }, }; static struct nf_sockopt_ops ip6t_sockopts = { .pf = PF_INET6, .set_optmin = IP6T_BASE_CTL, .set_optmax = IP6T_SO_SET_MAX+1, .set = do_ip6t_set_ctl, .get_optmin = IP6T_BASE_CTL, .get_optmax = IP6T_SO_GET_MAX+1, .get = do_ip6t_get_ctl, .owner = THIS_MODULE, }; static int __net_init ip6_tables_net_init(struct net *net) { return xt_proto_init(net, NFPROTO_IPV6); } static void __net_exit ip6_tables_net_exit(struct net *net) { xt_proto_fini(net, NFPROTO_IPV6); } static struct pernet_operations ip6_tables_net_ops = { .init = ip6_tables_net_init, .exit = ip6_tables_net_exit, }; static int __init ip6_tables_init(void) { int ret; ret = register_pernet_subsys(&ip6_tables_net_ops); if (ret < 0) goto err1; /* No one else will be downing sem now, so we won't sleep */ ret = xt_register_targets(ip6t_builtin_tg, ARRAY_SIZE(ip6t_builtin_tg)); if (ret < 0) goto err2; /* Register setsockopt */ ret = nf_register_sockopt(&ip6t_sockopts); if (ret < 0) goto err4; return 0; err4: xt_unregister_targets(ip6t_builtin_tg, ARRAY_SIZE(ip6t_builtin_tg)); err2: unregister_pernet_subsys(&ip6_tables_net_ops); err1: return ret; } static void __exit ip6_tables_fini(void) { nf_unregister_sockopt(&ip6t_sockopts); xt_unregister_targets(ip6t_builtin_tg, ARRAY_SIZE(ip6t_builtin_tg)); unregister_pernet_subsys(&ip6_tables_net_ops); } EXPORT_SYMBOL(ip6t_register_table); EXPORT_SYMBOL(ip6t_unregister_table_pre_exit); EXPORT_SYMBOL(ip6t_unregister_table_exit); EXPORT_SYMBOL(ip6t_do_table); module_init(ip6_tables_init); module_exit(ip6_tables_fini);
8 8 8 8 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 // SPDX-License-Identifier: GPL-2.0-or-later /* AFS security handling * * Copyright (C) 2007, 2017 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <linux/init.h> #include <linux/slab.h> #include <linux/fs.h> #include <linux/ctype.h> #include <linux/sched.h> #include <linux/hashtable.h> #include <keys/rxrpc-type.h> #include "internal.h" static DEFINE_HASHTABLE(afs_permits_cache, 10); static DEFINE_SPINLOCK(afs_permits_lock); /* * get a key */ struct key *afs_request_key(struct afs_cell *cell) { struct key *key; _enter("{%x}", key_serial(cell->anonymous_key)); _debug("key %s", cell->anonymous_key->description); key = request_key_net(&key_type_rxrpc, cell->anonymous_key->description, cell->net->net, NULL); if (IS_ERR(key)) { if (PTR_ERR(key) != -ENOKEY) { _leave(" = %ld", PTR_ERR(key)); return key; } /* act as anonymous user */ _leave(" = {%x} [anon]", key_serial(cell->anonymous_key)); return key_get(cell->anonymous_key); } else { /* act as authorised user */ _leave(" = {%x} [auth]", key_serial(key)); return key; } } /* * Get a key when pathwalk is in rcuwalk mode. */ struct key *afs_request_key_rcu(struct afs_cell *cell) { struct key *key; _enter("{%x}", key_serial(cell->anonymous_key)); _debug("key %s", cell->anonymous_key->description); key = request_key_net_rcu(&key_type_rxrpc, cell->anonymous_key->description, cell->net->net); if (IS_ERR(key)) { if (PTR_ERR(key) != -ENOKEY) { _leave(" = %ld", PTR_ERR(key)); return key; } /* act as anonymous user */ _leave(" = {%x} [anon]", key_serial(cell->anonymous_key)); return key_get(cell->anonymous_key); } else { /* act as authorised user */ _leave(" = {%x} [auth]", key_serial(key)); return key; } } /* * Dispose of a list of permits. */ static void afs_permits_rcu(struct rcu_head *rcu) { struct afs_permits *permits = container_of(rcu, struct afs_permits, rcu); int i; for (i = 0; i < permits->nr_permits; i++) key_put(permits->permits[i].key); kfree(permits); } /* * Discard a permission cache. */ void afs_put_permits(struct afs_permits *permits) { if (permits && refcount_dec_and_test(&permits->usage)) { spin_lock(&afs_permits_lock); hash_del_rcu(&permits->hash_node); spin_unlock(&afs_permits_lock); call_rcu(&permits->rcu, afs_permits_rcu); } } /* * Clear a permit cache on callback break. */ void afs_clear_permits(struct afs_vnode *vnode) { struct afs_permits *permits; spin_lock(&vnode->lock); permits = rcu_dereference_protected(vnode->permit_cache, lockdep_is_held(&vnode->lock)); RCU_INIT_POINTER(vnode->permit_cache, NULL); spin_unlock(&vnode->lock); afs_put_permits(permits); } /* * Hash a list of permits. Use simple addition to make it easy to add an extra * one at an as-yet indeterminate position in the list. */ static void afs_hash_permits(struct afs_permits *permits) { unsigned long h = permits->nr_permits; int i; for (i = 0; i < permits->nr_permits; i++) { h += (unsigned long)permits->permits[i].key / sizeof(void *); h += permits->permits[i].access; } permits->h = h; } /* * Cache the CallerAccess result obtained from doing a fileserver operation * that returned a vnode status for a particular key. If a callback break * occurs whilst the operation was in progress then we have to ditch the cache * as the ACL *may* have changed. */ void afs_cache_permit(struct afs_vnode *vnode, struct key *key, unsigned int cb_break, struct afs_status_cb *scb) { struct afs_permits *permits, *xpermits, *replacement, *zap, *new = NULL; afs_access_t caller_access = scb->status.caller_access; size_t size = 0; bool changed = false; int i, j; _enter("{%llx:%llu},%x,%x", vnode->fid.vid, vnode->fid.vnode, key_serial(key), caller_access); rcu_read_lock(); /* Check for the common case first: We got back the same access as last * time we tried and already have it recorded. */ permits = rcu_dereference(vnode->permit_cache); if (permits) { if (!permits->invalidated) { for (i = 0; i < permits->nr_permits; i++) { if (permits->permits[i].key < key) continue; if (permits->permits[i].key > key) break; if (permits->permits[i].access != caller_access) { changed = true; break; } if (afs_cb_is_broken(cb_break, vnode)) { changed = true; break; } /* The cache is still good. */ rcu_read_unlock(); return; } } changed |= permits->invalidated; size = permits->nr_permits; /* If this set of permits is now wrong, clear the permits * pointer so that no one tries to use the stale information. */ if (changed) { spin_lock(&vnode->lock); if (permits != rcu_access_pointer(vnode->permit_cache)) goto someone_else_changed_it_unlock; RCU_INIT_POINTER(vnode->permit_cache, NULL); spin_unlock(&vnode->lock); afs_put_permits(permits); permits = NULL; size = 0; } } if (afs_cb_is_broken(cb_break, vnode)) goto someone_else_changed_it; /* We need a ref on any permits list we want to copy as we'll have to * drop the lock to do memory allocation. */ if (permits && !refcount_inc_not_zero(&permits->usage)) goto someone_else_changed_it; rcu_read_unlock(); /* Speculatively create a new list with the revised permission set. We * discard this if we find an extant match already in the hash, but * it's easier to compare with memcmp this way. * * We fill in the key pointers at this time, but we don't get the refs * yet. */ size++; new = kzalloc(struct_size(new, permits, size), GFP_NOFS); if (!new) goto out_put; refcount_set(&new->usage, 1); new->nr_permits = size; i = j = 0; if (permits) { for (i = 0; i < permits->nr_permits; i++) { if (j == i && permits->permits[i].key > key) { new->permits[j].key = key; new->permits[j].access = caller_access; j++; } new->permits[j].key = permits->permits[i].key; new->permits[j].access = permits->permits[i].access; j++; } } if (j == i) { new->permits[j].key = key; new->permits[j].access = caller_access; } afs_hash_permits(new); /* Now see if the permit list we want is actually already available */ spin_lock(&afs_permits_lock); hash_for_each_possible(afs_permits_cache, xpermits, hash_node, new->h) { if (xpermits->h != new->h || xpermits->invalidated || xpermits->nr_permits != new->nr_permits || memcmp(xpermits->permits, new->permits, new->nr_permits * sizeof(struct afs_permit)) != 0) continue; if (refcount_inc_not_zero(&xpermits->usage)) { replacement = xpermits; goto found; } break; } for (i = 0; i < new->nr_permits; i++) key_get(new->permits[i].key); hash_add_rcu(afs_permits_cache, &new->hash_node, new->h); replacement = new; new = NULL; found: spin_unlock(&afs_permits_lock); kfree(new); rcu_read_lock(); spin_lock(&vnode->lock); zap = rcu_access_pointer(vnode->permit_cache); if (!afs_cb_is_broken(cb_break, vnode) && zap == permits) rcu_assign_pointer(vnode->permit_cache, replacement); else zap = replacement; spin_unlock(&vnode->lock); rcu_read_unlock(); afs_put_permits(zap); out_put: afs_put_permits(permits); return; someone_else_changed_it_unlock: spin_unlock(&vnode->lock); someone_else_changed_it: /* Someone else changed the cache under us - don't recheck at this * time. */ rcu_read_unlock(); return; } static bool afs_check_permit_rcu(struct afs_vnode *vnode, struct key *key, afs_access_t *_access) { const struct afs_permits *permits; int i; _enter("{%llx:%llu},%x", vnode->fid.vid, vnode->fid.vnode, key_serial(key)); /* check the permits to see if we've got one yet */ if (key == vnode->volume->cell->anonymous_key) { *_access = vnode->status.anon_access; _leave(" = t [anon %x]", *_access); return true; } permits = rcu_dereference(vnode->permit_cache); if (permits) { for (i = 0; i < permits->nr_permits; i++) { if (permits->permits[i].key < key) continue; if (permits->permits[i].key > key) break; *_access = permits->permits[i].access; _leave(" = %u [perm %x]", !permits->invalidated, *_access); return !permits->invalidated; } } _leave(" = f"); return false; } /* * check with the fileserver to see if the directory or parent directory is * permitted to be accessed with this authorisation, and if so, what access it * is granted */ int afs_check_permit(struct afs_vnode *vnode, struct key *key, afs_access_t *_access) { struct afs_permits *permits; bool valid = false; int i, ret; _enter("{%llx:%llu},%x", vnode->fid.vid, vnode->fid.vnode, key_serial(key)); /* check the permits to see if we've got one yet */ if (key == vnode->volume->cell->anonymous_key) { _debug("anon"); *_access = vnode->status.anon_access; valid = true; } else { rcu_read_lock(); permits = rcu_dereference(vnode->permit_cache); if (permits) { for (i = 0; i < permits->nr_permits; i++) { if (permits->permits[i].key < key) continue; if (permits->permits[i].key > key) break; *_access = permits->permits[i].access; valid = !permits->invalidated; break; } } rcu_read_unlock(); } if (!valid) { /* Check the status on the file we're actually interested in * (the post-processing will cache the result). */ _debug("no valid permit"); ret = afs_fetch_status(vnode, key, false, _access); if (ret < 0) { *_access = 0; _leave(" = %d", ret); return ret; } } _leave(" = 0 [access %x]", *_access); return 0; } /* * check the permissions on an AFS file * - AFS ACLs are attached to directories only, and a file is controlled by its * parent directory's ACL */ int afs_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { struct afs_vnode *vnode = AFS_FS_I(inode); afs_access_t access; struct key *key; int ret = 0; _enter("{{%llx:%llu},%lx},%x,", vnode->fid.vid, vnode->fid.vnode, vnode->flags, mask); if (mask & MAY_NOT_BLOCK) { key = afs_request_key_rcu(vnode->volume->cell); if (IS_ERR(key)) return -ECHILD; ret = -ECHILD; if (!afs_check_validity(vnode) || !afs_check_permit_rcu(vnode, key, &access)) goto error; } else { key = afs_request_key(vnode->volume->cell); if (IS_ERR(key)) { _leave(" = %ld [key]", PTR_ERR(key)); return PTR_ERR(key); } ret = afs_validate(vnode, key); if (ret < 0) goto error; /* check the permits to see if we've got one yet */ ret = afs_check_permit(vnode, key, &access); if (ret < 0) goto error; } /* interpret the access mask */ _debug("REQ %x ACC %x on %s", mask, access, S_ISDIR(inode->i_mode) ? "dir" : "file"); ret = 0; if (S_ISDIR(inode->i_mode)) { if (mask & (MAY_EXEC | MAY_READ | MAY_CHDIR)) { if (!(access & AFS_ACE_LOOKUP)) goto permission_denied; } if (mask & MAY_WRITE) { if (!(access & (AFS_ACE_DELETE | /* rmdir, unlink, rename from */ AFS_ACE_INSERT))) /* create, mkdir, symlink, rename to */ goto permission_denied; } } else { if (!(access & AFS_ACE_LOOKUP)) goto permission_denied; if ((mask & MAY_EXEC) && !(inode->i_mode & S_IXUSR)) goto permission_denied; if (mask & (MAY_EXEC | MAY_READ)) { if (!(access & AFS_ACE_READ)) goto permission_denied; if (!(inode->i_mode & S_IRUSR)) goto permission_denied; } else if (mask & MAY_WRITE) { if (!(access & AFS_ACE_WRITE)) goto permission_denied; if (!(inode->i_mode & S_IWUSR)) goto permission_denied; } } key_put(key); _leave(" = %d", ret); return ret; permission_denied: ret = -EACCES; error: key_put(key); _leave(" = %d", ret); return ret; } void __exit afs_clean_up_permit_cache(void) { int i; for (i = 0; i < HASH_SIZE(afs_permits_cache); i++) WARN_ON_ONCE(!hlist_empty(&afs_permits_cache[i])); }
4 4 4 4 3 1 1 4 1 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 // SPDX-License-Identifier: GPL-2.0-only #define KMSG_COMPONENT "IPVS" #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt #include <linux/module.h> #include <linux/spinlock.h> #include <linux/interrupt.h> #include <asm/string.h> #include <linux/kmod.h> #include <linux/sysctl.h> #include <net/ip_vs.h> /* IPVS pe list */ static LIST_HEAD(ip_vs_pe); /* semaphore for IPVS PEs. */ static DEFINE_MUTEX(ip_vs_pe_mutex); /* Get pe in the pe list by name */ struct ip_vs_pe *__ip_vs_pe_getbyname(const char *pe_name) { struct ip_vs_pe *pe; IP_VS_DBG(10, "%s(): pe_name \"%s\"\n", __func__, pe_name); rcu_read_lock(); list_for_each_entry_rcu(pe, &ip_vs_pe, n_list) { /* Test and get the modules atomically */ if (pe->module && !try_module_get(pe->module)) { /* This pe is just deleted */ continue; } if (strcmp(pe_name, pe->name)==0) { /* HIT */ rcu_read_unlock(); return pe; } module_put(pe->module); } rcu_read_unlock(); return NULL; } /* Lookup pe and try to load it if it doesn't exist */ struct ip_vs_pe *ip_vs_pe_getbyname(const char *name) { struct ip_vs_pe *pe; /* Search for the pe by name */ pe = __ip_vs_pe_getbyname(name); /* If pe not found, load the module and search again */ if (!pe) { request_module("ip_vs_pe_%s", name); pe = __ip_vs_pe_getbyname(name); } return pe; } /* Register a pe in the pe list */ int register_ip_vs_pe(struct ip_vs_pe *pe) { struct ip_vs_pe *tmp; /* increase the module use count */ if (!ip_vs_use_count_inc()) return -ENOENT; mutex_lock(&ip_vs_pe_mutex); /* Make sure that the pe with this name doesn't exist * in the pe list. */ list_for_each_entry(tmp, &ip_vs_pe, n_list) { if (strcmp(tmp->name, pe->name) == 0) { mutex_unlock(&ip_vs_pe_mutex); ip_vs_use_count_dec(); pr_err("%s(): [%s] pe already existed " "in the system\n", __func__, pe->name); return -EINVAL; } } /* Add it into the d-linked pe list */ list_add_rcu(&pe->n_list, &ip_vs_pe); mutex_unlock(&ip_vs_pe_mutex); pr_info("[%s] pe registered.\n", pe->name); return 0; } EXPORT_SYMBOL_GPL(register_ip_vs_pe); /* Unregister a pe from the pe list */ int unregister_ip_vs_pe(struct ip_vs_pe *pe) { mutex_lock(&ip_vs_pe_mutex); /* Remove it from the d-linked pe list */ list_del_rcu(&pe->n_list); mutex_unlock(&ip_vs_pe_mutex); /* decrease the module use count */ ip_vs_use_count_dec(); pr_info("[%s] pe unregistered.\n", pe->name); return 0; } EXPORT_SYMBOL_GPL(unregister_ip_vs_pe);
4 48 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 /* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich, Antonio Quartulli */ #ifndef _NET_BATMAN_ADV_TRANSLATION_TABLE_H_ #define _NET_BATMAN_ADV_TRANSLATION_TABLE_H_ #include "main.h" #include <linux/kref.h> #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/skbuff.h> #include <linux/types.h> int batadv_tt_init(struct batadv_priv *bat_priv); bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr, unsigned short vid, int ifindex, u32 mark); u16 batadv_tt_local_remove(struct batadv_priv *bat_priv, const u8 *addr, unsigned short vid, const char *message, bool roaming); int batadv_tt_local_dump(struct sk_buff *msg, struct netlink_callback *cb); int batadv_tt_global_dump(struct sk_buff *msg, struct netlink_callback *cb); void batadv_tt_global_del_orig(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, s32 match_vid, const char *message); struct batadv_tt_global_entry * batadv_tt_global_hash_find(struct batadv_priv *bat_priv, const u8 *addr, unsigned short vid); void batadv_tt_global_entry_release(struct kref *ref); int batadv_tt_global_hash_count(struct batadv_priv *bat_priv, const u8 *addr, unsigned short vid); struct batadv_orig_node *batadv_transtable_search(struct batadv_priv *bat_priv, const u8 *src, const u8 *addr, unsigned short vid); void batadv_tt_free(struct batadv_priv *bat_priv); bool batadv_is_my_client(struct batadv_priv *bat_priv, const u8 *addr, unsigned short vid); bool batadv_is_ap_isolated(struct batadv_priv *bat_priv, u8 *src, u8 *dst, unsigned short vid); void batadv_tt_local_commit_changes(struct batadv_priv *bat_priv); bool batadv_tt_global_client_is_roaming(struct batadv_priv *bat_priv, u8 *addr, unsigned short vid); bool batadv_tt_local_client_is_roaming(struct batadv_priv *bat_priv, u8 *addr, unsigned short vid); void batadv_tt_local_resize_to_mtu(struct net_device *soft_iface); bool batadv_tt_add_temporary_global_entry(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, const unsigned char *addr, unsigned short vid); bool batadv_tt_global_is_isolated(struct batadv_priv *bat_priv, const u8 *addr, unsigned short vid); int batadv_tt_cache_init(void); void batadv_tt_cache_destroy(void); /** * batadv_tt_global_entry_put() - decrement the tt_global_entry refcounter and * possibly release it * @tt_global_entry: tt_global_entry to be free'd */ static inline void batadv_tt_global_entry_put(struct batadv_tt_global_entry *tt_global_entry) { if (!tt_global_entry) return; kref_put(&tt_global_entry->common.refcount, batadv_tt_global_entry_release); } #endif /* _NET_BATMAN_ADV_TRANSLATION_TABLE_H_ */
104 104 104 104 104 104 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * Copyright (c) 2003 International Business Machines, Corp. * * This file is part of the SCTP kernel implementation * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * Sridhar Samudrala <sri@us.ibm.com> */ #include <linux/types.h> #include <linux/seq_file.h> #include <linux/init.h> #include <linux/export.h> #include <net/sctp/sctp.h> #include <net/ip.h> /* for snmp_fold_field */ static const struct snmp_mib sctp_snmp_list[] = { SNMP_MIB_ITEM("SctpCurrEstab", SCTP_MIB_CURRESTAB), SNMP_MIB_ITEM("SctpActiveEstabs", SCTP_MIB_ACTIVEESTABS), SNMP_MIB_ITEM("SctpPassiveEstabs", SCTP_MIB_PASSIVEESTABS), SNMP_MIB_ITEM("SctpAborteds", SCTP_MIB_ABORTEDS), SNMP_MIB_ITEM("SctpShutdowns", SCTP_MIB_SHUTDOWNS), SNMP_MIB_ITEM("SctpOutOfBlues", SCTP_MIB_OUTOFBLUES), SNMP_MIB_ITEM("SctpChecksumErrors", SCTP_MIB_CHECKSUMERRORS), SNMP_MIB_ITEM("SctpOutCtrlChunks", SCTP_MIB_OUTCTRLCHUNKS), SNMP_MIB_ITEM("SctpOutOrderChunks", SCTP_MIB_OUTORDERCHUNKS), SNMP_MIB_ITEM("SctpOutUnorderChunks", SCTP_MIB_OUTUNORDERCHUNKS), SNMP_MIB_ITEM("SctpInCtrlChunks", SCTP_MIB_INCTRLCHUNKS), SNMP_MIB_ITEM("SctpInOrderChunks", SCTP_MIB_INORDERCHUNKS), SNMP_MIB_ITEM("SctpInUnorderChunks", SCTP_MIB_INUNORDERCHUNKS), SNMP_MIB_ITEM("SctpFragUsrMsgs", SCTP_MIB_FRAGUSRMSGS), SNMP_MIB_ITEM("SctpReasmUsrMsgs", SCTP_MIB_REASMUSRMSGS), SNMP_MIB_ITEM("SctpOutSCTPPacks", SCTP_MIB_OUTSCTPPACKS), SNMP_MIB_ITEM("SctpInSCTPPacks", SCTP_MIB_INSCTPPACKS), SNMP_MIB_ITEM("SctpT1InitExpireds", SCTP_MIB_T1_INIT_EXPIREDS), SNMP_MIB_ITEM("SctpT1CookieExpireds", SCTP_MIB_T1_COOKIE_EXPIREDS), SNMP_MIB_ITEM("SctpT2ShutdownExpireds", SCTP_MIB_T2_SHUTDOWN_EXPIREDS), SNMP_MIB_ITEM("SctpT3RtxExpireds", SCTP_MIB_T3_RTX_EXPIREDS), SNMP_MIB_ITEM("SctpT4RtoExpireds", SCTP_MIB_T4_RTO_EXPIREDS), SNMP_MIB_ITEM("SctpT5ShutdownGuardExpireds", SCTP_MIB_T5_SHUTDOWN_GUARD_EXPIREDS), SNMP_MIB_ITEM("SctpDelaySackExpireds", SCTP_MIB_DELAY_SACK_EXPIREDS), SNMP_MIB_ITEM("SctpAutocloseExpireds", SCTP_MIB_AUTOCLOSE_EXPIREDS), SNMP_MIB_ITEM("SctpT3Retransmits", SCTP_MIB_T3_RETRANSMITS), SNMP_MIB_ITEM("SctpPmtudRetransmits", SCTP_MIB_PMTUD_RETRANSMITS), SNMP_MIB_ITEM("SctpFastRetransmits", SCTP_MIB_FAST_RETRANSMITS), SNMP_MIB_ITEM("SctpInPktSoftirq", SCTP_MIB_IN_PKT_SOFTIRQ), SNMP_MIB_ITEM("SctpInPktBacklog", SCTP_MIB_IN_PKT_BACKLOG), SNMP_MIB_ITEM("SctpInPktDiscards", SCTP_MIB_IN_PKT_DISCARDS), SNMP_MIB_ITEM("SctpInDataChunkDiscards", SCTP_MIB_IN_DATA_CHUNK_DISCARDS), SNMP_MIB_SENTINEL }; /* Display sctp snmp mib statistics(/proc/net/sctp/snmp). */ static int sctp_snmp_seq_show(struct seq_file *seq, void *v) { unsigned long buff[SCTP_MIB_MAX]; struct net *net = seq->private; int i; memset(buff, 0, sizeof(unsigned long) * SCTP_MIB_MAX); snmp_get_cpu_field_batch(buff, sctp_snmp_list, net->sctp.sctp_statistics); for (i = 0; sctp_snmp_list[i].name; i++) seq_printf(seq, "%-32s\t%ld\n", sctp_snmp_list[i].name, buff[i]); return 0; } /* Dump local addresses of an association/endpoint. */ static void sctp_seq_dump_local_addrs(struct seq_file *seq, struct sctp_ep_common *epb) { struct sctp_association *asoc; struct sctp_sockaddr_entry *laddr; struct sctp_transport *peer; union sctp_addr *addr, *primary = NULL; struct sctp_af *af; if (epb->type == SCTP_EP_TYPE_ASSOCIATION) { asoc = sctp_assoc(epb); peer = asoc->peer.primary_path; if (unlikely(peer == NULL)) { WARN(1, "Association %p with NULL primary path!\n", asoc); return; } primary = &peer->saddr; } rcu_read_lock(); list_for_each_entry_rcu(laddr, &epb->bind_addr.address_list, list) { if (!laddr->valid) continue; addr = &laddr->a; af = sctp_get_af_specific(addr->sa.sa_family); if (primary && af->cmp_addr(addr, primary)) { seq_printf(seq, "*"); } af->seq_dump_addr(seq, addr); } rcu_read_unlock(); } /* Dump remote addresses of an association. */ static void sctp_seq_dump_remote_addrs(struct seq_file *seq, struct sctp_association *assoc) { struct sctp_transport *transport; union sctp_addr *addr, *primary; struct sctp_af *af; primary = &assoc->peer.primary_addr; list_for_each_entry_rcu(transport, &assoc->peer.transport_addr_list, transports) { addr = &transport->ipaddr; af = sctp_get_af_specific(addr->sa.sa_family); if (af->cmp_addr(addr, primary)) { seq_printf(seq, "*"); } af->seq_dump_addr(seq, addr); } } static void *sctp_eps_seq_start(struct seq_file *seq, loff_t *pos) { if (*pos >= sctp_ep_hashsize) return NULL; if (*pos < 0) *pos = 0; if (*pos == 0) seq_printf(seq, " ENDPT SOCK STY SST HBKT LPORT UID INODE LADDRS\n"); return (void *)pos; } static void sctp_eps_seq_stop(struct seq_file *seq, void *v) { } static void *sctp_eps_seq_next(struct seq_file *seq, void *v, loff_t *pos) { if (++*pos >= sctp_ep_hashsize) return NULL; return pos; } /* Display sctp endpoints (/proc/net/sctp/eps). */ static int sctp_eps_seq_show(struct seq_file *seq, void *v) { struct sctp_hashbucket *head; struct sctp_endpoint *ep; struct sock *sk; int hash = *(loff_t *)v; if (hash >= sctp_ep_hashsize) return -ENOMEM; head = &sctp_ep_hashtable[hash]; read_lock_bh(&head->lock); sctp_for_each_hentry(ep, &head->chain) { sk = ep->base.sk; if (!net_eq(sock_net(sk), seq_file_net(seq))) continue; seq_printf(seq, "%8pK %8pK %-3d %-3d %-4d %-5d %5u %5lu ", ep, sk, sctp_sk(sk)->type, sk->sk_state, hash, ep->base.bind_addr.port, from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)), sock_i_ino(sk)); sctp_seq_dump_local_addrs(seq, &ep->base); seq_printf(seq, "\n"); } read_unlock_bh(&head->lock); return 0; } static const struct seq_operations sctp_eps_ops = { .start = sctp_eps_seq_start, .next = sctp_eps_seq_next, .stop = sctp_eps_seq_stop, .show = sctp_eps_seq_show, }; struct sctp_ht_iter { struct seq_net_private p; struct rhashtable_iter hti; }; static void *sctp_transport_seq_start(struct seq_file *seq, loff_t *pos) { struct sctp_ht_iter *iter = seq->private; sctp_transport_walk_start(&iter->hti); return sctp_transport_get_idx(seq_file_net(seq), &iter->hti, *pos); } static void sctp_transport_seq_stop(struct seq_file *seq, void *v) { struct sctp_ht_iter *iter = seq->private; if (v && v != SEQ_START_TOKEN) { struct sctp_transport *transport = v; sctp_transport_put(transport); } sctp_transport_walk_stop(&iter->hti); } static void *sctp_transport_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct sctp_ht_iter *iter = seq->private; if (v && v != SEQ_START_TOKEN) { struct sctp_transport *transport = v; sctp_transport_put(transport); } ++*pos; return sctp_transport_get_next(seq_file_net(seq), &iter->hti); } /* Display sctp associations (/proc/net/sctp/assocs). */ static int sctp_assocs_seq_show(struct seq_file *seq, void *v) { struct sctp_transport *transport; struct sctp_association *assoc; struct sctp_ep_common *epb; struct sock *sk; if (v == SEQ_START_TOKEN) { seq_printf(seq, " ASSOC SOCK STY SST ST HBKT " "ASSOC-ID TX_QUEUE RX_QUEUE UID INODE LPORT " "RPORT LADDRS <-> RADDRS " "HBINT INS OUTS MAXRT T1X T2X RTXC " "wmema wmemq sndbuf rcvbuf\n"); return 0; } transport = (struct sctp_transport *)v; assoc = transport->asoc; epb = &assoc->base; sk = epb->sk; seq_printf(seq, "%8pK %8pK %-3d %-3d %-2d %-4d " "%4d %8d %8d %7u %5lu %-5d %5d ", assoc, sk, sctp_sk(sk)->type, sk->sk_state, assoc->state, 0, assoc->assoc_id, assoc->sndbuf_used, atomic_read(&assoc->rmem_alloc), from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)), sock_i_ino(sk), epb->bind_addr.port, assoc->peer.port); seq_printf(seq, " "); sctp_seq_dump_local_addrs(seq, epb); seq_printf(seq, "<-> "); sctp_seq_dump_remote_addrs(seq, assoc); seq_printf(seq, "\t%8lu %5d %5d %4d %4d %4d %8d " "%8d %8d %8d %8d", assoc->hbinterval, assoc->stream.incnt, assoc->stream.outcnt, assoc->max_retrans, assoc->init_retries, assoc->shutdown_retries, assoc->rtx_data_chunks, refcount_read(&sk->sk_wmem_alloc), READ_ONCE(sk->sk_wmem_queued), sk->sk_sndbuf, sk->sk_rcvbuf); seq_printf(seq, "\n"); return 0; } static const struct seq_operations sctp_assoc_ops = { .start = sctp_transport_seq_start, .next = sctp_transport_seq_next, .stop = sctp_transport_seq_stop, .show = sctp_assocs_seq_show, }; static int sctp_remaddr_seq_show(struct seq_file *seq, void *v) { struct sctp_association *assoc; struct sctp_transport *transport, *tsp; if (v == SEQ_START_TOKEN) { seq_printf(seq, "ADDR ASSOC_ID HB_ACT RTO MAX_PATH_RTX " "REM_ADDR_RTX START STATE\n"); return 0; } transport = (struct sctp_transport *)v; assoc = transport->asoc; list_for_each_entry_rcu(tsp, &assoc->peer.transport_addr_list, transports) { /* * The remote address (ADDR) */ tsp->af_specific->seq_dump_addr(seq, &tsp->ipaddr); seq_printf(seq, " "); /* * The association ID (ASSOC_ID) */ seq_printf(seq, "%d ", tsp->asoc->assoc_id); /* * If the Heartbeat is active (HB_ACT) * Note: 1 = Active, 0 = Inactive */ seq_printf(seq, "%d ", timer_pending(&tsp->hb_timer)); /* * Retransmit time out (RTO) */ seq_printf(seq, "%lu ", tsp->rto); /* * Maximum path retransmit count (PATH_MAX_RTX) */ seq_printf(seq, "%d ", tsp->pathmaxrxt); /* * remote address retransmit count (REM_ADDR_RTX) * Note: We don't have a way to tally this at the moment * so lets just leave it as zero for the moment */ seq_puts(seq, "0 "); /* * remote address start time (START). This is also not * currently implemented, but we can record it with a * jiffies marker in a subsequent patch */ seq_puts(seq, "0 "); /* * The current state of this destination. I.e. * SCTP_ACTIVE, SCTP_INACTIVE, ... */ seq_printf(seq, "%d", tsp->state); seq_printf(seq, "\n"); } return 0; } static const struct seq_operations sctp_remaddr_ops = { .start = sctp_transport_seq_start, .next = sctp_transport_seq_next, .stop = sctp_transport_seq_stop, .show = sctp_remaddr_seq_show, }; /* Set up the proc fs entry for the SCTP protocol. */ int __net_init sctp_proc_init(struct net *net) { net->sctp.proc_net_sctp = proc_net_mkdir(net, "sctp", net->proc_net); if (!net->sctp.proc_net_sctp) return -ENOMEM; if (!proc_create_net_single("snmp", 0444, net->sctp.proc_net_sctp, sctp_snmp_seq_show, NULL)) goto cleanup; if (!proc_create_net("eps", 0444, net->sctp.proc_net_sctp, &sctp_eps_ops, sizeof(struct seq_net_private))) goto cleanup; if (!proc_create_net("assocs", 0444, net->sctp.proc_net_sctp, &sctp_assoc_ops, sizeof(struct sctp_ht_iter))) goto cleanup; if (!proc_create_net("remaddr", 0444, net->sctp.proc_net_sctp, &sctp_remaddr_ops, sizeof(struct sctp_ht_iter))) goto cleanup; return 0; cleanup: remove_proc_subtree("sctp", net->proc_net); net->sctp.proc_net_sctp = NULL; return -ENOMEM; }
36 35 1 36 1 36 27 1 39 1 38 38 1 37 36 39 19 16 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 // SPDX-License-Identifier: GPL-2.0-only #include <linux/net.h> #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/types.h> #include <net/pkt_sched.h> #include "sch_mqprio_lib.h" /* Returns true if the intervals [a, b) and [c, d) overlap. */ static bool intervals_overlap(int a, int b, int c, int d) { int left = max(a, c), right = min(b, d); return left < right; } static int mqprio_validate_queue_counts(struct net_device *dev, const struct tc_mqprio_qopt *qopt, bool allow_overlapping_txqs, struct netlink_ext_ack *extack) { int i, j; for (i = 0; i < qopt->num_tc; i++) { unsigned int last = qopt->offset[i] + qopt->count[i]; if (!qopt->count[i]) { NL_SET_ERR_MSG_FMT_MOD(extack, "No queues for TC %d", i); return -EINVAL; } /* Verify the queue count is in tx range being equal to the * real_num_tx_queues indicates the last queue is in use. */ if (qopt->offset[i] >= dev->real_num_tx_queues || last > dev->real_num_tx_queues) { NL_SET_ERR_MSG_FMT_MOD(extack, "Queues %d:%d for TC %d exceed the %d TX queues available", qopt->count[i], qopt->offset[i], i, dev->real_num_tx_queues); return -EINVAL; } if (allow_overlapping_txqs) continue; /* Verify that the offset and counts do not overlap */ for (j = i + 1; j < qopt->num_tc; j++) { if (intervals_overlap(qopt->offset[i], last, qopt->offset[j], qopt->offset[j] + qopt->count[j])) { NL_SET_ERR_MSG_FMT_MOD(extack, "TC %d queues %d@%d overlap with TC %d queues %d@%d", i, qopt->count[i], qopt->offset[i], j, qopt->count[j], qopt->offset[j]); return -EINVAL; } } } return 0; } int mqprio_validate_qopt(struct net_device *dev, struct tc_mqprio_qopt *qopt, bool validate_queue_counts, bool allow_overlapping_txqs, struct netlink_ext_ack *extack) { int i, err; /* Verify num_tc is not out of max range */ if (qopt->num_tc > TC_MAX_QUEUE) { NL_SET_ERR_MSG(extack, "Number of traffic classes is outside valid range"); return -EINVAL; } /* Verify priority mapping uses valid tcs */ for (i = 0; i <= TC_BITMASK; i++) { if (qopt->prio_tc_map[i] >= qopt->num_tc) { NL_SET_ERR_MSG(extack, "Invalid traffic class in priority to traffic class mapping"); return -EINVAL; } } if (validate_queue_counts) { err = mqprio_validate_queue_counts(dev, qopt, allow_overlapping_txqs, extack); if (err) return err; } return 0; } EXPORT_SYMBOL_GPL(mqprio_validate_qopt); void mqprio_qopt_reconstruct(struct net_device *dev, struct tc_mqprio_qopt *qopt) { int tc, num_tc = netdev_get_num_tc(dev); qopt->num_tc = num_tc; memcpy(qopt->prio_tc_map, dev->prio_tc_map, sizeof(qopt->prio_tc_map)); for (tc = 0; tc < num_tc; tc++) { qopt->count[tc] = dev->tc_to_txq[tc].count; qopt->offset[tc] = dev->tc_to_txq[tc].offset; } } EXPORT_SYMBOL_GPL(mqprio_qopt_reconstruct); void mqprio_fp_to_offload(u32 fp[TC_QOPT_MAX_QUEUE], struct tc_mqprio_qopt_offload *mqprio) { unsigned long preemptible_tcs = 0; int tc; for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++) if (fp[tc] == TC_FP_PREEMPTIBLE) preemptible_tcs |= BIT(tc); mqprio->preemptible_tcs = preemptible_tcs; } EXPORT_SYMBOL_GPL(mqprio_fp_to_offload); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Shared mqprio qdisc code currently between taprio and mqprio");
34 34 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 // SPDX-License-Identifier: GPL-2.0-or-later /* * History * 03-01-2007 Added forwarding for x.25 Andrew Hendry */ #define pr_fmt(fmt) "X25: " fmt #include <linux/if_arp.h> #include <linux/init.h> #include <linux/slab.h> #include <net/x25.h> LIST_HEAD(x25_forward_list); DEFINE_RWLOCK(x25_forward_list_lock); int x25_forward_call(struct x25_address *dest_addr, struct x25_neigh *from, struct sk_buff *skb, int lci) { struct x25_route *rt; struct x25_neigh *neigh_new = NULL; struct x25_forward *x25_frwd, *new_frwd; struct sk_buff *skbn; short same_lci = 0; int rc = 0; if ((rt = x25_get_route(dest_addr)) == NULL) goto out_no_route; if ((neigh_new = x25_get_neigh(rt->dev)) == NULL) { /* This shouldn't happen, if it occurs somehow * do something sensible */ goto out_put_route; } /* Avoid a loop. This is the normal exit path for a * system with only one x.25 iface and default route */ if (rt->dev == from->dev) { goto out_put_nb; } /* Remote end sending a call request on an already * established LCI? It shouldn't happen, just in case.. */ read_lock_bh(&x25_forward_list_lock); list_for_each_entry(x25_frwd, &x25_forward_list, node) { if (x25_frwd->lci == lci) { pr_warn("call request for lci which is already registered!, transmitting but not registering new pair\n"); same_lci = 1; } } read_unlock_bh(&x25_forward_list_lock); /* Save the forwarding details for future traffic */ if (!same_lci){ if ((new_frwd = kmalloc(sizeof(struct x25_forward), GFP_ATOMIC)) == NULL){ rc = -ENOMEM; goto out_put_nb; } new_frwd->lci = lci; new_frwd->dev1 = rt->dev; new_frwd->dev2 = from->dev; write_lock_bh(&x25_forward_list_lock); list_add(&new_frwd->node, &x25_forward_list); write_unlock_bh(&x25_forward_list_lock); } /* Forward the call request */ if ( (skbn = skb_clone(skb, GFP_ATOMIC)) == NULL){ goto out_put_nb; } x25_transmit_link(skbn, neigh_new); rc = 1; out_put_nb: x25_neigh_put(neigh_new); out_put_route: x25_route_put(rt); out_no_route: return rc; } int x25_forward_data(int lci, struct x25_neigh *from, struct sk_buff *skb) { struct x25_forward *frwd; struct net_device *peer = NULL; struct x25_neigh *nb; struct sk_buff *skbn; int rc = 0; read_lock_bh(&x25_forward_list_lock); list_for_each_entry(frwd, &x25_forward_list, node) { if (frwd->lci == lci) { /* The call is established, either side can send */ if (from->dev == frwd->dev1) { peer = frwd->dev2; } else { peer = frwd->dev1; } break; } } read_unlock_bh(&x25_forward_list_lock); if ( (nb = x25_get_neigh(peer)) == NULL) goto out; if ( (skbn = pskb_copy(skb, GFP_ATOMIC)) == NULL){ goto output; } x25_transmit_link(skbn, nb); rc = 1; output: x25_neigh_put(nb); out: return rc; } void x25_clear_forward_by_lci(unsigned int lci) { struct x25_forward *fwd, *tmp; write_lock_bh(&x25_forward_list_lock); list_for_each_entry_safe(fwd, tmp, &x25_forward_list, node) { if (fwd->lci == lci) { list_del(&fwd->node); kfree(fwd); } } write_unlock_bh(&x25_forward_list_lock); } void x25_clear_forward_by_dev(struct net_device *dev) { struct x25_forward *fwd, *tmp; write_lock_bh(&x25_forward_list_lock); list_for_each_entry_safe(fwd, tmp, &x25_forward_list, node) { if ((fwd->dev1 == dev) || (fwd->dev2 == dev)){ list_del(&fwd->node); kfree(fwd); } } write_unlock_bh(&x25_forward_list_lock); }
104 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 // SPDX-License-Identifier: BSD-3-Clause /* * linux/net/sunrpc/auth_gss/auth_gss.c * * RPCSEC_GSS client authentication. * * Copyright (c) 2000 The Regents of the University of Michigan. * All rights reserved. * * Dug Song <dugsong@monkey.org> * Andy Adamson <andros@umich.edu> */ #include <linux/module.h> #include <linux/init.h> #include <linux/types.h> #include <linux/slab.h> #include <linux/sched.h> #include <linux/pagemap.h> #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/auth.h> #include <linux/sunrpc/auth_gss.h> #include <linux/sunrpc/gss_krb5.h> #include <linux/sunrpc/svcauth_gss.h> #include <linux/sunrpc/gss_err.h> #include <linux/workqueue.h> #include <linux/sunrpc/rpc_pipe_fs.h> #include <linux/sunrpc/gss_api.h> #include <linux/uaccess.h> #include <linux/hashtable.h> #include "auth_gss_internal.h" #include "../netns.h" #include <trace/events/rpcgss.h> static const struct rpc_authops authgss_ops; static const struct rpc_credops gss_credops; static const struct rpc_credops gss_nullops; #define GSS_RETRY_EXPIRED 5 static unsigned int gss_expired_cred_retry_delay = GSS_RETRY_EXPIRED; #define GSS_KEY_EXPIRE_TIMEO 240 static unsigned int gss_key_expire_timeo = GSS_KEY_EXPIRE_TIMEO; #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) # define RPCDBG_FACILITY RPCDBG_AUTH #endif /* * This compile-time check verifies that we will not exceed the * slack space allotted by the client and server auth_gss code * before they call gss_wrap(). */ #define GSS_KRB5_MAX_SLACK_NEEDED \ (GSS_KRB5_TOK_HDR_LEN /* gss token header */ \ + GSS_KRB5_MAX_CKSUM_LEN /* gss token checksum */ \ + GSS_KRB5_MAX_BLOCKSIZE /* confounder */ \ + GSS_KRB5_MAX_BLOCKSIZE /* possible padding */ \ + GSS_KRB5_TOK_HDR_LEN /* encrypted hdr in v2 token */ \ + GSS_KRB5_MAX_CKSUM_LEN /* encryption hmac */ \ + XDR_UNIT * 2 /* RPC verifier */ \ + GSS_KRB5_TOK_HDR_LEN \ + GSS_KRB5_MAX_CKSUM_LEN) #define GSS_CRED_SLACK (RPC_MAX_AUTH_SIZE * 2) /* length of a krb5 verifier (48), plus data added before arguments when * using integrity (two 4-byte integers): */ #define GSS_VERF_SLACK 100 static DEFINE_HASHTABLE(gss_auth_hash_table, 4); static DEFINE_SPINLOCK(gss_auth_hash_lock); struct gss_pipe { struct rpc_pipe_dir_object pdo; struct rpc_pipe *pipe; struct rpc_clnt *clnt; const char *name; struct kref kref; }; struct gss_auth { struct kref kref; struct hlist_node hash; struct rpc_auth rpc_auth; struct gss_api_mech *mech; enum rpc_gss_svc service; struct rpc_clnt *client; struct net *net; netns_tracker ns_tracker; /* * There are two upcall pipes; dentry[1], named "gssd", is used * for the new text-based upcall; dentry[0] is named after the * mechanism (for example, "krb5") and exists for * backwards-compatibility with older gssd's. */ struct gss_pipe *gss_pipe[2]; const char *target_name; }; /* pipe_version >= 0 if and only if someone has a pipe open. */ static DEFINE_SPINLOCK(pipe_version_lock); static struct rpc_wait_queue pipe_version_rpc_waitqueue; static DECLARE_WAIT_QUEUE_HEAD(pipe_version_waitqueue); static void gss_put_auth(struct gss_auth *gss_auth); static void gss_free_ctx(struct gss_cl_ctx *); static const struct rpc_pipe_ops gss_upcall_ops_v0; static const struct rpc_pipe_ops gss_upcall_ops_v1; static inline struct gss_cl_ctx * gss_get_ctx(struct gss_cl_ctx *ctx) { refcount_inc(&ctx->count); return ctx; } static inline void gss_put_ctx(struct gss_cl_ctx *ctx) { if (refcount_dec_and_test(&ctx->count)) gss_free_ctx(ctx); } /* gss_cred_set_ctx: * called by gss_upcall_callback and gss_create_upcall in order * to set the gss context. The actual exchange of an old context * and a new one is protected by the pipe->lock. */ static void gss_cred_set_ctx(struct rpc_cred *cred, struct gss_cl_ctx *ctx) { struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base); if (!test_bit(RPCAUTH_CRED_NEW, &cred->cr_flags)) return; gss_get_ctx(ctx); rcu_assign_pointer(gss_cred->gc_ctx, ctx); set_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags); smp_mb__before_atomic(); clear_bit(RPCAUTH_CRED_NEW, &cred->cr_flags); } static struct gss_cl_ctx * gss_cred_get_ctx(struct rpc_cred *cred) { struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base); struct gss_cl_ctx *ctx = NULL; rcu_read_lock(); ctx = rcu_dereference(gss_cred->gc_ctx); if (ctx) gss_get_ctx(ctx); rcu_read_unlock(); return ctx; } static struct gss_cl_ctx * gss_alloc_context(void) { struct gss_cl_ctx *ctx; ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (ctx != NULL) { ctx->gc_proc = RPC_GSS_PROC_DATA; ctx->gc_seq = 1; /* NetApp 6.4R1 doesn't accept seq. no. 0 */ spin_lock_init(&ctx->gc_seq_lock); refcount_set(&ctx->count,1); } return ctx; } #define GSSD_MIN_TIMEOUT (60 * 60) static const void * gss_fill_context(const void *p, const void *end, struct gss_cl_ctx *ctx, struct gss_api_mech *gm) { const void *q; unsigned int seclen; unsigned int timeout; unsigned long now = jiffies; u32 window_size; int ret; /* First unsigned int gives the remaining lifetime in seconds of the * credential - e.g. the remaining TGT lifetime for Kerberos or * the -t value passed to GSSD. */ p = simple_get_bytes(p, end, &timeout, sizeof(timeout)); if (IS_ERR(p)) goto err; if (timeout == 0) timeout = GSSD_MIN_TIMEOUT; ctx->gc_expiry = now + ((unsigned long)timeout * HZ); /* Sequence number window. Determines the maximum number of * simultaneous requests */ p = simple_get_bytes(p, end, &window_size, sizeof(window_size)); if (IS_ERR(p)) goto err; ctx->gc_win = window_size; /* gssd signals an error by passing ctx->gc_win = 0: */ if (ctx->gc_win == 0) { /* * in which case, p points to an error code. Anything other * than -EKEYEXPIRED gets converted to -EACCES. */ p = simple_get_bytes(p, end, &ret, sizeof(ret)); if (!IS_ERR(p)) p = (ret == -EKEYEXPIRED) ? ERR_PTR(-EKEYEXPIRED) : ERR_PTR(-EACCES); goto err; } /* copy the opaque wire context */ p = simple_get_netobj(p, end, &ctx->gc_wire_ctx); if (IS_ERR(p)) goto err; /* import the opaque security context */ p = simple_get_bytes(p, end, &seclen, sizeof(seclen)); if (IS_ERR(p)) goto err; q = (const void *)((const char *)p + seclen); if (unlikely(q > end || q < p)) { p = ERR_PTR(-EFAULT); goto err; } ret = gss_import_sec_context(p, seclen, gm, &ctx->gc_gss_ctx, NULL, GFP_KERNEL); if (ret < 0) { trace_rpcgss_import_ctx(ret); p = ERR_PTR(ret); goto err; } /* is there any trailing data? */ if (q == end) { p = q; goto done; } /* pull in acceptor name (if there is one) */ p = simple_get_netobj(q, end, &ctx->gc_acceptor); if (IS_ERR(p)) goto err; done: trace_rpcgss_context(window_size, ctx->gc_expiry, now, timeout, ctx->gc_acceptor.len, ctx->gc_acceptor.data); err: return p; } /* XXX: Need some documentation about why UPCALL_BUF_LEN is so small. * Is user space expecting no more than UPCALL_BUF_LEN bytes? * Note that there are now _two_ NI_MAXHOST sized data items * being passed in this string. */ #define UPCALL_BUF_LEN 256 struct gss_upcall_msg { refcount_t count; kuid_t uid; const char *service_name; struct rpc_pipe_msg msg; struct list_head list; struct gss_auth *auth; struct rpc_pipe *pipe; struct rpc_wait_queue rpc_waitqueue; wait_queue_head_t waitqueue; struct gss_cl_ctx *ctx; char databuf[UPCALL_BUF_LEN]; }; static int get_pipe_version(struct net *net) { struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); int ret; spin_lock(&pipe_version_lock); if (sn->pipe_version >= 0) { atomic_inc(&sn->pipe_users); ret = sn->pipe_version; } else ret = -EAGAIN; spin_unlock(&pipe_version_lock); return ret; } static void put_pipe_version(struct net *net) { struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); if (atomic_dec_and_lock(&sn->pipe_users, &pipe_version_lock)) { sn->pipe_version = -1; spin_unlock(&pipe_version_lock); } } static void gss_release_msg(struct gss_upcall_msg *gss_msg) { struct net *net = gss_msg->auth->net; if (!refcount_dec_and_test(&gss_msg->count)) return; put_pipe_version(net); BUG_ON(!list_empty(&gss_msg->list)); if (gss_msg->ctx != NULL) gss_put_ctx(gss_msg->ctx); rpc_destroy_wait_queue(&gss_msg->rpc_waitqueue); gss_put_auth(gss_msg->auth); kfree_const(gss_msg->service_name); kfree(gss_msg); } static struct gss_upcall_msg * __gss_find_upcall(struct rpc_pipe *pipe, kuid_t uid, const struct gss_auth *auth) { struct gss_upcall_msg *pos; list_for_each_entry(pos, &pipe->in_downcall, list) { if (!uid_eq(pos->uid, uid)) continue; if (pos->auth->service != auth->service) continue; refcount_inc(&pos->count); return pos; } return NULL; } /* Try to add an upcall to the pipefs queue. * If an upcall owned by our uid already exists, then we return a reference * to that upcall instead of adding the new upcall. */ static inline struct gss_upcall_msg * gss_add_msg(struct gss_upcall_msg *gss_msg) { struct rpc_pipe *pipe = gss_msg->pipe; struct gss_upcall_msg *old; spin_lock(&pipe->lock); old = __gss_find_upcall(pipe, gss_msg->uid, gss_msg->auth); if (old == NULL) { refcount_inc(&gss_msg->count); list_add(&gss_msg->list, &pipe->in_downcall); } else gss_msg = old; spin_unlock(&pipe->lock); return gss_msg; } static void __gss_unhash_msg(struct gss_upcall_msg *gss_msg) { list_del_init(&gss_msg->list); rpc_wake_up_status(&gss_msg->rpc_waitqueue, gss_msg->msg.errno); wake_up_all(&gss_msg->waitqueue); refcount_dec(&gss_msg->count); } static void gss_unhash_msg(struct gss_upcall_msg *gss_msg) { struct rpc_pipe *pipe = gss_msg->pipe; if (list_empty(&gss_msg->list)) return; spin_lock(&pipe->lock); if (!list_empty(&gss_msg->list)) __gss_unhash_msg(gss_msg); spin_unlock(&pipe->lock); } static void gss_handle_downcall_result(struct gss_cred *gss_cred, struct gss_upcall_msg *gss_msg) { switch (gss_msg->msg.errno) { case 0: if (gss_msg->ctx == NULL) break; clear_bit(RPCAUTH_CRED_NEGATIVE, &gss_cred->gc_base.cr_flags); gss_cred_set_ctx(&gss_cred->gc_base, gss_msg->ctx); break; case -EKEYEXPIRED: set_bit(RPCAUTH_CRED_NEGATIVE, &gss_cred->gc_base.cr_flags); } gss_cred->gc_upcall_timestamp = jiffies; gss_cred->gc_upcall = NULL; rpc_wake_up_status(&gss_msg->rpc_waitqueue, gss_msg->msg.errno); } static void gss_upcall_callback(struct rpc_task *task) { struct gss_cred *gss_cred = container_of(task->tk_rqstp->rq_cred, struct gss_cred, gc_base); struct gss_upcall_msg *gss_msg = gss_cred->gc_upcall; struct rpc_pipe *pipe = gss_msg->pipe; spin_lock(&pipe->lock); gss_handle_downcall_result(gss_cred, gss_msg); spin_unlock(&pipe->lock); task->tk_status = gss_msg->msg.errno; gss_release_msg(gss_msg); } static void gss_encode_v0_msg(struct gss_upcall_msg *gss_msg, const struct cred *cred) { struct user_namespace *userns = cred->user_ns; uid_t uid = from_kuid_munged(userns, gss_msg->uid); memcpy(gss_msg->databuf, &uid, sizeof(uid)); gss_msg->msg.data = gss_msg->databuf; gss_msg->msg.len = sizeof(uid); BUILD_BUG_ON(sizeof(uid) > sizeof(gss_msg->databuf)); } static ssize_t gss_v0_upcall(struct file *file, struct rpc_pipe_msg *msg, char __user *buf, size_t buflen) { struct gss_upcall_msg *gss_msg = container_of(msg, struct gss_upcall_msg, msg); if (msg->copied == 0) gss_encode_v0_msg(gss_msg, file->f_cred); return rpc_pipe_generic_upcall(file, msg, buf, buflen); } static int gss_encode_v1_msg(struct gss_upcall_msg *gss_msg, const char *service_name, const char *target_name, const struct cred *cred) { struct user_namespace *userns = cred->user_ns; struct gss_api_mech *mech = gss_msg->auth->mech; char *p = gss_msg->databuf; size_t buflen = sizeof(gss_msg->databuf); int len; len = scnprintf(p, buflen, "mech=%s uid=%d", mech->gm_name, from_kuid_munged(userns, gss_msg->uid)); buflen -= len; p += len; gss_msg->msg.len = len; /* * target= is a full service principal that names the remote * identity that we are authenticating to. */ if (target_name) { len = scnprintf(p, buflen, " target=%s", target_name); buflen -= len; p += len; gss_msg->msg.len += len; } /* * gssd uses service= and srchost= to select a matching key from * the system's keytab to use as the source principal. * * service= is the service name part of the source principal, * or "*" (meaning choose any). * * srchost= is the hostname part of the source principal. When * not provided, gssd uses the local hostname. */ if (service_name) { char *c = strchr(service_name, '@'); if (!c) len = scnprintf(p, buflen, " service=%s", service_name); else len = scnprintf(p, buflen, " service=%.*s srchost=%s", (int)(c - service_name), service_name, c + 1); buflen -= len; p += len; gss_msg->msg.len += len; } if (mech->gm_upcall_enctypes) { len = scnprintf(p, buflen, " enctypes=%s", mech->gm_upcall_enctypes); buflen -= len; p += len; gss_msg->msg.len += len; } trace_rpcgss_upcall_msg(gss_msg->databuf); len = scnprintf(p, buflen, "\n"); if (len == 0) goto out_overflow; gss_msg->msg.len += len; gss_msg->msg.data = gss_msg->databuf; return 0; out_overflow: WARN_ON_ONCE(1); return -ENOMEM; } static ssize_t gss_v1_upcall(struct file *file, struct rpc_pipe_msg *msg, char __user *buf, size_t buflen) { struct gss_upcall_msg *gss_msg = container_of(msg, struct gss_upcall_msg, msg); int err; if (msg->copied == 0) { err = gss_encode_v1_msg(gss_msg, gss_msg->service_name, gss_msg->auth->target_name, file->f_cred); if (err) return err; } return rpc_pipe_generic_upcall(file, msg, buf, buflen); } static struct gss_upcall_msg * gss_alloc_msg(struct gss_auth *gss_auth, kuid_t uid, const char *service_name) { struct gss_upcall_msg *gss_msg; int vers; int err = -ENOMEM; gss_msg = kzalloc(sizeof(*gss_msg), GFP_KERNEL); if (gss_msg == NULL) goto err; vers = get_pipe_version(gss_auth->net); err = vers; if (err < 0) goto err_free_msg; gss_msg->pipe = gss_auth->gss_pipe[vers]->pipe; INIT_LIST_HEAD(&gss_msg->list); rpc_init_wait_queue(&gss_msg->rpc_waitqueue, "RPCSEC_GSS upcall waitq"); init_waitqueue_head(&gss_msg->waitqueue); refcount_set(&gss_msg->count, 1); gss_msg->uid = uid; gss_msg->auth = gss_auth; kref_get(&gss_auth->kref); if (service_name) { gss_msg->service_name = kstrdup_const(service_name, GFP_KERNEL); if (!gss_msg->service_name) { err = -ENOMEM; goto err_put_pipe_version; } } return gss_msg; err_put_pipe_version: put_pipe_version(gss_auth->net); err_free_msg: kfree(gss_msg); err: return ERR_PTR(err); } static struct gss_upcall_msg * gss_setup_upcall(struct gss_auth *gss_auth, struct rpc_cred *cred) { struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base); struct gss_upcall_msg *gss_new, *gss_msg; kuid_t uid = cred->cr_cred->fsuid; gss_new = gss_alloc_msg(gss_auth, uid, gss_cred->gc_principal); if (IS_ERR(gss_new)) return gss_new; gss_msg = gss_add_msg(gss_new); if (gss_msg == gss_new) { int res; refcount_inc(&gss_msg->count); res = rpc_queue_upcall(gss_new->pipe, &gss_new->msg); if (res) { gss_unhash_msg(gss_new); refcount_dec(&gss_msg->count); gss_release_msg(gss_new); gss_msg = ERR_PTR(res); } } else gss_release_msg(gss_new); return gss_msg; } static void warn_gssd(void) { dprintk("AUTH_GSS upcall failed. Please check user daemon is running.\n"); } static inline int gss_refresh_upcall(struct rpc_task *task) { struct rpc_cred *cred = task->tk_rqstp->rq_cred; struct gss_auth *gss_auth = container_of(cred->cr_auth, struct gss_auth, rpc_auth); struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base); struct gss_upcall_msg *gss_msg; struct rpc_pipe *pipe; int err = 0; gss_msg = gss_setup_upcall(gss_auth, cred); if (PTR_ERR(gss_msg) == -EAGAIN) { /* XXX: warning on the first, under the assumption we * shouldn't normally hit this case on a refresh. */ warn_gssd(); rpc_sleep_on_timeout(&pipe_version_rpc_waitqueue, task, NULL, jiffies + (15 * HZ)); err = -EAGAIN; goto out; } if (IS_ERR(gss_msg)) { err = PTR_ERR(gss_msg); goto out; } pipe = gss_msg->pipe; spin_lock(&pipe->lock); if (gss_cred->gc_upcall != NULL) rpc_sleep_on(&gss_cred->gc_upcall->rpc_waitqueue, task, NULL); else if (gss_msg->ctx == NULL && gss_msg->msg.errno >= 0) { gss_cred->gc_upcall = gss_msg; /* gss_upcall_callback will release the reference to gss_upcall_msg */ refcount_inc(&gss_msg->count); rpc_sleep_on(&gss_msg->rpc_waitqueue, task, gss_upcall_callback); } else { gss_handle_downcall_result(gss_cred, gss_msg); err = gss_msg->msg.errno; } spin_unlock(&pipe->lock); gss_release_msg(gss_msg); out: trace_rpcgss_upcall_result(from_kuid(&init_user_ns, cred->cr_cred->fsuid), err); return err; } static inline int gss_create_upcall(struct gss_auth *gss_auth, struct gss_cred *gss_cred) { struct net *net = gss_auth->net; struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); struct rpc_pipe *pipe; struct rpc_cred *cred = &gss_cred->gc_base; struct gss_upcall_msg *gss_msg; DEFINE_WAIT(wait); int err; retry: err = 0; /* if gssd is down, just skip upcalling altogether */ if (!gssd_running(net)) { warn_gssd(); err = -EACCES; goto out; } gss_msg = gss_setup_upcall(gss_auth, cred); if (PTR_ERR(gss_msg) == -EAGAIN) { err = wait_event_interruptible_timeout(pipe_version_waitqueue, sn->pipe_version >= 0, 15 * HZ); if (sn->pipe_version < 0) { warn_gssd(); err = -EACCES; } if (err < 0) goto out; goto retry; } if (IS_ERR(gss_msg)) { err = PTR_ERR(gss_msg); goto out; } pipe = gss_msg->pipe; for (;;) { prepare_to_wait(&gss_msg->waitqueue, &wait, TASK_KILLABLE); spin_lock(&pipe->lock); if (gss_msg->ctx != NULL || gss_msg->msg.errno < 0) { break; } spin_unlock(&pipe->lock); if (fatal_signal_pending(current)) { err = -ERESTARTSYS; goto out_intr; } schedule(); } if (gss_msg->ctx) { trace_rpcgss_ctx_init(gss_cred); gss_cred_set_ctx(cred, gss_msg->ctx); } else { err = gss_msg->msg.errno; } spin_unlock(&pipe->lock); out_intr: finish_wait(&gss_msg->waitqueue, &wait); gss_release_msg(gss_msg); out: trace_rpcgss_upcall_result(from_kuid(&init_user_ns, cred->cr_cred->fsuid), err); return err; } static struct gss_upcall_msg * gss_find_downcall(struct rpc_pipe *pipe, kuid_t uid) { struct gss_upcall_msg *pos; list_for_each_entry(pos, &pipe->in_downcall, list) { if (!uid_eq(pos->uid, uid)) continue; if (!rpc_msg_is_inflight(&pos->msg)) continue; refcount_inc(&pos->count); return pos; } return NULL; } #define MSG_BUF_MAXSIZE 1024 static ssize_t gss_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) { const void *p, *end; void *buf; struct gss_upcall_msg *gss_msg; struct rpc_pipe *pipe = RPC_I(file_inode(filp))->pipe; struct gss_cl_ctx *ctx; uid_t id; kuid_t uid; ssize_t err = -EFBIG; if (mlen > MSG_BUF_MAXSIZE) goto out; err = -ENOMEM; buf = kmalloc(mlen, GFP_KERNEL); if (!buf) goto out; err = -EFAULT; if (copy_from_user(buf, src, mlen)) goto err; end = (const void *)((char *)buf + mlen); p = simple_get_bytes(buf, end, &id, sizeof(id)); if (IS_ERR(p)) { err = PTR_ERR(p); goto err; } uid = make_kuid(current_user_ns(), id); if (!uid_valid(uid)) { err = -EINVAL; goto err; } err = -ENOMEM; ctx = gss_alloc_context(); if (ctx == NULL) goto err; err = -ENOENT; /* Find a matching upcall */ spin_lock(&pipe->lock); gss_msg = gss_find_downcall(pipe, uid); if (gss_msg == NULL) { spin_unlock(&pipe->lock); goto err_put_ctx; } list_del_init(&gss_msg->list); spin_unlock(&pipe->lock); p = gss_fill_context(p, end, ctx, gss_msg->auth->mech); if (IS_ERR(p)) { err = PTR_ERR(p); switch (err) { case -EACCES: case -EKEYEXPIRED: gss_msg->msg.errno = err; err = mlen; break; case -EFAULT: case -ENOMEM: case -EINVAL: case -ENOSYS: gss_msg->msg.errno = -EAGAIN; break; default: printk(KERN_CRIT "%s: bad return from " "gss_fill_context: %zd\n", __func__, err); gss_msg->msg.errno = -EIO; } goto err_release_msg; } gss_msg->ctx = gss_get_ctx(ctx); err = mlen; err_release_msg: spin_lock(&pipe->lock); __gss_unhash_msg(gss_msg); spin_unlock(&pipe->lock); gss_release_msg(gss_msg); err_put_ctx: gss_put_ctx(ctx); err: kfree(buf); out: return err; } static int gss_pipe_open(struct inode *inode, int new_version) { struct net *net = inode->i_sb->s_fs_info; struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); int ret = 0; spin_lock(&pipe_version_lock); if (sn->pipe_version < 0) { /* First open of any gss pipe determines the version: */ sn->pipe_version = new_version; rpc_wake_up(&pipe_version_rpc_waitqueue); wake_up(&pipe_version_waitqueue); } else if (sn->pipe_version != new_version) { /* Trying to open a pipe of a different version */ ret = -EBUSY; goto out; } atomic_inc(&sn->pipe_users); out: spin_unlock(&pipe_version_lock); return ret; } static int gss_pipe_open_v0(struct inode *inode) { return gss_pipe_open(inode, 0); } static int gss_pipe_open_v1(struct inode *inode) { return gss_pipe_open(inode, 1); } static void gss_pipe_release(struct inode *inode) { struct net *net = inode->i_sb->s_fs_info; struct rpc_pipe *pipe = RPC_I(inode)->pipe; struct gss_upcall_msg *gss_msg; restart: spin_lock(&pipe->lock); list_for_each_entry(gss_msg, &pipe->in_downcall, list) { if (!list_empty(&gss_msg->msg.list)) continue; gss_msg->msg.errno = -EPIPE; refcount_inc(&gss_msg->count); __gss_unhash_msg(gss_msg); spin_unlock(&pipe->lock); gss_release_msg(gss_msg); goto restart; } spin_unlock(&pipe->lock); put_pipe_version(net); } static void gss_pipe_destroy_msg(struct rpc_pipe_msg *msg) { struct gss_upcall_msg *gss_msg = container_of(msg, struct gss_upcall_msg, msg); if (msg->errno < 0) { refcount_inc(&gss_msg->count); gss_unhash_msg(gss_msg); if (msg->errno == -ETIMEDOUT) warn_gssd(); gss_release_msg(gss_msg); } gss_release_msg(gss_msg); } static void gss_pipe_dentry_destroy(struct dentry *dir, struct rpc_pipe_dir_object *pdo) { struct gss_pipe *gss_pipe = pdo->pdo_data; struct rpc_pipe *pipe = gss_pipe->pipe; if (pipe->dentry != NULL) { rpc_unlink(pipe->dentry); pipe->dentry = NULL; } } static int gss_pipe_dentry_create(struct dentry *dir, struct rpc_pipe_dir_object *pdo) { struct gss_pipe *p = pdo->pdo_data; struct dentry *dentry; dentry = rpc_mkpipe_dentry(dir, p->name, p->clnt, p->pipe); if (IS_ERR(dentry)) return PTR_ERR(dentry); p->pipe->dentry = dentry; return 0; } static const struct rpc_pipe_dir_object_ops gss_pipe_dir_object_ops = { .create = gss_pipe_dentry_create, .destroy = gss_pipe_dentry_destroy, }; static struct gss_pipe *gss_pipe_alloc(struct rpc_clnt *clnt, const char *name, const struct rpc_pipe_ops *upcall_ops) { struct gss_pipe *p; int err = -ENOMEM; p = kmalloc(sizeof(*p), GFP_KERNEL); if (p == NULL) goto err; p->pipe = rpc_mkpipe_data(upcall_ops, RPC_PIPE_WAIT_FOR_OPEN); if (IS_ERR(p->pipe)) { err = PTR_ERR(p->pipe); goto err_free_gss_pipe; } p->name = name; p->clnt = clnt; kref_init(&p->kref); rpc_init_pipe_dir_object(&p->pdo, &gss_pipe_dir_object_ops, p); return p; err_free_gss_pipe: kfree(p); err: return ERR_PTR(err); } struct gss_alloc_pdo { struct rpc_clnt *clnt; const char *name; const struct rpc_pipe_ops *upcall_ops; }; static int gss_pipe_match_pdo(struct rpc_pipe_dir_object *pdo, void *data) { struct gss_pipe *gss_pipe; struct gss_alloc_pdo *args = data; if (pdo->pdo_ops != &gss_pipe_dir_object_ops) return 0; gss_pipe = container_of(pdo, struct gss_pipe, pdo); if (strcmp(gss_pipe->name, args->name) != 0) return 0; if (!kref_get_unless_zero(&gss_pipe->kref)) return 0; return 1; } static struct rpc_pipe_dir_object *gss_pipe_alloc_pdo(void *data) { struct gss_pipe *gss_pipe; struct gss_alloc_pdo *args = data; gss_pipe = gss_pipe_alloc(args->clnt, args->name, args->upcall_ops); if (!IS_ERR(gss_pipe)) return &gss_pipe->pdo; return NULL; } static struct gss_pipe *gss_pipe_get(struct rpc_clnt *clnt, const char *name, const struct rpc_pipe_ops *upcall_ops) { struct net *net = rpc_net_ns(clnt); struct rpc_pipe_dir_object *pdo; struct gss_alloc_pdo args = { .clnt = clnt, .name = name, .upcall_ops = upcall_ops, }; pdo = rpc_find_or_alloc_pipe_dir_object(net, &clnt->cl_pipedir_objects, gss_pipe_match_pdo, gss_pipe_alloc_pdo, &args); if (pdo != NULL) return container_of(pdo, struct gss_pipe, pdo); return ERR_PTR(-ENOMEM); } static void __gss_pipe_free(struct gss_pipe *p) { struct rpc_clnt *clnt = p->clnt; struct net *net = rpc_net_ns(clnt); rpc_remove_pipe_dir_object(net, &clnt->cl_pipedir_objects, &p->pdo); rpc_destroy_pipe_data(p->pipe); kfree(p); } static void __gss_pipe_release(struct kref *kref) { struct gss_pipe *p = container_of(kref, struct gss_pipe, kref); __gss_pipe_free(p); } static void gss_pipe_free(struct gss_pipe *p) { if (p != NULL) kref_put(&p->kref, __gss_pipe_release); } /* * NOTE: we have the opportunity to use different * parameters based on the input flavor (which must be a pseudoflavor) */ static struct gss_auth * gss_create_new(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt) { rpc_authflavor_t flavor = args->pseudoflavor; struct gss_auth *gss_auth; struct gss_pipe *gss_pipe; struct rpc_auth * auth; int err = -ENOMEM; /* XXX? */ if (!try_module_get(THIS_MODULE)) return ERR_PTR(err); if (!(gss_auth = kmalloc(sizeof(*gss_auth), GFP_KERNEL))) goto out_dec; INIT_HLIST_NODE(&gss_auth->hash); gss_auth->target_name = NULL; if (args->target_name) { gss_auth->target_name = kstrdup(args->target_name, GFP_KERNEL); if (gss_auth->target_name == NULL) goto err_free; } gss_auth->client = clnt; gss_auth->net = get_net_track(rpc_net_ns(clnt), &gss_auth->ns_tracker, GFP_KERNEL); err = -EINVAL; gss_auth->mech = gss_mech_get_by_pseudoflavor(flavor); if (!gss_auth->mech) goto err_put_net; gss_auth->service = gss_pseudoflavor_to_service(gss_auth->mech, flavor); if (gss_auth->service == 0) goto err_put_mech; if (!gssd_running(gss_auth->net)) goto err_put_mech; auth = &gss_auth->rpc_auth; auth->au_cslack = GSS_CRED_SLACK >> 2; BUILD_BUG_ON(GSS_KRB5_MAX_SLACK_NEEDED > RPC_MAX_AUTH_SIZE); auth->au_rslack = GSS_KRB5_MAX_SLACK_NEEDED >> 2; auth->au_verfsize = GSS_VERF_SLACK >> 2; auth->au_ralign = GSS_VERF_SLACK >> 2; __set_bit(RPCAUTH_AUTH_UPDATE_SLACK, &auth->au_flags); auth->au_ops = &authgss_ops; auth->au_flavor = flavor; if (gss_pseudoflavor_to_datatouch(gss_auth->mech, flavor)) __set_bit(RPCAUTH_AUTH_DATATOUCH, &auth->au_flags); refcount_set(&auth->au_count, 1); kref_init(&gss_auth->kref); err = rpcauth_init_credcache(auth); if (err) goto err_put_mech; /* * Note: if we created the old pipe first, then someone who * examined the directory at the right moment might conclude * that we supported only the old pipe. So we instead create * the new pipe first. */ gss_pipe = gss_pipe_get(clnt, "gssd", &gss_upcall_ops_v1); if (IS_ERR(gss_pipe)) { err = PTR_ERR(gss_pipe); goto err_destroy_credcache; } gss_auth->gss_pipe[1] = gss_pipe; gss_pipe = gss_pipe_get(clnt, gss_auth->mech->gm_name, &gss_upcall_ops_v0); if (IS_ERR(gss_pipe)) { err = PTR_ERR(gss_pipe); goto err_destroy_pipe_1; } gss_auth->gss_pipe[0] = gss_pipe; return gss_auth; err_destroy_pipe_1: gss_pipe_free(gss_auth->gss_pipe[1]); err_destroy_credcache: rpcauth_destroy_credcache(auth); err_put_mech: gss_mech_put(gss_auth->mech); err_put_net: put_net_track(gss_auth->net, &gss_auth->ns_tracker); err_free: kfree(gss_auth->target_name); kfree(gss_auth); out_dec: module_put(THIS_MODULE); trace_rpcgss_createauth(flavor, err); return ERR_PTR(err); } static void gss_free(struct gss_auth *gss_auth) { gss_pipe_free(gss_auth->gss_pipe[0]); gss_pipe_free(gss_auth->gss_pipe[1]); gss_mech_put(gss_auth->mech); put_net_track(gss_auth->net, &gss_auth->ns_tracker); kfree(gss_auth->target_name); kfree(gss_auth); module_put(THIS_MODULE); } static void gss_free_callback(struct kref *kref) { struct gss_auth *gss_auth = container_of(kref, struct gss_auth, kref); gss_free(gss_auth); } static void gss_put_auth(struct gss_auth *gss_auth) { kref_put(&gss_auth->kref, gss_free_callback); } static void gss_destroy(struct rpc_auth *auth) { struct gss_auth *gss_auth = container_of(auth, struct gss_auth, rpc_auth); if (hash_hashed(&gss_auth->hash)) { spin_lock(&gss_auth_hash_lock); hash_del(&gss_auth->hash); spin_unlock(&gss_auth_hash_lock); } gss_pipe_free(gss_auth->gss_pipe[0]); gss_auth->gss_pipe[0] = NULL; gss_pipe_free(gss_auth->gss_pipe[1]); gss_auth->gss_pipe[1] = NULL; rpcauth_destroy_credcache(auth); gss_put_auth(gss_auth); } /* * Auths may be shared between rpc clients that were cloned from a * common client with the same xprt, if they also share the flavor and * target_name. * * The auth is looked up from the oldest parent sharing the same * cl_xprt, and the auth itself references only that common parent * (which is guaranteed to last as long as any of its descendants). */ static struct gss_auth * gss_auth_find_or_add_hashed(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt, struct gss_auth *new) { struct gss_auth *gss_auth; unsigned long hashval = (unsigned long)clnt; spin_lock(&gss_auth_hash_lock); hash_for_each_possible(gss_auth_hash_table, gss_auth, hash, hashval) { if (gss_auth->client != clnt) continue; if (gss_auth->rpc_auth.au_flavor != args->pseudoflavor) continue; if (gss_auth->target_name != args->target_name) { if (gss_auth->target_name == NULL) continue; if (args->target_name == NULL) continue; if (strcmp(gss_auth->target_name, args->target_name)) continue; } if (!refcount_inc_not_zero(&gss_auth->rpc_auth.au_count)) continue; goto out; } if (new) hash_add(gss_auth_hash_table, &new->hash, hashval); gss_auth = new; out: spin_unlock(&gss_auth_hash_lock); return gss_auth; } static struct gss_auth * gss_create_hashed(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt) { struct gss_auth *gss_auth; struct gss_auth *new; gss_auth = gss_auth_find_or_add_hashed(args, clnt, NULL); if (gss_auth != NULL) goto out; new = gss_create_new(args, clnt); if (IS_ERR(new)) return new; gss_auth = gss_auth_find_or_add_hashed(args, clnt, new); if (gss_auth != new) gss_destroy(&new->rpc_auth); out: return gss_auth; } static struct rpc_auth * gss_create(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt) { struct gss_auth *gss_auth; struct rpc_xprt_switch *xps = rcu_access_pointer(clnt->cl_xpi.xpi_xpswitch); while (clnt != clnt->cl_parent) { struct rpc_clnt *parent = clnt->cl_parent; /* Find the original parent for this transport */ if (rcu_access_pointer(parent->cl_xpi.xpi_xpswitch) != xps) break; clnt = parent; } gss_auth = gss_create_hashed(args, clnt); if (IS_ERR(gss_auth)) return ERR_CAST(gss_auth); return &gss_auth->rpc_auth; } static struct gss_cred * gss_dup_cred(struct gss_auth *gss_auth, struct gss_cred *gss_cred) { struct gss_cred *new; /* Make a copy of the cred so that we can reference count it */ new = kzalloc(sizeof(*gss_cred), GFP_KERNEL); if (new) { struct auth_cred acred = { .cred = gss_cred->gc_base.cr_cred, }; struct gss_cl_ctx *ctx = rcu_dereference_protected(gss_cred->gc_ctx, 1); rpcauth_init_cred(&new->gc_base, &acred, &gss_auth->rpc_auth, &gss_nullops); new->gc_base.cr_flags = 1UL << RPCAUTH_CRED_UPTODATE; new->gc_service = gss_cred->gc_service; new->gc_principal = gss_cred->gc_principal; kref_get(&gss_auth->kref); rcu_assign_pointer(new->gc_ctx, ctx); gss_get_ctx(ctx); } return new; } /* * gss_send_destroy_context will cause the RPCSEC_GSS to send a NULL RPC call * to the server with the GSS control procedure field set to * RPC_GSS_PROC_DESTROY. This should normally cause the server to release * all RPCSEC_GSS state associated with that context. */ static void gss_send_destroy_context(struct rpc_cred *cred) { struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base); struct gss_auth *gss_auth = container_of(cred->cr_auth, struct gss_auth, rpc_auth); struct gss_cl_ctx *ctx = rcu_dereference_protected(gss_cred->gc_ctx, 1); struct gss_cred *new; struct rpc_task *task; new = gss_dup_cred(gss_auth, gss_cred); if (new) { ctx->gc_proc = RPC_GSS_PROC_DESTROY; trace_rpcgss_ctx_destroy(gss_cred); task = rpc_call_null(gss_auth->client, &new->gc_base, RPC_TASK_ASYNC); if (!IS_ERR(task)) rpc_put_task(task); put_rpccred(&new->gc_base); } } /* gss_destroy_cred (and gss_free_ctx) are used to clean up after failure * to create a new cred or context, so they check that things have been * allocated before freeing them. */ static void gss_do_free_ctx(struct gss_cl_ctx *ctx) { gss_delete_sec_context(&ctx->gc_gss_ctx); kfree(ctx->gc_wire_ctx.data); kfree(ctx->gc_acceptor.data); kfree(ctx); } static void gss_free_ctx_callback(struct rcu_head *head) { struct gss_cl_ctx *ctx = container_of(head, struct gss_cl_ctx, gc_rcu); gss_do_free_ctx(ctx); } static void gss_free_ctx(struct gss_cl_ctx *ctx) { call_rcu(&ctx->gc_rcu, gss_free_ctx_callback); } static void gss_free_cred(struct gss_cred *gss_cred) { kfree(gss_cred); } static void gss_free_cred_callback(struct rcu_head *head) { struct gss_cred *gss_cred = container_of(head, struct gss_cred, gc_base.cr_rcu); gss_free_cred(gss_cred); } static void gss_destroy_nullcred(struct rpc_cred *cred) { struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base); struct gss_auth *gss_auth = container_of(cred->cr_auth, struct gss_auth, rpc_auth); struct gss_cl_ctx *ctx = rcu_dereference_protected(gss_cred->gc_ctx, 1); RCU_INIT_POINTER(gss_cred->gc_ctx, NULL); put_cred(cred->cr_cred); call_rcu(&cred->cr_rcu, gss_free_cred_callback); if (ctx) gss_put_ctx(ctx); gss_put_auth(gss_auth); } static void gss_destroy_cred(struct rpc_cred *cred) { if (test_and_clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags) != 0) gss_send_destroy_context(cred); gss_destroy_nullcred(cred); } static int gss_hash_cred(struct auth_cred *acred, unsigned int hashbits) { return hash_64(from_kuid(&init_user_ns, acred->cred->fsuid), hashbits); } /* * Lookup RPCSEC_GSS cred for the current process */ static struct rpc_cred *gss_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags) { return rpcauth_lookup_credcache(auth, acred, flags, rpc_task_gfp_mask()); } static struct rpc_cred * gss_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags, gfp_t gfp) { struct gss_auth *gss_auth = container_of(auth, struct gss_auth, rpc_auth); struct gss_cred *cred = NULL; int err = -ENOMEM; if (!(cred = kzalloc(sizeof(*cred), gfp))) goto out_err; rpcauth_init_cred(&cred->gc_base, acred, auth, &gss_credops); /* * Note: in order to force a call to call_refresh(), we deliberately * fail to flag the credential as RPCAUTH_CRED_UPTODATE. */ cred->gc_base.cr_flags = 1UL << RPCAUTH_CRED_NEW; cred->gc_service = gss_auth->service; cred->gc_principal = acred->principal; kref_get(&gss_auth->kref); return &cred->gc_base; out_err: return ERR_PTR(err); } static int gss_cred_init(struct rpc_auth *auth, struct rpc_cred *cred) { struct gss_auth *gss_auth = container_of(auth, struct gss_auth, rpc_auth); struct gss_cred *gss_cred = container_of(cred,struct gss_cred, gc_base); int err; do { err = gss_create_upcall(gss_auth, gss_cred); } while (err == -EAGAIN); return err; } static char * gss_stringify_acceptor(struct rpc_cred *cred) { char *string = NULL; struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base); struct gss_cl_ctx *ctx; unsigned int len; struct xdr_netobj *acceptor; rcu_read_lock(); ctx = rcu_dereference(gss_cred->gc_ctx); if (!ctx) goto out; len = ctx->gc_acceptor.len; rcu_read_unlock(); /* no point if there's no string */ if (!len) return NULL; realloc: string = kmalloc(len + 1, GFP_KERNEL); if (!string) return NULL; rcu_read_lock(); ctx = rcu_dereference(gss_cred->gc_ctx); /* did the ctx disappear or was it replaced by one with no acceptor? */ if (!ctx || !ctx->gc_acceptor.len) { kfree(string); string = NULL; goto out; } acceptor = &ctx->gc_acceptor; /* * Did we find a new acceptor that's longer than the original? Allocate * a longer buffer and try again. */ if (len < acceptor->len) { len = acceptor->len; rcu_read_unlock(); kfree(string); goto realloc; } memcpy(string, acceptor->data, acceptor->len); string[acceptor->len] = '\0'; out: rcu_read_unlock(); return string; } /* * Returns -EACCES if GSS context is NULL or will expire within the * timeout (miliseconds) */ static int gss_key_timeout(struct rpc_cred *rc) { struct gss_cred *gss_cred = container_of(rc, struct gss_cred, gc_base); struct gss_cl_ctx *ctx; unsigned long timeout = jiffies + (gss_key_expire_timeo * HZ); int ret = 0; rcu_read_lock(); ctx = rcu_dereference(gss_cred->gc_ctx); if (!ctx || time_after(timeout, ctx->gc_expiry)) ret = -EACCES; rcu_read_unlock(); return ret; } static int gss_match(struct auth_cred *acred, struct rpc_cred *rc, int flags) { struct gss_cred *gss_cred = container_of(rc, struct gss_cred, gc_base); struct gss_cl_ctx *ctx; int ret; if (test_bit(RPCAUTH_CRED_NEW, &rc->cr_flags)) goto out; /* Don't match with creds that have expired. */ rcu_read_lock(); ctx = rcu_dereference(gss_cred->gc_ctx); if (!ctx || time_after(jiffies, ctx->gc_expiry)) { rcu_read_unlock(); return 0; } rcu_read_unlock(); if (!test_bit(RPCAUTH_CRED_UPTODATE, &rc->cr_flags)) return 0; out: if (acred->principal != NULL) { if (gss_cred->gc_principal == NULL) return 0; ret = strcmp(acred->principal, gss_cred->gc_principal) == 0; } else { if (gss_cred->gc_principal != NULL) return 0; ret = uid_eq(rc->cr_cred->fsuid, acred->cred->fsuid); } return ret; } /* * Marshal credentials. * * The expensive part is computing the verifier. We can't cache a * pre-computed version of the verifier because the seqno, which * is different every time, is included in the MIC. */ static int gss_marshal(struct rpc_task *task, struct xdr_stream *xdr) { struct rpc_rqst *req = task->tk_rqstp; struct rpc_cred *cred = req->rq_cred; struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base); struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred); __be32 *p, *cred_len; u32 maj_stat = 0; struct xdr_netobj mic; struct kvec iov; struct xdr_buf verf_buf; int status; /* Credential */ p = xdr_reserve_space(xdr, 7 * sizeof(*p) + ctx->gc_wire_ctx.len); if (!p) goto marshal_failed; *p++ = rpc_auth_gss; cred_len = p++; spin_lock(&ctx->gc_seq_lock); req->rq_seqno = (ctx->gc_seq < MAXSEQ) ? ctx->gc_seq++ : MAXSEQ; spin_unlock(&ctx->gc_seq_lock); if (req->rq_seqno == MAXSEQ) goto expired; trace_rpcgss_seqno(task); *p++ = cpu_to_be32(RPC_GSS_VERSION); *p++ = cpu_to_be32(ctx->gc_proc); *p++ = cpu_to_be32(req->rq_seqno); *p++ = cpu_to_be32(gss_cred->gc_service); p = xdr_encode_netobj(p, &ctx->gc_wire_ctx); *cred_len = cpu_to_be32((p - (cred_len + 1)) << 2); /* Verifier */ /* We compute the checksum for the verifier over the xdr-encoded bytes * starting with the xid and ending at the end of the credential: */ iov.iov_base = req->rq_snd_buf.head[0].iov_base; iov.iov_len = (u8 *)p - (u8 *)iov.iov_base; xdr_buf_from_iov(&iov, &verf_buf); p = xdr_reserve_space(xdr, sizeof(*p)); if (!p) goto marshal_failed; *p++ = rpc_auth_gss; mic.data = (u8 *)(p + 1); maj_stat = gss_get_mic(ctx->gc_gss_ctx, &verf_buf, &mic); if (maj_stat == GSS_S_CONTEXT_EXPIRED) goto expired; else if (maj_stat != 0) goto bad_mic; if (xdr_stream_encode_opaque_inline(xdr, (void **)&p, mic.len) < 0) goto marshal_failed; status = 0; out: gss_put_ctx(ctx); return status; expired: clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags); status = -EKEYEXPIRED; goto out; marshal_failed: status = -EMSGSIZE; goto out; bad_mic: trace_rpcgss_get_mic(task, maj_stat); status = -EIO; goto out; } static int gss_renew_cred(struct rpc_task *task) { struct rpc_cred *oldcred = task->tk_rqstp->rq_cred; struct gss_cred *gss_cred = container_of(oldcred, struct gss_cred, gc_base); struct rpc_auth *auth = oldcred->cr_auth; struct auth_cred acred = { .cred = oldcred->cr_cred, .principal = gss_cred->gc_principal, }; struct rpc_cred *new; new = gss_lookup_cred(auth, &acred, RPCAUTH_LOOKUP_NEW); if (IS_ERR(new)) return PTR_ERR(new); task->tk_rqstp->rq_cred = new; put_rpccred(oldcred); return 0; } static int gss_cred_is_negative_entry(struct rpc_cred *cred) { if (test_bit(RPCAUTH_CRED_NEGATIVE, &cred->cr_flags)) { unsigned long now = jiffies; unsigned long begin, expire; struct gss_cred *gss_cred; gss_cred = container_of(cred, struct gss_cred, gc_base); begin = gss_cred->gc_upcall_timestamp; expire = begin + gss_expired_cred_retry_delay * HZ; if (time_in_range_open(now, begin, expire)) return 1; } return 0; } /* * Refresh credentials. XXX - finish */ static int gss_refresh(struct rpc_task *task) { struct rpc_cred *cred = task->tk_rqstp->rq_cred; int ret = 0; if (gss_cred_is_negative_entry(cred)) return -EKEYEXPIRED; if (!test_bit(RPCAUTH_CRED_NEW, &cred->cr_flags) && !test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags)) { ret = gss_renew_cred(task); if (ret < 0) goto out; cred = task->tk_rqstp->rq_cred; } if (test_bit(RPCAUTH_CRED_NEW, &cred->cr_flags)) ret = gss_refresh_upcall(task); out: return ret; } /* Dummy refresh routine: used only when destroying the context */ static int gss_refresh_null(struct rpc_task *task) { return 0; } static int gss_validate(struct rpc_task *task, struct xdr_stream *xdr) { struct rpc_cred *cred = task->tk_rqstp->rq_cred; struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred); __be32 *p, *seq = NULL; struct kvec iov; struct xdr_buf verf_buf; struct xdr_netobj mic; u32 len, maj_stat; int status; p = xdr_inline_decode(xdr, 2 * sizeof(*p)); if (!p) goto validate_failed; if (*p++ != rpc_auth_gss) goto validate_failed; len = be32_to_cpup(p); if (len > RPC_MAX_AUTH_SIZE) goto validate_failed; p = xdr_inline_decode(xdr, len); if (!p) goto validate_failed; seq = kmalloc(4, GFP_KERNEL); if (!seq) goto validate_failed; *seq = cpu_to_be32(task->tk_rqstp->rq_seqno); iov.iov_base = seq; iov.iov_len = 4; xdr_buf_from_iov(&iov, &verf_buf); mic.data = (u8 *)p; mic.len = len; maj_stat = gss_verify_mic(ctx->gc_gss_ctx, &verf_buf, &mic); if (maj_stat == GSS_S_CONTEXT_EXPIRED) clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags); if (maj_stat) goto bad_mic; /* We leave it to unwrap to calculate au_rslack. For now we just * calculate the length of the verifier: */ if (test_bit(RPCAUTH_AUTH_UPDATE_SLACK, &cred->cr_auth->au_flags)) cred->cr_auth->au_verfsize = XDR_QUADLEN(len) + 2; status = 0; out: gss_put_ctx(ctx); kfree(seq); return status; validate_failed: status = -EIO; goto out; bad_mic: trace_rpcgss_verify_mic(task, maj_stat); status = -EACCES; goto out; } static noinline_for_stack int gss_wrap_req_integ(struct rpc_cred *cred, struct gss_cl_ctx *ctx, struct rpc_task *task, struct xdr_stream *xdr) { struct rpc_rqst *rqstp = task->tk_rqstp; struct xdr_buf integ_buf, *snd_buf = &rqstp->rq_snd_buf; struct xdr_netobj mic; __be32 *p, *integ_len; u32 offset, maj_stat; p = xdr_reserve_space(xdr, 2 * sizeof(*p)); if (!p) goto wrap_failed; integ_len = p++; *p = cpu_to_be32(rqstp->rq_seqno); if (rpcauth_wrap_req_encode(task, xdr)) goto wrap_failed; offset = (u8 *)p - (u8 *)snd_buf->head[0].iov_base; if (xdr_buf_subsegment(snd_buf, &integ_buf, offset, snd_buf->len - offset)) goto wrap_failed; *integ_len = cpu_to_be32(integ_buf.len); p = xdr_reserve_space(xdr, 0); if (!p) goto wrap_failed; mic.data = (u8 *)(p + 1); maj_stat = gss_get_mic(ctx->gc_gss_ctx, &integ_buf, &mic); if (maj_stat == GSS_S_CONTEXT_EXPIRED) clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags); else if (maj_stat) goto bad_mic; /* Check that the trailing MIC fit in the buffer, after the fact */ if (xdr_stream_encode_opaque_inline(xdr, (void **)&p, mic.len) < 0) goto wrap_failed; return 0; wrap_failed: return -EMSGSIZE; bad_mic: trace_rpcgss_get_mic(task, maj_stat); return -EIO; } static void priv_release_snd_buf(struct rpc_rqst *rqstp) { int i; for (i=0; i < rqstp->rq_enc_pages_num; i++) __free_page(rqstp->rq_enc_pages[i]); kfree(rqstp->rq_enc_pages); rqstp->rq_release_snd_buf = NULL; } static int alloc_enc_pages(struct rpc_rqst *rqstp) { struct xdr_buf *snd_buf = &rqstp->rq_snd_buf; int first, last, i; if (rqstp->rq_release_snd_buf) rqstp->rq_release_snd_buf(rqstp); if (snd_buf->page_len == 0) { rqstp->rq_enc_pages_num = 0; return 0; } first = snd_buf->page_base >> PAGE_SHIFT; last = (snd_buf->page_base + snd_buf->page_len - 1) >> PAGE_SHIFT; rqstp->rq_enc_pages_num = last - first + 1 + 1; rqstp->rq_enc_pages = kmalloc_array(rqstp->rq_enc_pages_num, sizeof(struct page *), GFP_KERNEL); if (!rqstp->rq_enc_pages) goto out; for (i=0; i < rqstp->rq_enc_pages_num; i++) { rqstp->rq_enc_pages[i] = alloc_page(GFP_KERNEL); if (rqstp->rq_enc_pages[i] == NULL) goto out_free; } rqstp->rq_release_snd_buf = priv_release_snd_buf; return 0; out_free: rqstp->rq_enc_pages_num = i; priv_release_snd_buf(rqstp); out: return -EAGAIN; } static noinline_for_stack int gss_wrap_req_priv(struct rpc_cred *cred, struct gss_cl_ctx *ctx, struct rpc_task *task, struct xdr_stream *xdr) { struct rpc_rqst *rqstp = task->tk_rqstp; struct xdr_buf *snd_buf = &rqstp->rq_snd_buf; u32 pad, offset, maj_stat; int status; __be32 *p, *opaque_len; struct page **inpages; int first; struct kvec *iov; status = -EIO; p = xdr_reserve_space(xdr, 2 * sizeof(*p)); if (!p) goto wrap_failed; opaque_len = p++; *p = cpu_to_be32(rqstp->rq_seqno); if (rpcauth_wrap_req_encode(task, xdr)) goto wrap_failed; status = alloc_enc_pages(rqstp); if (unlikely(status)) goto wrap_failed; first = snd_buf->page_base >> PAGE_SHIFT; inpages = snd_buf->pages + first; snd_buf->pages = rqstp->rq_enc_pages; snd_buf->page_base -= first << PAGE_SHIFT; /* * Move the tail into its own page, in case gss_wrap needs * more space in the head when wrapping. * * Still... Why can't gss_wrap just slide the tail down? */ if (snd_buf->page_len || snd_buf->tail[0].iov_len) { char *tmp; tmp = page_address(rqstp->rq_enc_pages[rqstp->rq_enc_pages_num - 1]); memcpy(tmp, snd_buf->tail[0].iov_base, snd_buf->tail[0].iov_len); snd_buf->tail[0].iov_base = tmp; } offset = (u8 *)p - (u8 *)snd_buf->head[0].iov_base; maj_stat = gss_wrap(ctx->gc_gss_ctx, offset, snd_buf, inpages); /* slack space should prevent this ever happening: */ if (unlikely(snd_buf->len > snd_buf->buflen)) { status = -EIO; goto wrap_failed; } /* We're assuming that when GSS_S_CONTEXT_EXPIRED, the encryption was * done anyway, so it's safe to put the request on the wire: */ if (maj_stat == GSS_S_CONTEXT_EXPIRED) clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags); else if (maj_stat) goto bad_wrap; *opaque_len = cpu_to_be32(snd_buf->len - offset); /* guess whether the pad goes into the head or the tail: */ if (snd_buf->page_len || snd_buf->tail[0].iov_len) iov = snd_buf->tail; else iov = snd_buf->head; p = iov->iov_base + iov->iov_len; pad = xdr_pad_size(snd_buf->len - offset); memset(p, 0, pad); iov->iov_len += pad; snd_buf->len += pad; return 0; wrap_failed: return status; bad_wrap: trace_rpcgss_wrap(task, maj_stat); return -EIO; } static int gss_wrap_req(struct rpc_task *task, struct xdr_stream *xdr) { struct rpc_cred *cred = task->tk_rqstp->rq_cred; struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base); struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred); int status; status = -EIO; if (ctx->gc_proc != RPC_GSS_PROC_DATA) { /* The spec seems a little ambiguous here, but I think that not * wrapping context destruction requests makes the most sense. */ status = rpcauth_wrap_req_encode(task, xdr); goto out; } switch (gss_cred->gc_service) { case RPC_GSS_SVC_NONE: status = rpcauth_wrap_req_encode(task, xdr); break; case RPC_GSS_SVC_INTEGRITY: status = gss_wrap_req_integ(cred, ctx, task, xdr); break; case RPC_GSS_SVC_PRIVACY: status = gss_wrap_req_priv(cred, ctx, task, xdr); break; default: status = -EIO; } out: gss_put_ctx(ctx); return status; } /** * gss_update_rslack - Possibly update RPC receive buffer size estimates * @task: rpc_task for incoming RPC Reply being unwrapped * @cred: controlling rpc_cred for @task * @before: XDR words needed before each RPC Reply message * @after: XDR words needed following each RPC Reply message * */ static void gss_update_rslack(struct rpc_task *task, struct rpc_cred *cred, unsigned int before, unsigned int after) { struct rpc_auth *auth = cred->cr_auth; if (test_and_clear_bit(RPCAUTH_AUTH_UPDATE_SLACK, &auth->au_flags)) { auth->au_ralign = auth->au_verfsize + before; auth->au_rslack = auth->au_verfsize + after; trace_rpcgss_update_slack(task, auth); } } static int gss_unwrap_resp_auth(struct rpc_task *task, struct rpc_cred *cred) { gss_update_rslack(task, cred, 0, 0); return 0; } /* * RFC 2203, Section 5.3.2.2 * * struct rpc_gss_integ_data { * opaque databody_integ<>; * opaque checksum<>; * }; * * struct rpc_gss_data_t { * unsigned int seq_num; * proc_req_arg_t arg; * }; */ static noinline_for_stack int gss_unwrap_resp_integ(struct rpc_task *task, struct rpc_cred *cred, struct gss_cl_ctx *ctx, struct rpc_rqst *rqstp, struct xdr_stream *xdr) { struct xdr_buf gss_data, *rcv_buf = &rqstp->rq_rcv_buf; u32 len, offset, seqno, maj_stat; struct xdr_netobj mic; int ret; ret = -EIO; mic.data = NULL; /* opaque databody_integ<>; */ if (xdr_stream_decode_u32(xdr, &len)) goto unwrap_failed; if (len & 3) goto unwrap_failed; offset = rcv_buf->len - xdr_stream_remaining(xdr); if (xdr_stream_decode_u32(xdr, &seqno)) goto unwrap_failed; if (seqno != rqstp->rq_seqno) goto bad_seqno; if (xdr_buf_subsegment(rcv_buf, &gss_data, offset, len)) goto unwrap_failed; /* * The xdr_stream now points to the beginning of the * upper layer payload, to be passed below to * rpcauth_unwrap_resp_decode(). The checksum, which * follows the upper layer payload in @rcv_buf, is * located and parsed without updating the xdr_stream. */ /* opaque checksum<>; */ offset += len; if (xdr_decode_word(rcv_buf, offset, &len)) goto unwrap_failed; offset += sizeof(__be32); if (offset + len > rcv_buf->len) goto unwrap_failed; mic.len = len; mic.data = kmalloc(len, GFP_KERNEL); if (ZERO_OR_NULL_PTR(mic.data)) goto unwrap_failed; if (read_bytes_from_xdr_buf(rcv_buf, offset, mic.data, mic.len)) goto unwrap_failed; maj_stat = gss_verify_mic(ctx->gc_gss_ctx, &gss_data, &mic); if (maj_stat == GSS_S_CONTEXT_EXPIRED) clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags); if (maj_stat != GSS_S_COMPLETE) goto bad_mic; gss_update_rslack(task, cred, 2, 2 + 1 + XDR_QUADLEN(mic.len)); ret = 0; out: kfree(mic.data); return ret; unwrap_failed: trace_rpcgss_unwrap_failed(task); goto out; bad_seqno: trace_rpcgss_bad_seqno(task, rqstp->rq_seqno, seqno); goto out; bad_mic: trace_rpcgss_verify_mic(task, maj_stat); goto out; } static noinline_for_stack int gss_unwrap_resp_priv(struct rpc_task *task, struct rpc_cred *cred, struct gss_cl_ctx *ctx, struct rpc_rqst *rqstp, struct xdr_stream *xdr) { struct xdr_buf *rcv_buf = &rqstp->rq_rcv_buf; struct kvec *head = rqstp->rq_rcv_buf.head; u32 offset, opaque_len, maj_stat; __be32 *p; p = xdr_inline_decode(xdr, 2 * sizeof(*p)); if (unlikely(!p)) goto unwrap_failed; opaque_len = be32_to_cpup(p++); offset = (u8 *)(p) - (u8 *)head->iov_base; if (offset + opaque_len > rcv_buf->len) goto unwrap_failed; maj_stat = gss_unwrap(ctx->gc_gss_ctx, offset, offset + opaque_len, rcv_buf); if (maj_stat == GSS_S_CONTEXT_EXPIRED) clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags); if (maj_stat != GSS_S_COMPLETE) goto bad_unwrap; /* gss_unwrap decrypted the sequence number */ if (be32_to_cpup(p++) != rqstp->rq_seqno) goto bad_seqno; /* gss_unwrap redacts the opaque blob from the head iovec. * rcv_buf has changed, thus the stream needs to be reset. */ xdr_init_decode(xdr, rcv_buf, p, rqstp); gss_update_rslack(task, cred, 2 + ctx->gc_gss_ctx->align, 2 + ctx->gc_gss_ctx->slack); return 0; unwrap_failed: trace_rpcgss_unwrap_failed(task); return -EIO; bad_seqno: trace_rpcgss_bad_seqno(task, rqstp->rq_seqno, be32_to_cpup(--p)); return -EIO; bad_unwrap: trace_rpcgss_unwrap(task, maj_stat); return -EIO; } static bool gss_seq_is_newer(u32 new, u32 old) { return (s32)(new - old) > 0; } static bool gss_xmit_need_reencode(struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; struct rpc_cred *cred = req->rq_cred; struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred); u32 win, seq_xmit = 0; bool ret = true; if (!ctx) goto out; if (gss_seq_is_newer(req->rq_seqno, READ_ONCE(ctx->gc_seq))) goto out_ctx; seq_xmit = READ_ONCE(ctx->gc_seq_xmit); while (gss_seq_is_newer(req->rq_seqno, seq_xmit)) { u32 tmp = seq_xmit; seq_xmit = cmpxchg(&ctx->gc_seq_xmit, tmp, req->rq_seqno); if (seq_xmit == tmp) { ret = false; goto out_ctx; } } win = ctx->gc_win; if (win > 0) ret = !gss_seq_is_newer(req->rq_seqno, seq_xmit - win); out_ctx: gss_put_ctx(ctx); out: trace_rpcgss_need_reencode(task, seq_xmit, ret); return ret; } static int gss_unwrap_resp(struct rpc_task *task, struct xdr_stream *xdr) { struct rpc_rqst *rqstp = task->tk_rqstp; struct rpc_cred *cred = rqstp->rq_cred; struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base); struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred); int status = -EIO; if (ctx->gc_proc != RPC_GSS_PROC_DATA) goto out_decode; switch (gss_cred->gc_service) { case RPC_GSS_SVC_NONE: status = gss_unwrap_resp_auth(task, cred); break; case RPC_GSS_SVC_INTEGRITY: status = gss_unwrap_resp_integ(task, cred, ctx, rqstp, xdr); break; case RPC_GSS_SVC_PRIVACY: status = gss_unwrap_resp_priv(task, cred, ctx, rqstp, xdr); break; } if (status) goto out; out_decode: status = rpcauth_unwrap_resp_decode(task, xdr); out: gss_put_ctx(ctx); return status; } static const struct rpc_authops authgss_ops = { .owner = THIS_MODULE, .au_flavor = RPC_AUTH_GSS, .au_name = "RPCSEC_GSS", .create = gss_create, .destroy = gss_destroy, .hash_cred = gss_hash_cred, .lookup_cred = gss_lookup_cred, .crcreate = gss_create_cred, .info2flavor = gss_mech_info2flavor, .flavor2info = gss_mech_flavor2info, }; static const struct rpc_credops gss_credops = { .cr_name = "AUTH_GSS", .crdestroy = gss_destroy_cred, .cr_init = gss_cred_init, .crmatch = gss_match, .crmarshal = gss_marshal, .crrefresh = gss_refresh, .crvalidate = gss_validate, .crwrap_req = gss_wrap_req, .crunwrap_resp = gss_unwrap_resp, .crkey_timeout = gss_key_timeout, .crstringify_acceptor = gss_stringify_acceptor, .crneed_reencode = gss_xmit_need_reencode, }; static const struct rpc_credops gss_nullops = { .cr_name = "AUTH_GSS", .crdestroy = gss_destroy_nullcred, .crmatch = gss_match, .crmarshal = gss_marshal, .crrefresh = gss_refresh_null, .crvalidate = gss_validate, .crwrap_req = gss_wrap_req, .crunwrap_resp = gss_unwrap_resp, .crstringify_acceptor = gss_stringify_acceptor, }; static const struct rpc_pipe_ops gss_upcall_ops_v0 = { .upcall = gss_v0_upcall, .downcall = gss_pipe_downcall, .destroy_msg = gss_pipe_destroy_msg, .open_pipe = gss_pipe_open_v0, .release_pipe = gss_pipe_release, }; static const struct rpc_pipe_ops gss_upcall_ops_v1 = { .upcall = gss_v1_upcall, .downcall = gss_pipe_downcall, .destroy_msg = gss_pipe_destroy_msg, .open_pipe = gss_pipe_open_v1, .release_pipe = gss_pipe_release, }; static __net_init int rpcsec_gss_init_net(struct net *net) { return gss_svc_init_net(net); } static __net_exit void rpcsec_gss_exit_net(struct net *net) { gss_svc_shutdown_net(net); } static struct pernet_operations rpcsec_gss_net_ops = { .init = rpcsec_gss_init_net, .exit = rpcsec_gss_exit_net, }; /* * Initialize RPCSEC_GSS module */ static int __init init_rpcsec_gss(void) { int err = 0; err = rpcauth_register(&authgss_ops); if (err) goto out; err = gss_svc_init(); if (err) goto out_unregister; err = register_pernet_subsys(&rpcsec_gss_net_ops); if (err) goto out_svc_exit; rpc_init_wait_queue(&pipe_version_rpc_waitqueue, "gss pipe version"); return 0; out_svc_exit: gss_svc_shutdown(); out_unregister: rpcauth_unregister(&authgss_ops); out: return err; } static void __exit exit_rpcsec_gss(void) { unregister_pernet_subsys(&rpcsec_gss_net_ops); gss_svc_shutdown(); rpcauth_unregister(&authgss_ops); rcu_barrier(); /* Wait for completion of call_rcu()'s */ } MODULE_ALIAS("rpc-auth-6"); MODULE_DESCRIPTION("Sun RPC Kerberos RPCSEC_GSS client authentication"); MODULE_LICENSE("GPL"); module_param_named(expired_cred_retry_delay, gss_expired_cred_retry_delay, uint, 0644); MODULE_PARM_DESC(expired_cred_retry_delay, "Timeout (in seconds) until " "the RPC engine retries an expired credential"); module_param_named(key_expire_timeo, gss_key_expire_timeo, uint, 0644); MODULE_PARM_DESC(key_expire_timeo, "Time (in seconds) at the end of a " "credential keys lifetime where the NFS layer cleans up " "prior to key expiration"); module_init(init_rpcsec_gss) module_exit(exit_rpcsec_gss)
9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (c) 2007-2012 Nicira, Inc. */ #ifndef VPORT_H #define VPORT_H 1 #include <linux/if_tunnel.h> #include <linux/list.h> #include <linux/netlink.h> #include <linux/openvswitch.h> #include <linux/reciprocal_div.h> #include <linux/skbuff.h> #include <linux/spinlock.h> #include <linux/u64_stats_sync.h> #include "datapath.h" struct vport; struct vport_parms; /* The following definitions are for users of the vport subsystem: */ int ovs_vport_init(void); void ovs_vport_exit(void); struct vport *ovs_vport_add(const struct vport_parms *); void ovs_vport_del(struct vport *); struct vport *ovs_vport_locate(const struct net *net, const char *name); void ovs_vport_get_stats(struct vport *, struct ovs_vport_stats *); int ovs_vport_get_upcall_stats(struct vport *vport, struct sk_buff *skb); int ovs_vport_set_options(struct vport *, struct nlattr *options); int ovs_vport_get_options(const struct vport *, struct sk_buff *); int ovs_vport_set_upcall_portids(struct vport *, const struct nlattr *pids); int ovs_vport_get_upcall_portids(const struct vport *, struct sk_buff *); u32 ovs_vport_find_upcall_portid(const struct vport *, struct sk_buff *); /** * struct vport_portids - array of netlink portids of a vport. * must be protected by rcu. * @rn_ids: The reciprocal value of @n_ids. * @rcu: RCU callback head for deferred destruction. * @n_ids: Size of @ids array. * @ids: Array storing the Netlink socket pids to be used for packets received * on this port that miss the flow table. */ struct vport_portids { struct reciprocal_value rn_ids; struct rcu_head rcu; u32 n_ids; u32 ids[]; }; /** * struct vport - one port within a datapath * @dev: Pointer to net_device. * @dev_tracker: refcount tracker for @dev reference * @dp: Datapath to which this port belongs. * @upcall_portids: RCU protected 'struct vport_portids'. * @port_no: Index into @dp's @ports array. * @hash_node: Element in @dev_table hash table in vport.c. * @dp_hash_node: Element in @datapath->ports hash table in datapath.c. * @ops: Class structure. * @upcall_stats: Upcall stats of every ports. * @detach_list: list used for detaching vport in net-exit call. * @rcu: RCU callback head for deferred destruction. */ struct vport { struct net_device *dev; netdevice_tracker dev_tracker; struct datapath *dp; struct vport_portids __rcu *upcall_portids; u16 port_no; struct hlist_node hash_node; struct hlist_node dp_hash_node; const struct vport_ops *ops; struct vport_upcall_stats_percpu __percpu *upcall_stats; struct list_head detach_list; struct rcu_head rcu; }; /** * struct vport_parms - parameters for creating a new vport * * @name: New vport's name. * @type: New vport's type. * @options: %OVS_VPORT_ATTR_OPTIONS attribute from Netlink message, %NULL if * none was supplied. * @desired_ifindex: New vport's ifindex. * @dp: New vport's datapath. * @port_no: New vport's port number. */ struct vport_parms { const char *name; enum ovs_vport_type type; int desired_ifindex; struct nlattr *options; /* For ovs_vport_alloc(). */ struct datapath *dp; u16 port_no; struct nlattr *upcall_portids; }; /** * struct vport_ops - definition of a type of virtual port * * @type: %OVS_VPORT_TYPE_* value for this type of virtual port. * @create: Create a new vport configured as specified. On success returns * a new vport allocated with ovs_vport_alloc(), otherwise an ERR_PTR() value. * @destroy: Destroys a vport. Must call vport_free() on the vport but not * before an RCU grace period has elapsed. * @set_options: Modify the configuration of an existing vport. May be %NULL * if modification is not supported. * @get_options: Appends vport-specific attributes for the configuration of an * existing vport to a &struct sk_buff. May be %NULL for a vport that does not * have any configuration. * @send: Send a packet on the device. * zero for dropped packets or negative for error. */ struct vport_ops { enum ovs_vport_type type; /* Called with ovs_mutex. */ struct vport *(*create)(const struct vport_parms *); void (*destroy)(struct vport *); int (*set_options)(struct vport *, struct nlattr *); int (*get_options)(const struct vport *, struct sk_buff *); int (*send)(struct sk_buff *skb); struct module *owner; struct list_head list; }; /** * struct vport_upcall_stats_percpu - per-cpu packet upcall statistics for * a given vport. * @n_success: Number of packets that upcall to userspace succeed. * @n_fail: Number of packets that upcall to userspace failed. */ struct vport_upcall_stats_percpu { struct u64_stats_sync syncp; u64_stats_t n_success; u64_stats_t n_fail; }; struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *, const struct vport_parms *); void ovs_vport_free(struct vport *); #define VPORT_ALIGN 8 /** * vport_priv - access private data area of vport * * @vport: vport to access * * If a nonzero size was passed in priv_size of vport_alloc() a private data * area was allocated on creation. This allows that area to be accessed and * used for any purpose needed by the vport implementer. */ static inline void *vport_priv(const struct vport *vport) { return (u8 *)(uintptr_t)vport + ALIGN(sizeof(struct vport), VPORT_ALIGN); } /** * vport_from_priv - lookup vport from private data pointer * * @priv: Start of private data area. * * It is sometimes useful to translate from a pointer to the private data * area to the vport, such as in the case where the private data pointer is * the result of a hash table lookup. @priv must point to the start of the * private data area. */ static inline struct vport *vport_from_priv(void *priv) { return (struct vport *)((u8 *)priv - ALIGN(sizeof(struct vport), VPORT_ALIGN)); } int ovs_vport_receive(struct vport *, struct sk_buff *, const struct ip_tunnel_info *); static inline const char *ovs_vport_name(struct vport *vport) { return vport->dev->name; } int __ovs_vport_ops_register(struct vport_ops *ops); #define ovs_vport_ops_register(ops) \ ({ \ (ops)->owner = THIS_MODULE; \ __ovs_vport_ops_register(ops); \ }) void ovs_vport_ops_unregister(struct vport_ops *ops); void ovs_vport_send(struct vport *vport, struct sk_buff *skb, u8 mac_proto); #endif /* vport.h */
80 104 115 99 122 16 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 /* SPDX-License-Identifier: GPL-2.0-only */ /* * vivid-core.h - core datastructures * * Copyright 2014 Cisco Systems, Inc. and/or its affiliates. All rights reserved. */ #ifndef _VIVID_CORE_H_ #define _VIVID_CORE_H_ #include <linux/fb.h> #include <linux/workqueue.h> #include <media/cec.h> #include <media/videobuf2-v4l2.h> #include <media/v4l2-device.h> #include <media/v4l2-dev.h> #include <media/v4l2-ctrls.h> #include <media/tpg/v4l2-tpg.h> #include "vivid-rds-gen.h" #include "vivid-vbi-gen.h" #define dprintk(dev, level, fmt, arg...) \ v4l2_dbg(level, vivid_debug, &dev->v4l2_dev, fmt, ## arg) /* The maximum number of inputs */ #define MAX_INPUTS 16 /* The maximum number of outputs */ #define MAX_OUTPUTS 16 /* The maximum number of video capture buffers */ #define MAX_VID_CAP_BUFFERS 64 /* The maximum up or down scaling factor is 4 */ #define MAX_ZOOM 4 /* The maximum image width/height are set to 4K DMT */ #define MAX_WIDTH 4096 #define MAX_HEIGHT 2160 /* The minimum image width/height */ #define MIN_WIDTH 16 #define MIN_HEIGHT MIN_WIDTH /* Pixel Array control divider */ #define PIXEL_ARRAY_DIV MIN_WIDTH /* The data_offset of plane 0 for the multiplanar formats */ #define PLANE0_DATA_OFFSET 128 /* The supported TV frequency range in MHz */ #define MIN_TV_FREQ (44U * 16U) #define MAX_TV_FREQ (958U * 16U) /* The number of samples returned in every SDR buffer */ #define SDR_CAP_SAMPLES_PER_BUF 0x4000 /* used by the threads to know when to resync internal counters */ #define JIFFIES_PER_DAY (3600U * 24U * HZ) #define JIFFIES_RESYNC (JIFFIES_PER_DAY * (0xf0000000U / JIFFIES_PER_DAY)) /* * Maximum number of HDMI inputs allowed by vivid, due to limitations * of the Physical Address in the EDID and used by CEC we stop at 15 * inputs and outputs. */ #define MAX_HDMI_INPUTS 15 #define MAX_HDMI_OUTPUTS 15 /* Maximum number of S-Video inputs allowed by vivid */ #define MAX_SVID_INPUTS 16 /* The maximum number of items in a menu control */ #define MAX_MENU_ITEMS BITS_PER_LONG_LONG /* Number of fixed menu items in the 'Connected To' menu controls */ #define FIXED_MENU_ITEMS 2 /* The maximum number of vivid devices */ #define VIVID_MAX_DEVS CONFIG_VIDEO_VIVID_MAX_DEVS extern const struct v4l2_rect vivid_min_rect; extern const struct v4l2_rect vivid_max_rect; extern unsigned vivid_debug; /* * NULL-terminated string array for the HDMI 'Connected To' menu controls * with the list of possible HDMI outputs. * * The first two items are fixed ("TPG" and "None"). */ extern char *vivid_ctrl_hdmi_to_output_strings[1 + MAX_MENU_ITEMS]; /* Menu control skip mask of all HDMI outputs that are in use */ extern u64 hdmi_to_output_menu_skip_mask; /* * Bitmask of which vivid instances need to update any connected * HDMI outputs. */ extern u64 hdmi_input_update_outputs_mask; /* * Spinlock for access to hdmi_to_output_menu_skip_mask and * hdmi_input_update_outputs_mask. */ extern spinlock_t hdmi_output_skip_mask_lock; /* * Workqueue that updates the menu controls whenever the HDMI menu skip mask * changes. */ extern struct workqueue_struct *update_hdmi_ctrls_workqueue; /* * The HDMI menu control value (index in the menu list) maps to an HDMI * output that is part of the given vivid_dev instance and has the given * output index (as returned by VIDIOC_G_OUTPUT). * * NULL/0 if not available. */ extern struct vivid_dev *vivid_ctrl_hdmi_to_output_instance[MAX_MENU_ITEMS]; extern unsigned int vivid_ctrl_hdmi_to_output_index[MAX_MENU_ITEMS]; /* * NULL-terminated string array for the S-Video 'Connected To' menu controls * with the list of possible S-Video outputs. * * The first two items are fixed ("TPG" and "None"). */ extern char *vivid_ctrl_svid_to_output_strings[1 + MAX_MENU_ITEMS]; /* Menu control skip mask of all S-Video outputs that are in use */ extern u64 svid_to_output_menu_skip_mask; /* Spinlock for access to svid_to_output_menu_skip_mask */ extern spinlock_t svid_output_skip_mask_lock; /* * Workqueue that updates the menu controls whenever the S-Video menu skip mask * changes. */ extern struct workqueue_struct *update_svid_ctrls_workqueue; /* * The S-Video menu control value (index in the menu list) maps to an S-Video * output that is part of the given vivid_dev instance and has the given * output index (as returned by VIDIOC_G_OUTPUT). * * NULL/0 if not available. */ extern struct vivid_dev *vivid_ctrl_svid_to_output_instance[MAX_MENU_ITEMS]; extern unsigned int vivid_ctrl_svid_to_output_index[MAX_MENU_ITEMS]; extern struct vivid_dev *vivid_devs[VIVID_MAX_DEVS]; extern unsigned int n_devs; struct vivid_fmt { u32 fourcc; /* v4l2 format id */ enum tgp_color_enc color_enc; bool can_do_overlay; u8 vdownsampling[TPG_MAX_PLANES]; u32 alpha_mask; u8 planes; u8 buffers; u32 data_offset[TPG_MAX_PLANES]; u32 bit_depth[TPG_MAX_PLANES]; }; extern struct vivid_fmt vivid_formats[]; /* buffer for one video frame */ struct vivid_buffer { /* common v4l buffer stuff -- must be first */ struct vb2_v4l2_buffer vb; struct list_head list; }; enum vivid_input { WEBCAM, TV, SVID, HDMI, }; enum vivid_signal_mode { CURRENT_DV_TIMINGS, CURRENT_STD = CURRENT_DV_TIMINGS, NO_SIGNAL, NO_LOCK, OUT_OF_RANGE, SELECTED_DV_TIMINGS, SELECTED_STD = SELECTED_DV_TIMINGS, CYCLE_DV_TIMINGS, CYCLE_STD = CYCLE_DV_TIMINGS, CUSTOM_DV_TIMINGS, }; enum vivid_colorspace { VIVID_CS_170M, VIVID_CS_709, VIVID_CS_SRGB, VIVID_CS_OPRGB, VIVID_CS_2020, VIVID_CS_DCI_P3, VIVID_CS_240M, VIVID_CS_SYS_M, VIVID_CS_SYS_BG, }; #define VIVID_INVALID_SIGNAL(mode) \ ((mode) == NO_SIGNAL || (mode) == NO_LOCK || (mode) == OUT_OF_RANGE) struct vivid_cec_xfer { struct cec_adapter *adap; u8 msg[CEC_MAX_MSG_SIZE]; u32 len; u32 sft; }; struct vivid_dev { u8 inst; struct v4l2_device v4l2_dev; #ifdef CONFIG_MEDIA_CONTROLLER struct media_device mdev; struct media_pad vid_cap_pad; struct media_pad vid_out_pad; struct media_pad vbi_cap_pad; struct media_pad vbi_out_pad; struct media_pad sdr_cap_pad; struct media_pad meta_cap_pad; struct media_pad meta_out_pad; struct media_pad touch_cap_pad; #endif struct v4l2_ctrl_handler ctrl_hdl_user_gen; struct v4l2_ctrl_handler ctrl_hdl_user_vid; struct v4l2_ctrl_handler ctrl_hdl_user_aud; struct v4l2_ctrl_handler ctrl_hdl_streaming; struct v4l2_ctrl_handler ctrl_hdl_sdtv_cap; struct v4l2_ctrl_handler ctrl_hdl_loop_cap; struct v4l2_ctrl_handler ctrl_hdl_fb; struct video_device vid_cap_dev; struct v4l2_ctrl_handler ctrl_hdl_vid_cap; struct video_device vid_out_dev; struct v4l2_ctrl_handler ctrl_hdl_vid_out; struct video_device vbi_cap_dev; struct v4l2_ctrl_handler ctrl_hdl_vbi_cap; struct video_device vbi_out_dev; struct v4l2_ctrl_handler ctrl_hdl_vbi_out; struct video_device radio_rx_dev; struct v4l2_ctrl_handler ctrl_hdl_radio_rx; struct video_device radio_tx_dev; struct v4l2_ctrl_handler ctrl_hdl_radio_tx; struct video_device sdr_cap_dev; struct v4l2_ctrl_handler ctrl_hdl_sdr_cap; struct video_device meta_cap_dev; struct v4l2_ctrl_handler ctrl_hdl_meta_cap; struct video_device meta_out_dev; struct v4l2_ctrl_handler ctrl_hdl_meta_out; struct video_device touch_cap_dev; struct v4l2_ctrl_handler ctrl_hdl_touch_cap; spinlock_t slock; struct mutex mutex; struct work_struct update_hdmi_ctrl_work; struct work_struct update_svid_ctrl_work; /* capabilities */ u32 vid_cap_caps; u32 vid_out_caps; u32 vbi_cap_caps; u32 vbi_out_caps; u32 sdr_cap_caps; u32 radio_rx_caps; u32 radio_tx_caps; u32 meta_cap_caps; u32 meta_out_caps; u32 touch_cap_caps; /* supported features */ bool multiplanar; u8 num_inputs; u8 num_hdmi_inputs; u8 num_svid_inputs; u8 input_type[MAX_INPUTS]; u8 input_name_counter[MAX_INPUTS]; u8 num_outputs; u8 num_hdmi_outputs; u8 output_type[MAX_OUTPUTS]; u8 output_name_counter[MAX_OUTPUTS]; bool has_audio_inputs; bool has_audio_outputs; bool has_vid_cap; bool has_vid_out; bool has_vbi_cap; bool has_raw_vbi_cap; bool has_sliced_vbi_cap; bool has_vbi_out; bool has_raw_vbi_out; bool has_sliced_vbi_out; bool has_radio_rx; bool has_radio_tx; bool has_sdr_cap; bool has_fb; bool has_meta_cap; bool has_meta_out; bool has_tv_tuner; bool has_touch_cap; /* Output index (0-MAX_OUTPUTS) to vivid instance of connected input */ struct vivid_dev *output_to_input_instance[MAX_OUTPUTS]; /* Output index (0-MAX_OUTPUTS) to input index (0-MAX_INPUTS) of connected input */ u8 output_to_input_index[MAX_OUTPUTS]; /* Output index (0-MAX_OUTPUTS) to HDMI or S-Video output index (0-MAX_HDMI/SVID_OUTPUTS) */ u8 output_to_iface_index[MAX_OUTPUTS]; /* ctrl_hdmi_to_output or ctrl_svid_to_output control value for each input */ s32 input_is_connected_to_output[MAX_INPUTS]; /* HDMI index (0-MAX_HDMI_OUTPUTS) to output index (0-MAX_OUTPUTS) */ u8 hdmi_index_to_output_index[MAX_HDMI_OUTPUTS]; /* HDMI index (0-MAX_HDMI_INPUTS) to input index (0-MAX_INPUTS) */ u8 hdmi_index_to_input_index[MAX_HDMI_INPUTS]; /* S-Video index (0-MAX_SVID_INPUTS) to input index (0-MAX_INPUTS) */ u8 svid_index_to_input_index[MAX_SVID_INPUTS]; /* controls */ struct v4l2_ctrl *brightness; struct v4l2_ctrl *contrast; struct v4l2_ctrl *saturation; struct v4l2_ctrl *hue; struct { /* autogain/gain cluster */ struct v4l2_ctrl *autogain; struct v4l2_ctrl *gain; }; struct v4l2_ctrl *volume; struct v4l2_ctrl *mute; struct v4l2_ctrl *alpha; struct v4l2_ctrl *button; struct v4l2_ctrl *boolean; struct v4l2_ctrl *int32; struct v4l2_ctrl *int64; struct v4l2_ctrl *menu; struct v4l2_ctrl *string; struct v4l2_ctrl *bitmask; struct v4l2_ctrl *int_menu; struct v4l2_ctrl *ro_int32; struct v4l2_ctrl *pixel_array; struct v4l2_ctrl *test_pattern; struct v4l2_ctrl *colorspace; struct v4l2_ctrl *rgb_range_cap; struct v4l2_ctrl *real_rgb_range_cap; struct { /* std_signal_mode/standard cluster */ struct v4l2_ctrl *ctrl_std_signal_mode; struct v4l2_ctrl *ctrl_standard; }; struct { /* dv_timings_signal_mode/timings cluster */ struct v4l2_ctrl *ctrl_dv_timings_signal_mode; struct v4l2_ctrl *ctrl_dv_timings; }; struct v4l2_ctrl *ctrl_has_crop_cap; struct v4l2_ctrl *ctrl_has_compose_cap; struct v4l2_ctrl *ctrl_has_scaler_cap; struct v4l2_ctrl *ctrl_has_crop_out; struct v4l2_ctrl *ctrl_has_compose_out; struct v4l2_ctrl *ctrl_has_scaler_out; struct v4l2_ctrl *ctrl_tx_mode; struct v4l2_ctrl *ctrl_tx_rgb_range; struct v4l2_ctrl *ctrl_tx_edid_present; struct v4l2_ctrl *ctrl_tx_hotplug; struct v4l2_ctrl *ctrl_tx_rxsense; struct v4l2_ctrl *ctrl_rx_power_present; struct v4l2_ctrl *radio_tx_rds_pi; struct v4l2_ctrl *radio_tx_rds_pty; struct v4l2_ctrl *radio_tx_rds_mono_stereo; struct v4l2_ctrl *radio_tx_rds_art_head; struct v4l2_ctrl *radio_tx_rds_compressed; struct v4l2_ctrl *radio_tx_rds_dyn_pty; struct v4l2_ctrl *radio_tx_rds_ta; struct v4l2_ctrl *radio_tx_rds_tp; struct v4l2_ctrl *radio_tx_rds_ms; struct v4l2_ctrl *radio_tx_rds_psname; struct v4l2_ctrl *radio_tx_rds_radiotext; struct v4l2_ctrl *radio_rx_rds_pty; struct v4l2_ctrl *radio_rx_rds_ta; struct v4l2_ctrl *radio_rx_rds_tp; struct v4l2_ctrl *radio_rx_rds_ms; struct v4l2_ctrl *radio_rx_rds_psname; struct v4l2_ctrl *radio_rx_rds_radiotext; struct v4l2_ctrl *ctrl_hdmi_to_output[MAX_HDMI_INPUTS]; char ctrl_hdmi_to_output_names[MAX_HDMI_INPUTS][32]; struct v4l2_ctrl *ctrl_svid_to_output[MAX_SVID_INPUTS]; char ctrl_svid_to_output_names[MAX_SVID_INPUTS][32]; unsigned input_brightness[MAX_INPUTS]; unsigned osd_mode; unsigned button_pressed; bool sensor_hflip; bool sensor_vflip; bool hflip; bool vflip; bool vbi_cap_interlaced; bool loop_video; bool reduced_fps; /* Framebuffer */ unsigned long video_pbase; void *video_vbase; u32 video_buffer_size; int display_width; int display_height; int display_byte_stride; int bits_per_pixel; int bytes_per_pixel; struct fb_info fb_info; struct fb_var_screeninfo fb_defined; struct fb_fix_screeninfo fb_fix; /* Error injection */ bool disconnect_error; bool queue_setup_error; bool buf_prepare_error; bool start_streaming_error; bool dqbuf_error; bool req_validate_error; bool seq_wrap; u64 time_wrap; u64 time_wrap_offset; unsigned perc_dropped_buffers; enum vivid_signal_mode std_signal_mode[MAX_INPUTS]; unsigned int query_std_last[MAX_INPUTS]; v4l2_std_id query_std[MAX_INPUTS]; enum tpg_video_aspect std_aspect_ratio[MAX_INPUTS]; enum vivid_signal_mode dv_timings_signal_mode[MAX_INPUTS]; char **query_dv_timings_qmenu; char *query_dv_timings_qmenu_strings; unsigned query_dv_timings_size; unsigned int query_dv_timings_last[MAX_INPUTS]; unsigned int query_dv_timings[MAX_INPUTS]; enum tpg_video_aspect dv_timings_aspect_ratio[MAX_INPUTS]; /* Input */ unsigned input; v4l2_std_id std_cap[MAX_INPUTS]; struct v4l2_dv_timings dv_timings_cap[MAX_INPUTS]; int dv_timings_cap_sel[MAX_INPUTS]; u32 service_set_cap; struct vivid_vbi_gen_data vbi_gen; u8 *edid; unsigned edid_blocks; unsigned edid_max_blocks; unsigned webcam_size_idx; unsigned webcam_ival_idx; unsigned tv_freq; unsigned tv_audmode; unsigned tv_field_cap; unsigned tv_audio_input; u32 power_present; /* Output */ unsigned output; v4l2_std_id std_out; struct v4l2_dv_timings dv_timings_out; u32 colorspace_out; u32 ycbcr_enc_out; u32 hsv_enc_out; u32 quantization_out; u32 xfer_func_out; u32 service_set_out; unsigned bytesperline_out[TPG_MAX_PLANES]; unsigned tv_field_out; unsigned tv_audio_output; bool vbi_out_have_wss; u8 vbi_out_wss[2]; bool vbi_out_have_cc[2]; u8 vbi_out_cc[2][2]; bool dvi_d_out; u8 *scaled_line; u8 *blended_line; unsigned cur_scaled_line; /* Output Overlay */ void *fb_vbase_out; bool overlay_out_enabled; int overlay_out_top, overlay_out_left; unsigned fbuf_out_flags; u32 chromakey_out; u8 global_alpha_out; /* video capture */ struct tpg_data tpg; unsigned ms_vid_cap; bool must_blank[MAX_VID_CAP_BUFFERS]; const struct vivid_fmt *fmt_cap; struct v4l2_fract timeperframe_vid_cap; enum v4l2_field field_cap; struct v4l2_rect src_rect; struct v4l2_rect fmt_cap_rect; struct v4l2_rect crop_cap; struct v4l2_rect compose_cap; struct v4l2_rect crop_bounds_cap; struct vb2_queue vb_vid_cap_q; struct list_head vid_cap_active; struct vb2_queue vb_vbi_cap_q; struct list_head vbi_cap_active; struct vb2_queue vb_meta_cap_q; struct list_head meta_cap_active; struct vb2_queue vb_touch_cap_q; struct list_head touch_cap_active; /* thread for generating video capture stream */ struct task_struct *kthread_vid_cap; unsigned long jiffies_vid_cap; u64 cap_stream_start; u64 cap_frame_period; u64 cap_frame_eof_offset; u32 cap_seq_offset; u32 cap_seq_count; bool cap_seq_resync; u32 vid_cap_seq_start; u32 vid_cap_seq_count; bool vid_cap_streaming; u32 vbi_cap_seq_start; u32 vbi_cap_seq_count; bool vbi_cap_streaming; u32 meta_cap_seq_start; u32 meta_cap_seq_count; bool meta_cap_streaming; /* Touch capture */ struct task_struct *kthread_touch_cap; unsigned long jiffies_touch_cap; u64 touch_cap_stream_start; u32 touch_cap_seq_offset; bool touch_cap_seq_resync; u32 touch_cap_seq_start; u32 touch_cap_seq_count; u32 touch_cap_with_seq_wrap_count; bool touch_cap_streaming; struct v4l2_fract timeperframe_tch_cap; struct v4l2_pix_format tch_format; int tch_pat_random; /* video output */ const struct vivid_fmt *fmt_out; struct v4l2_fract timeperframe_vid_out; enum v4l2_field field_out; struct v4l2_rect sink_rect; struct v4l2_rect fmt_out_rect; struct v4l2_rect crop_out; struct v4l2_rect compose_out; struct v4l2_rect compose_bounds_out; struct vb2_queue vb_vid_out_q; struct list_head vid_out_active; struct vb2_queue vb_vbi_out_q; struct list_head vbi_out_active; struct vb2_queue vb_meta_out_q; struct list_head meta_out_active; /* video loop precalculated rectangles */ /* * Intersection between what the output side composes and the capture side * crops. I.e., what actually needs to be copied from the output buffer to * the capture buffer. */ struct v4l2_rect loop_vid_copy; /* The part of the output buffer that (after scaling) corresponds to loop_vid_copy. */ struct v4l2_rect loop_vid_out; /* The part of the capture buffer that (after scaling) corresponds to loop_vid_copy. */ struct v4l2_rect loop_vid_cap; /* * The intersection of the framebuffer, the overlay output window and * loop_vid_copy. I.e., the part of the framebuffer that actually should be * blended with the compose_out rectangle. This uses the framebuffer origin. */ struct v4l2_rect loop_fb_copy; /* The same as loop_fb_copy but with compose_out origin. */ struct v4l2_rect loop_vid_overlay; /* * The part of the capture buffer that (after scaling) corresponds * to loop_vid_overlay. */ struct v4l2_rect loop_vid_overlay_cap; /* thread for generating video output stream */ struct task_struct *kthread_vid_out; unsigned long jiffies_vid_out; u32 out_seq_offset; u32 out_seq_count; bool out_seq_resync; u32 vid_out_seq_start; u32 vid_out_seq_count; bool vid_out_streaming; u32 vbi_out_seq_start; u32 vbi_out_seq_count; bool vbi_out_streaming; bool stream_sliced_vbi_out; u32 meta_out_seq_start; u32 meta_out_seq_count; bool meta_out_streaming; /* SDR capture */ struct vb2_queue vb_sdr_cap_q; struct list_head sdr_cap_active; u32 sdr_pixelformat; /* v4l2 format id */ unsigned sdr_buffersize; unsigned sdr_adc_freq; unsigned sdr_fm_freq; unsigned sdr_fm_deviation; int sdr_fixp_src_phase; int sdr_fixp_mod_phase; bool tstamp_src_is_soe; bool has_crop_cap; bool has_compose_cap; bool has_scaler_cap; bool has_crop_out; bool has_compose_out; bool has_scaler_out; /* thread for generating SDR stream */ struct task_struct *kthread_sdr_cap; unsigned long jiffies_sdr_cap; u32 sdr_cap_seq_offset; u32 sdr_cap_seq_start; u32 sdr_cap_seq_count; u32 sdr_cap_with_seq_wrap_count; bool sdr_cap_seq_resync; /* RDS generator */ struct vivid_rds_gen rds_gen; /* Radio receiver */ unsigned radio_rx_freq; unsigned radio_rx_audmode; int radio_rx_sig_qual; unsigned radio_rx_hw_seek_mode; bool radio_rx_hw_seek_prog_lim; bool radio_rx_rds_controls; bool radio_rx_rds_enabled; unsigned radio_rx_rds_use_alternates; unsigned radio_rx_rds_last_block; struct v4l2_fh *radio_rx_rds_owner; /* Radio transmitter */ unsigned radio_tx_freq; unsigned radio_tx_subchans; bool radio_tx_rds_controls; unsigned radio_tx_rds_last_block; struct v4l2_fh *radio_tx_rds_owner; /* Shared between radio receiver and transmitter */ bool radio_rds_loop; ktime_t radio_rds_init_time; /* CEC */ struct cec_adapter *cec_rx_adap; struct cec_adapter *cec_tx_adap[MAX_HDMI_OUTPUTS]; struct task_struct *kthread_cec; wait_queue_head_t kthread_waitq_cec; struct vivid_cec_xfer xfers[MAX_OUTPUTS]; spinlock_t cec_xfers_slock; /* read and write cec messages */ u32 cec_sft; /* bus signal free time, in bit periods */ u8 last_initiator; /* CEC OSD String */ char osd[14]; unsigned long osd_jiffies; bool meta_pts; bool meta_scr; }; static inline bool vivid_is_webcam(const struct vivid_dev *dev) { return dev->input_type[dev->input] == WEBCAM; } static inline bool vivid_is_tv_cap(const struct vivid_dev *dev) { return dev->input_type[dev->input] == TV; } static inline bool vivid_is_svid_cap(const struct vivid_dev *dev) { return dev->input_type[dev->input] == SVID; } static inline bool vivid_is_hdmi_cap(const struct vivid_dev *dev) { return dev->input_type[dev->input] == HDMI; } static inline bool vivid_is_sdtv_cap(const struct vivid_dev *dev) { return vivid_is_tv_cap(dev) || vivid_is_svid_cap(dev); } static inline bool vivid_is_svid_out(const struct vivid_dev *dev) { return dev->output_type[dev->output] == SVID; } static inline bool vivid_is_hdmi_out(const struct vivid_dev *dev) { return dev->output_type[dev->output] == HDMI; } #endif
2 1 2 2 2 2 2 2 10 21 11 20 19 18 19 21 2 16 23 16 2 2 1 1 2 1 1 1 1 2 2 2 2 2 2 2 2 1 1 1 2 1 1 1 1 8 2 1 1 1 1 1 1 1 1 8 3 3 1 3 8 10 8 22 22 22 21 22 92 1 1 1 1 1 1 1 1 1 1 13 13 1 1 13 13 10 10 10 10 10 10 1 10 10 10 13 1 17 18 17 17 10 10 10 10 1580 6 15 15 15 10 10 10 10 14 14 14 13 14 12 10 5 5 10 144 7 144 7 7 7 7 137 3 2 2 21 21 21 21 10 21 21 20 21 1 1 1 1 1 1 1 147 146 147 146 85 85 92 91 91 92 16 1 18 17 16 15 13 13 16 7 18 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 // SPDX-License-Identifier: GPL-2.0-only /* * Memory merging support. * * This code enables dynamic sharing of identical pages found in different * memory areas, even if they are not shared by fork() * * Copyright (C) 2008-2009 Red Hat, Inc. * Authors: * Izik Eidus * Andrea Arcangeli * Chris Wright * Hugh Dickins */ #include <linux/errno.h> #include <linux/mm.h> #include <linux/mm_inline.h> #include <linux/fs.h> #include <linux/mman.h> #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/sched/cputime.h> #include <linux/rwsem.h> #include <linux/pagemap.h> #include <linux/rmap.h> #include <linux/spinlock.h> #include <linux/xxhash.h> #include <linux/delay.h> #include <linux/kthread.h> #include <linux/wait.h> #include <linux/slab.h> #include <linux/rbtree.h> #include <linux/memory.h> #include <linux/mmu_notifier.h> #include <linux/swap.h> #include <linux/ksm.h> #include <linux/hashtable.h> #include <linux/freezer.h> #include <linux/oom.h> #include <linux/numa.h> #include <linux/pagewalk.h> #include <asm/tlbflush.h> #include "internal.h" #include "mm_slot.h" #define CREATE_TRACE_POINTS #include <trace/events/ksm.h> #ifdef CONFIG_NUMA #define NUMA(x) (x) #define DO_NUMA(x) do { (x); } while (0) #else #define NUMA(x) (0) #define DO_NUMA(x) do { } while (0) #endif typedef u8 rmap_age_t; /** * DOC: Overview * * A few notes about the KSM scanning process, * to make it easier to understand the data structures below: * * In order to reduce excessive scanning, KSM sorts the memory pages by their * contents into a data structure that holds pointers to the pages' locations. * * Since the contents of the pages may change at any moment, KSM cannot just * insert the pages into a normal sorted tree and expect it to find anything. * Therefore KSM uses two data structures - the stable and the unstable tree. * * The stable tree holds pointers to all the merged pages (ksm pages), sorted * by their contents. Because each such page is write-protected, searching on * this tree is fully assured to be working (except when pages are unmapped), * and therefore this tree is called the stable tree. * * The stable tree node includes information required for reverse * mapping from a KSM page to virtual addresses that map this page. * * In order to avoid large latencies of the rmap walks on KSM pages, * KSM maintains two types of nodes in the stable tree: * * * the regular nodes that keep the reverse mapping structures in a * linked list * * the "chains" that link nodes ("dups") that represent the same * write protected memory content, but each "dup" corresponds to a * different KSM page copy of that content * * Internally, the regular nodes, "dups" and "chains" are represented * using the same struct ksm_stable_node structure. * * In addition to the stable tree, KSM uses a second data structure called the * unstable tree: this tree holds pointers to pages which have been found to * be "unchanged for a period of time". The unstable tree sorts these pages * by their contents, but since they are not write-protected, KSM cannot rely * upon the unstable tree to work correctly - the unstable tree is liable to * be corrupted as its contents are modified, and so it is called unstable. * * KSM solves this problem by several techniques: * * 1) The unstable tree is flushed every time KSM completes scanning all * memory areas, and then the tree is rebuilt again from the beginning. * 2) KSM will only insert into the unstable tree, pages whose hash value * has not changed since the previous scan of all memory areas. * 3) The unstable tree is a RedBlack Tree - so its balancing is based on the * colors of the nodes and not on their contents, assuring that even when * the tree gets "corrupted" it won't get out of balance, so scanning time * remains the same (also, searching and inserting nodes in an rbtree uses * the same algorithm, so we have no overhead when we flush and rebuild). * 4) KSM never flushes the stable tree, which means that even if it were to * take 10 attempts to find a page in the unstable tree, once it is found, * it is secured in the stable tree. (When we scan a new page, we first * compare it against the stable tree, and then against the unstable tree.) * * If the merge_across_nodes tunable is unset, then KSM maintains multiple * stable trees and multiple unstable trees: one of each for each NUMA node. */ /** * struct ksm_mm_slot - ksm information per mm that is being scanned * @slot: hash lookup from mm to mm_slot * @rmap_list: head for this mm_slot's singly-linked list of rmap_items */ struct ksm_mm_slot { struct mm_slot slot; struct ksm_rmap_item *rmap_list; }; /** * struct ksm_scan - cursor for scanning * @mm_slot: the current mm_slot we are scanning * @address: the next address inside that to be scanned * @rmap_list: link to the next rmap to be scanned in the rmap_list * @seqnr: count of completed full scans (needed when removing unstable node) * * There is only the one ksm_scan instance of this cursor structure. */ struct ksm_scan { struct ksm_mm_slot *mm_slot; unsigned long address; struct ksm_rmap_item **rmap_list; unsigned long seqnr; }; /** * struct ksm_stable_node - node of the stable rbtree * @node: rb node of this ksm page in the stable tree * @head: (overlaying parent) &migrate_nodes indicates temporarily on that list * @hlist_dup: linked into the stable_node->hlist with a stable_node chain * @list: linked into migrate_nodes, pending placement in the proper node tree * @hlist: hlist head of rmap_items using this ksm page * @kpfn: page frame number of this ksm page (perhaps temporarily on wrong nid) * @chain_prune_time: time of the last full garbage collection * @rmap_hlist_len: number of rmap_item entries in hlist or STABLE_NODE_CHAIN * @nid: NUMA node id of stable tree in which linked (may not match kpfn) */ struct ksm_stable_node { union { struct rb_node node; /* when node of stable tree */ struct { /* when listed for migration */ struct list_head *head; struct { struct hlist_node hlist_dup; struct list_head list; }; }; }; struct hlist_head hlist; union { unsigned long kpfn; unsigned long chain_prune_time; }; /* * STABLE_NODE_CHAIN can be any negative number in * rmap_hlist_len negative range, but better not -1 to be able * to reliably detect underflows. */ #define STABLE_NODE_CHAIN -1024 int rmap_hlist_len; #ifdef CONFIG_NUMA int nid; #endif }; /** * struct ksm_rmap_item - reverse mapping item for virtual addresses * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree * @nid: NUMA node id of unstable tree in which linked (may not match page) * @mm: the memory structure this rmap_item is pointing into * @address: the virtual address this rmap_item tracks (+ flags in low bits) * @oldchecksum: previous checksum of the page at that virtual address * @node: rb node of this rmap_item in the unstable tree * @head: pointer to stable_node heading this list in the stable tree * @hlist: link into hlist of rmap_items hanging off that stable_node * @age: number of scan iterations since creation * @remaining_skips: how many scans to skip */ struct ksm_rmap_item { struct ksm_rmap_item *rmap_list; union { struct anon_vma *anon_vma; /* when stable */ #ifdef CONFIG_NUMA int nid; /* when node of unstable tree */ #endif }; struct mm_struct *mm; unsigned long address; /* + low bits used for flags below */ unsigned int oldchecksum; /* when unstable */ rmap_age_t age; rmap_age_t remaining_skips; union { struct rb_node node; /* when node of unstable tree */ struct { /* when listed from stable tree */ struct ksm_stable_node *head; struct hlist_node hlist; }; }; }; #define SEQNR_MASK 0x0ff /* low bits of unstable tree seqnr */ #define UNSTABLE_FLAG 0x100 /* is a node of the unstable tree */ #define STABLE_FLAG 0x200 /* is listed from the stable tree */ /* The stable and unstable tree heads */ static struct rb_root one_stable_tree[1] = { RB_ROOT }; static struct rb_root one_unstable_tree[1] = { RB_ROOT }; static struct rb_root *root_stable_tree = one_stable_tree; static struct rb_root *root_unstable_tree = one_unstable_tree; /* Recently migrated nodes of stable tree, pending proper placement */ static LIST_HEAD(migrate_nodes); #define STABLE_NODE_DUP_HEAD ((struct list_head *)&migrate_nodes.prev) #define MM_SLOTS_HASH_BITS 10 static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); static struct ksm_mm_slot ksm_mm_head = { .slot.mm_node = LIST_HEAD_INIT(ksm_mm_head.slot.mm_node), }; static struct ksm_scan ksm_scan = { .mm_slot = &ksm_mm_head, }; static struct kmem_cache *rmap_item_cache; static struct kmem_cache *stable_node_cache; static struct kmem_cache *mm_slot_cache; /* Default number of pages to scan per batch */ #define DEFAULT_PAGES_TO_SCAN 100 /* The number of pages scanned */ static unsigned long ksm_pages_scanned; /* The number of nodes in the stable tree */ static unsigned long ksm_pages_shared; /* The number of page slots additionally sharing those nodes */ static unsigned long ksm_pages_sharing; /* The number of nodes in the unstable tree */ static unsigned long ksm_pages_unshared; /* The number of rmap_items in use: to calculate pages_volatile */ static unsigned long ksm_rmap_items; /* The number of stable_node chains */ static unsigned long ksm_stable_node_chains; /* The number of stable_node dups linked to the stable_node chains */ static unsigned long ksm_stable_node_dups; /* Delay in pruning stale stable_node_dups in the stable_node_chains */ static unsigned int ksm_stable_node_chains_prune_millisecs = 2000; /* Maximum number of page slots sharing a stable node */ static int ksm_max_page_sharing = 256; /* Number of pages ksmd should scan in one batch */ static unsigned int ksm_thread_pages_to_scan = DEFAULT_PAGES_TO_SCAN; /* Milliseconds ksmd should sleep between batches */ static unsigned int ksm_thread_sleep_millisecs = 20; /* Checksum of an empty (zeroed) page */ static unsigned int zero_checksum __read_mostly; /* Whether to merge empty (zeroed) pages with actual zero pages */ static bool ksm_use_zero_pages __read_mostly; /* Skip pages that couldn't be de-duplicated previously */ /* Default to true at least temporarily, for testing */ static bool ksm_smart_scan = true; /* The number of zero pages which is placed by KSM */ atomic_long_t ksm_zero_pages = ATOMIC_LONG_INIT(0); /* The number of pages that have been skipped due to "smart scanning" */ static unsigned long ksm_pages_skipped; /* Don't scan more than max pages per batch. */ static unsigned long ksm_advisor_max_pages_to_scan = 30000; /* Min CPU for scanning pages per scan */ #define KSM_ADVISOR_MIN_CPU 10 /* Max CPU for scanning pages per scan */ static unsigned int ksm_advisor_max_cpu = 70; /* Target scan time in seconds to analyze all KSM candidate pages. */ static unsigned long ksm_advisor_target_scan_time = 200; /* Exponentially weighted moving average. */ #define EWMA_WEIGHT 30 /** * struct advisor_ctx - metadata for KSM advisor * @start_scan: start time of the current scan * @scan_time: scan time of previous scan * @change: change in percent to pages_to_scan parameter * @cpu_time: cpu time consumed by the ksmd thread in the previous scan */ struct advisor_ctx { ktime_t start_scan; unsigned long scan_time; unsigned long change; unsigned long long cpu_time; }; static struct advisor_ctx advisor_ctx; /* Define different advisor's */ enum ksm_advisor_type { KSM_ADVISOR_NONE, KSM_ADVISOR_SCAN_TIME, }; static enum ksm_advisor_type ksm_advisor; #ifdef CONFIG_SYSFS /* * Only called through the sysfs control interface: */ /* At least scan this many pages per batch. */ static unsigned long ksm_advisor_min_pages_to_scan = 500; static void set_advisor_defaults(void) { if (ksm_advisor == KSM_ADVISOR_NONE) { ksm_thread_pages_to_scan = DEFAULT_PAGES_TO_SCAN; } else if (ksm_advisor == KSM_ADVISOR_SCAN_TIME) { advisor_ctx = (const struct advisor_ctx){ 0 }; ksm_thread_pages_to_scan = ksm_advisor_min_pages_to_scan; } } #endif /* CONFIG_SYSFS */ static inline void advisor_start_scan(void) { if (ksm_advisor == KSM_ADVISOR_SCAN_TIME) advisor_ctx.start_scan = ktime_get(); } /* * Use previous scan time if available, otherwise use current scan time as an * approximation for the previous scan time. */ static inline unsigned long prev_scan_time(struct advisor_ctx *ctx, unsigned long scan_time) { return ctx->scan_time ? ctx->scan_time : scan_time; } /* Calculate exponential weighted moving average */ static unsigned long ewma(unsigned long prev, unsigned long curr) { return ((100 - EWMA_WEIGHT) * prev + EWMA_WEIGHT * curr) / 100; } /* * The scan time advisor is based on the current scan rate and the target * scan rate. * * new_pages_to_scan = pages_to_scan * (scan_time / target_scan_time) * * To avoid perturbations it calculates a change factor of previous changes. * A new change factor is calculated for each iteration and it uses an * exponentially weighted moving average. The new pages_to_scan value is * multiplied with that change factor: * * new_pages_to_scan *= change facor * * The new_pages_to_scan value is limited by the cpu min and max values. It * calculates the cpu percent for the last scan and calculates the new * estimated cpu percent cost for the next scan. That value is capped by the * cpu min and max setting. * * In addition the new pages_to_scan value is capped by the max and min * limits. */ static void scan_time_advisor(void) { unsigned int cpu_percent; unsigned long cpu_time; unsigned long cpu_time_diff; unsigned long cpu_time_diff_ms; unsigned long pages; unsigned long per_page_cost; unsigned long factor; unsigned long change; unsigned long last_scan_time; unsigned long scan_time; /* Convert scan time to seconds */ scan_time = div_s64(ktime_ms_delta(ktime_get(), advisor_ctx.start_scan), MSEC_PER_SEC); scan_time = scan_time ? scan_time : 1; /* Calculate CPU consumption of ksmd background thread */ cpu_time = task_sched_runtime(current); cpu_time_diff = cpu_time - advisor_ctx.cpu_time; cpu_time_diff_ms = cpu_time_diff / 1000 / 1000; cpu_percent = (cpu_time_diff_ms * 100) / (scan_time * 1000); cpu_percent = cpu_percent ? cpu_percent : 1; last_scan_time = prev_scan_time(&advisor_ctx, scan_time); /* Calculate scan time as percentage of target scan time */ factor = ksm_advisor_target_scan_time * 100 / scan_time; factor = factor ? factor : 1; /* * Calculate scan time as percentage of last scan time and use * exponentially weighted average to smooth it */ change = scan_time * 100 / last_scan_time; change = change ? change : 1; change = ewma(advisor_ctx.change, change); /* Calculate new scan rate based on target scan rate. */ pages = ksm_thread_pages_to_scan * 100 / factor; /* Update pages_to_scan by weighted change percentage. */ pages = pages * change / 100; /* Cap new pages_to_scan value */ per_page_cost = ksm_thread_pages_to_scan / cpu_percent; per_page_cost = per_page_cost ? per_page_cost : 1; pages = min(pages, per_page_cost * ksm_advisor_max_cpu); pages = max(pages, per_page_cost * KSM_ADVISOR_MIN_CPU); pages = min(pages, ksm_advisor_max_pages_to_scan); /* Update advisor context */ advisor_ctx.change = change; advisor_ctx.scan_time = scan_time; advisor_ctx.cpu_time = cpu_time; ksm_thread_pages_to_scan = pages; trace_ksm_advisor(scan_time, pages, cpu_percent); } static void advisor_stop_scan(void) { if (ksm_advisor == KSM_ADVISOR_SCAN_TIME) scan_time_advisor(); } #ifdef CONFIG_NUMA /* Zeroed when merging across nodes is not allowed */ static unsigned int ksm_merge_across_nodes = 1; static int ksm_nr_node_ids = 1; #else #define ksm_merge_across_nodes 1U #define ksm_nr_node_ids 1 #endif #define KSM_RUN_STOP 0 #define KSM_RUN_MERGE 1 #define KSM_RUN_UNMERGE 2 #define KSM_RUN_OFFLINE 4 static unsigned long ksm_run = KSM_RUN_STOP; static void wait_while_offlining(void); static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait); static DECLARE_WAIT_QUEUE_HEAD(ksm_iter_wait); static DEFINE_MUTEX(ksm_thread_mutex); static DEFINE_SPINLOCK(ksm_mmlist_lock); static int __init ksm_slab_init(void) { rmap_item_cache = KMEM_CACHE(ksm_rmap_item, 0); if (!rmap_item_cache) goto out; stable_node_cache = KMEM_CACHE(ksm_stable_node, 0); if (!stable_node_cache) goto out_free1; mm_slot_cache = KMEM_CACHE(ksm_mm_slot, 0); if (!mm_slot_cache) goto out_free2; return 0; out_free2: kmem_cache_destroy(stable_node_cache); out_free1: kmem_cache_destroy(rmap_item_cache); out: return -ENOMEM; } static void __init ksm_slab_free(void) { kmem_cache_destroy(mm_slot_cache); kmem_cache_destroy(stable_node_cache); kmem_cache_destroy(rmap_item_cache); mm_slot_cache = NULL; } static __always_inline bool is_stable_node_chain(struct ksm_stable_node *chain) { return chain->rmap_hlist_len == STABLE_NODE_CHAIN; } static __always_inline bool is_stable_node_dup(struct ksm_stable_node *dup) { return dup->head == STABLE_NODE_DUP_HEAD; } static inline void stable_node_chain_add_dup(struct ksm_stable_node *dup, struct ksm_stable_node *chain) { VM_BUG_ON(is_stable_node_dup(dup)); dup->head = STABLE_NODE_DUP_HEAD; VM_BUG_ON(!is_stable_node_chain(chain)); hlist_add_head(&dup->hlist_dup, &chain->hlist); ksm_stable_node_dups++; } static inline void __stable_node_dup_del(struct ksm_stable_node *dup) { VM_BUG_ON(!is_stable_node_dup(dup)); hlist_del(&dup->hlist_dup); ksm_stable_node_dups--; } static inline void stable_node_dup_del(struct ksm_stable_node *dup) { VM_BUG_ON(is_stable_node_chain(dup)); if (is_stable_node_dup(dup)) __stable_node_dup_del(dup); else rb_erase(&dup->node, root_stable_tree + NUMA(dup->nid)); #ifdef CONFIG_DEBUG_VM dup->head = NULL; #endif } static inline struct ksm_rmap_item *alloc_rmap_item(void) { struct ksm_rmap_item *rmap_item; rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN); if (rmap_item) ksm_rmap_items++; return rmap_item; } static inline void free_rmap_item(struct ksm_rmap_item *rmap_item) { ksm_rmap_items--; rmap_item->mm->ksm_rmap_items--; rmap_item->mm = NULL; /* debug safety */ kmem_cache_free(rmap_item_cache, rmap_item); } static inline struct ksm_stable_node *alloc_stable_node(void) { /* * The allocation can take too long with GFP_KERNEL when memory is under * pressure, which may lead to hung task warnings. Adding __GFP_HIGH * grants access to memory reserves, helping to avoid this problem. */ return kmem_cache_alloc(stable_node_cache, GFP_KERNEL | __GFP_HIGH); } static inline void free_stable_node(struct ksm_stable_node *stable_node) { VM_BUG_ON(stable_node->rmap_hlist_len && !is_stable_node_chain(stable_node)); kmem_cache_free(stable_node_cache, stable_node); } /* * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's * page tables after it has passed through ksm_exit() - which, if necessary, * takes mmap_lock briefly to serialize against them. ksm_exit() does not set * a special flag: they can just back out as soon as mm_users goes to zero. * ksm_test_exit() is used throughout to make this test for exit: in some * places for correctness, in some places just to avoid unnecessary work. */ static inline bool ksm_test_exit(struct mm_struct *mm) { return atomic_read(&mm->mm_users) == 0; } /* * We use break_ksm to break COW on a ksm page by triggering unsharing, * such that the ksm page will get replaced by an exclusive anonymous page. * * We take great care only to touch a ksm page, in a VM_MERGEABLE vma, * in case the application has unmapped and remapped mm,addr meanwhile. * Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP * mmap of /dev/mem, where we would not want to touch it. * * FAULT_FLAG_REMOTE/FOLL_REMOTE are because we do this outside the context * of the process that owns 'vma'. We also do not want to enforce * protection keys here anyway. */ static int break_ksm(struct vm_area_struct *vma, unsigned long addr, bool lock_vma) { vm_fault_t ret = 0; if (lock_vma) vma_start_write(vma); do { bool ksm_page = false; struct folio_walk fw; struct folio *folio; cond_resched(); folio = folio_walk_start(&fw, vma, addr, FW_MIGRATION | FW_ZEROPAGE); if (folio) { /* Small folio implies FW_LEVEL_PTE. */ if (!folio_test_large(folio) && (folio_test_ksm(folio) || is_ksm_zero_pte(fw.pte))) ksm_page = true; folio_walk_end(&fw, vma); } if (!ksm_page) return 0; ret = handle_mm_fault(vma, addr, FAULT_FLAG_UNSHARE | FAULT_FLAG_REMOTE, NULL); } while (!(ret & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM))); /* * We must loop until we no longer find a KSM page because * handle_mm_fault() may back out if there's any difficulty e.g. if * pte accessed bit gets updated concurrently. * * VM_FAULT_SIGBUS could occur if we race with truncation of the * backing file, which also invalidates anonymous pages: that's * okay, that truncation will have unmapped the KSM page for us. * * VM_FAULT_OOM: at the time of writing (late July 2009), setting * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the * current task has TIF_MEMDIE set, and will be OOM killed on return * to user; and ksmd, having no mm, would never be chosen for that. * * But if the mm is in a limited mem_cgroup, then the fault may fail * with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and * even ksmd can fail in this way - though it's usually breaking ksm * just to undo a merge it made a moment before, so unlikely to oom. * * That's a pity: we might therefore have more kernel pages allocated * than we're counting as nodes in the stable tree; but ksm_do_scan * will retry to break_cow on each pass, so should recover the page * in due course. The important thing is to not let VM_MERGEABLE * be cleared while any such pages might remain in the area. */ return (ret & VM_FAULT_OOM) ? -ENOMEM : 0; } static bool vma_ksm_compatible(struct vm_area_struct *vma) { if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE | VM_PFNMAP | VM_IO | VM_DONTEXPAND | VM_HUGETLB | VM_MIXEDMAP| VM_DROPPABLE)) return false; /* just ignore the advice */ if (vma_is_dax(vma)) return false; #ifdef VM_SAO if (vma->vm_flags & VM_SAO) return false; #endif #ifdef VM_SPARC_ADI if (vma->vm_flags & VM_SPARC_ADI) return false; #endif return true; } static struct vm_area_struct *find_mergeable_vma(struct mm_struct *mm, unsigned long addr) { struct vm_area_struct *vma; if (ksm_test_exit(mm)) return NULL; vma = vma_lookup(mm, addr); if (!vma || !(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma) return NULL; return vma; } static void break_cow(struct ksm_rmap_item *rmap_item) { struct mm_struct *mm = rmap_item->mm; unsigned long addr = rmap_item->address; struct vm_area_struct *vma; /* * It is not an accident that whenever we want to break COW * to undo, we also need to drop a reference to the anon_vma. */ put_anon_vma(rmap_item->anon_vma); mmap_read_lock(mm); vma = find_mergeable_vma(mm, addr); if (vma) break_ksm(vma, addr, false); mmap_read_unlock(mm); } static struct page *get_mergeable_page(struct ksm_rmap_item *rmap_item) { struct mm_struct *mm = rmap_item->mm; unsigned long addr = rmap_item->address; struct vm_area_struct *vma; struct page *page = NULL; struct folio_walk fw; struct folio *folio; mmap_read_lock(mm); vma = find_mergeable_vma(mm, addr); if (!vma) goto out; folio = folio_walk_start(&fw, vma, addr, 0); if (folio) { if (!folio_is_zone_device(folio) && folio_test_anon(folio)) { folio_get(folio); page = fw.page; } folio_walk_end(&fw, vma); } out: if (page) { flush_anon_page(vma, page, addr); flush_dcache_page(page); } mmap_read_unlock(mm); return page; } /* * This helper is used for getting right index into array of tree roots. * When merge_across_nodes knob is set to 1, there are only two rb-trees for * stable and unstable pages from all nodes with roots in index 0. Otherwise, * every node has its own stable and unstable tree. */ static inline int get_kpfn_nid(unsigned long kpfn) { return ksm_merge_across_nodes ? 0 : NUMA(pfn_to_nid(kpfn)); } static struct ksm_stable_node *alloc_stable_node_chain(struct ksm_stable_node *dup, struct rb_root *root) { struct ksm_stable_node *chain = alloc_stable_node(); VM_BUG_ON(is_stable_node_chain(dup)); if (likely(chain)) { INIT_HLIST_HEAD(&chain->hlist); chain->chain_prune_time = jiffies; chain->rmap_hlist_len = STABLE_NODE_CHAIN; #if defined (CONFIG_DEBUG_VM) && defined(CONFIG_NUMA) chain->nid = NUMA_NO_NODE; /* debug */ #endif ksm_stable_node_chains++; /* * Put the stable node chain in the first dimension of * the stable tree and at the same time remove the old * stable node. */ rb_replace_node(&dup->node, &chain->node, root); /* * Move the old stable node to the second dimension * queued in the hlist_dup. The invariant is that all * dup stable_nodes in the chain->hlist point to pages * that are write protected and have the exact same * content. */ stable_node_chain_add_dup(dup, chain); } return chain; } static inline void free_stable_node_chain(struct ksm_stable_node *chain, struct rb_root *root) { rb_erase(&chain->node, root); free_stable_node(chain); ksm_stable_node_chains--; } static void remove_node_from_stable_tree(struct ksm_stable_node *stable_node) { struct ksm_rmap_item *rmap_item; /* check it's not STABLE_NODE_CHAIN or negative */ BUG_ON(stable_node->rmap_hlist_len < 0); hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) { if (rmap_item->hlist.next) { ksm_pages_sharing--; trace_ksm_remove_rmap_item(stable_node->kpfn, rmap_item, rmap_item->mm); } else { ksm_pages_shared--; } rmap_item->mm->ksm_merging_pages--; VM_BUG_ON(stable_node->rmap_hlist_len <= 0); stable_node->rmap_hlist_len--; put_anon_vma(rmap_item->anon_vma); rmap_item->address &= PAGE_MASK; cond_resched(); } /* * We need the second aligned pointer of the migrate_nodes * list_head to stay clear from the rb_parent_color union * (aligned and different than any node) and also different * from &migrate_nodes. This will verify that future list.h changes * don't break STABLE_NODE_DUP_HEAD. Only recent gcc can handle it. */ BUILD_BUG_ON(STABLE_NODE_DUP_HEAD <= &migrate_nodes); BUILD_BUG_ON(STABLE_NODE_DUP_HEAD >= &migrate_nodes + 1); trace_ksm_remove_ksm_page(stable_node->kpfn); if (stable_node->head == &migrate_nodes) list_del(&stable_node->list); else stable_node_dup_del(stable_node); free_stable_node(stable_node); } enum ksm_get_folio_flags { KSM_GET_FOLIO_NOLOCK, KSM_GET_FOLIO_LOCK, KSM_GET_FOLIO_TRYLOCK }; /* * ksm_get_folio: checks if the page indicated by the stable node * is still its ksm page, despite having held no reference to it. * In which case we can trust the content of the page, and it * returns the gotten page; but if the page has now been zapped, * remove the stale node from the stable tree and return NULL. * But beware, the stable node's page might be being migrated. * * You would expect the stable_node to hold a reference to the ksm page. * But if it increments the page's count, swapping out has to wait for * ksmd to come around again before it can free the page, which may take * seconds or even minutes: much too unresponsive. So instead we use a * "keyhole reference": access to the ksm page from the stable node peeps * out through its keyhole to see if that page still holds the right key, * pointing back to this stable node. This relies on freeing a PageAnon * page to reset its page->mapping to NULL, and relies on no other use of * a page to put something that might look like our key in page->mapping. * is on its way to being freed; but it is an anomaly to bear in mind. */ static struct folio *ksm_get_folio(struct ksm_stable_node *stable_node, enum ksm_get_folio_flags flags) { struct folio *folio; void *expected_mapping; unsigned long kpfn; expected_mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM); again: kpfn = READ_ONCE(stable_node->kpfn); /* Address dependency. */ folio = pfn_folio(kpfn); if (READ_ONCE(folio->mapping) != expected_mapping) goto stale; /* * We cannot do anything with the page while its refcount is 0. * Usually 0 means free, or tail of a higher-order page: in which * case this node is no longer referenced, and should be freed; * however, it might mean that the page is under page_ref_freeze(). * The __remove_mapping() case is easy, again the node is now stale; * the same is in reuse_ksm_page() case; but if page is swapcache * in folio_migrate_mapping(), it might still be our page, * in which case it's essential to keep the node. */ while (!folio_try_get(folio)) { /* * Another check for folio->mapping != expected_mapping * would work here too. We have chosen to test the * swapcache flag to optimize the common case, when the * folio is or is about to be freed: the swapcache flag * is cleared (under spin_lock_irq) in the ref_freeze * section of __remove_mapping(); but anon folio->mapping * is reset to NULL later, in free_pages_prepare(). */ if (!folio_test_swapcache(folio)) goto stale; cpu_relax(); } if (READ_ONCE(folio->mapping) != expected_mapping) { folio_put(folio); goto stale; } if (flags == KSM_GET_FOLIO_TRYLOCK) { if (!folio_trylock(folio)) { folio_put(folio); return ERR_PTR(-EBUSY); } } else if (flags == KSM_GET_FOLIO_LOCK) folio_lock(folio); if (flags != KSM_GET_FOLIO_NOLOCK) { if (READ_ONCE(folio->mapping) != expected_mapping) { folio_unlock(folio); folio_put(folio); goto stale; } } return folio; stale: /* * We come here from above when folio->mapping or the swapcache flag * suggests that the node is stale; but it might be under migration. * We need smp_rmb(), matching the smp_wmb() in folio_migrate_ksm(), * before checking whether node->kpfn has been changed. */ smp_rmb(); if (READ_ONCE(stable_node->kpfn) != kpfn) goto again; remove_node_from_stable_tree(stable_node); return NULL; } /* * Removing rmap_item from stable or unstable tree. * This function will clean the information from the stable/unstable tree. */ static void remove_rmap_item_from_tree(struct ksm_rmap_item *rmap_item) { if (rmap_item->address & STABLE_FLAG) { struct ksm_stable_node *stable_node; struct folio *folio; stable_node = rmap_item->head; folio = ksm_get_folio(stable_node, KSM_GET_FOLIO_LOCK); if (!folio) goto out; hlist_del(&rmap_item->hlist); folio_unlock(folio); folio_put(folio); if (!hlist_empty(&stable_node->hlist)) ksm_pages_sharing--; else ksm_pages_shared--; rmap_item->mm->ksm_merging_pages--; VM_BUG_ON(stable_node->rmap_hlist_len <= 0); stable_node->rmap_hlist_len--; put_anon_vma(rmap_item->anon_vma); rmap_item->head = NULL; rmap_item->address &= PAGE_MASK; } else if (rmap_item->address & UNSTABLE_FLAG) { unsigned char age; /* * Usually ksmd can and must skip the rb_erase, because * root_unstable_tree was already reset to RB_ROOT. * But be careful when an mm is exiting: do the rb_erase * if this rmap_item was inserted by this scan, rather * than left over from before. */ age = (unsigned char)(ksm_scan.seqnr - rmap_item->address); BUG_ON(age > 1); if (!age) rb_erase(&rmap_item->node, root_unstable_tree + NUMA(rmap_item->nid)); ksm_pages_unshared--; rmap_item->address &= PAGE_MASK; } out: cond_resched(); /* we're called from many long loops */ } static void remove_trailing_rmap_items(struct ksm_rmap_item **rmap_list) { while (*rmap_list) { struct ksm_rmap_item *rmap_item = *rmap_list; *rmap_list = rmap_item->rmap_list; remove_rmap_item_from_tree(rmap_item); free_rmap_item(rmap_item); } } /* * Though it's very tempting to unmerge rmap_items from stable tree rather * than check every pte of a given vma, the locking doesn't quite work for * that - an rmap_item is assigned to the stable tree after inserting ksm * page and upping mmap_lock. Nor does it fit with the way we skip dup'ing * rmap_items from parent to child at fork time (so as not to waste time * if exit comes before the next scan reaches it). * * Similarly, although we'd like to remove rmap_items (so updating counts * and freeing memory) when unmerging an area, it's easier to leave that * to the next pass of ksmd - consider, for example, how ksmd might be * in cmp_and_merge_page on one of the rmap_items we would be removing. */ static int unmerge_ksm_pages(struct vm_area_struct *vma, unsigned long start, unsigned long end, bool lock_vma) { unsigned long addr; int err = 0; for (addr = start; addr < end && !err; addr += PAGE_SIZE) { if (ksm_test_exit(vma->vm_mm)) break; if (signal_pending(current)) err = -ERESTARTSYS; else err = break_ksm(vma, addr, lock_vma); } return err; } static inline struct ksm_stable_node *folio_stable_node(const struct folio *folio) { return folio_test_ksm(folio) ? folio_raw_mapping(folio) : NULL; } static inline struct ksm_stable_node *page_stable_node(struct page *page) { return folio_stable_node(page_folio(page)); } static inline void folio_set_stable_node(struct folio *folio, struct ksm_stable_node *stable_node) { VM_WARN_ON_FOLIO(folio_test_anon(folio) && PageAnonExclusive(&folio->page), folio); folio->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM); } #ifdef CONFIG_SYSFS /* * Only called through the sysfs control interface: */ static int remove_stable_node(struct ksm_stable_node *stable_node) { struct folio *folio; int err; folio = ksm_get_folio(stable_node, KSM_GET_FOLIO_LOCK); if (!folio) { /* * ksm_get_folio did remove_node_from_stable_tree itself. */ return 0; } /* * Page could be still mapped if this races with __mmput() running in * between ksm_exit() and exit_mmap(). Just refuse to let * merge_across_nodes/max_page_sharing be switched. */ err = -EBUSY; if (!folio_mapped(folio)) { /* * The stable node did not yet appear stale to ksm_get_folio(), * since that allows for an unmapped ksm folio to be recognized * right up until it is freed; but the node is safe to remove. * This folio might be in an LRU cache waiting to be freed, * or it might be in the swapcache (perhaps under writeback), * or it might have been removed from swapcache a moment ago. */ folio_set_stable_node(folio, NULL); remove_node_from_stable_tree(stable_node); err = 0; } folio_unlock(folio); folio_put(folio); return err; } static int remove_stable_node_chain(struct ksm_stable_node *stable_node, struct rb_root *root) { struct ksm_stable_node *dup; struct hlist_node *hlist_safe; if (!is_stable_node_chain(stable_node)) { VM_BUG_ON(is_stable_node_dup(stable_node)); if (remove_stable_node(stable_node)) return true; else return false; } hlist_for_each_entry_safe(dup, hlist_safe, &stable_node->hlist, hlist_dup) { VM_BUG_ON(!is_stable_node_dup(dup)); if (remove_stable_node(dup)) return true; } BUG_ON(!hlist_empty(&stable_node->hlist)); free_stable_node_chain(stable_node, root); return false; } static int remove_all_stable_nodes(void) { struct ksm_stable_node *stable_node, *next; int nid; int err = 0; for (nid = 0; nid < ksm_nr_node_ids; nid++) { while (root_stable_tree[nid].rb_node) { stable_node = rb_entry(root_stable_tree[nid].rb_node, struct ksm_stable_node, node); if (remove_stable_node_chain(stable_node, root_stable_tree + nid)) { err = -EBUSY; break; /* proceed to next nid */ } cond_resched(); } } list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) { if (remove_stable_node(stable_node)) err = -EBUSY; cond_resched(); } return err; } static int unmerge_and_remove_all_rmap_items(void) { struct ksm_mm_slot *mm_slot; struct mm_slot *slot; struct mm_struct *mm; struct vm_area_struct *vma; int err = 0; spin_lock(&ksm_mmlist_lock); slot = list_entry(ksm_mm_head.slot.mm_node.next, struct mm_slot, mm_node); ksm_scan.mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot); spin_unlock(&ksm_mmlist_lock); for (mm_slot = ksm_scan.mm_slot; mm_slot != &ksm_mm_head; mm_slot = ksm_scan.mm_slot) { VMA_ITERATOR(vmi, mm_slot->slot.mm, 0); mm = mm_slot->slot.mm; mmap_read_lock(mm); /* * Exit right away if mm is exiting to avoid lockdep issue in * the maple tree */ if (ksm_test_exit(mm)) goto mm_exiting; for_each_vma(vmi, vma) { if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma) continue; err = unmerge_ksm_pages(vma, vma->vm_start, vma->vm_end, false); if (err) goto error; } mm_exiting: remove_trailing_rmap_items(&mm_slot->rmap_list); mmap_read_unlock(mm); spin_lock(&ksm_mmlist_lock); slot = list_entry(mm_slot->slot.mm_node.next, struct mm_slot, mm_node); ksm_scan.mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot); if (ksm_test_exit(mm)) { hash_del(&mm_slot->slot.hash); list_del(&mm_slot->slot.mm_node); spin_unlock(&ksm_mmlist_lock); mm_slot_free(mm_slot_cache, mm_slot); clear_bit(MMF_VM_MERGEABLE, &mm->flags); clear_bit(MMF_VM_MERGE_ANY, &mm->flags); mmdrop(mm); } else spin_unlock(&ksm_mmlist_lock); } /* Clean up stable nodes, but don't worry if some are still busy */ remove_all_stable_nodes(); ksm_scan.seqnr = 0; return 0; error: mmap_read_unlock(mm); spin_lock(&ksm_mmlist_lock); ksm_scan.mm_slot = &ksm_mm_head; spin_unlock(&ksm_mmlist_lock); return err; } #endif /* CONFIG_SYSFS */ static u32 calc_checksum(struct page *page) { u32 checksum; void *addr = kmap_local_page(page); checksum = xxhash(addr, PAGE_SIZE, 0); kunmap_local(addr); return checksum; } static int write_protect_page(struct vm_area_struct *vma, struct folio *folio, pte_t *orig_pte) { struct mm_struct *mm = vma->vm_mm; DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, 0, 0); int swapped; int err = -EFAULT; struct mmu_notifier_range range; bool anon_exclusive; pte_t entry; if (WARN_ON_ONCE(folio_test_large(folio))) return err; pvmw.address = page_address_in_vma(folio, folio_page(folio, 0), vma); if (pvmw.address == -EFAULT) goto out; mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, pvmw.address, pvmw.address + PAGE_SIZE); mmu_notifier_invalidate_range_start(&range); if (!page_vma_mapped_walk(&pvmw)) goto out_mn; if (WARN_ONCE(!pvmw.pte, "Unexpected PMD mapping?")) goto out_unlock; anon_exclusive = PageAnonExclusive(&folio->page); entry = ptep_get(pvmw.pte); if (pte_write(entry) || pte_dirty(entry) || anon_exclusive || mm_tlb_flush_pending(mm)) { swapped = folio_test_swapcache(folio); flush_cache_page(vma, pvmw.address, folio_pfn(folio)); /* * Ok this is tricky, when get_user_pages_fast() run it doesn't * take any lock, therefore the check that we are going to make * with the pagecount against the mapcount is racy and * O_DIRECT can happen right after the check. * So we clear the pte and flush the tlb before the check * this assure us that no O_DIRECT can happen after the check * or in the middle of the check. * * No need to notify as we are downgrading page table to read * only not changing it to point to a new page. * * See Documentation/mm/mmu_notifier.rst */ entry = ptep_clear_flush(vma, pvmw.address, pvmw.pte); /* * Check that no O_DIRECT or similar I/O is in progress on the * page */ if (folio_mapcount(folio) + 1 + swapped != folio_ref_count(folio)) { set_pte_at(mm, pvmw.address, pvmw.pte, entry); goto out_unlock; } /* See folio_try_share_anon_rmap_pte(): clear PTE first. */ if (anon_exclusive && folio_try_share_anon_rmap_pte(folio, &folio->page)) { set_pte_at(mm, pvmw.address, pvmw.pte, entry); goto out_unlock; } if (pte_dirty(entry)) folio_mark_dirty(folio); entry = pte_mkclean(entry); if (pte_write(entry)) entry = pte_wrprotect(entry); set_pte_at(mm, pvmw.address, pvmw.pte, entry); } *orig_pte = entry; err = 0; out_unlock: page_vma_mapped_walk_done(&pvmw); out_mn: mmu_notifier_invalidate_range_end(&range); out: return err; } /** * replace_page - replace page in vma by new ksm page * @vma: vma that holds the pte pointing to page * @page: the page we are replacing by kpage * @kpage: the ksm page we replace page by * @orig_pte: the original value of the pte * * Returns 0 on success, -EFAULT on failure. */ static int replace_page(struct vm_area_struct *vma, struct page *page, struct page *kpage, pte_t orig_pte) { struct folio *kfolio = page_folio(kpage); struct mm_struct *mm = vma->vm_mm; struct folio *folio = page_folio(page); pmd_t *pmd; pmd_t pmde; pte_t *ptep; pte_t newpte; spinlock_t *ptl; unsigned long addr; int err = -EFAULT; struct mmu_notifier_range range; addr = page_address_in_vma(folio, page, vma); if (addr == -EFAULT) goto out; pmd = mm_find_pmd(mm, addr); if (!pmd) goto out; /* * Some THP functions use the sequence pmdp_huge_clear_flush(), set_pmd_at() * without holding anon_vma lock for write. So when looking for a * genuine pmde (in which to find pte), test present and !THP together. */ pmde = pmdp_get_lockless(pmd); if (!pmd_present(pmde) || pmd_trans_huge(pmde)) goto out; mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, addr, addr + PAGE_SIZE); mmu_notifier_invalidate_range_start(&range); ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); if (!ptep) goto out_mn; if (!pte_same(ptep_get(ptep), orig_pte)) { pte_unmap_unlock(ptep, ptl); goto out_mn; } VM_BUG_ON_PAGE(PageAnonExclusive(page), page); VM_BUG_ON_FOLIO(folio_test_anon(kfolio) && PageAnonExclusive(kpage), kfolio); /* * No need to check ksm_use_zero_pages here: we can only have a * zero_page here if ksm_use_zero_pages was enabled already. */ if (!is_zero_pfn(page_to_pfn(kpage))) { folio_get(kfolio); folio_add_anon_rmap_pte(kfolio, kpage, vma, addr, RMAP_NONE); newpte = mk_pte(kpage, vma->vm_page_prot); } else { /* * Use pte_mkdirty to mark the zero page mapped by KSM, and then * we can easily track all KSM-placed zero pages by checking if * the dirty bit in zero page's PTE is set. */ newpte = pte_mkdirty(pte_mkspecial(pfn_pte(page_to_pfn(kpage), vma->vm_page_prot))); ksm_map_zero_page(mm); /* * We're replacing an anonymous page with a zero page, which is * not anonymous. We need to do proper accounting otherwise we * will get wrong values in /proc, and a BUG message in dmesg * when tearing down the mm. */ dec_mm_counter(mm, MM_ANONPAGES); } flush_cache_page(vma, addr, pte_pfn(ptep_get(ptep))); /* * No need to notify as we are replacing a read only page with another * read only page with the same content. * * See Documentation/mm/mmu_notifier.rst */ ptep_clear_flush(vma, addr, ptep); set_pte_at(mm, addr, ptep, newpte); folio_remove_rmap_pte(folio, page, vma); if (!folio_mapped(folio)) folio_free_swap(folio); folio_put(folio); pte_unmap_unlock(ptep, ptl); err = 0; out_mn: mmu_notifier_invalidate_range_end(&range); out: return err; } /* * try_to_merge_one_page - take two pages and merge them into one * @vma: the vma that holds the pte pointing to page * @page: the PageAnon page that we want to replace with kpage * @kpage: the KSM page that we want to map instead of page, * or NULL the first time when we want to use page as kpage. * * This function returns 0 if the pages were merged, -EFAULT otherwise. */ static int try_to_merge_one_page(struct vm_area_struct *vma, struct page *page, struct page *kpage) { struct folio *folio = page_folio(page); pte_t orig_pte = __pte(0); int err = -EFAULT; if (page == kpage) /* ksm page forked */ return 0; if (!folio_test_anon(folio)) goto out; /* * We need the folio lock to read a stable swapcache flag in * write_protect_page(). We trylock because we don't want to wait * here - we prefer to continue scanning and merging different * pages, then come back to this page when it is unlocked. */ if (!folio_trylock(folio)) goto out; if (folio_test_large(folio)) { if (split_huge_page(page)) goto out_unlock; folio = page_folio(page); } /* * If this anonymous page is mapped only here, its pte may need * to be write-protected. If it's mapped elsewhere, all of its * ptes are necessarily already write-protected. But in either * case, we need to lock and check page_count is not raised. */ if (write_protect_page(vma, folio, &orig_pte) == 0) { if (!kpage) { /* * While we hold folio lock, upgrade folio from * anon to a NULL stable_node with the KSM flag set: * stable_tree_insert() will update stable_node. */ folio_set_stable_node(folio, NULL); folio_mark_accessed(folio); /* * Page reclaim just frees a clean folio with no dirty * ptes: make sure that the ksm page would be swapped. */ if (!folio_test_dirty(folio)) folio_mark_dirty(folio); err = 0; } else if (pages_identical(page, kpage)) err = replace_page(vma, page, kpage, orig_pte); } out_unlock: folio_unlock(folio); out: return err; } /* * This function returns 0 if the pages were merged or if they are * no longer merging candidates (e.g., VMA stale), -EFAULT otherwise. */ static int try_to_merge_with_zero_page(struct ksm_rmap_item *rmap_item, struct page *page) { struct mm_struct *mm = rmap_item->mm; int err = -EFAULT; /* * Same checksum as an empty page. We attempt to merge it with the * appropriate zero page if the user enabled this via sysfs. */ if (ksm_use_zero_pages && (rmap_item->oldchecksum == zero_checksum)) { struct vm_area_struct *vma; mmap_read_lock(mm); vma = find_mergeable_vma(mm, rmap_item->address); if (vma) { err = try_to_merge_one_page(vma, page, ZERO_PAGE(rmap_item->address)); trace_ksm_merge_one_page( page_to_pfn(ZERO_PAGE(rmap_item->address)), rmap_item, mm, err); } else { /* * If the vma is out of date, we do not need to * continue. */ err = 0; } mmap_read_unlock(mm); } return err; } /* * try_to_merge_with_ksm_page - like try_to_merge_two_pages, * but no new kernel page is allocated: kpage must already be a ksm page. * * This function returns 0 if the pages were merged, -EFAULT otherwise. */ static int try_to_merge_with_ksm_page(struct ksm_rmap_item *rmap_item, struct page *page, struct page *kpage) { struct mm_struct *mm = rmap_item->mm; struct vm_area_struct *vma; int err = -EFAULT; mmap_read_lock(mm); vma = find_mergeable_vma(mm, rmap_item->address); if (!vma) goto out; err = try_to_merge_one_page(vma, page, kpage); if (err) goto out; /* Unstable nid is in union with stable anon_vma: remove first */ remove_rmap_item_from_tree(rmap_item); /* Must get reference to anon_vma while still holding mmap_lock */ rmap_item->anon_vma = vma->anon_vma; get_anon_vma(vma->anon_vma); out: mmap_read_unlock(mm); trace_ksm_merge_with_ksm_page(kpage, page_to_pfn(kpage ? kpage : page), rmap_item, mm, err); return err; } /* * try_to_merge_two_pages - take two identical pages and prepare them * to be merged into one page. * * This function returns the kpage if we successfully merged two identical * pages into one ksm page, NULL otherwise. * * Note that this function upgrades page to ksm page: if one of the pages * is already a ksm page, try_to_merge_with_ksm_page should be used. */ static struct folio *try_to_merge_two_pages(struct ksm_rmap_item *rmap_item, struct page *page, struct ksm_rmap_item *tree_rmap_item, struct page *tree_page) { int err; err = try_to_merge_with_ksm_page(rmap_item, page, NULL); if (!err) { err = try_to_merge_with_ksm_page(tree_rmap_item, tree_page, page); /* * If that fails, we have a ksm page with only one pte * pointing to it: so break it. */ if (err) break_cow(rmap_item); } return err ? NULL : page_folio(page); } static __always_inline bool __is_page_sharing_candidate(struct ksm_stable_node *stable_node, int offset) { VM_BUG_ON(stable_node->rmap_hlist_len < 0); /* * Check that at least one mapping still exists, otherwise * there's no much point to merge and share with this * stable_node, as the underlying tree_page of the other * sharer is going to be freed soon. */ return stable_node->rmap_hlist_len && stable_node->rmap_hlist_len + offset < ksm_max_page_sharing; } static __always_inline bool is_page_sharing_candidate(struct ksm_stable_node *stable_node) { return __is_page_sharing_candidate(stable_node, 0); } static struct folio *stable_node_dup(struct ksm_stable_node **_stable_node_dup, struct ksm_stable_node **_stable_node, struct rb_root *root, bool prune_stale_stable_nodes) { struct ksm_stable_node *dup, *found = NULL, *stable_node = *_stable_node; struct hlist_node *hlist_safe; struct folio *folio, *tree_folio = NULL; int found_rmap_hlist_len; if (!prune_stale_stable_nodes || time_before(jiffies, stable_node->chain_prune_time + msecs_to_jiffies( ksm_stable_node_chains_prune_millisecs))) prune_stale_stable_nodes = false; else stable_node->chain_prune_time = jiffies; hlist_for_each_entry_safe(dup, hlist_safe, &stable_node->hlist, hlist_dup) { cond_resched(); /* * We must walk all stable_node_dup to prune the stale * stable nodes during lookup. * * ksm_get_folio can drop the nodes from the * stable_node->hlist if they point to freed pages * (that's why we do a _safe walk). The "dup" * stable_node parameter itself will be freed from * under us if it returns NULL. */ folio = ksm_get_folio(dup, KSM_GET_FOLIO_NOLOCK); if (!folio) continue; /* Pick the best candidate if possible. */ if (!found || (is_page_sharing_candidate(dup) && (!is_page_sharing_candidate(found) || dup->rmap_hlist_len > found_rmap_hlist_len))) { if (found) folio_put(tree_folio); found = dup; found_rmap_hlist_len = found->rmap_hlist_len; tree_folio = folio; /* skip put_page for found candidate */ if (!prune_stale_stable_nodes && is_page_sharing_candidate(found)) break; continue; } folio_put(folio); } if (found) { if (hlist_is_singular_node(&found->hlist_dup, &stable_node->hlist)) { /* * If there's not just one entry it would * corrupt memory, better BUG_ON. In KSM * context with no lock held it's not even * fatal. */ BUG_ON(stable_node->hlist.first->next); /* * There's just one entry and it is below the * deduplication limit so drop the chain. */ rb_replace_node(&stable_node->node, &found->node, root); free_stable_node(stable_node); ksm_stable_node_chains--; ksm_stable_node_dups--; /* * NOTE: the caller depends on the stable_node * to be equal to stable_node_dup if the chain * was collapsed. */ *_stable_node = found; /* * Just for robustness, as stable_node is * otherwise left as a stable pointer, the * compiler shall optimize it away at build * time. */ stable_node = NULL; } else if (stable_node->hlist.first != &found->hlist_dup && __is_page_sharing_candidate(found, 1)) { /* * If the found stable_node dup can accept one * more future merge (in addition to the one * that is underway) and is not at the head of * the chain, put it there so next search will * be quicker in the !prune_stale_stable_nodes * case. * * NOTE: it would be inaccurate to use nr > 1 * instead of checking the hlist.first pointer * directly, because in the * prune_stale_stable_nodes case "nr" isn't * the position of the found dup in the chain, * but the total number of dups in the chain. */ hlist_del(&found->hlist_dup); hlist_add_head(&found->hlist_dup, &stable_node->hlist); } } else { /* Its hlist must be empty if no one found. */ free_stable_node_chain(stable_node, root); } *_stable_node_dup = found; return tree_folio; } /* * Like for ksm_get_folio, this function can free the *_stable_node and * *_stable_node_dup if the returned tree_page is NULL. * * It can also free and overwrite *_stable_node with the found * stable_node_dup if the chain is collapsed (in which case * *_stable_node will be equal to *_stable_node_dup like if the chain * never existed). It's up to the caller to verify tree_page is not * NULL before dereferencing *_stable_node or *_stable_node_dup. * * *_stable_node_dup is really a second output parameter of this * function and will be overwritten in all cases, the caller doesn't * need to initialize it. */ static struct folio *__stable_node_chain(struct ksm_stable_node **_stable_node_dup, struct ksm_stable_node **_stable_node, struct rb_root *root, bool prune_stale_stable_nodes) { struct ksm_stable_node *stable_node = *_stable_node; if (!is_stable_node_chain(stable_node)) { *_stable_node_dup = stable_node; return ksm_get_folio(stable_node, KSM_GET_FOLIO_NOLOCK); } return stable_node_dup(_stable_node_dup, _stable_node, root, prune_stale_stable_nodes); } static __always_inline struct folio *chain_prune(struct ksm_stable_node **s_n_d, struct ksm_stable_node **s_n, struct rb_root *root) { return __stable_node_chain(s_n_d, s_n, root, true); } static __always_inline struct folio *chain(struct ksm_stable_node **s_n_d, struct ksm_stable_node **s_n, struct rb_root *root) { return __stable_node_chain(s_n_d, s_n, root, false); } /* * stable_tree_search - search for page inside the stable tree * * This function checks if there is a page inside the stable tree * with identical content to the page that we are scanning right now. * * This function returns the stable tree node of identical content if found, * -EBUSY if the stable node's page is being migrated, NULL otherwise. */ static struct folio *stable_tree_search(struct page *page) { int nid; struct rb_root *root; struct rb_node **new; struct rb_node *parent; struct ksm_stable_node *stable_node, *stable_node_dup; struct ksm_stable_node *page_node; struct folio *folio; folio = page_folio(page); page_node = folio_stable_node(folio); if (page_node && page_node->head != &migrate_nodes) { /* ksm page forked */ folio_get(folio); return folio; } nid = get_kpfn_nid(folio_pfn(folio)); root = root_stable_tree + nid; again: new = &root->rb_node; parent = NULL; while (*new) { struct folio *tree_folio; int ret; cond_resched(); stable_node = rb_entry(*new, struct ksm_stable_node, node); tree_folio = chain_prune(&stable_node_dup, &stable_node, root); if (!tree_folio) { /* * If we walked over a stale stable_node, * ksm_get_folio() will call rb_erase() and it * may rebalance the tree from under us. So * restart the search from scratch. Returning * NULL would be safe too, but we'd generate * false negative insertions just because some * stable_node was stale. */ goto again; } ret = memcmp_pages(page, &tree_folio->page); folio_put(tree_folio); parent = *new; if (ret < 0) new = &parent->rb_left; else if (ret > 0) new = &parent->rb_right; else { if (page_node) { VM_BUG_ON(page_node->head != &migrate_nodes); /* * If the mapcount of our migrated KSM folio is * at most 1, we can merge it with another * KSM folio where we know that we have space * for one more mapping without exceeding the * ksm_max_page_sharing limit: see * chain_prune(). This way, we can avoid adding * this stable node to the chain. */ if (folio_mapcount(folio) > 1) goto chain_append; } if (!is_page_sharing_candidate(stable_node_dup)) { /* * If the stable_node is a chain and * we got a payload match in memcmp * but we cannot merge the scanned * page in any of the existing * stable_node dups because they're * all full, we need to wait the * scanned page to find itself a match * in the unstable tree to create a * brand new KSM page to add later to * the dups of this stable_node. */ return NULL; } /* * Lock and unlock the stable_node's page (which * might already have been migrated) so that page * migration is sure to notice its raised count. * It would be more elegant to return stable_node * than kpage, but that involves more changes. */ tree_folio = ksm_get_folio(stable_node_dup, KSM_GET_FOLIO_TRYLOCK); if (PTR_ERR(tree_folio) == -EBUSY) return ERR_PTR(-EBUSY); if (unlikely(!tree_folio)) /* * The tree may have been rebalanced, * so re-evaluate parent and new. */ goto again; folio_unlock(tree_folio); if (get_kpfn_nid(stable_node_dup->kpfn) != NUMA(stable_node_dup->nid)) { folio_put(tree_folio); goto replace; } return tree_folio; } } if (!page_node) return NULL; list_del(&page_node->list); DO_NUMA(page_node->nid = nid); rb_link_node(&page_node->node, parent, new); rb_insert_color(&page_node->node, root); out: if (is_page_sharing_candidate(page_node)) { folio_get(folio); return folio; } else return NULL; replace: /* * If stable_node was a chain and chain_prune collapsed it, * stable_node has been updated to be the new regular * stable_node. A collapse of the chain is indistinguishable * from the case there was no chain in the stable * rbtree. Otherwise stable_node is the chain and * stable_node_dup is the dup to replace. */ if (stable_node_dup == stable_node) { VM_BUG_ON(is_stable_node_chain(stable_node_dup)); VM_BUG_ON(is_stable_node_dup(stable_node_dup)); /* there is no chain */ if (page_node) { VM_BUG_ON(page_node->head != &migrate_nodes); list_del(&page_node->list); DO_NUMA(page_node->nid = nid); rb_replace_node(&stable_node_dup->node, &page_node->node, root); if (is_page_sharing_candidate(page_node)) folio_get(folio); else folio = NULL; } else { rb_erase(&stable_node_dup->node, root); folio = NULL; } } else { VM_BUG_ON(!is_stable_node_chain(stable_node)); __stable_node_dup_del(stable_node_dup); if (page_node) { VM_BUG_ON(page_node->head != &migrate_nodes); list_del(&page_node->list); DO_NUMA(page_node->nid = nid); stable_node_chain_add_dup(page_node, stable_node); if (is_page_sharing_candidate(page_node)) folio_get(folio); else folio = NULL; } else { folio = NULL; } } stable_node_dup->head = &migrate_nodes; list_add(&stable_node_dup->list, stable_node_dup->head); return folio; chain_append: /* * If stable_node was a chain and chain_prune collapsed it, * stable_node has been updated to be the new regular * stable_node. A collapse of the chain is indistinguishable * from the case there was no chain in the stable * rbtree. Otherwise stable_node is the chain and * stable_node_dup is the dup to replace. */ if (stable_node_dup == stable_node) { VM_BUG_ON(is_stable_node_dup(stable_node_dup)); /* chain is missing so create it */ stable_node = alloc_stable_node_chain(stable_node_dup, root); if (!stable_node) return NULL; } /* * Add this stable_node dup that was * migrated to the stable_node chain * of the current nid for this page * content. */ VM_BUG_ON(!is_stable_node_dup(stable_node_dup)); VM_BUG_ON(page_node->head != &migrate_nodes); list_del(&page_node->list); DO_NUMA(page_node->nid = nid); stable_node_chain_add_dup(page_node, stable_node); goto out; } /* * stable_tree_insert - insert stable tree node pointing to new ksm page * into the stable tree. * * This function returns the stable tree node just allocated on success, * NULL otherwise. */ static struct ksm_stable_node *stable_tree_insert(struct folio *kfolio) { int nid; unsigned long kpfn; struct rb_root *root; struct rb_node **new; struct rb_node *parent; struct ksm_stable_node *stable_node, *stable_node_dup; bool need_chain = false; kpfn = folio_pfn(kfolio); nid = get_kpfn_nid(kpfn); root = root_stable_tree + nid; again: parent = NULL; new = &root->rb_node; while (*new) { struct folio *tree_folio; int ret; cond_resched(); stable_node = rb_entry(*new, struct ksm_stable_node, node); tree_folio = chain(&stable_node_dup, &stable_node, root); if (!tree_folio) { /* * If we walked over a stale stable_node, * ksm_get_folio() will call rb_erase() and it * may rebalance the tree from under us. So * restart the search from scratch. Returning * NULL would be safe too, but we'd generate * false negative insertions just because some * stable_node was stale. */ goto again; } ret = memcmp_pages(&kfolio->page, &tree_folio->page); folio_put(tree_folio); parent = *new; if (ret < 0) new = &parent->rb_left; else if (ret > 0) new = &parent->rb_right; else { need_chain = true; break; } } stable_node_dup = alloc_stable_node(); if (!stable_node_dup) return NULL; INIT_HLIST_HEAD(&stable_node_dup->hlist); stable_node_dup->kpfn = kpfn; stable_node_dup->rmap_hlist_len = 0; DO_NUMA(stable_node_dup->nid = nid); if (!need_chain) { rb_link_node(&stable_node_dup->node, parent, new); rb_insert_color(&stable_node_dup->node, root); } else { if (!is_stable_node_chain(stable_node)) { struct ksm_stable_node *orig = stable_node; /* chain is missing so create it */ stable_node = alloc_stable_node_chain(orig, root); if (!stable_node) { free_stable_node(stable_node_dup); return NULL; } } stable_node_chain_add_dup(stable_node_dup, stable_node); } folio_set_stable_node(kfolio, stable_node_dup); return stable_node_dup; } /* * unstable_tree_search_insert - search for identical page, * else insert rmap_item into the unstable tree. * * This function searches for a page in the unstable tree identical to the * page currently being scanned; and if no identical page is found in the * tree, we insert rmap_item as a new object into the unstable tree. * * This function returns pointer to rmap_item found to be identical * to the currently scanned page, NULL otherwise. * * This function does both searching and inserting, because they share * the same walking algorithm in an rbtree. */ static struct ksm_rmap_item *unstable_tree_search_insert(struct ksm_rmap_item *rmap_item, struct page *page, struct page **tree_pagep) { struct rb_node **new; struct rb_root *root; struct rb_node *parent = NULL; int nid; nid = get_kpfn_nid(page_to_pfn(page)); root = root_unstable_tree + nid; new = &root->rb_node; while (*new) { struct ksm_rmap_item *tree_rmap_item; struct page *tree_page; int ret; cond_resched(); tree_rmap_item = rb_entry(*new, struct ksm_rmap_item, node); tree_page = get_mergeable_page(tree_rmap_item); if (!tree_page) return NULL; /* * Don't substitute a ksm page for a forked page. */ if (page == tree_page) { put_page(tree_page); return NULL; } ret = memcmp_pages(page, tree_page); parent = *new; if (ret < 0) { put_page(tree_page); new = &parent->rb_left; } else if (ret > 0) { put_page(tree_page); new = &parent->rb_right; } else if (!ksm_merge_across_nodes && page_to_nid(tree_page) != nid) { /* * If tree_page has been migrated to another NUMA node, * it will be flushed out and put in the right unstable * tree next time: only merge with it when across_nodes. */ put_page(tree_page); return NULL; } else { *tree_pagep = tree_page; return tree_rmap_item; } } rmap_item->address |= UNSTABLE_FLAG; rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK); DO_NUMA(rmap_item->nid = nid); rb_link_node(&rmap_item->node, parent, new); rb_insert_color(&rmap_item->node, root); ksm_pages_unshared++; return NULL; } /* * stable_tree_append - add another rmap_item to the linked list of * rmap_items hanging off a given node of the stable tree, all sharing * the same ksm page. */ static void stable_tree_append(struct ksm_rmap_item *rmap_item, struct ksm_stable_node *stable_node, bool max_page_sharing_bypass) { /* * rmap won't find this mapping if we don't insert the * rmap_item in the right stable_node * duplicate. page_migration could break later if rmap breaks, * so we can as well crash here. We really need to check for * rmap_hlist_len == STABLE_NODE_CHAIN, but we can as well check * for other negative values as an underflow if detected here * for the first time (and not when decreasing rmap_hlist_len) * would be sign of memory corruption in the stable_node. */ BUG_ON(stable_node->rmap_hlist_len < 0); stable_node->rmap_hlist_len++; if (!max_page_sharing_bypass) /* possibly non fatal but unexpected overflow, only warn */ WARN_ON_ONCE(stable_node->rmap_hlist_len > ksm_max_page_sharing); rmap_item->head = stable_node; rmap_item->address |= STABLE_FLAG; hlist_add_head(&rmap_item->hlist, &stable_node->hlist); if (rmap_item->hlist.next) ksm_pages_sharing++; else ksm_pages_shared++; rmap_item->mm->ksm_merging_pages++; } /* * cmp_and_merge_page - first see if page can be merged into the stable tree; * if not, compare checksum to previous and if it's the same, see if page can * be inserted into the unstable tree, or merged with a page already there and * both transferred to the stable tree. * * @page: the page that we are searching identical page to. * @rmap_item: the reverse mapping into the virtual address of this page */ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_item) { struct ksm_rmap_item *tree_rmap_item; struct page *tree_page = NULL; struct ksm_stable_node *stable_node; struct folio *kfolio; unsigned int checksum; int err; bool max_page_sharing_bypass = false; stable_node = page_stable_node(page); if (stable_node) { if (stable_node->head != &migrate_nodes && get_kpfn_nid(READ_ONCE(stable_node->kpfn)) != NUMA(stable_node->nid)) { stable_node_dup_del(stable_node); stable_node->head = &migrate_nodes; list_add(&stable_node->list, stable_node->head); } if (stable_node->head != &migrate_nodes && rmap_item->head == stable_node) return; /* * If it's a KSM fork, allow it to go over the sharing limit * without warnings. */ if (!is_page_sharing_candidate(stable_node)) max_page_sharing_bypass = true; } else { remove_rmap_item_from_tree(rmap_item); /* * If the hash value of the page has changed from the last time * we calculated it, this page is changing frequently: therefore we * don't want to insert it in the unstable tree, and we don't want * to waste our time searching for something identical to it there. */ checksum = calc_checksum(page); if (rmap_item->oldchecksum != checksum) { rmap_item->oldchecksum = checksum; return; } if (!try_to_merge_with_zero_page(rmap_item, page)) return; } /* Start by searching for the folio in the stable tree */ kfolio = stable_tree_search(page); if (&kfolio->page == page && rmap_item->head == stable_node) { folio_put(kfolio); return; } remove_rmap_item_from_tree(rmap_item); if (kfolio) { if (kfolio == ERR_PTR(-EBUSY)) return; err = try_to_merge_with_ksm_page(rmap_item, page, &kfolio->page); if (!err) { /* * The page was successfully merged: * add its rmap_item to the stable tree. */ folio_lock(kfolio); stable_tree_append(rmap_item, folio_stable_node(kfolio), max_page_sharing_bypass); folio_unlock(kfolio); } folio_put(kfolio); return; } tree_rmap_item = unstable_tree_search_insert(rmap_item, page, &tree_page); if (tree_rmap_item) { bool split; kfolio = try_to_merge_two_pages(rmap_item, page, tree_rmap_item, tree_page); /* * If both pages we tried to merge belong to the same compound * page, then we actually ended up increasing the reference * count of the same compound page twice, and split_huge_page * failed. * Here we set a flag if that happened, and we use it later to * try split_huge_page again. Since we call put_page right * afterwards, the reference count will be correct and * split_huge_page should succeed. */ split = PageTransCompound(page) && compound_head(page) == compound_head(tree_page); put_page(tree_page); if (kfolio) { /* * The pages were successfully merged: insert new * node in the stable tree and add both rmap_items. */ folio_lock(kfolio); stable_node = stable_tree_insert(kfolio); if (stable_node) { stable_tree_append(tree_rmap_item, stable_node, false); stable_tree_append(rmap_item, stable_node, false); } folio_unlock(kfolio); /* * If we fail to insert the page into the stable tree, * we will have 2 virtual addresses that are pointing * to a ksm page left outside the stable tree, * in which case we need to break_cow on both. */ if (!stable_node) { break_cow(tree_rmap_item); break_cow(rmap_item); } } else if (split) { /* * We are here if we tried to merge two pages and * failed because they both belonged to the same * compound page. We will split the page now, but no * merging will take place. * We do not want to add the cost of a full lock; if * the page is locked, it is better to skip it and * perhaps try again later. */ if (!trylock_page(page)) return; split_huge_page(page); unlock_page(page); } } } static struct ksm_rmap_item *get_next_rmap_item(struct ksm_mm_slot *mm_slot, struct ksm_rmap_item **rmap_list, unsigned long addr) { struct ksm_rmap_item *rmap_item; while (*rmap_list) { rmap_item = *rmap_list; if ((rmap_item->address & PAGE_MASK) == addr) return rmap_item; if (rmap_item->address > addr) break; *rmap_list = rmap_item->rmap_list; remove_rmap_item_from_tree(rmap_item); free_rmap_item(rmap_item); } rmap_item = alloc_rmap_item(); if (rmap_item) { /* It has already been zeroed */ rmap_item->mm = mm_slot->slot.mm; rmap_item->mm->ksm_rmap_items++; rmap_item->address = addr; rmap_item->rmap_list = *rmap_list; *rmap_list = rmap_item; } return rmap_item; } /* * Calculate skip age for the ksm page age. The age determines how often * de-duplicating has already been tried unsuccessfully. If the age is * smaller, the scanning of this page is skipped for less scans. * * @age: rmap_item age of page */ static unsigned int skip_age(rmap_age_t age) { if (age <= 3) return 1; if (age <= 5) return 2; if (age <= 8) return 4; return 8; } /* * Determines if a page should be skipped for the current scan. * * @folio: folio containing the page to check * @rmap_item: associated rmap_item of page */ static bool should_skip_rmap_item(struct folio *folio, struct ksm_rmap_item *rmap_item) { rmap_age_t age; if (!ksm_smart_scan) return false; /* * Never skip pages that are already KSM; pages cmp_and_merge_page() * will essentially ignore them, but we still have to process them * properly. */ if (folio_test_ksm(folio)) return false; age = rmap_item->age; if (age != U8_MAX) rmap_item->age++; /* * Smaller ages are not skipped, they need to get a chance to go * through the different phases of the KSM merging. */ if (age < 3) return false; /* * Are we still allowed to skip? If not, then don't skip it * and determine how much more often we are allowed to skip next. */ if (!rmap_item->remaining_skips) { rmap_item->remaining_skips = skip_age(age); return false; } /* Skip this page */ ksm_pages_skipped++; rmap_item->remaining_skips--; remove_rmap_item_from_tree(rmap_item); return true; } static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page) { struct mm_struct *mm; struct ksm_mm_slot *mm_slot; struct mm_slot *slot; struct vm_area_struct *vma; struct ksm_rmap_item *rmap_item; struct vma_iterator vmi; int nid; if (list_empty(&ksm_mm_head.slot.mm_node)) return NULL; mm_slot = ksm_scan.mm_slot; if (mm_slot == &ksm_mm_head) { advisor_start_scan(); trace_ksm_start_scan(ksm_scan.seqnr, ksm_rmap_items); /* * A number of pages can hang around indefinitely in per-cpu * LRU cache, raised page count preventing write_protect_page * from merging them. Though it doesn't really matter much, * it is puzzling to see some stuck in pages_volatile until * other activity jostles them out, and they also prevented * LTP's KSM test from succeeding deterministically; so drain * them here (here rather than on entry to ksm_do_scan(), * so we don't IPI too often when pages_to_scan is set low). */ lru_add_drain_all(); /* * Whereas stale stable_nodes on the stable_tree itself * get pruned in the regular course of stable_tree_search(), * those moved out to the migrate_nodes list can accumulate: * so prune them once before each full scan. */ if (!ksm_merge_across_nodes) { struct ksm_stable_node *stable_node, *next; struct folio *folio; list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) { folio = ksm_get_folio(stable_node, KSM_GET_FOLIO_NOLOCK); if (folio) folio_put(folio); cond_resched(); } } for (nid = 0; nid < ksm_nr_node_ids; nid++) root_unstable_tree[nid] = RB_ROOT; spin_lock(&ksm_mmlist_lock); slot = list_entry(mm_slot->slot.mm_node.next, struct mm_slot, mm_node); mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot); ksm_scan.mm_slot = mm_slot; spin_unlock(&ksm_mmlist_lock); /* * Although we tested list_empty() above, a racing __ksm_exit * of the last mm on the list may have removed it since then. */ if (mm_slot == &ksm_mm_head) return NULL; next_mm: ksm_scan.address = 0; ksm_scan.rmap_list = &mm_slot->rmap_list; } slot = &mm_slot->slot; mm = slot->mm; vma_iter_init(&vmi, mm, ksm_scan.address); mmap_read_lock(mm); if (ksm_test_exit(mm)) goto no_vmas; for_each_vma(vmi, vma) { if (!(vma->vm_flags & VM_MERGEABLE)) continue; if (ksm_scan.address < vma->vm_start) ksm_scan.address = vma->vm_start; if (!vma->anon_vma) ksm_scan.address = vma->vm_end; while (ksm_scan.address < vma->vm_end) { struct page *tmp_page = NULL; struct folio_walk fw; struct folio *folio; if (ksm_test_exit(mm)) break; folio = folio_walk_start(&fw, vma, ksm_scan.address, 0); if (folio) { if (!folio_is_zone_device(folio) && folio_test_anon(folio)) { folio_get(folio); tmp_page = fw.page; } folio_walk_end(&fw, vma); } if (tmp_page) { flush_anon_page(vma, tmp_page, ksm_scan.address); flush_dcache_page(tmp_page); rmap_item = get_next_rmap_item(mm_slot, ksm_scan.rmap_list, ksm_scan.address); if (rmap_item) { ksm_scan.rmap_list = &rmap_item->rmap_list; if (should_skip_rmap_item(folio, rmap_item)) { folio_put(folio); goto next_page; } ksm_scan.address += PAGE_SIZE; *page = tmp_page; } else { folio_put(folio); } mmap_read_unlock(mm); return rmap_item; } next_page: ksm_scan.address += PAGE_SIZE; cond_resched(); } } if (ksm_test_exit(mm)) { no_vmas: ksm_scan.address = 0; ksm_scan.rmap_list = &mm_slot->rmap_list; } /* * Nuke all the rmap_items that are above this current rmap: * because there were no VM_MERGEABLE vmas with such addresses. */ remove_trailing_rmap_items(ksm_scan.rmap_list); spin_lock(&ksm_mmlist_lock); slot = list_entry(mm_slot->slot.mm_node.next, struct mm_slot, mm_node); ksm_scan.mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot); if (ksm_scan.address == 0) { /* * We've completed a full scan of all vmas, holding mmap_lock * throughout, and found no VM_MERGEABLE: so do the same as * __ksm_exit does to remove this mm from all our lists now. * This applies either when cleaning up after __ksm_exit * (but beware: we can reach here even before __ksm_exit), * or when all VM_MERGEABLE areas have been unmapped (and * mmap_lock then protects against race with MADV_MERGEABLE). */ hash_del(&mm_slot->slot.hash); list_del(&mm_slot->slot.mm_node); spin_unlock(&ksm_mmlist_lock); mm_slot_free(mm_slot_cache, mm_slot); clear_bit(MMF_VM_MERGEABLE, &mm->flags); clear_bit(MMF_VM_MERGE_ANY, &mm->flags); mmap_read_unlock(mm); mmdrop(mm); } else { mmap_read_unlock(mm); /* * mmap_read_unlock(mm) first because after * spin_unlock(&ksm_mmlist_lock) run, the "mm" may * already have been freed under us by __ksm_exit() * because the "mm_slot" is still hashed and * ksm_scan.mm_slot doesn't point to it anymore. */ spin_unlock(&ksm_mmlist_lock); } /* Repeat until we've completed scanning the whole list */ mm_slot = ksm_scan.mm_slot; if (mm_slot != &ksm_mm_head) goto next_mm; advisor_stop_scan(); trace_ksm_stop_scan(ksm_scan.seqnr, ksm_rmap_items); ksm_scan.seqnr++; return NULL; } /** * ksm_do_scan - the ksm scanner main worker function. * @scan_npages: number of pages we want to scan before we return. */ static void ksm_do_scan(unsigned int scan_npages) { struct ksm_rmap_item *rmap_item; struct page *page; while (scan_npages-- && likely(!freezing(current))) { cond_resched(); rmap_item = scan_get_next_rmap_item(&page); if (!rmap_item) return; cmp_and_merge_page(page, rmap_item); put_page(page); ksm_pages_scanned++; } } static int ksmd_should_run(void) { return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.slot.mm_node); } static int ksm_scan_thread(void *nothing) { unsigned int sleep_ms; set_freezable(); set_user_nice(current, 5); while (!kthread_should_stop()) { mutex_lock(&ksm_thread_mutex); wait_while_offlining(); if (ksmd_should_run()) ksm_do_scan(ksm_thread_pages_to_scan); mutex_unlock(&ksm_thread_mutex); if (ksmd_should_run()) { sleep_ms = READ_ONCE(ksm_thread_sleep_millisecs); wait_event_freezable_timeout(ksm_iter_wait, sleep_ms != READ_ONCE(ksm_thread_sleep_millisecs), msecs_to_jiffies(sleep_ms)); } else { wait_event_freezable(ksm_thread_wait, ksmd_should_run() || kthread_should_stop()); } } return 0; } static void __ksm_add_vma(struct vm_area_struct *vma) { unsigned long vm_flags = vma->vm_flags; if (vm_flags & VM_MERGEABLE) return; if (vma_ksm_compatible(vma)) vm_flags_set(vma, VM_MERGEABLE); } static int __ksm_del_vma(struct vm_area_struct *vma) { int err; if (!(vma->vm_flags & VM_MERGEABLE)) return 0; if (vma->anon_vma) { err = unmerge_ksm_pages(vma, vma->vm_start, vma->vm_end, true); if (err) return err; } vm_flags_clear(vma, VM_MERGEABLE); return 0; } /** * ksm_add_vma - Mark vma as mergeable if compatible * * @vma: Pointer to vma */ void ksm_add_vma(struct vm_area_struct *vma) { struct mm_struct *mm = vma->vm_mm; if (test_bit(MMF_VM_MERGE_ANY, &mm->flags)) __ksm_add_vma(vma); } static void ksm_add_vmas(struct mm_struct *mm) { struct vm_area_struct *vma; VMA_ITERATOR(vmi, mm, 0); for_each_vma(vmi, vma) __ksm_add_vma(vma); } static int ksm_del_vmas(struct mm_struct *mm) { struct vm_area_struct *vma; int err; VMA_ITERATOR(vmi, mm, 0); for_each_vma(vmi, vma) { err = __ksm_del_vma(vma); if (err) return err; } return 0; } /** * ksm_enable_merge_any - Add mm to mm ksm list and enable merging on all * compatible VMA's * * @mm: Pointer to mm * * Returns 0 on success, otherwise error code */ int ksm_enable_merge_any(struct mm_struct *mm) { int err; if (test_bit(MMF_VM_MERGE_ANY, &mm->flags)) return 0; if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) { err = __ksm_enter(mm); if (err) return err; } set_bit(MMF_VM_MERGE_ANY, &mm->flags); ksm_add_vmas(mm); return 0; } /** * ksm_disable_merge_any - Disable merging on all compatible VMA's of the mm, * previously enabled via ksm_enable_merge_any(). * * Disabling merging implies unmerging any merged pages, like setting * MADV_UNMERGEABLE would. If unmerging fails, the whole operation fails and * merging on all compatible VMA's remains enabled. * * @mm: Pointer to mm * * Returns 0 on success, otherwise error code */ int ksm_disable_merge_any(struct mm_struct *mm) { int err; if (!test_bit(MMF_VM_MERGE_ANY, &mm->flags)) return 0; err = ksm_del_vmas(mm); if (err) { ksm_add_vmas(mm); return err; } clear_bit(MMF_VM_MERGE_ANY, &mm->flags); return 0; } int ksm_disable(struct mm_struct *mm) { mmap_assert_write_locked(mm); if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) return 0; if (test_bit(MMF_VM_MERGE_ANY, &mm->flags)) return ksm_disable_merge_any(mm); return ksm_del_vmas(mm); } int ksm_madvise(struct vm_area_struct *vma, unsigned long start, unsigned long end, int advice, unsigned long *vm_flags) { struct mm_struct *mm = vma->vm_mm; int err; switch (advice) { case MADV_MERGEABLE: if (vma->vm_flags & VM_MERGEABLE) return 0; if (!vma_ksm_compatible(vma)) return 0; if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) { err = __ksm_enter(mm); if (err) return err; } *vm_flags |= VM_MERGEABLE; break; case MADV_UNMERGEABLE: if (!(*vm_flags & VM_MERGEABLE)) return 0; /* just ignore the advice */ if (vma->anon_vma) { err = unmerge_ksm_pages(vma, start, end, true); if (err) return err; } *vm_flags &= ~VM_MERGEABLE; break; } return 0; } EXPORT_SYMBOL_GPL(ksm_madvise); int __ksm_enter(struct mm_struct *mm) { struct ksm_mm_slot *mm_slot; struct mm_slot *slot; int needs_wakeup; mm_slot = mm_slot_alloc(mm_slot_cache); if (!mm_slot) return -ENOMEM; slot = &mm_slot->slot; /* Check ksm_run too? Would need tighter locking */ needs_wakeup = list_empty(&ksm_mm_head.slot.mm_node); spin_lock(&ksm_mmlist_lock); mm_slot_insert(mm_slots_hash, mm, slot); /* * When KSM_RUN_MERGE (or KSM_RUN_STOP), * insert just behind the scanning cursor, to let the area settle * down a little; when fork is followed by immediate exec, we don't * want ksmd to waste time setting up and tearing down an rmap_list. * * But when KSM_RUN_UNMERGE, it's important to insert ahead of its * scanning cursor, otherwise KSM pages in newly forked mms will be * missed: then we might as well insert at the end of the list. */ if (ksm_run & KSM_RUN_UNMERGE) list_add_tail(&slot->mm_node, &ksm_mm_head.slot.mm_node); else list_add_tail(&slot->mm_node, &ksm_scan.mm_slot->slot.mm_node); spin_unlock(&ksm_mmlist_lock); set_bit(MMF_VM_MERGEABLE, &mm->flags); mmgrab(mm); if (needs_wakeup) wake_up_interruptible(&ksm_thread_wait); trace_ksm_enter(mm); return 0; } void __ksm_exit(struct mm_struct *mm) { struct ksm_mm_slot *mm_slot; struct mm_slot *slot; int easy_to_free = 0; /* * This process is exiting: if it's straightforward (as is the * case when ksmd was never running), free mm_slot immediately. * But if it's at the cursor or has rmap_items linked to it, use * mmap_lock to synchronize with any break_cows before pagetables * are freed, and leave the mm_slot on the list for ksmd to free. * Beware: ksm may already have noticed it exiting and freed the slot. */ spin_lock(&ksm_mmlist_lock); slot = mm_slot_lookup(mm_slots_hash, mm); mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot); if (mm_slot && ksm_scan.mm_slot != mm_slot) { if (!mm_slot->rmap_list) { hash_del(&slot->hash); list_del(&slot->mm_node); easy_to_free = 1; } else { list_move(&slot->mm_node, &ksm_scan.mm_slot->slot.mm_node); } } spin_unlock(&ksm_mmlist_lock); if (easy_to_free) { mm_slot_free(mm_slot_cache, mm_slot); clear_bit(MMF_VM_MERGE_ANY, &mm->flags); clear_bit(MMF_VM_MERGEABLE, &mm->flags); mmdrop(mm); } else if (mm_slot) { mmap_write_lock(mm); mmap_write_unlock(mm); } trace_ksm_exit(mm); } struct folio *ksm_might_need_to_copy(struct folio *folio, struct vm_area_struct *vma, unsigned long addr) { struct page *page = folio_page(folio, 0); struct anon_vma *anon_vma = folio_anon_vma(folio); struct folio *new_folio; if (folio_test_large(folio)) return folio; if (folio_test_ksm(folio)) { if (folio_stable_node(folio) && !(ksm_run & KSM_RUN_UNMERGE)) return folio; /* no need to copy it */ } else if (!anon_vma) { return folio; /* no need to copy it */ } else if (folio->index == linear_page_index(vma, addr) && anon_vma->root == vma->anon_vma->root) { return folio; /* still no need to copy it */ } if (PageHWPoison(page)) return ERR_PTR(-EHWPOISON); if (!folio_test_uptodate(folio)) return folio; /* let do_swap_page report the error */ new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, addr); if (new_folio && mem_cgroup_charge(new_folio, vma->vm_mm, GFP_KERNEL)) { folio_put(new_folio); new_folio = NULL; } if (new_folio) { if (copy_mc_user_highpage(folio_page(new_folio, 0), page, addr, vma)) { folio_put(new_folio); return ERR_PTR(-EHWPOISON); } folio_set_dirty(new_folio); __folio_mark_uptodate(new_folio); __folio_set_locked(new_folio); #ifdef CONFIG_SWAP count_vm_event(KSM_SWPIN_COPY); #endif } return new_folio; } void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc) { struct ksm_stable_node *stable_node; struct ksm_rmap_item *rmap_item; int search_new_forks = 0; VM_BUG_ON_FOLIO(!folio_test_ksm(folio), folio); /* * Rely on the page lock to protect against concurrent modifications * to that page's node of the stable tree. */ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); stable_node = folio_stable_node(folio); if (!stable_node) return; again: hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) { struct anon_vma *anon_vma = rmap_item->anon_vma; struct anon_vma_chain *vmac; struct vm_area_struct *vma; cond_resched(); if (!anon_vma_trylock_read(anon_vma)) { if (rwc->try_lock) { rwc->contended = true; return; } anon_vma_lock_read(anon_vma); } anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, 0, ULONG_MAX) { unsigned long addr; cond_resched(); vma = vmac->vma; /* Ignore the stable/unstable/sqnr flags */ addr = rmap_item->address & PAGE_MASK; if (addr < vma->vm_start || addr >= vma->vm_end) continue; /* * Initially we examine only the vma which covers this * rmap_item; but later, if there is still work to do, * we examine covering vmas in other mms: in case they * were forked from the original since ksmd passed. */ if ((rmap_item->mm == vma->vm_mm) == search_new_forks) continue; if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) continue; if (!rwc->rmap_one(folio, vma, addr, rwc->arg)) { anon_vma_unlock_read(anon_vma); return; } if (rwc->done && rwc->done(folio)) { anon_vma_unlock_read(anon_vma); return; } } anon_vma_unlock_read(anon_vma); } if (!search_new_forks++) goto again; } #ifdef CONFIG_MEMORY_FAILURE /* * Collect processes when the error hit an ksm page. */ void collect_procs_ksm(const struct folio *folio, const struct page *page, struct list_head *to_kill, int force_early) { struct ksm_stable_node *stable_node; struct ksm_rmap_item *rmap_item; struct vm_area_struct *vma; struct task_struct *tsk; stable_node = folio_stable_node(folio); if (!stable_node) return; hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) { struct anon_vma *av = rmap_item->anon_vma; anon_vma_lock_read(av); rcu_read_lock(); for_each_process(tsk) { struct anon_vma_chain *vmac; unsigned long addr; struct task_struct *t = task_early_kill(tsk, force_early); if (!t) continue; anon_vma_interval_tree_foreach(vmac, &av->rb_root, 0, ULONG_MAX) { vma = vmac->vma; if (vma->vm_mm == t->mm) { addr = rmap_item->address & PAGE_MASK; add_to_kill_ksm(t, page, vma, to_kill, addr); } } } rcu_read_unlock(); anon_vma_unlock_read(av); } } #endif #ifdef CONFIG_MIGRATION void folio_migrate_ksm(struct folio *newfolio, struct folio *folio) { struct ksm_stable_node *stable_node; VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_FOLIO(!folio_test_locked(newfolio), newfolio); VM_BUG_ON_FOLIO(newfolio->mapping != folio->mapping, newfolio); stable_node = folio_stable_node(folio); if (stable_node) { VM_BUG_ON_FOLIO(stable_node->kpfn != folio_pfn(folio), folio); stable_node->kpfn = folio_pfn(newfolio); /* * newfolio->mapping was set in advance; now we need smp_wmb() * to make sure that the new stable_node->kpfn is visible * to ksm_get_folio() before it can see that folio->mapping * has gone stale (or that the swapcache flag has been cleared). */ smp_wmb(); folio_set_stable_node(folio, NULL); } } #endif /* CONFIG_MIGRATION */ #ifdef CONFIG_MEMORY_HOTREMOVE static void wait_while_offlining(void) { while (ksm_run & KSM_RUN_OFFLINE) { mutex_unlock(&ksm_thread_mutex); wait_on_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE), TASK_UNINTERRUPTIBLE); mutex_lock(&ksm_thread_mutex); } } static bool stable_node_dup_remove_range(struct ksm_stable_node *stable_node, unsigned long start_pfn, unsigned long end_pfn) { if (stable_node->kpfn >= start_pfn && stable_node->kpfn < end_pfn) { /* * Don't ksm_get_folio, page has already gone: * which is why we keep kpfn instead of page* */ remove_node_from_stable_tree(stable_node); return true; } return false; } static bool stable_node_chain_remove_range(struct ksm_stable_node *stable_node, unsigned long start_pfn, unsigned long end_pfn, struct rb_root *root) { struct ksm_stable_node *dup; struct hlist_node *hlist_safe; if (!is_stable_node_chain(stable_node)) { VM_BUG_ON(is_stable_node_dup(stable_node)); return stable_node_dup_remove_range(stable_node, start_pfn, end_pfn); } hlist_for_each_entry_safe(dup, hlist_safe, &stable_node->hlist, hlist_dup) { VM_BUG_ON(!is_stable_node_dup(dup)); stable_node_dup_remove_range(dup, start_pfn, end_pfn); } if (hlist_empty(&stable_node->hlist)) { free_stable_node_chain(stable_node, root); return true; /* notify caller that tree was rebalanced */ } else return false; } static void ksm_check_stable_tree(unsigned long start_pfn, unsigned long end_pfn) { struct ksm_stable_node *stable_node, *next; struct rb_node *node; int nid; for (nid = 0; nid < ksm_nr_node_ids; nid++) { node = rb_first(root_stable_tree + nid); while (node) { stable_node = rb_entry(node, struct ksm_stable_node, node); if (stable_node_chain_remove_range(stable_node, start_pfn, end_pfn, root_stable_tree + nid)) node = rb_first(root_stable_tree + nid); else node = rb_next(node); cond_resched(); } } list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) { if (stable_node->kpfn >= start_pfn && stable_node->kpfn < end_pfn) remove_node_from_stable_tree(stable_node); cond_resched(); } } static int ksm_memory_callback(struct notifier_block *self, unsigned long action, void *arg) { struct memory_notify *mn = arg; switch (action) { case MEM_GOING_OFFLINE: /* * Prevent ksm_do_scan(), unmerge_and_remove_all_rmap_items() * and remove_all_stable_nodes() while memory is going offline: * it is unsafe for them to touch the stable tree at this time. * But unmerge_ksm_pages(), rmap lookups and other entry points * which do not need the ksm_thread_mutex are all safe. */ mutex_lock(&ksm_thread_mutex); ksm_run |= KSM_RUN_OFFLINE; mutex_unlock(&ksm_thread_mutex); break; case MEM_OFFLINE: /* * Most of the work is done by page migration; but there might * be a few stable_nodes left over, still pointing to struct * pages which have been offlined: prune those from the tree, * otherwise ksm_get_folio() might later try to access a * non-existent struct page. */ ksm_check_stable_tree(mn->start_pfn, mn->start_pfn + mn->nr_pages); fallthrough; case MEM_CANCEL_OFFLINE: mutex_lock(&ksm_thread_mutex); ksm_run &= ~KSM_RUN_OFFLINE; mutex_unlock(&ksm_thread_mutex); smp_mb(); /* wake_up_bit advises this */ wake_up_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE)); break; } return NOTIFY_OK; } #else static void wait_while_offlining(void) { } #endif /* CONFIG_MEMORY_HOTREMOVE */ #ifdef CONFIG_PROC_FS long ksm_process_profit(struct mm_struct *mm) { return (long)(mm->ksm_merging_pages + mm_ksm_zero_pages(mm)) * PAGE_SIZE - mm->ksm_rmap_items * sizeof(struct ksm_rmap_item); } #endif /* CONFIG_PROC_FS */ #ifdef CONFIG_SYSFS /* * This all compiles without CONFIG_SYSFS, but is a waste of space. */ #define KSM_ATTR_RO(_name) \ static struct kobj_attribute _name##_attr = __ATTR_RO(_name) #define KSM_ATTR(_name) \ static struct kobj_attribute _name##_attr = __ATTR_RW(_name) static ssize_t sleep_millisecs_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", ksm_thread_sleep_millisecs); } static ssize_t sleep_millisecs_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { unsigned int msecs; int err; err = kstrtouint(buf, 10, &msecs); if (err) return -EINVAL; ksm_thread_sleep_millisecs = msecs; wake_up_interruptible(&ksm_iter_wait); return count; } KSM_ATTR(sleep_millisecs); static ssize_t pages_to_scan_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", ksm_thread_pages_to_scan); } static ssize_t pages_to_scan_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { unsigned int nr_pages; int err; if (ksm_advisor != KSM_ADVISOR_NONE) return -EINVAL; err = kstrtouint(buf, 10, &nr_pages); if (err) return -EINVAL; ksm_thread_pages_to_scan = nr_pages; return count; } KSM_ATTR(pages_to_scan); static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%lu\n", ksm_run); } static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { unsigned int flags; int err; err = kstrtouint(buf, 10, &flags); if (err) return -EINVAL; if (flags > KSM_RUN_UNMERGE) return -EINVAL; /* * KSM_RUN_MERGE sets ksmd running, and 0 stops it running. * KSM_RUN_UNMERGE stops it running and unmerges all rmap_items, * breaking COW to free the pages_shared (but leaves mm_slots * on the list for when ksmd may be set running again). */ mutex_lock(&ksm_thread_mutex); wait_while_offlining(); if (ksm_run != flags) { ksm_run = flags; if (flags & KSM_RUN_UNMERGE) { set_current_oom_origin(); err = unmerge_and_remove_all_rmap_items(); clear_current_oom_origin(); if (err) { ksm_run = KSM_RUN_STOP; count = err; } } } mutex_unlock(&ksm_thread_mutex); if (flags & KSM_RUN_MERGE) wake_up_interruptible(&ksm_thread_wait); return count; } KSM_ATTR(run); #ifdef CONFIG_NUMA static ssize_t merge_across_nodes_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", ksm_merge_across_nodes); } static ssize_t merge_across_nodes_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { int err; unsigned long knob; err = kstrtoul(buf, 10, &knob); if (err) return err; if (knob > 1) return -EINVAL; mutex_lock(&ksm_thread_mutex); wait_while_offlining(); if (ksm_merge_across_nodes != knob) { if (ksm_pages_shared || remove_all_stable_nodes()) err = -EBUSY; else if (root_stable_tree == one_stable_tree) { struct rb_root *buf; /* * This is the first time that we switch away from the * default of merging across nodes: must now allocate * a buffer to hold as many roots as may be needed. * Allocate stable and unstable together: * MAXSMP NODES_SHIFT 10 will use 16kB. */ buf = kcalloc(nr_node_ids + nr_node_ids, sizeof(*buf), GFP_KERNEL); /* Let us assume that RB_ROOT is NULL is zero */ if (!buf) err = -ENOMEM; else { root_stable_tree = buf; root_unstable_tree = buf + nr_node_ids; /* Stable tree is empty but not the unstable */ root_unstable_tree[0] = one_unstable_tree[0]; } } if (!err) { ksm_merge_across_nodes = knob; ksm_nr_node_ids = knob ? 1 : nr_node_ids; } } mutex_unlock(&ksm_thread_mutex); return err ? err : count; } KSM_ATTR(merge_across_nodes); #endif static ssize_t use_zero_pages_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", ksm_use_zero_pages); } static ssize_t use_zero_pages_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { int err; bool value; err = kstrtobool(buf, &value); if (err) return -EINVAL; ksm_use_zero_pages = value; return count; } KSM_ATTR(use_zero_pages); static ssize_t max_page_sharing_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", ksm_max_page_sharing); } static ssize_t max_page_sharing_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { int err; int knob; err = kstrtoint(buf, 10, &knob); if (err) return err; /* * When a KSM page is created it is shared by 2 mappings. This * being a signed comparison, it implicitly verifies it's not * negative. */ if (knob < 2) return -EINVAL; if (READ_ONCE(ksm_max_page_sharing) == knob) return count; mutex_lock(&ksm_thread_mutex); wait_while_offlining(); if (ksm_max_page_sharing != knob) { if (ksm_pages_shared || remove_all_stable_nodes()) err = -EBUSY; else ksm_max_page_sharing = knob; } mutex_unlock(&ksm_thread_mutex); return err ? err : count; } KSM_ATTR(max_page_sharing); static ssize_t pages_scanned_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%lu\n", ksm_pages_scanned); } KSM_ATTR_RO(pages_scanned); static ssize_t pages_shared_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%lu\n", ksm_pages_shared); } KSM_ATTR_RO(pages_shared); static ssize_t pages_sharing_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%lu\n", ksm_pages_sharing); } KSM_ATTR_RO(pages_sharing); static ssize_t pages_unshared_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%lu\n", ksm_pages_unshared); } KSM_ATTR_RO(pages_unshared); static ssize_t pages_volatile_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { long ksm_pages_volatile; ksm_pages_volatile = ksm_rmap_items - ksm_pages_shared - ksm_pages_sharing - ksm_pages_unshared; /* * It was not worth any locking to calculate that statistic, * but it might therefore sometimes be negative: conceal that. */ if (ksm_pages_volatile < 0) ksm_pages_volatile = 0; return sysfs_emit(buf, "%ld\n", ksm_pages_volatile); } KSM_ATTR_RO(pages_volatile); static ssize_t pages_skipped_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%lu\n", ksm_pages_skipped); } KSM_ATTR_RO(pages_skipped); static ssize_t ksm_zero_pages_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%ld\n", atomic_long_read(&ksm_zero_pages)); } KSM_ATTR_RO(ksm_zero_pages); static ssize_t general_profit_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { long general_profit; general_profit = (ksm_pages_sharing + atomic_long_read(&ksm_zero_pages)) * PAGE_SIZE - ksm_rmap_items * sizeof(struct ksm_rmap_item); return sysfs_emit(buf, "%ld\n", general_profit); } KSM_ATTR_RO(general_profit); static ssize_t stable_node_dups_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%lu\n", ksm_stable_node_dups); } KSM_ATTR_RO(stable_node_dups); static ssize_t stable_node_chains_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%lu\n", ksm_stable_node_chains); } KSM_ATTR_RO(stable_node_chains); static ssize_t stable_node_chains_prune_millisecs_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", ksm_stable_node_chains_prune_millisecs); } static ssize_t stable_node_chains_prune_millisecs_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { unsigned int msecs; int err; err = kstrtouint(buf, 10, &msecs); if (err) return -EINVAL; ksm_stable_node_chains_prune_millisecs = msecs; return count; } KSM_ATTR(stable_node_chains_prune_millisecs); static ssize_t full_scans_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%lu\n", ksm_scan.seqnr); } KSM_ATTR_RO(full_scans); static ssize_t smart_scan_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", ksm_smart_scan); } static ssize_t smart_scan_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { int err; bool value; err = kstrtobool(buf, &value); if (err) return -EINVAL; ksm_smart_scan = value; return count; } KSM_ATTR(smart_scan); static ssize_t advisor_mode_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { const char *output; if (ksm_advisor == KSM_ADVISOR_NONE) output = "[none] scan-time"; else if (ksm_advisor == KSM_ADVISOR_SCAN_TIME) output = "none [scan-time]"; return sysfs_emit(buf, "%s\n", output); } static ssize_t advisor_mode_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { enum ksm_advisor_type curr_advisor = ksm_advisor; if (sysfs_streq("scan-time", buf)) ksm_advisor = KSM_ADVISOR_SCAN_TIME; else if (sysfs_streq("none", buf)) ksm_advisor = KSM_ADVISOR_NONE; else return -EINVAL; /* Set advisor default values */ if (curr_advisor != ksm_advisor) set_advisor_defaults(); return count; } KSM_ATTR(advisor_mode); static ssize_t advisor_max_cpu_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", ksm_advisor_max_cpu); } static ssize_t advisor_max_cpu_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { int err; unsigned long value; err = kstrtoul(buf, 10, &value); if (err) return -EINVAL; ksm_advisor_max_cpu = value; return count; } KSM_ATTR(advisor_max_cpu); static ssize_t advisor_min_pages_to_scan_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%lu\n", ksm_advisor_min_pages_to_scan); } static ssize_t advisor_min_pages_to_scan_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { int err; unsigned long value; err = kstrtoul(buf, 10, &value); if (err) return -EINVAL; ksm_advisor_min_pages_to_scan = value; return count; } KSM_ATTR(advisor_min_pages_to_scan); static ssize_t advisor_max_pages_to_scan_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%lu\n", ksm_advisor_max_pages_to_scan); } static ssize_t advisor_max_pages_to_scan_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { int err; unsigned long value; err = kstrtoul(buf, 10, &value); if (err) return -EINVAL; ksm_advisor_max_pages_to_scan = value; return count; } KSM_ATTR(advisor_max_pages_to_scan); static ssize_t advisor_target_scan_time_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%lu\n", ksm_advisor_target_scan_time); } static ssize_t advisor_target_scan_time_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { int err; unsigned long value; err = kstrtoul(buf, 10, &value); if (err) return -EINVAL; if (value < 1) return -EINVAL; ksm_advisor_target_scan_time = value; return count; } KSM_ATTR(advisor_target_scan_time); static struct attribute *ksm_attrs[] = { &sleep_millisecs_attr.attr, &pages_to_scan_attr.attr, &run_attr.attr, &pages_scanned_attr.attr, &pages_shared_attr.attr, &pages_sharing_attr.attr, &pages_unshared_attr.attr, &pages_volatile_attr.attr, &pages_skipped_attr.attr, &ksm_zero_pages_attr.attr, &full_scans_attr.attr, #ifdef CONFIG_NUMA &merge_across_nodes_attr.attr, #endif &max_page_sharing_attr.attr, &stable_node_chains_attr.attr, &stable_node_dups_attr.attr, &stable_node_chains_prune_millisecs_attr.attr, &use_zero_pages_attr.attr, &general_profit_attr.attr, &smart_scan_attr.attr, &advisor_mode_attr.attr, &advisor_max_cpu_attr.attr, &advisor_min_pages_to_scan_attr.attr, &advisor_max_pages_to_scan_attr.attr, &advisor_target_scan_time_attr.attr, NULL, }; static const struct attribute_group ksm_attr_group = { .attrs = ksm_attrs, .name = "ksm", }; #endif /* CONFIG_SYSFS */ static int __init ksm_init(void) { struct task_struct *ksm_thread; int err; /* The correct value depends on page size and endianness */ zero_checksum = calc_checksum(ZERO_PAGE(0)); /* Default to false for backwards compatibility */ ksm_use_zero_pages = false; err = ksm_slab_init(); if (err) goto out; ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd"); if (IS_ERR(ksm_thread)) { pr_err("ksm: creating kthread failed\n"); err = PTR_ERR(ksm_thread); goto out_free; } #ifdef CONFIG_SYSFS err = sysfs_create_group(mm_kobj, &ksm_attr_group); if (err) { pr_err("ksm: register sysfs failed\n"); kthread_stop(ksm_thread); goto out_free; } #else ksm_run = KSM_RUN_MERGE; /* no way for user to start it */ #endif /* CONFIG_SYSFS */ #ifdef CONFIG_MEMORY_HOTREMOVE /* There is no significance to this priority 100 */ hotplug_memory_notifier(ksm_memory_callback, KSM_CALLBACK_PRI); #endif return 0; out_free: ksm_slab_free(); out: return err; } subsys_initcall(ksm_init);
3 3 3 8 3 7 8 7 9 9 9 9 8 8 3 3 3 3 3 3 3 3 2 9 10 9 1 10 10 1 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. */ #include <linux/list_sort.h> #include <linux/libnvdimm.h> #include <linux/module.h> #include <linux/nospec.h> #include <linux/mutex.h> #include <linux/ndctl.h> #include <linux/sysfs.h> #include <linux/delay.h> #include <linux/list.h> #include <linux/acpi.h> #include <linux/sort.h> #include <linux/io.h> #include <linux/nd.h> #include <asm/cacheflush.h> #include <acpi/nfit.h> #include "intel.h" #include "nfit.h" /* * For readq() and writeq() on 32-bit builds, the hi-lo, lo-hi order is * irrelevant. */ #include <linux/io-64-nonatomic-hi-lo.h> static bool force_enable_dimms; module_param(force_enable_dimms, bool, S_IRUGO|S_IWUSR); MODULE_PARM_DESC(force_enable_dimms, "Ignore _STA (ACPI DIMM device) status"); static bool disable_vendor_specific; module_param(disable_vendor_specific, bool, S_IRUGO); MODULE_PARM_DESC(disable_vendor_specific, "Limit commands to the publicly specified set"); static unsigned long override_dsm_mask; module_param(override_dsm_mask, ulong, S_IRUGO); MODULE_PARM_DESC(override_dsm_mask, "Bitmask of allowed NVDIMM DSM functions"); static int default_dsm_family = -1; module_param(default_dsm_family, int, S_IRUGO); MODULE_PARM_DESC(default_dsm_family, "Try this DSM type first when identifying NVDIMM family"); static bool no_init_ars; module_param(no_init_ars, bool, 0644); MODULE_PARM_DESC(no_init_ars, "Skip ARS run at nfit init time"); static bool force_labels; module_param(force_labels, bool, 0444); MODULE_PARM_DESC(force_labels, "Opt-in to labels despite missing methods"); LIST_HEAD(acpi_descs); DEFINE_MUTEX(acpi_desc_lock); static struct workqueue_struct *nfit_wq; struct nfit_table_prev { struct list_head spas; struct list_head memdevs; struct list_head dcrs; struct list_head bdws; struct list_head idts; struct list_head flushes; }; static guid_t nfit_uuid[NFIT_UUID_MAX]; const guid_t *to_nfit_uuid(enum nfit_uuids id) { return &nfit_uuid[id]; } EXPORT_SYMBOL(to_nfit_uuid); static const guid_t *to_nfit_bus_uuid(int family) { if (WARN_ONCE(family == NVDIMM_BUS_FAMILY_NFIT, "only secondary bus families can be translated\n")) return NULL; /* * The index of bus UUIDs starts immediately following the last * NVDIMM/leaf family. */ return to_nfit_uuid(family + NVDIMM_FAMILY_MAX); } static struct acpi_device *to_acpi_dev(struct acpi_nfit_desc *acpi_desc) { struct nvdimm_bus_descriptor *nd_desc = &acpi_desc->nd_desc; /* * If provider == 'ACPI.NFIT' we can assume 'dev' is a struct * acpi_device. */ if (!nd_desc->provider_name || strcmp(nd_desc->provider_name, "ACPI.NFIT") != 0) return NULL; return to_acpi_device(acpi_desc->dev); } static int xlat_bus_status(void *buf, unsigned int cmd, u32 status) { struct nd_cmd_clear_error *clear_err; struct nd_cmd_ars_status *ars_status; u16 flags; switch (cmd) { case ND_CMD_ARS_CAP: if ((status & 0xffff) == NFIT_ARS_CAP_NONE) return -ENOTTY; /* Command failed */ if (status & 0xffff) return -EIO; /* No supported scan types for this range */ flags = ND_ARS_PERSISTENT | ND_ARS_VOLATILE; if ((status >> 16 & flags) == 0) return -ENOTTY; return 0; case ND_CMD_ARS_START: /* ARS is in progress */ if ((status & 0xffff) == NFIT_ARS_START_BUSY) return -EBUSY; /* Command failed */ if (status & 0xffff) return -EIO; return 0; case ND_CMD_ARS_STATUS: ars_status = buf; /* Command failed */ if (status & 0xffff) return -EIO; /* Check extended status (Upper two bytes) */ if (status == NFIT_ARS_STATUS_DONE) return 0; /* ARS is in progress */ if (status == NFIT_ARS_STATUS_BUSY) return -EBUSY; /* No ARS performed for the current boot */ if (status == NFIT_ARS_STATUS_NONE) return -EAGAIN; /* * ARS interrupted, either we overflowed or some other * agent wants the scan to stop. If we didn't overflow * then just continue with the returned results. */ if (status == NFIT_ARS_STATUS_INTR) { if (ars_status->out_length >= 40 && (ars_status->flags & NFIT_ARS_F_OVERFLOW)) return -ENOSPC; return 0; } /* Unknown status */ if (status >> 16) return -EIO; return 0; case ND_CMD_CLEAR_ERROR: clear_err = buf; if (status & 0xffff) return -EIO; if (!clear_err->cleared) return -EIO; if (clear_err->length > clear_err->cleared) return clear_err->cleared; return 0; default: break; } /* all other non-zero status results in an error */ if (status) return -EIO; return 0; } #define ACPI_LABELS_LOCKED 3 static int xlat_nvdimm_status(struct nvdimm *nvdimm, void *buf, unsigned int cmd, u32 status) { struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm); switch (cmd) { case ND_CMD_GET_CONFIG_SIZE: /* * In the _LSI, _LSR, _LSW case the locked status is * communicated via the read/write commands */ if (test_bit(NFIT_MEM_LSR, &nfit_mem->flags)) break; if (status >> 16 & ND_CONFIG_LOCKED) return -EACCES; break; case ND_CMD_GET_CONFIG_DATA: if (test_bit(NFIT_MEM_LSR, &nfit_mem->flags) && status == ACPI_LABELS_LOCKED) return -EACCES; break; case ND_CMD_SET_CONFIG_DATA: if (test_bit(NFIT_MEM_LSW, &nfit_mem->flags) && status == ACPI_LABELS_LOCKED) return -EACCES; break; default: break; } /* all other non-zero status results in an error */ if (status) return -EIO; return 0; } static int xlat_status(struct nvdimm *nvdimm, void *buf, unsigned int cmd, u32 status) { if (!nvdimm) return xlat_bus_status(buf, cmd, status); return xlat_nvdimm_status(nvdimm, buf, cmd, status); } /* convert _LS{I,R} packages to the buffer object acpi_nfit_ctl expects */ static union acpi_object *pkg_to_buf(union acpi_object *pkg) { int i; void *dst; size_t size = 0; union acpi_object *buf = NULL; if (pkg->type != ACPI_TYPE_PACKAGE) { WARN_ONCE(1, "BIOS bug, unexpected element type: %d\n", pkg->type); goto err; } for (i = 0; i < pkg->package.count; i++) { union acpi_object *obj = &pkg->package.elements[i]; if (obj->type == ACPI_TYPE_INTEGER) size += 4; else if (obj->type == ACPI_TYPE_BUFFER) size += obj->buffer.length; else { WARN_ONCE(1, "BIOS bug, unexpected element type: %d\n", obj->type); goto err; } } buf = ACPI_ALLOCATE(sizeof(*buf) + size); if (!buf) goto err; dst = buf + 1; buf->type = ACPI_TYPE_BUFFER; buf->buffer.length = size; buf->buffer.pointer = dst; for (i = 0; i < pkg->package.count; i++) { union acpi_object *obj = &pkg->package.elements[i]; if (obj->type == ACPI_TYPE_INTEGER) { memcpy(dst, &obj->integer.value, 4); dst += 4; } else if (obj->type == ACPI_TYPE_BUFFER) { memcpy(dst, obj->buffer.pointer, obj->buffer.length); dst += obj->buffer.length; } } err: ACPI_FREE(pkg); return buf; } static union acpi_object *int_to_buf(union acpi_object *integer) { union acpi_object *buf = NULL; void *dst = NULL; if (integer->type != ACPI_TYPE_INTEGER) { WARN_ONCE(1, "BIOS bug, unexpected element type: %d\n", integer->type); goto err; } buf = ACPI_ALLOCATE(sizeof(*buf) + 4); if (!buf) goto err; dst = buf + 1; buf->type = ACPI_TYPE_BUFFER; buf->buffer.length = 4; buf->buffer.pointer = dst; memcpy(dst, &integer->integer.value, 4); err: ACPI_FREE(integer); return buf; } static union acpi_object *acpi_label_write(acpi_handle handle, u32 offset, u32 len, void *data) { acpi_status rc; struct acpi_buffer buf = { ACPI_ALLOCATE_BUFFER, NULL }; struct acpi_object_list input = { .count = 3, .pointer = (union acpi_object []) { [0] = { .integer.type = ACPI_TYPE_INTEGER, .integer.value = offset, }, [1] = { .integer.type = ACPI_TYPE_INTEGER, .integer.value = len, }, [2] = { .buffer.type = ACPI_TYPE_BUFFER, .buffer.pointer = data, .buffer.length = len, }, }, }; rc = acpi_evaluate_object(handle, "_LSW", &input, &buf); if (ACPI_FAILURE(rc)) return NULL; return int_to_buf(buf.pointer); } static union acpi_object *acpi_label_read(acpi_handle handle, u32 offset, u32 len) { acpi_status rc; struct acpi_buffer buf = { ACPI_ALLOCATE_BUFFER, NULL }; struct acpi_object_list input = { .count = 2, .pointer = (union acpi_object []) { [0] = { .integer.type = ACPI_TYPE_INTEGER, .integer.value = offset, }, [1] = { .integer.type = ACPI_TYPE_INTEGER, .integer.value = len, }, }, }; rc = acpi_evaluate_object(handle, "_LSR", &input, &buf); if (ACPI_FAILURE(rc)) return NULL; return pkg_to_buf(buf.pointer); } static union acpi_object *acpi_label_info(acpi_handle handle) { acpi_status rc; struct acpi_buffer buf = { ACPI_ALLOCATE_BUFFER, NULL }; rc = acpi_evaluate_object(handle, "_LSI", NULL, &buf); if (ACPI_FAILURE(rc)) return NULL; return pkg_to_buf(buf.pointer); } static u8 nfit_dsm_revid(unsigned family, unsigned func) { static const u8 revid_table[NVDIMM_FAMILY_MAX+1][NVDIMM_CMD_MAX+1] = { [NVDIMM_FAMILY_INTEL] = { [NVDIMM_INTEL_GET_MODES ... NVDIMM_INTEL_FW_ACTIVATE_ARM] = 2, }, }; u8 id; if (family > NVDIMM_FAMILY_MAX) return 0; if (func > NVDIMM_CMD_MAX) return 0; id = revid_table[family][func]; if (id == 0) return 1; /* default */ return id; } static bool payload_dumpable(struct nvdimm *nvdimm, unsigned int func) { struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm); if (nfit_mem && nfit_mem->family == NVDIMM_FAMILY_INTEL && func >= NVDIMM_INTEL_GET_SECURITY_STATE && func <= NVDIMM_INTEL_MASTER_SECURE_ERASE) return IS_ENABLED(CONFIG_NFIT_SECURITY_DEBUG); return true; } static int cmd_to_func(struct nfit_mem *nfit_mem, unsigned int cmd, struct nd_cmd_pkg *call_pkg, int *family) { if (call_pkg) { int i; if (nfit_mem && nfit_mem->family != call_pkg->nd_family) return -ENOTTY; for (i = 0; i < ARRAY_SIZE(call_pkg->nd_reserved2); i++) if (call_pkg->nd_reserved2[i]) return -EINVAL; *family = call_pkg->nd_family; return call_pkg->nd_command; } /* In the !call_pkg case, bus commands == bus functions */ if (!nfit_mem) return cmd; /* Linux ND commands == NVDIMM_FAMILY_INTEL function numbers */ if (nfit_mem->family == NVDIMM_FAMILY_INTEL) return cmd; /* * Force function number validation to fail since 0 is never * published as a valid function in dsm_mask. */ return 0; } int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm, unsigned int cmd, void *buf, unsigned int buf_len, int *cmd_rc) { struct acpi_nfit_desc *acpi_desc = to_acpi_desc(nd_desc); struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm); union acpi_object in_obj, in_buf, *out_obj; const struct nd_cmd_desc *desc = NULL; struct device *dev = acpi_desc->dev; struct nd_cmd_pkg *call_pkg = NULL; const char *cmd_name, *dimm_name; unsigned long cmd_mask, dsm_mask; u32 offset, fw_status = 0; acpi_handle handle; const guid_t *guid; int func, rc, i; int family = 0; if (cmd_rc) *cmd_rc = -EINVAL; if (cmd == ND_CMD_CALL) { if (!buf || buf_len < sizeof(*call_pkg)) return -EINVAL; call_pkg = buf; } func = cmd_to_func(nfit_mem, cmd, call_pkg, &family); if (func < 0) return func; if (nvdimm) { struct acpi_device *adev = nfit_mem->adev; if (!adev) return -ENOTTY; dimm_name = nvdimm_name(nvdimm); cmd_name = nvdimm_cmd_name(cmd); cmd_mask = nvdimm_cmd_mask(nvdimm); dsm_mask = nfit_mem->dsm_mask; desc = nd_cmd_dimm_desc(cmd); guid = to_nfit_uuid(nfit_mem->family); handle = adev->handle; } else { struct acpi_device *adev = to_acpi_dev(acpi_desc); cmd_name = nvdimm_bus_cmd_name(cmd); cmd_mask = nd_desc->cmd_mask; if (cmd == ND_CMD_CALL && call_pkg->nd_family) { family = call_pkg->nd_family; if (family > NVDIMM_BUS_FAMILY_MAX || !test_bit(family, &nd_desc->bus_family_mask)) return -EINVAL; family = array_index_nospec(family, NVDIMM_BUS_FAMILY_MAX + 1); dsm_mask = acpi_desc->family_dsm_mask[family]; guid = to_nfit_bus_uuid(family); } else { dsm_mask = acpi_desc->bus_dsm_mask; guid = to_nfit_uuid(NFIT_DEV_BUS); } desc = nd_cmd_bus_desc(cmd); handle = adev->handle; dimm_name = "bus"; } if (!desc || (cmd && (desc->out_num + desc->in_num == 0))) return -ENOTTY; /* * Check for a valid command. For ND_CMD_CALL, we also have to * make sure that the DSM function is supported. */ if (cmd == ND_CMD_CALL && (func > NVDIMM_CMD_MAX || !test_bit(func, &dsm_mask))) return -ENOTTY; else if (!test_bit(cmd, &cmd_mask)) return -ENOTTY; in_obj.type = ACPI_TYPE_PACKAGE; in_obj.package.count = 1; in_obj.package.elements = &in_buf; in_buf.type = ACPI_TYPE_BUFFER; in_buf.buffer.pointer = buf; in_buf.buffer.length = 0; /* libnvdimm has already validated the input envelope */ for (i = 0; i < desc->in_num; i++) in_buf.buffer.length += nd_cmd_in_size(nvdimm, cmd, desc, i, buf); if (call_pkg) { /* skip over package wrapper */ in_buf.buffer.pointer = (void *) &call_pkg->nd_payload; in_buf.buffer.length = call_pkg->nd_size_in; } dev_dbg(dev, "%s cmd: %d: family: %d func: %d input length: %d\n", dimm_name, cmd, family, func, in_buf.buffer.length); if (payload_dumpable(nvdimm, func)) print_hex_dump_debug("nvdimm in ", DUMP_PREFIX_OFFSET, 4, 4, in_buf.buffer.pointer, min_t(u32, 256, in_buf.buffer.length), true); /* call the BIOS, prefer the named methods over _DSM if available */ if (nvdimm && cmd == ND_CMD_GET_CONFIG_SIZE && test_bit(NFIT_MEM_LSR, &nfit_mem->flags)) out_obj = acpi_label_info(handle); else if (nvdimm && cmd == ND_CMD_GET_CONFIG_DATA && test_bit(NFIT_MEM_LSR, &nfit_mem->flags)) { struct nd_cmd_get_config_data_hdr *p = buf; out_obj = acpi_label_read(handle, p->in_offset, p->in_length); } else if (nvdimm && cmd == ND_CMD_SET_CONFIG_DATA && test_bit(NFIT_MEM_LSW, &nfit_mem->flags)) { struct nd_cmd_set_config_hdr *p = buf; out_obj = acpi_label_write(handle, p->in_offset, p->in_length, p->in_buf); } else { u8 revid; if (nvdimm) revid = nfit_dsm_revid(nfit_mem->family, func); else revid = 1; out_obj = acpi_evaluate_dsm(handle, guid, revid, func, &in_obj); } if (!out_obj) { dev_dbg(dev, "%s _DSM failed cmd: %s\n", dimm_name, cmd_name); return -EINVAL; } if (out_obj->type != ACPI_TYPE_BUFFER) { dev_dbg(dev, "%s unexpected output object type cmd: %s type: %d\n", dimm_name, cmd_name, out_obj->type); rc = -EINVAL; goto out; } dev_dbg(dev, "%s cmd: %s output length: %d\n", dimm_name, cmd_name, out_obj->buffer.length); print_hex_dump_debug(cmd_name, DUMP_PREFIX_OFFSET, 4, 4, out_obj->buffer.pointer, min_t(u32, 128, out_obj->buffer.length), true); if (call_pkg) { call_pkg->nd_fw_size = out_obj->buffer.length; memcpy(call_pkg->nd_payload + call_pkg->nd_size_in, out_obj->buffer.pointer, min(call_pkg->nd_fw_size, call_pkg->nd_size_out)); ACPI_FREE(out_obj); /* * Need to support FW function w/o known size in advance. * Caller can determine required size based upon nd_fw_size. * If we return an error (like elsewhere) then caller wouldn't * be able to rely upon data returned to make calculation. */ if (cmd_rc) *cmd_rc = 0; return 0; } for (i = 0, offset = 0; i < desc->out_num; i++) { u32 out_size = nd_cmd_out_size(nvdimm, cmd, desc, i, buf, (u32 *) out_obj->buffer.pointer, out_obj->buffer.length - offset); if (offset + out_size > out_obj->buffer.length) { dev_dbg(dev, "%s output object underflow cmd: %s field: %d\n", dimm_name, cmd_name, i); break; } if (in_buf.buffer.length + offset + out_size > buf_len) { dev_dbg(dev, "%s output overrun cmd: %s field: %d\n", dimm_name, cmd_name, i); rc = -ENXIO; goto out; } memcpy(buf + in_buf.buffer.length + offset, out_obj->buffer.pointer + offset, out_size); offset += out_size; } /* * Set fw_status for all the commands with a known format to be * later interpreted by xlat_status(). */ if (i >= 1 && ((!nvdimm && cmd >= ND_CMD_ARS_CAP && cmd <= ND_CMD_CLEAR_ERROR) || (nvdimm && cmd >= ND_CMD_SMART && cmd <= ND_CMD_VENDOR))) fw_status = *(u32 *) out_obj->buffer.pointer; if (offset + in_buf.buffer.length < buf_len) { if (i >= 1) { /* * status valid, return the number of bytes left * unfilled in the output buffer */ rc = buf_len - offset - in_buf.buffer.length; if (cmd_rc) *cmd_rc = xlat_status(nvdimm, buf, cmd, fw_status); } else { dev_err(dev, "%s:%s underrun cmd: %s buf_len: %d out_len: %d\n", __func__, dimm_name, cmd_name, buf_len, offset); rc = -ENXIO; } } else { rc = 0; if (cmd_rc) *cmd_rc = xlat_status(nvdimm, buf, cmd, fw_status); } out: ACPI_FREE(out_obj); return rc; } EXPORT_SYMBOL_GPL(acpi_nfit_ctl); static const char *spa_type_name(u16 type) { static const char *to_name[] = { [NFIT_SPA_VOLATILE] = "volatile", [NFIT_SPA_PM] = "pmem", [NFIT_SPA_DCR] = "dimm-control-region", [NFIT_SPA_BDW] = "block-data-window", [NFIT_SPA_VDISK] = "volatile-disk", [NFIT_SPA_VCD] = "volatile-cd", [NFIT_SPA_PDISK] = "persistent-disk", [NFIT_SPA_PCD] = "persistent-cd", }; if (type > NFIT_SPA_PCD) return "unknown"; return to_name[type]; } int nfit_spa_type(struct acpi_nfit_system_address *spa) { guid_t guid; int i; import_guid(&guid, spa->range_guid); for (i = 0; i < NFIT_UUID_MAX; i++) if (guid_equal(to_nfit_uuid(i), &guid)) return i; return -1; } static size_t sizeof_spa(struct acpi_nfit_system_address *spa) { if (spa->flags & ACPI_NFIT_LOCATION_COOKIE_VALID) return sizeof(*spa); return sizeof(*spa) - 8; } static bool add_spa(struct acpi_nfit_desc *acpi_desc, struct nfit_table_prev *prev, struct acpi_nfit_system_address *spa) { struct device *dev = acpi_desc->dev; struct nfit_spa *nfit_spa; if (spa->header.length != sizeof_spa(spa)) return false; list_for_each_entry(nfit_spa, &prev->spas, list) { if (memcmp(nfit_spa->spa, spa, sizeof_spa(spa)) == 0) { list_move_tail(&nfit_spa->list, &acpi_desc->spas); return true; } } nfit_spa = devm_kzalloc(dev, sizeof(*nfit_spa) + sizeof_spa(spa), GFP_KERNEL); if (!nfit_spa) return false; INIT_LIST_HEAD(&nfit_spa->list); memcpy(nfit_spa->spa, spa, sizeof_spa(spa)); list_add_tail(&nfit_spa->list, &acpi_desc->spas); dev_dbg(dev, "spa index: %d type: %s\n", spa->range_index, spa_type_name(nfit_spa_type(spa))); return true; } static bool add_memdev(struct acpi_nfit_desc *acpi_desc, struct nfit_table_prev *prev, struct acpi_nfit_memory_map *memdev) { struct device *dev = acpi_desc->dev; struct nfit_memdev *nfit_memdev; if (memdev->header.length != sizeof(*memdev)) return false; list_for_each_entry(nfit_memdev, &prev->memdevs, list) if (memcmp(nfit_memdev->memdev, memdev, sizeof(*memdev)) == 0) { list_move_tail(&nfit_memdev->list, &acpi_desc->memdevs); return true; } nfit_memdev = devm_kzalloc(dev, sizeof(*nfit_memdev) + sizeof(*memdev), GFP_KERNEL); if (!nfit_memdev) return false; INIT_LIST_HEAD(&nfit_memdev->list); memcpy(nfit_memdev->memdev, memdev, sizeof(*memdev)); list_add_tail(&nfit_memdev->list, &acpi_desc->memdevs); dev_dbg(dev, "memdev handle: %#x spa: %d dcr: %d flags: %#x\n", memdev->device_handle, memdev->range_index, memdev->region_index, memdev->flags); return true; } int nfit_get_smbios_id(u32 device_handle, u16 *flags) { struct acpi_nfit_memory_map *memdev; struct acpi_nfit_desc *acpi_desc; struct nfit_mem *nfit_mem; u16 physical_id; mutex_lock(&acpi_desc_lock); list_for_each_entry(acpi_desc, &acpi_descs, list) { mutex_lock(&acpi_desc->init_mutex); list_for_each_entry(nfit_mem, &acpi_desc->dimms, list) { memdev = __to_nfit_memdev(nfit_mem); if (memdev->device_handle == device_handle) { *flags = memdev->flags; physical_id = memdev->physical_id; mutex_unlock(&acpi_desc->init_mutex); mutex_unlock(&acpi_desc_lock); return physical_id; } } mutex_unlock(&acpi_desc->init_mutex); } mutex_unlock(&acpi_desc_lock); return -ENODEV; } EXPORT_SYMBOL_GPL(nfit_get_smbios_id); /* * An implementation may provide a truncated control region if no block windows * are defined. */ static size_t sizeof_dcr(struct acpi_nfit_control_region *dcr) { if (dcr->header.length < offsetof(struct acpi_nfit_control_region, window_size)) return 0; if (dcr->windows) return sizeof(*dcr); return offsetof(struct acpi_nfit_control_region, window_size); } static bool add_dcr(struct acpi_nfit_desc *acpi_desc, struct nfit_table_prev *prev, struct acpi_nfit_control_region *dcr) { struct device *dev = acpi_desc->dev; struct nfit_dcr *nfit_dcr; if (!sizeof_dcr(dcr)) return false; list_for_each_entry(nfit_dcr, &prev->dcrs, list) if (memcmp(nfit_dcr->dcr, dcr, sizeof_dcr(dcr)) == 0) { list_move_tail(&nfit_dcr->list, &acpi_desc->dcrs); return true; } nfit_dcr = devm_kzalloc(dev, sizeof(*nfit_dcr) + sizeof(*dcr), GFP_KERNEL); if (!nfit_dcr) return false; INIT_LIST_HEAD(&nfit_dcr->list); memcpy(nfit_dcr->dcr, dcr, sizeof_dcr(dcr)); list_add_tail(&nfit_dcr->list, &acpi_desc->dcrs); dev_dbg(dev, "dcr index: %d windows: %d\n", dcr->region_index, dcr->windows); return true; } static bool add_bdw(struct acpi_nfit_desc *acpi_desc, struct nfit_table_prev *prev, struct acpi_nfit_data_region *bdw) { struct device *dev = acpi_desc->dev; struct nfit_bdw *nfit_bdw; if (bdw->header.length != sizeof(*bdw)) return false; list_for_each_entry(nfit_bdw, &prev->bdws, list) if (memcmp(nfit_bdw->bdw, bdw, sizeof(*bdw)) == 0) { list_move_tail(&nfit_bdw->list, &acpi_desc->bdws); return true; } nfit_bdw = devm_kzalloc(dev, sizeof(*nfit_bdw) + sizeof(*bdw), GFP_KERNEL); if (!nfit_bdw) return false; INIT_LIST_HEAD(&nfit_bdw->list); memcpy(nfit_bdw->bdw, bdw, sizeof(*bdw)); list_add_tail(&nfit_bdw->list, &acpi_desc->bdws); dev_dbg(dev, "bdw dcr: %d windows: %d\n", bdw->region_index, bdw->windows); return true; } static size_t sizeof_idt(struct acpi_nfit_interleave *idt) { if (idt->header.length < sizeof(*idt)) return 0; return sizeof(*idt) + sizeof(u32) * idt->line_count; } static bool add_idt(struct acpi_nfit_desc *acpi_desc, struct nfit_table_prev *prev, struct acpi_nfit_interleave *idt) { struct device *dev = acpi_desc->dev; struct nfit_idt *nfit_idt; if (!sizeof_idt(idt)) return false; list_for_each_entry(nfit_idt, &prev->idts, list) { if (sizeof_idt(nfit_idt->idt) != sizeof_idt(idt)) continue; if (memcmp(nfit_idt->idt, idt, sizeof_idt(idt)) == 0) { list_move_tail(&nfit_idt->list, &acpi_desc->idts); return true; } } nfit_idt = devm_kzalloc(dev, sizeof(*nfit_idt) + sizeof_idt(idt), GFP_KERNEL); if (!nfit_idt) return false; INIT_LIST_HEAD(&nfit_idt->list); memcpy(nfit_idt->idt, idt, sizeof_idt(idt)); list_add_tail(&nfit_idt->list, &acpi_desc->idts); dev_dbg(dev, "idt index: %d num_lines: %d\n", idt->interleave_index, idt->line_count); return true; } static size_t sizeof_flush(struct acpi_nfit_flush_address *flush) { if (flush->header.length < sizeof(*flush)) return 0; return struct_size(flush, hint_address, flush->hint_count); } static bool add_flush(struct acpi_nfit_desc *acpi_desc, struct nfit_table_prev *prev, struct acpi_nfit_flush_address *flush) { struct device *dev = acpi_desc->dev; struct nfit_flush *nfit_flush; if (!sizeof_flush(flush)) return false; list_for_each_entry(nfit_flush, &prev->flushes, list) { if (sizeof_flush(nfit_flush->flush) != sizeof_flush(flush)) continue; if (memcmp(nfit_flush->flush, flush, sizeof_flush(flush)) == 0) { list_move_tail(&nfit_flush->list, &acpi_desc->flushes); return true; } } nfit_flush = devm_kzalloc(dev, sizeof(*nfit_flush) + sizeof_flush(flush), GFP_KERNEL); if (!nfit_flush) return false; INIT_LIST_HEAD(&nfit_flush->list); memcpy(nfit_flush->flush, flush, sizeof_flush(flush)); list_add_tail(&nfit_flush->list, &acpi_desc->flushes); dev_dbg(dev, "nfit_flush handle: %d hint_count: %d\n", flush->device_handle, flush->hint_count); return true; } static bool add_platform_cap(struct acpi_nfit_desc *acpi_desc, struct acpi_nfit_capabilities *pcap) { struct device *dev = acpi_desc->dev; u32 mask; mask = (1 << (pcap->highest_capability + 1)) - 1; acpi_desc->platform_cap = pcap->capabilities & mask; dev_dbg(dev, "cap: %#x\n", acpi_desc->platform_cap); return true; } static void *add_table(struct acpi_nfit_desc *acpi_desc, struct nfit_table_prev *prev, void *table, const void *end) { struct device *dev = acpi_desc->dev; struct acpi_nfit_header *hdr; void *err = ERR_PTR(-ENOMEM); if (table >= end) return NULL; hdr = table; if (!hdr->length) { dev_warn(dev, "found a zero length table '%d' parsing nfit\n", hdr->type); return NULL; } switch (hdr->type) { case ACPI_NFIT_TYPE_SYSTEM_ADDRESS: if (!add_spa(acpi_desc, prev, table)) return err; break; case ACPI_NFIT_TYPE_MEMORY_MAP: if (!add_memdev(acpi_desc, prev, table)) return err; break; case ACPI_NFIT_TYPE_CONTROL_REGION: if (!add_dcr(acpi_desc, prev, table)) return err; break; case ACPI_NFIT_TYPE_DATA_REGION: if (!add_bdw(acpi_desc, prev, table)) return err; break; case ACPI_NFIT_TYPE_INTERLEAVE: if (!add_idt(acpi_desc, prev, table)) return err; break; case ACPI_NFIT_TYPE_FLUSH_ADDRESS: if (!add_flush(acpi_desc, prev, table)) return err; break; case ACPI_NFIT_TYPE_SMBIOS: dev_dbg(dev, "smbios\n"); break; case ACPI_NFIT_TYPE_CAPABILITIES: if (!add_platform_cap(acpi_desc, table)) return err; break; default: dev_err(dev, "unknown table '%d' parsing nfit\n", hdr->type); break; } return table + hdr->length; } static int __nfit_mem_init(struct acpi_nfit_desc *acpi_desc, struct acpi_nfit_system_address *spa) { struct nfit_mem *nfit_mem, *found; struct nfit_memdev *nfit_memdev; int type = spa ? nfit_spa_type(spa) : 0; switch (type) { case NFIT_SPA_DCR: case NFIT_SPA_PM: break; default: if (spa) return 0; } /* * This loop runs in two modes, when a dimm is mapped the loop * adds memdev associations to an existing dimm, or creates a * dimm. In the unmapped dimm case this loop sweeps for memdev * instances with an invalid / zero range_index and adds those * dimms without spa associations. */ list_for_each_entry(nfit_memdev, &acpi_desc->memdevs, list) { struct nfit_flush *nfit_flush; struct nfit_dcr *nfit_dcr; u32 device_handle; u16 dcr; if (spa && nfit_memdev->memdev->range_index != spa->range_index) continue; if (!spa && nfit_memdev->memdev->range_index) continue; found = NULL; dcr = nfit_memdev->memdev->region_index; device_handle = nfit_memdev->memdev->device_handle; list_for_each_entry(nfit_mem, &acpi_desc->dimms, list) if (__to_nfit_memdev(nfit_mem)->device_handle == device_handle) { found = nfit_mem; break; } if (found) nfit_mem = found; else { nfit_mem = devm_kzalloc(acpi_desc->dev, sizeof(*nfit_mem), GFP_KERNEL); if (!nfit_mem) return -ENOMEM; INIT_LIST_HEAD(&nfit_mem->list); nfit_mem->acpi_desc = acpi_desc; list_add(&nfit_mem->list, &acpi_desc->dimms); } list_for_each_entry(nfit_dcr, &acpi_desc->dcrs, list) { if (nfit_dcr->dcr->region_index != dcr) continue; /* * Record the control region for the dimm. For * the ACPI 6.1 case, where there are separate * control regions for the pmem vs blk * interfaces, be sure to record the extended * blk details. */ if (!nfit_mem->dcr) nfit_mem->dcr = nfit_dcr->dcr; else if (nfit_mem->dcr->windows == 0 && nfit_dcr->dcr->windows) nfit_mem->dcr = nfit_dcr->dcr; break; } list_for_each_entry(nfit_flush, &acpi_desc->flushes, list) { struct acpi_nfit_flush_address *flush; u16 i; if (nfit_flush->flush->device_handle != device_handle) continue; nfit_mem->nfit_flush = nfit_flush; flush = nfit_flush->flush; nfit_mem->flush_wpq = devm_kcalloc(acpi_desc->dev, flush->hint_count, sizeof(struct resource), GFP_KERNEL); if (!nfit_mem->flush_wpq) return -ENOMEM; for (i = 0; i < flush->hint_count; i++) { struct resource *res = &nfit_mem->flush_wpq[i]; res->start = flush->hint_address[i]; res->end = res->start + 8 - 1; } break; } if (dcr && !nfit_mem->dcr) { dev_err(acpi_desc->dev, "SPA %d missing DCR %d\n", spa->range_index, dcr); return -ENODEV; } if (type == NFIT_SPA_DCR) { struct nfit_idt *nfit_idt; u16 idt_idx; /* multiple dimms may share a SPA when interleaved */ nfit_mem->spa_dcr = spa; nfit_mem->memdev_dcr = nfit_memdev->memdev; idt_idx = nfit_memdev->memdev->interleave_index; list_for_each_entry(nfit_idt, &acpi_desc->idts, list) { if (nfit_idt->idt->interleave_index != idt_idx) continue; nfit_mem->idt_dcr = nfit_idt->idt; break; } } else if (type == NFIT_SPA_PM) { /* * A single dimm may belong to multiple SPA-PM * ranges, record at least one in addition to * any SPA-DCR range. */ nfit_mem->memdev_pmem = nfit_memdev->memdev; } else nfit_mem->memdev_dcr = nfit_memdev->memdev; } return 0; } static int nfit_mem_cmp(void *priv, const struct list_head *_a, const struct list_head *_b) { struct nfit_mem *a = container_of(_a, typeof(*a), list); struct nfit_mem *b = container_of(_b, typeof(*b), list); u32 handleA, handleB; handleA = __to_nfit_memdev(a)->device_handle; handleB = __to_nfit_memdev(b)->device_handle; if (handleA < handleB) return -1; else if (handleA > handleB) return 1; return 0; } static int nfit_mem_init(struct acpi_nfit_desc *acpi_desc) { struct nfit_spa *nfit_spa; int rc; /* * For each SPA-DCR or SPA-PMEM address range find its * corresponding MEMDEV(s). From each MEMDEV find the * corresponding DCR. Then, if we're operating on a SPA-DCR, * try to find a SPA-BDW and a corresponding BDW that references * the DCR. Throw it all into an nfit_mem object. Note, that * BDWs are optional. */ list_for_each_entry(nfit_spa, &acpi_desc->spas, list) { rc = __nfit_mem_init(acpi_desc, nfit_spa->spa); if (rc) return rc; } /* * If a DIMM has failed to be mapped into SPA there will be no * SPA entries above. Find and register all the unmapped DIMMs * for reporting and recovery purposes. */ rc = __nfit_mem_init(acpi_desc, NULL); if (rc) return rc; list_sort(NULL, &acpi_desc->dimms, nfit_mem_cmp); return 0; } static ssize_t bus_dsm_mask_show(struct device *dev, struct device_attribute *attr, char *buf) { struct nvdimm_bus *nvdimm_bus = to_nvdimm_bus(dev); struct nvdimm_bus_descriptor *nd_desc = to_nd_desc(nvdimm_bus); struct acpi_nfit_desc *acpi_desc = to_acpi_desc(nd_desc); return sysfs_emit(buf, "%#lx\n", acpi_desc->bus_dsm_mask); } static struct device_attribute dev_attr_bus_dsm_mask = __ATTR(dsm_mask, 0444, bus_dsm_mask_show, NULL); static ssize_t revision_show(struct device *dev, struct device_attribute *attr, char *buf) { struct nvdimm_bus *nvdimm_bus = to_nvdimm_bus(dev); struct nvdimm_bus_descriptor *nd_desc = to_nd_desc(nvdimm_bus); struct acpi_nfit_desc *acpi_desc = to_acpi_desc(nd_desc); return sysfs_emit(buf, "%d\n", acpi_desc->acpi_header.revision); } static DEVICE_ATTR_RO(revision); static ssize_t hw_error_scrub_show(struct device *dev, struct device_attribute *attr, char *buf) { struct nvdimm_bus *nvdimm_bus = to_nvdimm_bus(dev); struct nvdimm_bus_descriptor *nd_desc = to_nd_desc(nvdimm_bus); struct acpi_nfit_desc *acpi_desc = to_acpi_desc(nd_desc); return sysfs_emit(buf, "%d\n", acpi_desc->scrub_mode); } /* * The 'hw_error_scrub' attribute can have the following values written to it: * '0': Switch to the default mode where an exception will only insert * the address of the memory error into the poison and badblocks lists. * '1': Enable a full scrub to happen if an exception for a memory error is * received. */ static ssize_t hw_error_scrub_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t size) { struct nvdimm_bus_descriptor *nd_desc; ssize_t rc; long val; rc = kstrtol(buf, 0, &val); if (rc) return rc; device_lock(dev); nd_desc = dev_get_drvdata(dev); if (nd_desc) { struct acpi_nfit_desc *acpi_desc = to_acpi_desc(nd_desc); switch (val) { case HW_ERROR_SCRUB_ON: acpi_desc->scrub_mode = HW_ERROR_SCRUB_ON; break; case HW_ERROR_SCRUB_OFF: acpi_desc->scrub_mode = HW_ERROR_SCRUB_OFF; break; default: rc = -EINVAL; break; } } device_unlock(dev); if (rc) return rc; return size; } static DEVICE_ATTR_RW(hw_error_scrub); /* * This shows the number of full Address Range Scrubs that have been * completed since driver load time. Userspace can wait on this using * select/poll etc. A '+' at the end indicates an ARS is in progress */ static ssize_t scrub_show(struct device *dev, struct device_attribute *attr, char *buf) { struct nvdimm_bus_descriptor *nd_desc; struct acpi_nfit_desc *acpi_desc; ssize_t rc = -ENXIO; bool busy; device_lock(dev); nd_desc = dev_get_drvdata(dev); if (!nd_desc) { device_unlock(dev); return rc; } acpi_desc = to_acpi_desc(nd_desc); mutex_lock(&acpi_desc->init_mutex); busy = test_bit(ARS_BUSY, &acpi_desc->scrub_flags) && !test_bit(ARS_CANCEL, &acpi_desc->scrub_flags); rc = sysfs_emit(buf, "%d%s", acpi_desc->scrub_count, busy ? "+\n" : "\n"); /* Allow an admin to poll the busy state at a higher rate */ if (busy && capable(CAP_SYS_RAWIO) && !test_and_set_bit(ARS_POLL, &acpi_desc->scrub_flags)) { acpi_desc->scrub_tmo = 1; mod_delayed_work(nfit_wq, &acpi_desc->dwork, HZ); } mutex_unlock(&acpi_desc->init_mutex); device_unlock(dev); return rc; } static ssize_t scrub_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t size) { struct nvdimm_bus_descriptor *nd_desc; ssize_t rc; long val; rc = kstrtol(buf, 0, &val); if (rc) return rc; if (val != 1) return -EINVAL; device_lock(dev); nd_desc = dev_get_drvdata(dev); if (nd_desc) { struct acpi_nfit_desc *acpi_desc = to_acpi_desc(nd_desc); rc = acpi_nfit_ars_rescan(acpi_desc, ARS_REQ_LONG); } device_unlock(dev); if (rc) return rc; return size; } static DEVICE_ATTR_RW(scrub); static bool ars_supported(struct nvdimm_bus *nvdimm_bus) { struct nvdimm_bus_descriptor *nd_desc = to_nd_desc(nvdimm_bus); const unsigned long mask = 1 << ND_CMD_ARS_CAP | 1 << ND_CMD_ARS_START | 1 << ND_CMD_ARS_STATUS; return (nd_desc->cmd_mask & mask) == mask; } static umode_t nfit_visible(struct kobject *kobj, struct attribute *a, int n) { struct device *dev = kobj_to_dev(kobj); struct nvdimm_bus *nvdimm_bus = to_nvdimm_bus(dev); if (a == &dev_attr_scrub.attr) return ars_supported(nvdimm_bus) ? a->mode : 0; if (a == &dev_attr_firmware_activate_noidle.attr) return intel_fwa_supported(nvdimm_bus) ? a->mode : 0; return a->mode; } static struct attribute *acpi_nfit_attributes[] = { &dev_attr_revision.attr, &dev_attr_scrub.attr, &dev_attr_hw_error_scrub.attr, &dev_attr_bus_dsm_mask.attr, &dev_attr_firmware_activate_noidle.attr, NULL, }; static const struct attribute_group acpi_nfit_attribute_group = { .name = "nfit", .attrs = acpi_nfit_attributes, .is_visible = nfit_visible, }; static const struct attribute_group *acpi_nfit_attribute_groups[] = { &acpi_nfit_attribute_group, NULL, }; static struct acpi_nfit_memory_map *to_nfit_memdev(struct device *dev) { struct nvdimm *nvdimm = to_nvdimm(dev); struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm); return __to_nfit_memdev(nfit_mem); } static struct acpi_nfit_control_region *to_nfit_dcr(struct device *dev) { struct nvdimm *nvdimm = to_nvdimm(dev); struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm); return nfit_mem->dcr; } static ssize_t handle_show(struct device *dev, struct device_attribute *attr, char *buf) { struct acpi_nfit_memory_map *memdev = to_nfit_memdev(dev); return sysfs_emit(buf, "%#x\n", memdev->device_handle); } static DEVICE_ATTR_RO(handle); static ssize_t phys_id_show(struct device *dev, struct device_attribute *attr, char *buf) { struct acpi_nfit_memory_map *memdev = to_nfit_memdev(dev); return sysfs_emit(buf, "%#x\n", memdev->physical_id); } static DEVICE_ATTR_RO(phys_id); static ssize_t vendor_show(struct device *dev, struct device_attribute *attr, char *buf) { struct acpi_nfit_control_region *dcr = to_nfit_dcr(dev); return sysfs_emit(buf, "0x%04x\n", be16_to_cpu(dcr->vendor_id)); } static DEVICE_ATTR_RO(vendor); static ssize_t rev_id_show(struct device *dev, struct device_attribute *attr, char *buf) { struct acpi_nfit_control_region *dcr = to_nfit_dcr(dev); return sysfs_emit(buf, "0x%04x\n", be16_to_cpu(dcr->revision_id)); } static DEVICE_ATTR_RO(rev_id); static ssize_t device_show(struct device *dev, struct device_attribute *attr, char *buf) { struct acpi_nfit_control_region *dcr = to_nfit_dcr(dev); return sysfs_emit(buf, "0x%04x\n", be16_to_cpu(dcr->device_id)); } static DEVICE_ATTR_RO(device); static ssize_t subsystem_vendor_show(struct device *dev, struct device_attribute *attr, char *buf) { struct acpi_nfit_control_region *dcr = to_nfit_dcr(dev); return sysfs_emit(buf, "0x%04x\n", be16_to_cpu(dcr->subsystem_vendor_id)); } static DEVICE_ATTR_RO(subsystem_vendor); static ssize_t subsystem_rev_id_show(struct device *dev, struct device_attribute *attr, char *buf) { struct acpi_nfit_control_region *dcr = to_nfit_dcr(dev); return sysfs_emit(buf, "0x%04x\n", be16_to_cpu(dcr->subsystem_revision_id)); } static DEVICE_ATTR_RO(subsystem_rev_id); static ssize_t subsystem_device_show(struct device *dev, struct device_attribute *attr, char *buf) { struct acpi_nfit_control_region *dcr = to_nfit_dcr(dev); return sysfs_emit(buf, "0x%04x\n", be16_to_cpu(dcr->subsystem_device_id)); } static DEVICE_ATTR_RO(subsystem_device); static int num_nvdimm_formats(struct nvdimm *nvdimm) { struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm); int formats = 0; if (nfit_mem->memdev_pmem) formats++; return formats; } static ssize_t format_show(struct device *dev, struct device_attribute *attr, char *buf) { struct acpi_nfit_control_region *dcr = to_nfit_dcr(dev); return sysfs_emit(buf, "0x%04x\n", le16_to_cpu(dcr->code)); } static DEVICE_ATTR_RO(format); static ssize_t format1_show(struct device *dev, struct device_attribute *attr, char *buf) { u32 handle; ssize_t rc = -ENXIO; struct nfit_mem *nfit_mem; struct nfit_memdev *nfit_memdev; struct acpi_nfit_desc *acpi_desc; struct nvdimm *nvdimm = to_nvdimm(dev); struct acpi_nfit_control_region *dcr = to_nfit_dcr(dev); nfit_mem = nvdimm_provider_data(nvdimm); acpi_desc = nfit_mem->acpi_desc; handle = to_nfit_memdev(dev)->device_handle; /* assumes DIMMs have at most 2 published interface codes */ mutex_lock(&acpi_desc->init_mutex); list_for_each_entry(nfit_memdev, &acpi_desc->memdevs, list) { struct acpi_nfit_memory_map *memdev = nfit_memdev->memdev; struct nfit_dcr *nfit_dcr; if (memdev->device_handle != handle) continue; list_for_each_entry(nfit_dcr, &acpi_desc->dcrs, list) { if (nfit_dcr->dcr->region_index != memdev->region_index) continue; if (nfit_dcr->dcr->code == dcr->code) continue; rc = sysfs_emit(buf, "0x%04x\n", le16_to_cpu(nfit_dcr->dcr->code)); break; } if (rc != -ENXIO) break; } mutex_unlock(&acpi_desc->init_mutex); return rc; } static DEVICE_ATTR_RO(format1); static ssize_t formats_show(struct device *dev, struct device_attribute *attr, char *buf) { struct nvdimm *nvdimm = to_nvdimm(dev); return sysfs_emit(buf, "%d\n", num_nvdimm_formats(nvdimm)); } static DEVICE_ATTR_RO(formats); static ssize_t serial_show(struct device *dev, struct device_attribute *attr, char *buf) { struct acpi_nfit_control_region *dcr = to_nfit_dcr(dev); return sysfs_emit(buf, "0x%08x\n", be32_to_cpu(dcr->serial_number)); } static DEVICE_ATTR_RO(serial); static ssize_t family_show(struct device *dev, struct device_attribute *attr, char *buf) { struct nvdimm *nvdimm = to_nvdimm(dev); struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm); if (nfit_mem->family < 0) return -ENXIO; return sysfs_emit(buf, "%d\n", nfit_mem->family); } static DEVICE_ATTR_RO(family); static ssize_t dsm_mask_show(struct device *dev, struct device_attribute *attr, char *buf) { struct nvdimm *nvdimm = to_nvdimm(dev); struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm); if (nfit_mem->family < 0) return -ENXIO; return sysfs_emit(buf, "%#lx\n", nfit_mem->dsm_mask); } static DEVICE_ATTR_RO(dsm_mask); static ssize_t flags_show(struct device *dev, struct device_attribute *attr, char *buf) { struct nvdimm *nvdimm = to_nvdimm(dev); struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm); u16 flags = __to_nfit_memdev(nfit_mem)->flags; if (test_bit(NFIT_MEM_DIRTY, &nfit_mem->flags)) flags |= ACPI_NFIT_MEM_FLUSH_FAILED; return sysfs_emit(buf, "%s%s%s%s%s%s%s\n", flags & ACPI_NFIT_MEM_SAVE_FAILED ? "save_fail " : "", flags & ACPI_NFIT_MEM_RESTORE_FAILED ? "restore_fail " : "", flags & ACPI_NFIT_MEM_FLUSH_FAILED ? "flush_fail " : "", flags & ACPI_NFIT_MEM_NOT_ARMED ? "not_armed " : "", flags & ACPI_NFIT_MEM_HEALTH_OBSERVED ? "smart_event " : "", flags & ACPI_NFIT_MEM_MAP_FAILED ? "map_fail " : "", flags & ACPI_NFIT_MEM_HEALTH_ENABLED ? "smart_notify " : ""); } static DEVICE_ATTR_RO(flags); static ssize_t id_show(struct device *dev, struct device_attribute *attr, char *buf) { struct nvdimm *nvdimm = to_nvdimm(dev); struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm); return sysfs_emit(buf, "%s\n", nfit_mem->id); } static DEVICE_ATTR_RO(id); static ssize_t dirty_shutdown_show(struct device *dev, struct device_attribute *attr, char *buf) { struct nvdimm *nvdimm = to_nvdimm(dev); struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm); return sysfs_emit(buf, "%d\n", nfit_mem->dirty_shutdown); } static DEVICE_ATTR_RO(dirty_shutdown); static struct attribute *acpi_nfit_dimm_attributes[] = { &dev_attr_handle.attr, &dev_attr_phys_id.attr, &dev_attr_vendor.attr, &dev_attr_device.attr, &dev_attr_rev_id.attr, &dev_attr_subsystem_vendor.attr, &dev_attr_subsystem_device.attr, &dev_attr_subsystem_rev_id.attr, &dev_attr_format.attr, &dev_attr_formats.attr, &dev_attr_format1.attr, &dev_attr_serial.attr, &dev_attr_flags.attr, &dev_attr_id.attr, &dev_attr_family.attr, &dev_attr_dsm_mask.attr, &dev_attr_dirty_shutdown.attr, NULL, }; static umode_t acpi_nfit_dimm_attr_visible(struct kobject *kobj, struct attribute *a, int n) { struct device *dev = kobj_to_dev(kobj); struct nvdimm *nvdimm = to_nvdimm(dev); struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm); if (!to_nfit_dcr(dev)) { /* Without a dcr only the memdev attributes can be surfaced */ if (a == &dev_attr_handle.attr || a == &dev_attr_phys_id.attr || a == &dev_attr_flags.attr || a == &dev_attr_family.attr || a == &dev_attr_dsm_mask.attr) return a->mode; return 0; } if (a == &dev_attr_format1.attr && num_nvdimm_formats(nvdimm) <= 1) return 0; if (!test_bit(NFIT_MEM_DIRTY_COUNT, &nfit_mem->flags) && a == &dev_attr_dirty_shutdown.attr) return 0; return a->mode; } static const struct attribute_group acpi_nfit_dimm_attribute_group = { .name = "nfit", .attrs = acpi_nfit_dimm_attributes, .is_visible = acpi_nfit_dimm_attr_visible, }; static const struct attribute_group *acpi_nfit_dimm_attribute_groups[] = { &acpi_nfit_dimm_attribute_group, NULL, }; static struct nvdimm *acpi_nfit_dimm_by_handle(struct acpi_nfit_desc *acpi_desc, u32 device_handle) { struct nfit_mem *nfit_mem; list_for_each_entry(nfit_mem, &acpi_desc->dimms, list) if (__to_nfit_memdev(nfit_mem)->device_handle == device_handle) return nfit_mem->nvdimm; return NULL; } void __acpi_nvdimm_notify(struct device *dev, u32 event) { struct nfit_mem *nfit_mem; struct acpi_nfit_desc *acpi_desc; dev_dbg(dev->parent, "%s: event: %d\n", dev_name(dev), event); if (event != NFIT_NOTIFY_DIMM_HEALTH) { dev_dbg(dev->parent, "%s: unknown event: %d\n", dev_name(dev), event); return; } acpi_desc = dev_get_drvdata(dev->parent); if (!acpi_desc) return; /* * If we successfully retrieved acpi_desc, then we know nfit_mem data * is still valid. */ nfit_mem = dev_get_drvdata(dev); if (nfit_mem && nfit_mem->flags_attr) sysfs_notify_dirent(nfit_mem->flags_attr); } EXPORT_SYMBOL_GPL(__acpi_nvdimm_notify); static void acpi_nvdimm_notify(acpi_handle handle, u32 event, void *data) { struct acpi_device *adev = data; struct device *dev = &adev->dev; device_lock(dev->parent); __acpi_nvdimm_notify(dev, event); device_unlock(dev->parent); } static bool acpi_nvdimm_has_method(struct acpi_device *adev, char *method) { acpi_handle handle; acpi_status status; status = acpi_get_handle(adev->handle, method, &handle); if (ACPI_SUCCESS(status)) return true; return false; } __weak void nfit_intel_shutdown_status(struct nfit_mem *nfit_mem) { struct device *dev = &nfit_mem->adev->dev; struct nd_intel_smart smart = { 0 }; union acpi_object in_buf = { .buffer.type = ACPI_TYPE_BUFFER, .buffer.length = 0, }; union acpi_object in_obj = { .package.type = ACPI_TYPE_PACKAGE, .package.count = 1, .package.elements = &in_buf, }; const u8 func = ND_INTEL_SMART; const guid_t *guid = to_nfit_uuid(nfit_mem->family); u8 revid = nfit_dsm_revid(nfit_mem->family, func); struct acpi_device *adev = nfit_mem->adev; acpi_handle handle = adev->handle; union acpi_object *out_obj; if ((nfit_mem->dsm_mask & (1 << func)) == 0) return; out_obj = acpi_evaluate_dsm_typed(handle, guid, revid, func, &in_obj, ACPI_TYPE_BUFFER); if (!out_obj || out_obj->buffer.length < sizeof(smart)) { dev_dbg(dev->parent, "%s: failed to retrieve initial health\n", dev_name(dev)); ACPI_FREE(out_obj); return; } memcpy(&smart, out_obj->buffer.pointer, sizeof(smart)); ACPI_FREE(out_obj); if (smart.flags & ND_INTEL_SMART_SHUTDOWN_VALID) { if (smart.shutdown_state) set_bit(NFIT_MEM_DIRTY, &nfit_mem->flags); } if (smart.flags & ND_INTEL_SMART_SHUTDOWN_COUNT_VALID) { set_bit(NFIT_MEM_DIRTY_COUNT, &nfit_mem->flags); nfit_mem->dirty_shutdown = smart.shutdown_count; } } static void populate_shutdown_status(struct nfit_mem *nfit_mem) { /* * For DIMMs that provide a dynamic facility to retrieve a * dirty-shutdown status and/or a dirty-shutdown count, cache * these values in nfit_mem. */ if (nfit_mem->family == NVDIMM_FAMILY_INTEL) nfit_intel_shutdown_status(nfit_mem); } static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc, struct nfit_mem *nfit_mem, u32 device_handle) { struct nvdimm_bus_descriptor *nd_desc = &acpi_desc->nd_desc; struct acpi_device *adev, *adev_dimm; struct device *dev = acpi_desc->dev; unsigned long dsm_mask, label_mask; const guid_t *guid; int i; int family = -1; struct acpi_nfit_control_region *dcr = nfit_mem->dcr; /* nfit test assumes 1:1 relationship between commands and dsms */ nfit_mem->dsm_mask = acpi_desc->dimm_cmd_force_en; nfit_mem->family = NVDIMM_FAMILY_INTEL; set_bit(NVDIMM_FAMILY_INTEL, &nd_desc->dimm_family_mask); if (dcr->valid_fields & ACPI_NFIT_CONTROL_MFG_INFO_VALID) sprintf(nfit_mem->id, "%04x-%02x-%04x-%08x", be16_to_cpu(dcr->vendor_id), dcr->manufacturing_location, be16_to_cpu(dcr->manufacturing_date), be32_to_cpu(dcr->serial_number)); else sprintf(nfit_mem->id, "%04x-%08x", be16_to_cpu(dcr->vendor_id), be32_to_cpu(dcr->serial_number)); adev = to_acpi_dev(acpi_desc); if (!adev) { /* unit test case */ populate_shutdown_status(nfit_mem); return 0; } adev_dimm = acpi_find_child_device(adev, device_handle, false); nfit_mem->adev = adev_dimm; if (!adev_dimm) { dev_err(dev, "no ACPI.NFIT device with _ADR %#x, disabling...\n", device_handle); return force_enable_dimms ? 0 : -ENODEV; } if (ACPI_FAILURE(acpi_install_notify_handler(adev_dimm->handle, ACPI_DEVICE_NOTIFY, acpi_nvdimm_notify, adev_dimm))) { dev_err(dev, "%s: notification registration failed\n", dev_name(&adev_dimm->dev)); return -ENXIO; } /* * Record nfit_mem for the notification path to track back to * the nfit sysfs attributes for this dimm device object. */ dev_set_drvdata(&adev_dimm->dev, nfit_mem); /* * There are 4 "legacy" NVDIMM command sets * (NVDIMM_FAMILY_{INTEL,MSFT,HPE1,HPE2}) that were created before * an EFI working group was established to constrain this * proliferation. The nfit driver probes for the supported command * set by GUID. Note, if you're a platform developer looking to add * a new command set to this probe, consider using an existing set, * or otherwise seek approval to publish the command set at * http://www.uefi.org/RFIC_LIST. * * Note, that checking for function0 (bit0) tells us if any commands * are reachable through this GUID. */ clear_bit(NVDIMM_FAMILY_INTEL, &nd_desc->dimm_family_mask); for (i = 0; i <= NVDIMM_FAMILY_MAX; i++) if (acpi_check_dsm(adev_dimm->handle, to_nfit_uuid(i), 1, 1)) { set_bit(i, &nd_desc->dimm_family_mask); if (family < 0 || i == default_dsm_family) family = i; } /* limit the supported commands to those that are publicly documented */ nfit_mem->family = family; if (override_dsm_mask && !disable_vendor_specific) dsm_mask = override_dsm_mask; else if (nfit_mem->family == NVDIMM_FAMILY_INTEL) { dsm_mask = NVDIMM_INTEL_CMDMASK; if (disable_vendor_specific) dsm_mask &= ~(1 << ND_CMD_VENDOR); } else if (nfit_mem->family == NVDIMM_FAMILY_HPE1) { dsm_mask = 0x1c3c76; } else if (nfit_mem->family == NVDIMM_FAMILY_HPE2) { dsm_mask = 0x1fe; if (disable_vendor_specific) dsm_mask &= ~(1 << 8); } else if (nfit_mem->family == NVDIMM_FAMILY_MSFT) { dsm_mask = 0xffffffff; } else if (nfit_mem->family == NVDIMM_FAMILY_HYPERV) { dsm_mask = 0x1f; } else { dev_dbg(dev, "unknown dimm command family\n"); nfit_mem->family = -1; /* DSMs are optional, continue loading the driver... */ return 0; } /* * Function 0 is the command interrogation function, don't * export it to potential userspace use, and enable it to be * used as an error value in acpi_nfit_ctl(). */ dsm_mask &= ~1UL; guid = to_nfit_uuid(nfit_mem->family); for_each_set_bit(i, &dsm_mask, BITS_PER_LONG) if (acpi_check_dsm(adev_dimm->handle, guid, nfit_dsm_revid(nfit_mem->family, i), 1ULL << i)) set_bit(i, &nfit_mem->dsm_mask); /* * Prefer the NVDIMM_FAMILY_INTEL label read commands if present * due to their better semantics handling locked capacity. */ label_mask = 1 << ND_CMD_GET_CONFIG_SIZE | 1 << ND_CMD_GET_CONFIG_DATA | 1 << ND_CMD_SET_CONFIG_DATA; if (family == NVDIMM_FAMILY_INTEL && (dsm_mask & label_mask) == label_mask) /* skip _LS{I,R,W} enabling */; else { if (acpi_nvdimm_has_method(adev_dimm, "_LSI") && acpi_nvdimm_has_method(adev_dimm, "_LSR")) { dev_dbg(dev, "%s: has _LSR\n", dev_name(&adev_dimm->dev)); set_bit(NFIT_MEM_LSR, &nfit_mem->flags); } if (test_bit(NFIT_MEM_LSR, &nfit_mem->flags) && acpi_nvdimm_has_method(adev_dimm, "_LSW")) { dev_dbg(dev, "%s: has _LSW\n", dev_name(&adev_dimm->dev)); set_bit(NFIT_MEM_LSW, &nfit_mem->flags); } /* * Quirk read-only label configurations to preserve * access to label-less namespaces by default. */ if (!test_bit(NFIT_MEM_LSW, &nfit_mem->flags) && !force_labels) { dev_dbg(dev, "%s: No _LSW, disable labels\n", dev_name(&adev_dimm->dev)); clear_bit(NFIT_MEM_LSR, &nfit_mem->flags); } else dev_dbg(dev, "%s: Force enable labels\n", dev_name(&adev_dimm->dev)); } populate_shutdown_status(nfit_mem); return 0; } static void shutdown_dimm_notify(void *data) { struct acpi_nfit_desc *acpi_desc = data; struct nfit_mem *nfit_mem; mutex_lock(&acpi_desc->init_mutex); /* * Clear out the nfit_mem->flags_attr and shut down dimm event * notifications. */ list_for_each_entry(nfit_mem, &acpi_desc->dimms, list) { struct acpi_device *adev_dimm = nfit_mem->adev; if (nfit_mem->flags_attr) { sysfs_put(nfit_mem->flags_attr); nfit_mem->flags_attr = NULL; } if (adev_dimm) { acpi_remove_notify_handler(adev_dimm->handle, ACPI_DEVICE_NOTIFY, acpi_nvdimm_notify); dev_set_drvdata(&adev_dimm->dev, NULL); } } mutex_unlock(&acpi_desc->init_mutex); } static const struct nvdimm_security_ops *acpi_nfit_get_security_ops(int family) { switch (family) { case NVDIMM_FAMILY_INTEL: return intel_security_ops; default: return NULL; } } static const struct nvdimm_fw_ops *acpi_nfit_get_fw_ops( struct nfit_mem *nfit_mem) { unsigned long mask; struct acpi_nfit_desc *acpi_desc = nfit_mem->acpi_desc; struct nvdimm_bus_descriptor *nd_desc = &acpi_desc->nd_desc; if (!nd_desc->fw_ops) return NULL; if (nfit_mem->family != NVDIMM_FAMILY_INTEL) return NULL; mask = nfit_mem->dsm_mask & NVDIMM_INTEL_FW_ACTIVATE_CMDMASK; if (mask != NVDIMM_INTEL_FW_ACTIVATE_CMDMASK) return NULL; return intel_fw_ops; } static int acpi_nfit_register_dimms(struct acpi_nfit_desc *acpi_desc) { struct nfit_mem *nfit_mem; int dimm_count = 0, rc; struct nvdimm *nvdimm; list_for_each_entry(nfit_mem, &acpi_desc->dimms, list) { struct acpi_nfit_flush_address *flush; unsigned long flags = 0, cmd_mask; struct nfit_memdev *nfit_memdev; u32 device_handle; u16 mem_flags; device_handle = __to_nfit_memdev(nfit_mem)->device_handle; nvdimm = acpi_nfit_dimm_by_handle(acpi_desc, device_handle); if (nvdimm) { dimm_count++; continue; } /* collate flags across all memdevs for this dimm */ list_for_each_entry(nfit_memdev, &acpi_desc->memdevs, list) { struct acpi_nfit_memory_map *dimm_memdev; dimm_memdev = __to_nfit_memdev(nfit_mem); if (dimm_memdev->device_handle != nfit_memdev->memdev->device_handle) continue; dimm_memdev->flags |= nfit_memdev->memdev->flags; } mem_flags = __to_nfit_memdev(nfit_mem)->flags; if (mem_flags & ACPI_NFIT_MEM_NOT_ARMED) set_bit(NDD_UNARMED, &flags); rc = acpi_nfit_add_dimm(acpi_desc, nfit_mem, device_handle); if (rc) continue; /* * TODO: provide translation for non-NVDIMM_FAMILY_INTEL * devices (i.e. from nd_cmd to acpi_dsm) to standardize the * userspace interface. */ cmd_mask = 1UL << ND_CMD_CALL; if (nfit_mem->family == NVDIMM_FAMILY_INTEL) { /* * These commands have a 1:1 correspondence * between DSM payload and libnvdimm ioctl * payload format. */ cmd_mask |= nfit_mem->dsm_mask & NVDIMM_STANDARD_CMDMASK; } if (test_bit(NFIT_MEM_LSR, &nfit_mem->flags)) { set_bit(ND_CMD_GET_CONFIG_SIZE, &cmd_mask); set_bit(ND_CMD_GET_CONFIG_DATA, &cmd_mask); } if (test_bit(NFIT_MEM_LSW, &nfit_mem->flags)) set_bit(ND_CMD_SET_CONFIG_DATA, &cmd_mask); flush = nfit_mem->nfit_flush ? nfit_mem->nfit_flush->flush : NULL; nvdimm = __nvdimm_create(acpi_desc->nvdimm_bus, nfit_mem, acpi_nfit_dimm_attribute_groups, flags, cmd_mask, flush ? flush->hint_count : 0, nfit_mem->flush_wpq, &nfit_mem->id[0], acpi_nfit_get_security_ops(nfit_mem->family), acpi_nfit_get_fw_ops(nfit_mem)); if (!nvdimm) return -ENOMEM; nfit_mem->nvdimm = nvdimm; dimm_count++; if ((mem_flags & ACPI_NFIT_MEM_FAILED_MASK) == 0) continue; dev_err(acpi_desc->dev, "Error found in NVDIMM %s flags:%s%s%s%s%s\n", nvdimm_name(nvdimm), mem_flags & ACPI_NFIT_MEM_SAVE_FAILED ? " save_fail" : "", mem_flags & ACPI_NFIT_MEM_RESTORE_FAILED ? " restore_fail":"", mem_flags & ACPI_NFIT_MEM_FLUSH_FAILED ? " flush_fail" : "", mem_flags & ACPI_NFIT_MEM_NOT_ARMED ? " not_armed" : "", mem_flags & ACPI_NFIT_MEM_MAP_FAILED ? " map_fail" : ""); } rc = nvdimm_bus_check_dimm_count(acpi_desc->nvdimm_bus, dimm_count); if (rc) return rc; /* * Now that dimms are successfully registered, and async registration * is flushed, attempt to enable event notification. */ list_for_each_entry(nfit_mem, &acpi_desc->dimms, list) { struct kernfs_node *nfit_kernfs; nvdimm = nfit_mem->nvdimm; if (!nvdimm) continue; nfit_kernfs = sysfs_get_dirent(nvdimm_kobj(nvdimm)->sd, "nfit"); if (nfit_kernfs) nfit_mem->flags_attr = sysfs_get_dirent(nfit_kernfs, "flags"); sysfs_put(nfit_kernfs); if (!nfit_mem->flags_attr) dev_warn(acpi_desc->dev, "%s: notifications disabled\n", nvdimm_name(nvdimm)); } return devm_add_action_or_reset(acpi_desc->dev, shutdown_dimm_notify, acpi_desc); } /* * These constants are private because there are no kernel consumers of * these commands. */ enum nfit_aux_cmds { NFIT_CMD_TRANSLATE_SPA = 5, NFIT_CMD_ARS_INJECT_SET = 7, NFIT_CMD_ARS_INJECT_CLEAR = 8, NFIT_CMD_ARS_INJECT_GET = 9, }; static void acpi_nfit_init_dsms(struct acpi_nfit_desc *acpi_desc) { struct nvdimm_bus_descriptor *nd_desc = &acpi_desc->nd_desc; const guid_t *guid = to_nfit_uuid(NFIT_DEV_BUS); unsigned long dsm_mask, *mask; struct acpi_device *adev; int i; set_bit(ND_CMD_CALL, &nd_desc->cmd_mask); set_bit(NVDIMM_BUS_FAMILY_NFIT, &nd_desc->bus_family_mask); /* enable nfit_test to inject bus command emulation */ if (acpi_desc->bus_cmd_force_en) { nd_desc->cmd_mask = acpi_desc->bus_cmd_force_en; mask = &nd_desc->bus_family_mask; if (acpi_desc->family_dsm_mask[NVDIMM_BUS_FAMILY_INTEL]) { set_bit(NVDIMM_BUS_FAMILY_INTEL, mask); nd_desc->fw_ops = intel_bus_fw_ops; } } adev = to_acpi_dev(acpi_desc); if (!adev) return; for (i = ND_CMD_ARS_CAP; i <= ND_CMD_CLEAR_ERROR; i++) if (acpi_check_dsm(adev->handle, guid, 1, 1ULL << i)) set_bit(i, &nd_desc->cmd_mask); dsm_mask = (1 << ND_CMD_ARS_CAP) | (1 << ND_CMD_ARS_START) | (1 << ND_CMD_ARS_STATUS) | (1 << ND_CMD_CLEAR_ERROR) | (1 << NFIT_CMD_TRANSLATE_SPA) | (1 << NFIT_CMD_ARS_INJECT_SET) | (1 << NFIT_CMD_ARS_INJECT_CLEAR) | (1 << NFIT_CMD_ARS_INJECT_GET); for_each_set_bit(i, &dsm_mask, BITS_PER_LONG) if (acpi_check_dsm(adev->handle, guid, 1, 1ULL << i)) set_bit(i, &acpi_desc->bus_dsm_mask); /* Enumerate allowed NVDIMM_BUS_FAMILY_INTEL commands */ dsm_mask = NVDIMM_BUS_INTEL_FW_ACTIVATE_CMDMASK; guid = to_nfit_bus_uuid(NVDIMM_BUS_FAMILY_INTEL); mask = &acpi_desc->family_dsm_mask[NVDIMM_BUS_FAMILY_INTEL]; for_each_set_bit(i, &dsm_mask, BITS_PER_LONG) if (acpi_check_dsm(adev->handle, guid, 1, 1ULL << i)) set_bit(i, mask); if (*mask == dsm_mask) { set_bit(NVDIMM_BUS_FAMILY_INTEL, &nd_desc->bus_family_mask); nd_desc->fw_ops = intel_bus_fw_ops; } } static ssize_t range_index_show(struct device *dev, struct device_attribute *attr, char *buf) { struct nd_region *nd_region = to_nd_region(dev); struct nfit_spa *nfit_spa = nd_region_provider_data(nd_region); return sysfs_emit(buf, "%d\n", nfit_spa->spa->range_index); } static DEVICE_ATTR_RO(range_index); static struct attribute *acpi_nfit_region_attributes[] = { &dev_attr_range_index.attr, NULL, }; static const struct attribute_group acpi_nfit_region_attribute_group = { .name = "nfit", .attrs = acpi_nfit_region_attributes, }; static const struct attribute_group *acpi_nfit_region_attribute_groups[] = { &acpi_nfit_region_attribute_group, NULL, }; /* enough info to uniquely specify an interleave set */ struct nfit_set_info { u64 region_offset; u32 serial_number; u32 pad; }; struct nfit_set_info2 { u64 region_offset; u32 serial_number; u16 vendor_id; u16 manufacturing_date; u8 manufacturing_location; u8 reserved[31]; }; static int cmp_map_compat(const void *m0, const void *m1) { const struct nfit_set_info *map0 = m0; const struct nfit_set_info *map1 = m1; return memcmp(&map0->region_offset, &map1->region_offset, sizeof(u64)); } static int cmp_map(const void *m0, const void *m1) { const struct nfit_set_info *map0 = m0; const struct nfit_set_info *map1 = m1; if (map0->region_offset < map1->region_offset) return -1; else if (map0->region_offset > map1->region_offset) return 1; return 0; } static int cmp_map2(const void *m0, const void *m1) { const struct nfit_set_info2 *map0 = m0; const struct nfit_set_info2 *map1 = m1; if (map0->region_offset < map1->region_offset) return -1; else if (map0->region_offset > map1->region_offset) return 1; return 0; } /* Retrieve the nth entry referencing this spa */ static struct acpi_nfit_memory_map *memdev_from_spa( struct acpi_nfit_desc *acpi_desc, u16 range_index, int n) { struct nfit_memdev *nfit_memdev; list_for_each_entry(nfit_memdev, &acpi_desc->memdevs, list) if (nfit_memdev->memdev->range_index == range_index) if (n-- == 0) return nfit_memdev->memdev; return NULL; } static int acpi_nfit_init_interleave_set(struct acpi_nfit_desc *acpi_desc, struct nd_region_desc *ndr_desc, struct acpi_nfit_system_address *spa) { u16 nr = ndr_desc->num_mappings; struct nfit_set_info2 *info2 __free(kfree) = kcalloc(nr, sizeof(*info2), GFP_KERNEL); struct nfit_set_info *info __free(kfree) = kcalloc(nr, sizeof(*info), GFP_KERNEL); struct device *dev = acpi_desc->dev; struct nd_interleave_set *nd_set; int i; if (!info || !info2) return -ENOMEM; nd_set = devm_kzalloc(dev, sizeof(*nd_set), GFP_KERNEL); if (!nd_set) return -ENOMEM; import_guid(&nd_set->type_guid, spa->range_guid); for (i = 0; i < nr; i++) { struct nd_mapping_desc *mapping = &ndr_desc->mapping[i]; struct nvdimm *nvdimm = mapping->nvdimm; struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm); struct nfit_set_info *map = &info[i]; struct nfit_set_info2 *map2 = &info2[i]; struct acpi_nfit_memory_map *memdev = memdev_from_spa(acpi_desc, spa->range_index, i); struct acpi_nfit_control_region *dcr = nfit_mem->dcr; if (!memdev || !nfit_mem->dcr) { dev_err(dev, "%s: failed to find DCR\n", __func__); return -ENODEV; } map->region_offset = memdev->region_offset; map->serial_number = dcr->serial_number; map2->region_offset = memdev->region_offset; map2->serial_number = dcr->serial_number; map2->vendor_id = dcr->vendor_id; map2->manufacturing_date = dcr->manufacturing_date; map2->manufacturing_location = dcr->manufacturing_location; } /* v1.1 namespaces */ sort(info, nr, sizeof(*info), cmp_map, NULL); nd_set->cookie1 = nd_fletcher64(info, sizeof(*info) * nr, 0); /* v1.2 namespaces */ sort(info2, nr, sizeof(*info2), cmp_map2, NULL); nd_set->cookie2 = nd_fletcher64(info2, sizeof(*info2) * nr, 0); /* support v1.1 namespaces created with the wrong sort order */ sort(info, nr, sizeof(*info), cmp_map_compat, NULL); nd_set->altcookie = nd_fletcher64(info, sizeof(*info) * nr, 0); /* record the result of the sort for the mapping position */ for (i = 0; i < nr; i++) { struct nfit_set_info2 *map2 = &info2[i]; int j; for (j = 0; j < nr; j++) { struct nd_mapping_desc *mapping = &ndr_desc->mapping[j]; struct nvdimm *nvdimm = mapping->nvdimm; struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm); struct acpi_nfit_control_region *dcr = nfit_mem->dcr; if (map2->serial_number == dcr->serial_number && map2->vendor_id == dcr->vendor_id && map2->manufacturing_date == dcr->manufacturing_date && map2->manufacturing_location == dcr->manufacturing_location) { mapping->position = i; break; } } } ndr_desc->nd_set = nd_set; return 0; } static int ars_get_cap(struct acpi_nfit_desc *acpi_desc, struct nd_cmd_ars_cap *cmd, struct nfit_spa *nfit_spa) { struct nvdimm_bus_descriptor *nd_desc = &acpi_desc->nd_desc; struct acpi_nfit_system_address *spa = nfit_spa->spa; int cmd_rc, rc; cmd->address = spa->address; cmd->length = spa->length; rc = nd_desc->ndctl(nd_desc, NULL, ND_CMD_ARS_CAP, cmd, sizeof(*cmd), &cmd_rc); if (rc < 0) return rc; return cmd_rc; } static int ars_start(struct acpi_nfit_desc *acpi_desc, struct nfit_spa *nfit_spa, enum nfit_ars_state req_type) { int rc; int cmd_rc; struct nd_cmd_ars_start ars_start; struct acpi_nfit_system_address *spa = nfit_spa->spa; struct nvdimm_bus_descriptor *nd_desc = &acpi_desc->nd_desc; memset(&ars_start, 0, sizeof(ars_start)); ars_start.address = spa->address; ars_start.length = spa->length; if (req_type == ARS_REQ_SHORT) ars_start.flags = ND_ARS_RETURN_PREV_DATA; if (nfit_spa_type(spa) == NFIT_SPA_PM) ars_start.type = ND_ARS_PERSISTENT; else if (nfit_spa_type(spa) == NFIT_SPA_VOLATILE) ars_start.type = ND_ARS_VOLATILE; else return -ENOTTY; rc = nd_desc->ndctl(nd_desc, NULL, ND_CMD_ARS_START, &ars_start, sizeof(ars_start), &cmd_rc); if (rc < 0) return rc; if (cmd_rc < 0) return cmd_rc; set_bit(ARS_VALID, &acpi_desc->scrub_flags); return 0; } static int ars_continue(struct acpi_nfit_desc *acpi_desc) { int rc, cmd_rc; struct nd_cmd_ars_start ars_start; struct nvdimm_bus_descriptor *nd_desc = &acpi_desc->nd_desc; struct nd_cmd_ars_status *ars_status = acpi_desc->ars_status; ars_start = (struct nd_cmd_ars_start) { .address = ars_status->restart_address, .length = ars_status->restart_length, .type = ars_status->type, }; rc = nd_desc->ndctl(nd_desc, NULL, ND_CMD_ARS_START, &ars_start, sizeof(ars_start), &cmd_rc); if (rc < 0) return rc; return cmd_rc; } static int ars_get_status(struct acpi_nfit_desc *acpi_desc) { struct nvdimm_bus_descriptor *nd_desc = &acpi_desc->nd_desc; struct nd_cmd_ars_status *ars_status = acpi_desc->ars_status; int rc, cmd_rc; rc = nd_desc->ndctl(nd_desc, NULL, ND_CMD_ARS_STATUS, ars_status, acpi_desc->max_ars, &cmd_rc); if (rc < 0) return rc; return cmd_rc; } static void ars_complete(struct acpi_nfit_desc *acpi_desc, struct nfit_spa *nfit_spa) { struct nd_cmd_ars_status *ars_status = acpi_desc->ars_status; struct acpi_nfit_system_address *spa = nfit_spa->spa; struct nd_region *nd_region = nfit_spa->nd_region; struct device *dev; lockdep_assert_held(&acpi_desc->init_mutex); /* * Only advance the ARS state for ARS runs initiated by the * kernel, ignore ARS results from BIOS initiated runs for scrub * completion tracking. */ if (acpi_desc->scrub_spa != nfit_spa) return; if ((ars_status->address >= spa->address && ars_status->address < spa->address + spa->length) || (ars_status->address < spa->address)) { /* * Assume that if a scrub starts at an offset from the * start of nfit_spa that we are in the continuation * case. * * Otherwise, if the scrub covers the spa range, mark * any pending request complete. */ if (ars_status->address + ars_status->length >= spa->address + spa->length) /* complete */; else return; } else return; acpi_desc->scrub_spa = NULL; if (nd_region) { dev = nd_region_dev(nd_region); nvdimm_region_notify(nd_region, NVDIMM_REVALIDATE_POISON); } else dev = acpi_desc->dev; dev_dbg(dev, "ARS: range %d complete\n", spa->range_index); } static int ars_status_process_records(struct acpi_nfit_desc *acpi_desc) { struct nvdimm_bus *nvdimm_bus = acpi_desc->nvdimm_bus; struct nd_cmd_ars_status *ars_status = acpi_desc->ars_status; int rc; u32 i; /* * First record starts at 44 byte offset from the start of the * payload. */ if (ars_status->out_length < 44) return 0; /* * Ignore potentially stale results that are only refreshed * after a start-ARS event. */ if (!test_and_clear_bit(ARS_VALID, &acpi_desc->scrub_flags)) { dev_dbg(acpi_desc->dev, "skip %d stale records\n", ars_status->num_records); return 0; } for (i = 0; i < ars_status->num_records; i++) { /* only process full records */ if (ars_status->out_length < 44 + sizeof(struct nd_ars_record) * (i + 1)) break; rc = nvdimm_bus_add_badrange(nvdimm_bus, ars_status->records[i].err_address, ars_status->records[i].length); if (rc) return rc; } if (i < ars_status->num_records) dev_warn(acpi_desc->dev, "detected truncated ars results\n"); return 0; } static void acpi_nfit_remove_resource(void *data) { struct resource *res = data; remove_resource(res); } static int acpi_nfit_insert_resource(struct acpi_nfit_desc *acpi_desc, struct nd_region_desc *ndr_desc) { struct resource *res, *nd_res = ndr_desc->res; int is_pmem, ret; /* No operation if the region is already registered as PMEM */ is_pmem = region_intersects(nd_res->start, resource_size(nd_res), IORESOURCE_MEM, IORES_DESC_PERSISTENT_MEMORY); if (is_pmem == REGION_INTERSECTS) return 0; res = devm_kzalloc(acpi_desc->dev, sizeof(*res), GFP_KERNEL); if (!res) return -ENOMEM; res->name = "Persistent Memory"; res->start = nd_res->start; res->end = nd_res->end; res->flags = IORESOURCE_MEM; res->desc = IORES_DESC_PERSISTENT_MEMORY; ret = insert_resource(&iomem_resource, res); if (ret) return ret; ret = devm_add_action_or_reset(acpi_desc->dev, acpi_nfit_remove_resource, res); if (ret) return ret; return 0; } static int acpi_nfit_init_mapping(struct acpi_nfit_desc *acpi_desc, struct nd_mapping_desc *mapping, struct nd_region_desc *ndr_desc, struct acpi_nfit_memory_map *memdev, struct nfit_spa *nfit_spa) { struct nvdimm *nvdimm = acpi_nfit_dimm_by_handle(acpi_desc, memdev->device_handle); struct acpi_nfit_system_address *spa = nfit_spa->spa; if (!nvdimm) { dev_err(acpi_desc->dev, "spa%d dimm: %#x not found\n", spa->range_index, memdev->device_handle); return -ENODEV; } mapping->nvdimm = nvdimm; switch (nfit_spa_type(spa)) { case NFIT_SPA_PM: case NFIT_SPA_VOLATILE: mapping->start = memdev->address; mapping->size = memdev->region_size; break; } return 0; } static bool nfit_spa_is_virtual(struct acpi_nfit_system_address *spa) { return (nfit_spa_type(spa) == NFIT_SPA_VDISK || nfit_spa_type(spa) == NFIT_SPA_VCD || nfit_spa_type(spa) == NFIT_SPA_PDISK || nfit_spa_type(spa) == NFIT_SPA_PCD); } static bool nfit_spa_is_volatile(struct acpi_nfit_system_address *spa) { return (nfit_spa_type(spa) == NFIT_SPA_VDISK || nfit_spa_type(spa) == NFIT_SPA_VCD || nfit_spa_type(spa) == NFIT_SPA_VOLATILE); } static int acpi_nfit_register_region(struct acpi_nfit_desc *acpi_desc, struct nfit_spa *nfit_spa) { static struct nd_mapping_desc mappings[ND_MAX_MAPPINGS]; struct acpi_nfit_system_address *spa = nfit_spa->spa; struct nd_region_desc *ndr_desc, _ndr_desc; struct nfit_memdev *nfit_memdev; struct nvdimm_bus *nvdimm_bus; struct resource res; int count = 0, rc; if (nfit_spa->nd_region) return 0; if (spa->range_index == 0 && !nfit_spa_is_virtual(spa)) { dev_dbg(acpi_desc->dev, "detected invalid spa index\n"); return 0; } memset(&res, 0, sizeof(res)); memset(&mappings, 0, sizeof(mappings)); memset(&_ndr_desc, 0, sizeof(_ndr_desc)); res.start = spa->address; res.end = res.start + spa->length - 1; ndr_desc = &_ndr_desc; ndr_desc->res = &res; ndr_desc->provider_data = nfit_spa; ndr_desc->attr_groups = acpi_nfit_region_attribute_groups; if (spa->flags & ACPI_NFIT_PROXIMITY_VALID) { ndr_desc->numa_node = pxm_to_online_node(spa->proximity_domain); ndr_desc->target_node = pxm_to_node(spa->proximity_domain); } else { ndr_desc->numa_node = NUMA_NO_NODE; ndr_desc->target_node = NUMA_NO_NODE; } /* Fallback to address based numa information if node lookup failed */ if (ndr_desc->numa_node == NUMA_NO_NODE) { ndr_desc->numa_node = memory_add_physaddr_to_nid(spa->address); dev_info(acpi_desc->dev, "changing numa node from %d to %d for nfit region [%pa-%pa]", NUMA_NO_NODE, ndr_desc->numa_node, &res.start, &res.end); } if (ndr_desc->target_node == NUMA_NO_NODE) { ndr_desc->target_node = phys_to_target_node(spa->address); dev_info(acpi_desc->dev, "changing target node from %d to %d for nfit region [%pa-%pa]", NUMA_NO_NODE, ndr_desc->numa_node, &res.start, &res.end); } /* * Persistence domain bits are hierarchical, if * ACPI_NFIT_CAPABILITY_CACHE_FLUSH is set then * ACPI_NFIT_CAPABILITY_MEM_FLUSH is implied. */ if (acpi_desc->platform_cap & ACPI_NFIT_CAPABILITY_CACHE_FLUSH) set_bit(ND_REGION_PERSIST_CACHE, &ndr_desc->flags); else if (acpi_desc->platform_cap & ACPI_NFIT_CAPABILITY_MEM_FLUSH) set_bit(ND_REGION_PERSIST_MEMCTRL, &ndr_desc->flags); list_for_each_entry(nfit_memdev, &acpi_desc->memdevs, list) { struct acpi_nfit_memory_map *memdev = nfit_memdev->memdev; struct nd_mapping_desc *mapping; /* range index 0 == unmapped in SPA or invalid-SPA */ if (memdev->range_index == 0 || spa->range_index == 0) continue; if (memdev->range_index != spa->range_index) continue; if (count >= ND_MAX_MAPPINGS) { dev_err(acpi_desc->dev, "spa%d exceeds max mappings %d\n", spa->range_index, ND_MAX_MAPPINGS); return -ENXIO; } mapping = &mappings[count++]; rc = acpi_nfit_init_mapping(acpi_desc, mapping, ndr_desc, memdev, nfit_spa); if (rc) goto out; } ndr_desc->mapping = mappings; ndr_desc->num_mappings = count; rc = acpi_nfit_init_interleave_set(acpi_desc, ndr_desc, spa); if (rc) goto out; nvdimm_bus = acpi_desc->nvdimm_bus; if (nfit_spa_type(spa) == NFIT_SPA_PM) { rc = acpi_nfit_insert_resource(acpi_desc, ndr_desc); if (rc) { dev_warn(acpi_desc->dev, "failed to insert pmem resource to iomem: %d\n", rc); goto out; } nfit_spa->nd_region = nvdimm_pmem_region_create(nvdimm_bus, ndr_desc); if (!nfit_spa->nd_region) rc = -ENOMEM; } else if (nfit_spa_is_volatile(spa)) { nfit_spa->nd_region = nvdimm_volatile_region_create(nvdimm_bus, ndr_desc); if (!nfit_spa->nd_region) rc = -ENOMEM; } else if (nfit_spa_is_virtual(spa)) { nfit_spa->nd_region = nvdimm_pmem_region_create(nvdimm_bus, ndr_desc); if (!nfit_spa->nd_region) rc = -ENOMEM; } out: if (rc) dev_err(acpi_desc->dev, "failed to register spa range %d\n", nfit_spa->spa->range_index); return rc; } static int ars_status_alloc(struct acpi_nfit_desc *acpi_desc) { struct device *dev = acpi_desc->dev; struct nd_cmd_ars_status *ars_status; if (acpi_desc->ars_status) { memset(acpi_desc->ars_status, 0, acpi_desc->max_ars); return 0; } ars_status = devm_kzalloc(dev, acpi_desc->max_ars, GFP_KERNEL); if (!ars_status) return -ENOMEM; acpi_desc->ars_status = ars_status; return 0; } static int acpi_nfit_query_poison(struct acpi_nfit_desc *acpi_desc) { int rc; if (ars_status_alloc(acpi_desc)) return -ENOMEM; rc = ars_get_status(acpi_desc); if (rc < 0 && rc != -ENOSPC) return rc; if (ars_status_process_records(acpi_desc)) dev_err(acpi_desc->dev, "Failed to process ARS records\n"); return rc; } static int ars_register(struct acpi_nfit_desc *acpi_desc, struct nfit_spa *nfit_spa) { int rc; if (test_bit(ARS_FAILED, &nfit_spa->ars_state)) return acpi_nfit_register_region(acpi_desc, nfit_spa); set_bit(ARS_REQ_SHORT, &nfit_spa->ars_state); if (!no_init_ars) set_bit(ARS_REQ_LONG, &nfit_spa->ars_state); switch (acpi_nfit_query_poison(acpi_desc)) { case 0: case -ENOSPC: case -EAGAIN: rc = ars_start(acpi_desc, nfit_spa, ARS_REQ_SHORT); /* shouldn't happen, try again later */ if (rc == -EBUSY) break; if (rc) { set_bit(ARS_FAILED, &nfit_spa->ars_state); break; } clear_bit(ARS_REQ_SHORT, &nfit_spa->ars_state); rc = acpi_nfit_query_poison(acpi_desc); if (rc) break; acpi_desc->scrub_spa = nfit_spa; ars_complete(acpi_desc, nfit_spa); /* * If ars_complete() says we didn't complete the * short scrub, we'll try again with a long * request. */ acpi_desc->scrub_spa = NULL; break; case -EBUSY: case -ENOMEM: /* * BIOS was using ARS, wait for it to complete (or * resources to become available) and then perform our * own scrubs. */ break; default: set_bit(ARS_FAILED, &nfit_spa->ars_state); break; } return acpi_nfit_register_region(acpi_desc, nfit_spa); } static void ars_complete_all(struct acpi_nfit_desc *acpi_desc) { struct nfit_spa *nfit_spa; list_for_each_entry(nfit_spa, &acpi_desc->spas, list) { if (test_bit(ARS_FAILED, &nfit_spa->ars_state)) continue; ars_complete(acpi_desc, nfit_spa); } } static unsigned int __acpi_nfit_scrub(struct acpi_nfit_desc *acpi_desc, int query_rc) { unsigned int tmo = acpi_desc->scrub_tmo; struct device *dev = acpi_desc->dev; struct nfit_spa *nfit_spa; lockdep_assert_held(&acpi_desc->init_mutex); if (test_bit(ARS_CANCEL, &acpi_desc->scrub_flags)) return 0; if (query_rc == -EBUSY) { dev_dbg(dev, "ARS: ARS busy\n"); return min(30U * 60U, tmo * 2); } if (query_rc == -ENOSPC) { dev_dbg(dev, "ARS: ARS continue\n"); ars_continue(acpi_desc); return 1; } if (query_rc && query_rc != -EAGAIN) { unsigned long long addr, end; addr = acpi_desc->ars_status->address; end = addr + acpi_desc->ars_status->length; dev_dbg(dev, "ARS: %llx-%llx failed (%d)\n", addr, end, query_rc); } ars_complete_all(acpi_desc); list_for_each_entry(nfit_spa, &acpi_desc->spas, list) { enum nfit_ars_state req_type; int rc; if (test_bit(ARS_FAILED, &nfit_spa->ars_state)) continue; /* prefer short ARS requests first */ if (test_bit(ARS_REQ_SHORT, &nfit_spa->ars_state)) req_type = ARS_REQ_SHORT; else if (test_bit(ARS_REQ_LONG, &nfit_spa->ars_state)) req_type = ARS_REQ_LONG; else continue; rc = ars_start(acpi_desc, nfit_spa, req_type); dev = nd_region_dev(nfit_spa->nd_region); dev_dbg(dev, "ARS: range %d ARS start %s (%d)\n", nfit_spa->spa->range_index, req_type == ARS_REQ_SHORT ? "short" : "long", rc); /* * Hmm, we raced someone else starting ARS? Try again in * a bit. */ if (rc == -EBUSY) return 1; if (rc == 0) { dev_WARN_ONCE(dev, acpi_desc->scrub_spa, "scrub start while range %d active\n", acpi_desc->scrub_spa->spa->range_index); clear_bit(req_type, &nfit_spa->ars_state); acpi_desc->scrub_spa = nfit_spa; /* * Consider this spa last for future scrub * requests */ list_move_tail(&nfit_spa->list, &acpi_desc->spas); return 1; } dev_err(dev, "ARS: range %d ARS failed (%d)\n", nfit_spa->spa->range_index, rc); set_bit(ARS_FAILED, &nfit_spa->ars_state); } return 0; } static void __sched_ars(struct acpi_nfit_desc *acpi_desc, unsigned int tmo) { lockdep_assert_held(&acpi_desc->init_mutex); set_bit(ARS_BUSY, &acpi_desc->scrub_flags); /* note this should only be set from within the workqueue */ if (tmo) acpi_desc->scrub_tmo = tmo; queue_delayed_work(nfit_wq, &acpi_desc->dwork, tmo * HZ); } static void sched_ars(struct acpi_nfit_desc *acpi_desc) { __sched_ars(acpi_desc, 0); } static void notify_ars_done(struct acpi_nfit_desc *acpi_desc) { lockdep_assert_held(&acpi_desc->init_mutex); clear_bit(ARS_BUSY, &acpi_desc->scrub_flags); acpi_desc->scrub_count++; if (acpi_desc->scrub_count_state) sysfs_notify_dirent(acpi_desc->scrub_count_state); } static void acpi_nfit_scrub(struct work_struct *work) { struct acpi_nfit_desc *acpi_desc; unsigned int tmo; int query_rc; acpi_desc = container_of(work, typeof(*acpi_desc), dwork.work); mutex_lock(&acpi_desc->init_mutex); query_rc = acpi_nfit_query_poison(acpi_desc); tmo = __acpi_nfit_scrub(acpi_desc, query_rc); if (tmo) __sched_ars(acpi_desc, tmo); else notify_ars_done(acpi_desc); memset(acpi_desc->ars_status, 0, acpi_desc->max_ars); clear_bit(ARS_POLL, &acpi_desc->scrub_flags); mutex_unlock(&acpi_desc->init_mutex); } static void acpi_nfit_init_ars(struct acpi_nfit_desc *acpi_desc, struct nfit_spa *nfit_spa) { int type = nfit_spa_type(nfit_spa->spa); struct nd_cmd_ars_cap ars_cap; int rc; set_bit(ARS_FAILED, &nfit_spa->ars_state); memset(&ars_cap, 0, sizeof(ars_cap)); rc = ars_get_cap(acpi_desc, &ars_cap, nfit_spa); if (rc < 0) return; /* check that the supported scrub types match the spa type */ if (type == NFIT_SPA_VOLATILE && ((ars_cap.status >> 16) & ND_ARS_VOLATILE) == 0) return; if (type == NFIT_SPA_PM && ((ars_cap.status >> 16) & ND_ARS_PERSISTENT) == 0) return; nfit_spa->max_ars = ars_cap.max_ars_out; nfit_spa->clear_err_unit = ars_cap.clear_err_unit; acpi_desc->max_ars = max(nfit_spa->max_ars, acpi_desc->max_ars); clear_bit(ARS_FAILED, &nfit_spa->ars_state); } static int acpi_nfit_register_regions(struct acpi_nfit_desc *acpi_desc) { struct nfit_spa *nfit_spa; int rc, do_sched_ars = 0; set_bit(ARS_VALID, &acpi_desc->scrub_flags); list_for_each_entry(nfit_spa, &acpi_desc->spas, list) { switch (nfit_spa_type(nfit_spa->spa)) { case NFIT_SPA_VOLATILE: case NFIT_SPA_PM: acpi_nfit_init_ars(acpi_desc, nfit_spa); break; } } list_for_each_entry(nfit_spa, &acpi_desc->spas, list) { switch (nfit_spa_type(nfit_spa->spa)) { case NFIT_SPA_VOLATILE: case NFIT_SPA_PM: /* register regions and kick off initial ARS run */ rc = ars_register(acpi_desc, nfit_spa); if (rc) return rc; /* * Kick off background ARS if at least one * region successfully registered ARS */ if (!test_bit(ARS_FAILED, &nfit_spa->ars_state)) do_sched_ars++; break; case NFIT_SPA_BDW: /* nothing to register */ break; case NFIT_SPA_DCR: case NFIT_SPA_VDISK: case NFIT_SPA_VCD: case NFIT_SPA_PDISK: case NFIT_SPA_PCD: /* register known regions that don't support ARS */ rc = acpi_nfit_register_region(acpi_desc, nfit_spa); if (rc) return rc; break; default: /* don't register unknown regions */ break; } } if (do_sched_ars) sched_ars(acpi_desc); return 0; } static int acpi_nfit_check_deletions(struct acpi_nfit_desc *acpi_desc, struct nfit_table_prev *prev) { struct device *dev = acpi_desc->dev; if (!list_empty(&prev->spas) || !list_empty(&prev->memdevs) || !list_empty(&prev->dcrs) || !list_empty(&prev->bdws) || !list_empty(&prev->idts) || !list_empty(&prev->flushes)) { dev_err(dev, "new nfit deletes entries (unsupported)\n"); return -ENXIO; } return 0; } static int acpi_nfit_desc_init_scrub_attr(struct acpi_nfit_desc *acpi_desc) { struct device *dev = acpi_desc->dev; struct kernfs_node *nfit; struct device *bus_dev; if (!ars_supported(acpi_desc->nvdimm_bus)) return 0; bus_dev = to_nvdimm_bus_dev(acpi_desc->nvdimm_bus); nfit = sysfs_get_dirent(bus_dev->kobj.sd, "nfit"); if (!nfit) { dev_err(dev, "sysfs_get_dirent 'nfit' failed\n"); return -ENODEV; } acpi_desc->scrub_count_state = sysfs_get_dirent(nfit, "scrub"); sysfs_put(nfit); if (!acpi_desc->scrub_count_state) { dev_err(dev, "sysfs_get_dirent 'scrub' failed\n"); return -ENODEV; } return 0; } static void acpi_nfit_unregister(void *data) { struct acpi_nfit_desc *acpi_desc = data; nvdimm_bus_unregister(acpi_desc->nvdimm_bus); } int acpi_nfit_init(struct acpi_nfit_desc *acpi_desc, void *data, acpi_size sz) { struct device *dev = acpi_desc->dev; struct nfit_table_prev prev; const void *end; int rc; if (!acpi_desc->nvdimm_bus) { acpi_nfit_init_dsms(acpi_desc); acpi_desc->nvdimm_bus = nvdimm_bus_register(dev, &acpi_desc->nd_desc); if (!acpi_desc->nvdimm_bus) return -ENOMEM; rc = devm_add_action_or_reset(dev, acpi_nfit_unregister, acpi_desc); if (rc) return rc; rc = acpi_nfit_desc_init_scrub_attr(acpi_desc); if (rc) return rc; /* register this acpi_desc for mce notifications */ mutex_lock(&acpi_desc_lock); list_add_tail(&acpi_desc->list, &acpi_descs); mutex_unlock(&acpi_desc_lock); } mutex_lock(&acpi_desc->init_mutex); INIT_LIST_HEAD(&prev.spas); INIT_LIST_HEAD(&prev.memdevs); INIT_LIST_HEAD(&prev.dcrs); INIT_LIST_HEAD(&prev.bdws); INIT_LIST_HEAD(&prev.idts); INIT_LIST_HEAD(&prev.flushes); list_cut_position(&prev.spas, &acpi_desc->spas, acpi_desc->spas.prev); list_cut_position(&prev.memdevs, &acpi_desc->memdevs, acpi_desc->memdevs.prev); list_cut_position(&prev.dcrs, &acpi_desc->dcrs, acpi_desc->dcrs.prev); list_cut_position(&prev.bdws, &acpi_desc->bdws, acpi_desc->bdws.prev); list_cut_position(&prev.idts, &acpi_desc->idts, acpi_desc->idts.prev); list_cut_position(&prev.flushes, &acpi_desc->flushes, acpi_desc->flushes.prev); end = data + sz; while (!IS_ERR_OR_NULL(data)) data = add_table(acpi_desc, &prev, data, end); if (IS_ERR(data)) { dev_dbg(dev, "nfit table parsing error: %ld\n", PTR_ERR(data)); rc = PTR_ERR(data); goto out_unlock; } rc = acpi_nfit_check_deletions(acpi_desc, &prev); if (rc) goto out_unlock; rc = nfit_mem_init(acpi_desc); if (rc) goto out_unlock; rc = acpi_nfit_register_dimms(acpi_desc); if (rc) goto out_unlock; rc = acpi_nfit_register_regions(acpi_desc); out_unlock: mutex_unlock(&acpi_desc->init_mutex); return rc; } EXPORT_SYMBOL_GPL(acpi_nfit_init); static int acpi_nfit_flush_probe(struct nvdimm_bus_descriptor *nd_desc) { struct acpi_nfit_desc *acpi_desc = to_acpi_desc(nd_desc); struct device *dev = acpi_desc->dev; /* Bounce the device lock to flush acpi_nfit_add / acpi_nfit_notify */ device_lock(dev); device_unlock(dev); /* Bounce the init_mutex to complete initial registration */ mutex_lock(&acpi_desc->init_mutex); mutex_unlock(&acpi_desc->init_mutex); return 0; } static int __acpi_nfit_clear_to_send(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm, unsigned int cmd) { struct acpi_nfit_desc *acpi_desc = to_acpi_desc(nd_desc); if (nvdimm) return 0; if (cmd != ND_CMD_ARS_START) return 0; /* * The kernel and userspace may race to initiate a scrub, but * the scrub thread is prepared to lose that initial race. It * just needs guarantees that any ARS it initiates are not * interrupted by any intervening start requests from userspace. */ if (work_busy(&acpi_desc->dwork.work)) return -EBUSY; return 0; } /* * Prevent security and firmware activate commands from being issued via * ioctl. */ static int acpi_nfit_clear_to_send(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm, unsigned int cmd, void *buf) { struct nd_cmd_pkg *call_pkg = buf; unsigned int func; if (nvdimm && cmd == ND_CMD_CALL && call_pkg->nd_family == NVDIMM_FAMILY_INTEL) { func = call_pkg->nd_command; if (func > NVDIMM_CMD_MAX || (1 << func) & NVDIMM_INTEL_DENY_CMDMASK) return -EOPNOTSUPP; } /* block all non-nfit bus commands */ if (!nvdimm && cmd == ND_CMD_CALL && call_pkg->nd_family != NVDIMM_BUS_FAMILY_NFIT) return -EOPNOTSUPP; return __acpi_nfit_clear_to_send(nd_desc, nvdimm, cmd); } int acpi_nfit_ars_rescan(struct acpi_nfit_desc *acpi_desc, enum nfit_ars_state req_type) { struct device *dev = acpi_desc->dev; int scheduled = 0, busy = 0; struct nfit_spa *nfit_spa; mutex_lock(&acpi_desc->init_mutex); if (test_bit(ARS_CANCEL, &acpi_desc->scrub_flags)) { mutex_unlock(&acpi_desc->init_mutex); return 0; } list_for_each_entry(nfit_spa, &acpi_desc->spas, list) { int type = nfit_spa_type(nfit_spa->spa); if (type != NFIT_SPA_PM && type != NFIT_SPA_VOLATILE) continue; if (test_bit(ARS_FAILED, &nfit_spa->ars_state)) continue; if (test_and_set_bit(req_type, &nfit_spa->ars_state)) busy++; else scheduled++; } if (scheduled) { sched_ars(acpi_desc); dev_dbg(dev, "ars_scan triggered\n"); } mutex_unlock(&acpi_desc->init_mutex); if (scheduled) return 0; if (busy) return -EBUSY; return -ENOTTY; } void acpi_nfit_desc_init(struct acpi_nfit_desc *acpi_desc, struct device *dev) { struct nvdimm_bus_descriptor *nd_desc; dev_set_drvdata(dev, acpi_desc); acpi_desc->dev = dev; nd_desc = &acpi_desc->nd_desc; nd_desc->provider_name = "ACPI.NFIT"; nd_desc->module = THIS_MODULE; nd_desc->ndctl = acpi_nfit_ctl; nd_desc->flush_probe = acpi_nfit_flush_probe; nd_desc->clear_to_send = acpi_nfit_clear_to_send; nd_desc->attr_groups = acpi_nfit_attribute_groups; INIT_LIST_HEAD(&acpi_desc->spas); INIT_LIST_HEAD(&acpi_desc->dcrs); INIT_LIST_HEAD(&acpi_desc->bdws); INIT_LIST_HEAD(&acpi_desc->idts); INIT_LIST_HEAD(&acpi_desc->flushes); INIT_LIST_HEAD(&acpi_desc->memdevs); INIT_LIST_HEAD(&acpi_desc->dimms); INIT_LIST_HEAD(&acpi_desc->list); mutex_init(&acpi_desc->init_mutex); acpi_desc->scrub_tmo = 1; INIT_DELAYED_WORK(&acpi_desc->dwork, acpi_nfit_scrub); } EXPORT_SYMBOL_GPL(acpi_nfit_desc_init); static void acpi_nfit_put_table(void *table) { acpi_put_table(table); } static void acpi_nfit_notify(acpi_handle handle, u32 event, void *data) { struct acpi_device *adev = data; device_lock(&adev->dev); __acpi_nfit_notify(&adev->dev, handle, event); device_unlock(&adev->dev); } static void acpi_nfit_remove_notify_handler(void *data) { struct acpi_device *adev = data; acpi_dev_remove_notify_handler(adev, ACPI_DEVICE_NOTIFY, acpi_nfit_notify); } void acpi_nfit_shutdown(void *data) { struct acpi_nfit_desc *acpi_desc = data; struct device *bus_dev = to_nvdimm_bus_dev(acpi_desc->nvdimm_bus); /* * Destruct under acpi_desc_lock so that nfit_handle_mce does not * race teardown */ mutex_lock(&acpi_desc_lock); list_del(&acpi_desc->list); mutex_unlock(&acpi_desc_lock); mutex_lock(&acpi_desc->init_mutex); set_bit(ARS_CANCEL, &acpi_desc->scrub_flags); mutex_unlock(&acpi_desc->init_mutex); cancel_delayed_work_sync(&acpi_desc->dwork); /* * Bounce the nvdimm bus lock to make sure any in-flight * acpi_nfit_ars_rescan() submissions have had a chance to * either submit or see ->cancel set. */ device_lock(bus_dev); device_unlock(bus_dev); flush_workqueue(nfit_wq); } EXPORT_SYMBOL_GPL(acpi_nfit_shutdown); static int acpi_nfit_add(struct acpi_device *adev) { struct acpi_buffer buf = { ACPI_ALLOCATE_BUFFER, NULL }; struct acpi_nfit_desc *acpi_desc; struct device *dev = &adev->dev; struct acpi_table_header *tbl; acpi_status status = AE_OK; acpi_size sz; int rc = 0; rc = acpi_dev_install_notify_handler(adev, ACPI_DEVICE_NOTIFY, acpi_nfit_notify, adev); if (rc) return rc; rc = devm_add_action_or_reset(dev, acpi_nfit_remove_notify_handler, adev); if (rc) return rc; status = acpi_get_table(ACPI_SIG_NFIT, 0, &tbl); if (ACPI_FAILURE(status)) { /* The NVDIMM root device allows OS to trigger enumeration of * NVDIMMs through NFIT at boot time and re-enumeration at * root level via the _FIT method during runtime. * This is ok to return 0 here, we could have an nvdimm * hotplugged later and evaluate _FIT method which returns * data in the format of a series of NFIT Structures. */ dev_dbg(dev, "failed to find NFIT at startup\n"); return 0; } rc = devm_add_action_or_reset(dev, acpi_nfit_put_table, tbl); if (rc) return rc; sz = tbl->length; acpi_desc = devm_kzalloc(dev, sizeof(*acpi_desc), GFP_KERNEL); if (!acpi_desc) return -ENOMEM; acpi_nfit_desc_init(acpi_desc, &adev->dev); /* Save the acpi header for exporting the revision via sysfs */ acpi_desc->acpi_header = *tbl; /* Evaluate _FIT and override with that if present */ status = acpi_evaluate_object(adev->handle, "_FIT", NULL, &buf); if (ACPI_SUCCESS(status) && buf.length > 0) { union acpi_object *obj = buf.pointer; if (obj->type == ACPI_TYPE_BUFFER) rc = acpi_nfit_init(acpi_desc, obj->buffer.pointer, obj->buffer.length); else dev_dbg(dev, "invalid type %d, ignoring _FIT\n", (int) obj->type); kfree(buf.pointer); } else /* skip over the lead-in header table */ rc = acpi_nfit_init(acpi_desc, (void *) tbl + sizeof(struct acpi_table_nfit), sz - sizeof(struct acpi_table_nfit)); if (rc) return rc; return devm_add_action_or_reset(dev, acpi_nfit_shutdown, acpi_desc); } static void acpi_nfit_update_notify(struct device *dev, acpi_handle handle) { struct acpi_nfit_desc *acpi_desc = dev_get_drvdata(dev); struct acpi_buffer buf = { ACPI_ALLOCATE_BUFFER, NULL }; union acpi_object *obj; acpi_status status; int ret; if (!dev->driver) { /* dev->driver may be null if we're being removed */ dev_dbg(dev, "no driver found for dev\n"); return; } if (!acpi_desc) { acpi_desc = devm_kzalloc(dev, sizeof(*acpi_desc), GFP_KERNEL); if (!acpi_desc) return; acpi_nfit_desc_init(acpi_desc, dev); } else { /* * Finish previous registration before considering new * regions. */ flush_workqueue(nfit_wq); } /* Evaluate _FIT */ status = acpi_evaluate_object(handle, "_FIT", NULL, &buf); if (ACPI_FAILURE(status)) { dev_err(dev, "failed to evaluate _FIT\n"); return; } obj = buf.pointer; if (obj->type == ACPI_TYPE_BUFFER) { ret = acpi_nfit_init(acpi_desc, obj->buffer.pointer, obj->buffer.length); if (ret) dev_err(dev, "failed to merge updated NFIT\n"); } else dev_err(dev, "Invalid _FIT\n"); kfree(buf.pointer); } static void acpi_nfit_uc_error_notify(struct device *dev, acpi_handle handle) { struct acpi_nfit_desc *acpi_desc = dev_get_drvdata(dev); if (acpi_desc->scrub_mode == HW_ERROR_SCRUB_ON) acpi_nfit_ars_rescan(acpi_desc, ARS_REQ_LONG); else acpi_nfit_ars_rescan(acpi_desc, ARS_REQ_SHORT); } void __acpi_nfit_notify(struct device *dev, acpi_handle handle, u32 event) { dev_dbg(dev, "event: 0x%x\n", event); switch (event) { case NFIT_NOTIFY_UPDATE: return acpi_nfit_update_notify(dev, handle); case NFIT_NOTIFY_UC_MEMORY_ERROR: return acpi_nfit_uc_error_notify(dev, handle); default: return; } } EXPORT_SYMBOL_GPL(__acpi_nfit_notify); static const struct acpi_device_id acpi_nfit_ids[] = { { "ACPI0012", 0 }, { "", 0 }, }; MODULE_DEVICE_TABLE(acpi, acpi_nfit_ids); static struct acpi_driver acpi_nfit_driver = { .name = KBUILD_MODNAME, .ids = acpi_nfit_ids, .ops = { .add = acpi_nfit_add, }, }; static __init int nfit_init(void) { int ret; BUILD_BUG_ON(sizeof(struct acpi_table_nfit) != 40); BUILD_BUG_ON(sizeof(struct acpi_nfit_system_address) != 64); BUILD_BUG_ON(sizeof(struct acpi_nfit_memory_map) != 48); BUILD_BUG_ON(sizeof(struct acpi_nfit_interleave) != 16); BUILD_BUG_ON(sizeof(struct acpi_nfit_smbios) != 8); BUILD_BUG_ON(sizeof(struct acpi_nfit_control_region) != 80); BUILD_BUG_ON(sizeof(struct acpi_nfit_data_region) != 40); BUILD_BUG_ON(sizeof(struct acpi_nfit_capabilities) != 16); guid_parse(UUID_VOLATILE_MEMORY, &nfit_uuid[NFIT_SPA_VOLATILE]); guid_parse(UUID_PERSISTENT_MEMORY, &nfit_uuid[NFIT_SPA_PM]); guid_parse(UUID_CONTROL_REGION, &nfit_uuid[NFIT_SPA_DCR]); guid_parse(UUID_DATA_REGION, &nfit_uuid[NFIT_SPA_BDW]); guid_parse(UUID_VOLATILE_VIRTUAL_DISK, &nfit_uuid[NFIT_SPA_VDISK]); guid_parse(UUID_VOLATILE_VIRTUAL_CD, &nfit_uuid[NFIT_SPA_VCD]); guid_parse(UUID_PERSISTENT_VIRTUAL_DISK, &nfit_uuid[NFIT_SPA_PDISK]); guid_parse(UUID_PERSISTENT_VIRTUAL_CD, &nfit_uuid[NFIT_SPA_PCD]); guid_parse(UUID_NFIT_BUS, &nfit_uuid[NFIT_DEV_BUS]); guid_parse(UUID_NFIT_DIMM, &nfit_uuid[NFIT_DEV_DIMM]); guid_parse(UUID_NFIT_DIMM_N_HPE1, &nfit_uuid[NFIT_DEV_DIMM_N_HPE1]); guid_parse(UUID_NFIT_DIMM_N_HPE2, &nfit_uuid[NFIT_DEV_DIMM_N_HPE2]); guid_parse(UUID_NFIT_DIMM_N_MSFT, &nfit_uuid[NFIT_DEV_DIMM_N_MSFT]); guid_parse(UUID_NFIT_DIMM_N_HYPERV, &nfit_uuid[NFIT_DEV_DIMM_N_HYPERV]); guid_parse(UUID_INTEL_BUS, &nfit_uuid[NFIT_BUS_INTEL]); nfit_wq = create_singlethread_workqueue("nfit"); if (!nfit_wq) return -ENOMEM; nfit_mce_register(); ret = acpi_bus_register_driver(&acpi_nfit_driver); if (ret) { nfit_mce_unregister(); destroy_workqueue(nfit_wq); } return ret; } static __exit void nfit_exit(void) { nfit_mce_unregister(); acpi_bus_unregister_driver(&acpi_nfit_driver); destroy_workqueue(nfit_wq); WARN_ON(!list_empty(&acpi_descs)); } module_init(nfit_init); module_exit(nfit_exit); MODULE_DESCRIPTION("ACPI NVDIMM Firmware Interface Table (NFIT) driver"); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Intel Corporation");
15 66 66 65 41 42 36 30 42 66 66 39 39 38 1 1 38 21 21 25 25 25 6 5 5 28 31 25 20 37 2 20 34 35 31 28 27 1 1 1 26 65 26 65 64 65 65 65 32 28 28 28 64 65 65 65 35 35 35 35 35 35 35 35 35 31 31 31 31 31 31 31 6 21 5 4 20 1 31 18 31 30 29 29 29 29 28 28 27 27 27 2 2 25 25 25 25 22 4 4 4 25 25 25 25 25 25 25 2 2 2 2 2 1 2 2 1 2 2 2 2 2 2 2 2 2 2 5 3 2 5 5 4 8 8 8 14 5 14 7 14 14 5 14 12 1 10 11 8 11 4 4 11 14 5 6 14 13 2 1 10 13 13 6 13 7 13 12 11 5 1 5 5 2 3 3 1 3 3 1 2 2 1 1 1 2 4 1 14 14 16 6 15 3 2 2 13 1 16 9 5 7 7 5 10 7 7 7 7 7 5 6 2 2 1 1 1 2 2 2 21 20 3 19 19 1 18 1 21 2 18 5 14 18 17 17 15 7 2 5 2 8 7 15 15 30 30 7 7 4 4 4 4 4 8 8 8 12 8 4 3 8 8 6 7 5 4 2 1 2 10 10 1 10 6 6 10 10 10 10 10 10 9 11 8 4 7 6 6 9 9 6 4 3 2 1 3 1 4 4 1 1 1 1 2 4 8 10 9 7 8 1 2 3 17 19 19 18 17 7 3 2 3 3 5 10 2 2 1 1 2 1 13 10 2 1 1 1 1 1 1 1 1 1 1 1 13 16 143 11 105 10 20 25 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 // SPDX-License-Identifier: GPL-2.0-only /* * Functions to manage eBPF programs attached to cgroups * * Copyright (c) 2016 Daniel Mack */ #include <linux/kernel.h> #include <linux/atomic.h> #include <linux/cgroup.h> #include <linux/filter.h> #include <linux/slab.h> #include <linux/sysctl.h> #include <linux/string.h> #include <linux/bpf.h> #include <linux/bpf-cgroup.h> #include <linux/bpf_lsm.h> #include <linux/bpf_verifier.h> #include <net/sock.h> #include <net/bpf_sk_storage.h> #include "../cgroup/cgroup-internal.h" DEFINE_STATIC_KEY_ARRAY_FALSE(cgroup_bpf_enabled_key, MAX_CGROUP_BPF_ATTACH_TYPE); EXPORT_SYMBOL(cgroup_bpf_enabled_key); /* * cgroup bpf destruction makes heavy use of work items and there can be a lot * of concurrent destructions. Use a separate workqueue so that cgroup bpf * destruction work items don't end up filling up max_active of system_wq * which may lead to deadlock. */ static struct workqueue_struct *cgroup_bpf_destroy_wq; static int __init cgroup_bpf_wq_init(void) { cgroup_bpf_destroy_wq = alloc_workqueue("cgroup_bpf_destroy", 0, 1); if (!cgroup_bpf_destroy_wq) panic("Failed to alloc workqueue for cgroup bpf destroy.\n"); return 0; } core_initcall(cgroup_bpf_wq_init); /* __always_inline is necessary to prevent indirect call through run_prog * function pointer. */ static __always_inline int bpf_prog_run_array_cg(const struct cgroup_bpf *cgrp, enum cgroup_bpf_attach_type atype, const void *ctx, bpf_prog_run_fn run_prog, int retval, u32 *ret_flags) { const struct bpf_prog_array_item *item; const struct bpf_prog *prog; const struct bpf_prog_array *array; struct bpf_run_ctx *old_run_ctx; struct bpf_cg_run_ctx run_ctx; u32 func_ret; run_ctx.retval = retval; migrate_disable(); rcu_read_lock(); array = rcu_dereference(cgrp->effective[atype]); item = &array->items[0]; old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); while ((prog = READ_ONCE(item->prog))) { run_ctx.prog_item = item; func_ret = run_prog(prog, ctx); if (ret_flags) { *(ret_flags) |= (func_ret >> 1); func_ret &= 1; } if (!func_ret && !IS_ERR_VALUE((long)run_ctx.retval)) run_ctx.retval = -EPERM; item++; } bpf_reset_run_ctx(old_run_ctx); rcu_read_unlock(); migrate_enable(); return run_ctx.retval; } unsigned int __cgroup_bpf_run_lsm_sock(const void *ctx, const struct bpf_insn *insn) { const struct bpf_prog *shim_prog; struct sock *sk; struct cgroup *cgrp; int ret = 0; u64 *args; args = (u64 *)ctx; sk = (void *)(unsigned long)args[0]; /*shim_prog = container_of(insn, struct bpf_prog, insnsi);*/ shim_prog = (const struct bpf_prog *)((void *)insn - offsetof(struct bpf_prog, insnsi)); cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); if (likely(cgrp)) ret = bpf_prog_run_array_cg(&cgrp->bpf, shim_prog->aux->cgroup_atype, ctx, bpf_prog_run, 0, NULL); return ret; } unsigned int __cgroup_bpf_run_lsm_socket(const void *ctx, const struct bpf_insn *insn) { const struct bpf_prog *shim_prog; struct socket *sock; struct cgroup *cgrp; int ret = 0; u64 *args; args = (u64 *)ctx; sock = (void *)(unsigned long)args[0]; /*shim_prog = container_of(insn, struct bpf_prog, insnsi);*/ shim_prog = (const struct bpf_prog *)((void *)insn - offsetof(struct bpf_prog, insnsi)); cgrp = sock_cgroup_ptr(&sock->sk->sk_cgrp_data); if (likely(cgrp)) ret = bpf_prog_run_array_cg(&cgrp->bpf, shim_prog->aux->cgroup_atype, ctx, bpf_prog_run, 0, NULL); return ret; } unsigned int __cgroup_bpf_run_lsm_current(const void *ctx, const struct bpf_insn *insn) { const struct bpf_prog *shim_prog; struct cgroup *cgrp; int ret = 0; /*shim_prog = container_of(insn, struct bpf_prog, insnsi);*/ shim_prog = (const struct bpf_prog *)((void *)insn - offsetof(struct bpf_prog, insnsi)); /* We rely on trampoline's __bpf_prog_enter_lsm_cgroup to grab RCU read lock. */ cgrp = task_dfl_cgroup(current); if (likely(cgrp)) ret = bpf_prog_run_array_cg(&cgrp->bpf, shim_prog->aux->cgroup_atype, ctx, bpf_prog_run, 0, NULL); return ret; } #ifdef CONFIG_BPF_LSM struct cgroup_lsm_atype { u32 attach_btf_id; int refcnt; }; static struct cgroup_lsm_atype cgroup_lsm_atype[CGROUP_LSM_NUM]; static enum cgroup_bpf_attach_type bpf_cgroup_atype_find(enum bpf_attach_type attach_type, u32 attach_btf_id) { int i; lockdep_assert_held(&cgroup_mutex); if (attach_type != BPF_LSM_CGROUP) return to_cgroup_bpf_attach_type(attach_type); for (i = 0; i < ARRAY_SIZE(cgroup_lsm_atype); i++) if (cgroup_lsm_atype[i].attach_btf_id == attach_btf_id) return CGROUP_LSM_START + i; for (i = 0; i < ARRAY_SIZE(cgroup_lsm_atype); i++) if (cgroup_lsm_atype[i].attach_btf_id == 0) return CGROUP_LSM_START + i; return -E2BIG; } void bpf_cgroup_atype_get(u32 attach_btf_id, int cgroup_atype) { int i = cgroup_atype - CGROUP_LSM_START; lockdep_assert_held(&cgroup_mutex); WARN_ON_ONCE(cgroup_lsm_atype[i].attach_btf_id && cgroup_lsm_atype[i].attach_btf_id != attach_btf_id); cgroup_lsm_atype[i].attach_btf_id = attach_btf_id; cgroup_lsm_atype[i].refcnt++; } void bpf_cgroup_atype_put(int cgroup_atype) { int i = cgroup_atype - CGROUP_LSM_START; cgroup_lock(); if (--cgroup_lsm_atype[i].refcnt <= 0) cgroup_lsm_atype[i].attach_btf_id = 0; WARN_ON_ONCE(cgroup_lsm_atype[i].refcnt < 0); cgroup_unlock(); } #else static enum cgroup_bpf_attach_type bpf_cgroup_atype_find(enum bpf_attach_type attach_type, u32 attach_btf_id) { if (attach_type != BPF_LSM_CGROUP) return to_cgroup_bpf_attach_type(attach_type); return -EOPNOTSUPP; } #endif /* CONFIG_BPF_LSM */ void cgroup_bpf_offline(struct cgroup *cgrp) { cgroup_get(cgrp); percpu_ref_kill(&cgrp->bpf.refcnt); } static void bpf_cgroup_storages_free(struct bpf_cgroup_storage *storages[]) { enum bpf_cgroup_storage_type stype; for_each_cgroup_storage_type(stype) bpf_cgroup_storage_free(storages[stype]); } static int bpf_cgroup_storages_alloc(struct bpf_cgroup_storage *storages[], struct bpf_cgroup_storage *new_storages[], enum bpf_attach_type type, struct bpf_prog *prog, struct cgroup *cgrp) { enum bpf_cgroup_storage_type stype; struct bpf_cgroup_storage_key key; struct bpf_map *map; key.cgroup_inode_id = cgroup_id(cgrp); key.attach_type = type; for_each_cgroup_storage_type(stype) { map = prog->aux->cgroup_storage[stype]; if (!map) continue; storages[stype] = cgroup_storage_lookup((void *)map, &key, false); if (storages[stype]) continue; storages[stype] = bpf_cgroup_storage_alloc(prog, stype); if (IS_ERR(storages[stype])) { bpf_cgroup_storages_free(new_storages); return -ENOMEM; } new_storages[stype] = storages[stype]; } return 0; } static void bpf_cgroup_storages_assign(struct bpf_cgroup_storage *dst[], struct bpf_cgroup_storage *src[]) { enum bpf_cgroup_storage_type stype; for_each_cgroup_storage_type(stype) dst[stype] = src[stype]; } static void bpf_cgroup_storages_link(struct bpf_cgroup_storage *storages[], struct cgroup *cgrp, enum bpf_attach_type attach_type) { enum bpf_cgroup_storage_type stype; for_each_cgroup_storage_type(stype) bpf_cgroup_storage_link(storages[stype], cgrp, attach_type); } /* Called when bpf_cgroup_link is auto-detached from dying cgroup. * It drops cgroup and bpf_prog refcounts, and marks bpf_link as defunct. It * doesn't free link memory, which will eventually be done by bpf_link's * release() callback, when its last FD is closed. */ static void bpf_cgroup_link_auto_detach(struct bpf_cgroup_link *link) { cgroup_put(link->cgroup); link->cgroup = NULL; } /** * cgroup_bpf_release() - put references of all bpf programs and * release all cgroup bpf data * @work: work structure embedded into the cgroup to modify */ static void cgroup_bpf_release(struct work_struct *work) { struct cgroup *p, *cgrp = container_of(work, struct cgroup, bpf.release_work); struct bpf_prog_array *old_array; struct list_head *storages = &cgrp->bpf.storages; struct bpf_cgroup_storage *storage, *stmp; unsigned int atype; cgroup_lock(); for (atype = 0; atype < ARRAY_SIZE(cgrp->bpf.progs); atype++) { struct hlist_head *progs = &cgrp->bpf.progs[atype]; struct bpf_prog_list *pl; struct hlist_node *pltmp; hlist_for_each_entry_safe(pl, pltmp, progs, node) { hlist_del(&pl->node); if (pl->prog) { if (pl->prog->expected_attach_type == BPF_LSM_CGROUP) bpf_trampoline_unlink_cgroup_shim(pl->prog); bpf_prog_put(pl->prog); } if (pl->link) { if (pl->link->link.prog->expected_attach_type == BPF_LSM_CGROUP) bpf_trampoline_unlink_cgroup_shim(pl->link->link.prog); bpf_cgroup_link_auto_detach(pl->link); } kfree(pl); static_branch_dec(&cgroup_bpf_enabled_key[atype]); } old_array = rcu_dereference_protected( cgrp->bpf.effective[atype], lockdep_is_held(&cgroup_mutex)); bpf_prog_array_free(old_array); } list_for_each_entry_safe(storage, stmp, storages, list_cg) { bpf_cgroup_storage_unlink(storage); bpf_cgroup_storage_free(storage); } cgroup_unlock(); for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p)) cgroup_bpf_put(p); percpu_ref_exit(&cgrp->bpf.refcnt); cgroup_put(cgrp); } /** * cgroup_bpf_release_fn() - callback used to schedule releasing * of bpf cgroup data * @ref: percpu ref counter structure */ static void cgroup_bpf_release_fn(struct percpu_ref *ref) { struct cgroup *cgrp = container_of(ref, struct cgroup, bpf.refcnt); INIT_WORK(&cgrp->bpf.release_work, cgroup_bpf_release); queue_work(cgroup_bpf_destroy_wq, &cgrp->bpf.release_work); } /* Get underlying bpf_prog of bpf_prog_list entry, regardless if it's through * link or direct prog. */ static struct bpf_prog *prog_list_prog(struct bpf_prog_list *pl) { if (pl->prog) return pl->prog; if (pl->link) return pl->link->link.prog; return NULL; } /* count number of elements in the list. * it's slow but the list cannot be long */ static u32 prog_list_length(struct hlist_head *head) { struct bpf_prog_list *pl; u32 cnt = 0; hlist_for_each_entry(pl, head, node) { if (!prog_list_prog(pl)) continue; cnt++; } return cnt; } /* if parent has non-overridable prog attached, * disallow attaching new programs to the descendent cgroup. * if parent has overridable or multi-prog, allow attaching */ static bool hierarchy_allows_attach(struct cgroup *cgrp, enum cgroup_bpf_attach_type atype) { struct cgroup *p; p = cgroup_parent(cgrp); if (!p) return true; do { u32 flags = p->bpf.flags[atype]; u32 cnt; if (flags & BPF_F_ALLOW_MULTI) return true; cnt = prog_list_length(&p->bpf.progs[atype]); WARN_ON_ONCE(cnt > 1); if (cnt == 1) return !!(flags & BPF_F_ALLOW_OVERRIDE); p = cgroup_parent(p); } while (p); return true; } /* compute a chain of effective programs for a given cgroup: * start from the list of programs in this cgroup and add * all parent programs. * Note that parent's F_ALLOW_OVERRIDE-type program is yielding * to programs in this cgroup */ static int compute_effective_progs(struct cgroup *cgrp, enum cgroup_bpf_attach_type atype, struct bpf_prog_array **array) { struct bpf_prog_array_item *item; struct bpf_prog_array *progs; struct bpf_prog_list *pl; struct cgroup *p = cgrp; int cnt = 0; /* count number of effective programs by walking parents */ do { if (cnt == 0 || (p->bpf.flags[atype] & BPF_F_ALLOW_MULTI)) cnt += prog_list_length(&p->bpf.progs[atype]); p = cgroup_parent(p); } while (p); progs = bpf_prog_array_alloc(cnt, GFP_KERNEL); if (!progs) return -ENOMEM; /* populate the array with effective progs */ cnt = 0; p = cgrp; do { if (cnt > 0 && !(p->bpf.flags[atype] & BPF_F_ALLOW_MULTI)) continue; hlist_for_each_entry(pl, &p->bpf.progs[atype], node) { if (!prog_list_prog(pl)) continue; item = &progs->items[cnt]; item->prog = prog_list_prog(pl); bpf_cgroup_storages_assign(item->cgroup_storage, pl->storage); cnt++; } } while ((p = cgroup_parent(p))); *array = progs; return 0; } static void activate_effective_progs(struct cgroup *cgrp, enum cgroup_bpf_attach_type atype, struct bpf_prog_array *old_array) { old_array = rcu_replace_pointer(cgrp->bpf.effective[atype], old_array, lockdep_is_held(&cgroup_mutex)); /* free prog array after grace period, since __cgroup_bpf_run_*() * might be still walking the array */ bpf_prog_array_free(old_array); } /** * cgroup_bpf_inherit() - inherit effective programs from parent * @cgrp: the cgroup to modify */ int cgroup_bpf_inherit(struct cgroup *cgrp) { /* has to use marco instead of const int, since compiler thinks * that array below is variable length */ #define NR ARRAY_SIZE(cgrp->bpf.effective) struct bpf_prog_array *arrays[NR] = {}; struct cgroup *p; int ret, i; ret = percpu_ref_init(&cgrp->bpf.refcnt, cgroup_bpf_release_fn, 0, GFP_KERNEL); if (ret) return ret; for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p)) cgroup_bpf_get(p); for (i = 0; i < NR; i++) INIT_HLIST_HEAD(&cgrp->bpf.progs[i]); INIT_LIST_HEAD(&cgrp->bpf.storages); for (i = 0; i < NR; i++) if (compute_effective_progs(cgrp, i, &arrays[i])) goto cleanup; for (i = 0; i < NR; i++) activate_effective_progs(cgrp, i, arrays[i]); return 0; cleanup: for (i = 0; i < NR; i++) bpf_prog_array_free(arrays[i]); for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p)) cgroup_bpf_put(p); percpu_ref_exit(&cgrp->bpf.refcnt); return -ENOMEM; } static int update_effective_progs(struct cgroup *cgrp, enum cgroup_bpf_attach_type atype) { struct cgroup_subsys_state *css; int err; /* allocate and recompute effective prog arrays */ css_for_each_descendant_pre(css, &cgrp->self) { struct cgroup *desc = container_of(css, struct cgroup, self); if (percpu_ref_is_zero(&desc->bpf.refcnt)) continue; err = compute_effective_progs(desc, atype, &desc->bpf.inactive); if (err) goto cleanup; } /* all allocations were successful. Activate all prog arrays */ css_for_each_descendant_pre(css, &cgrp->self) { struct cgroup *desc = container_of(css, struct cgroup, self); if (percpu_ref_is_zero(&desc->bpf.refcnt)) { if (unlikely(desc->bpf.inactive)) { bpf_prog_array_free(desc->bpf.inactive); desc->bpf.inactive = NULL; } continue; } activate_effective_progs(desc, atype, desc->bpf.inactive); desc->bpf.inactive = NULL; } return 0; cleanup: /* oom while computing effective. Free all computed effective arrays * since they were not activated */ css_for_each_descendant_pre(css, &cgrp->self) { struct cgroup *desc = container_of(css, struct cgroup, self); bpf_prog_array_free(desc->bpf.inactive); desc->bpf.inactive = NULL; } return err; } #define BPF_CGROUP_MAX_PROGS 64 static struct bpf_prog_list *find_attach_entry(struct hlist_head *progs, struct bpf_prog *prog, struct bpf_cgroup_link *link, struct bpf_prog *replace_prog, bool allow_multi) { struct bpf_prog_list *pl; /* single-attach case */ if (!allow_multi) { if (hlist_empty(progs)) return NULL; return hlist_entry(progs->first, typeof(*pl), node); } hlist_for_each_entry(pl, progs, node) { if (prog && pl->prog == prog && prog != replace_prog) /* disallow attaching the same prog twice */ return ERR_PTR(-EINVAL); if (link && pl->link == link) /* disallow attaching the same link twice */ return ERR_PTR(-EINVAL); } /* direct prog multi-attach w/ replacement case */ if (replace_prog) { hlist_for_each_entry(pl, progs, node) { if (pl->prog == replace_prog) /* a match found */ return pl; } /* prog to replace not found for cgroup */ return ERR_PTR(-ENOENT); } return NULL; } /** * __cgroup_bpf_attach() - Attach the program or the link to a cgroup, and * propagate the change to descendants * @cgrp: The cgroup which descendants to traverse * @prog: A program to attach * @link: A link to attach * @replace_prog: Previously attached program to replace if BPF_F_REPLACE is set * @type: Type of attach operation * @flags: Option flags * * Exactly one of @prog or @link can be non-null. * Must be called with cgroup_mutex held. */ static int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, struct bpf_prog *replace_prog, struct bpf_cgroup_link *link, enum bpf_attach_type type, u32 flags) { u32 saved_flags = (flags & (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI)); struct bpf_prog *old_prog = NULL; struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {}; struct bpf_cgroup_storage *new_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {}; struct bpf_prog *new_prog = prog ? : link->link.prog; enum cgroup_bpf_attach_type atype; struct bpf_prog_list *pl; struct hlist_head *progs; int err; if (((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) || ((flags & BPF_F_REPLACE) && !(flags & BPF_F_ALLOW_MULTI))) /* invalid combination */ return -EINVAL; if (link && (prog || replace_prog)) /* only either link or prog/replace_prog can be specified */ return -EINVAL; if (!!replace_prog != !!(flags & BPF_F_REPLACE)) /* replace_prog implies BPF_F_REPLACE, and vice versa */ return -EINVAL; atype = bpf_cgroup_atype_find(type, new_prog->aux->attach_btf_id); if (atype < 0) return -EINVAL; progs = &cgrp->bpf.progs[atype]; if (!hierarchy_allows_attach(cgrp, atype)) return -EPERM; if (!hlist_empty(progs) && cgrp->bpf.flags[atype] != saved_flags) /* Disallow attaching non-overridable on top * of existing overridable in this cgroup. * Disallow attaching multi-prog if overridable or none */ return -EPERM; if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS) return -E2BIG; pl = find_attach_entry(progs, prog, link, replace_prog, flags & BPF_F_ALLOW_MULTI); if (IS_ERR(pl)) return PTR_ERR(pl); if (bpf_cgroup_storages_alloc(storage, new_storage, type, prog ? : link->link.prog, cgrp)) return -ENOMEM; if (pl) { old_prog = pl->prog; } else { struct hlist_node *last = NULL; pl = kmalloc(sizeof(*pl), GFP_KERNEL); if (!pl) { bpf_cgroup_storages_free(new_storage); return -ENOMEM; } if (hlist_empty(progs)) hlist_add_head(&pl->node, progs); else hlist_for_each(last, progs) { if (last->next) continue; hlist_add_behind(&pl->node, last); break; } } pl->prog = prog; pl->link = link; bpf_cgroup_storages_assign(pl->storage, storage); cgrp->bpf.flags[atype] = saved_flags; if (type == BPF_LSM_CGROUP) { err = bpf_trampoline_link_cgroup_shim(new_prog, atype); if (err) goto cleanup; } err = update_effective_progs(cgrp, atype); if (err) goto cleanup_trampoline; if (old_prog) { if (type == BPF_LSM_CGROUP) bpf_trampoline_unlink_cgroup_shim(old_prog); bpf_prog_put(old_prog); } else { static_branch_inc(&cgroup_bpf_enabled_key[atype]); } bpf_cgroup_storages_link(new_storage, cgrp, type); return 0; cleanup_trampoline: if (type == BPF_LSM_CGROUP) bpf_trampoline_unlink_cgroup_shim(new_prog); cleanup: if (old_prog) { pl->prog = old_prog; pl->link = NULL; } bpf_cgroup_storages_free(new_storage); if (!old_prog) { hlist_del(&pl->node); kfree(pl); } return err; } static int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, struct bpf_prog *replace_prog, struct bpf_cgroup_link *link, enum bpf_attach_type type, u32 flags) { int ret; cgroup_lock(); ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, link, type, flags); cgroup_unlock(); return ret; } /* Swap updated BPF program for given link in effective program arrays across * all descendant cgroups. This function is guaranteed to succeed. */ static void replace_effective_prog(struct cgroup *cgrp, enum cgroup_bpf_attach_type atype, struct bpf_cgroup_link *link) { struct bpf_prog_array_item *item; struct cgroup_subsys_state *css; struct bpf_prog_array *progs; struct bpf_prog_list *pl; struct hlist_head *head; struct cgroup *cg; int pos; css_for_each_descendant_pre(css, &cgrp->self) { struct cgroup *desc = container_of(css, struct cgroup, self); if (percpu_ref_is_zero(&desc->bpf.refcnt)) continue; /* find position of link in effective progs array */ for (pos = 0, cg = desc; cg; cg = cgroup_parent(cg)) { if (pos && !(cg->bpf.flags[atype] & BPF_F_ALLOW_MULTI)) continue; head = &cg->bpf.progs[atype]; hlist_for_each_entry(pl, head, node) { if (!prog_list_prog(pl)) continue; if (pl->link == link) goto found; pos++; } } found: BUG_ON(!cg); progs = rcu_dereference_protected( desc->bpf.effective[atype], lockdep_is_held(&cgroup_mutex)); item = &progs->items[pos]; WRITE_ONCE(item->prog, link->link.prog); } } /** * __cgroup_bpf_replace() - Replace link's program and propagate the change * to descendants * @cgrp: The cgroup which descendants to traverse * @link: A link for which to replace BPF program * @new_prog: &struct bpf_prog for the target BPF program with its refcnt * incremented * * Must be called with cgroup_mutex held. */ static int __cgroup_bpf_replace(struct cgroup *cgrp, struct bpf_cgroup_link *link, struct bpf_prog *new_prog) { enum cgroup_bpf_attach_type atype; struct bpf_prog *old_prog; struct bpf_prog_list *pl; struct hlist_head *progs; bool found = false; atype = bpf_cgroup_atype_find(link->type, new_prog->aux->attach_btf_id); if (atype < 0) return -EINVAL; progs = &cgrp->bpf.progs[atype]; if (link->link.prog->type != new_prog->type) return -EINVAL; hlist_for_each_entry(pl, progs, node) { if (pl->link == link) { found = true; break; } } if (!found) return -ENOENT; old_prog = xchg(&link->link.prog, new_prog); replace_effective_prog(cgrp, atype, link); bpf_prog_put(old_prog); return 0; } static int cgroup_bpf_replace(struct bpf_link *link, struct bpf_prog *new_prog, struct bpf_prog *old_prog) { struct bpf_cgroup_link *cg_link; int ret; cg_link = container_of(link, struct bpf_cgroup_link, link); cgroup_lock(); /* link might have been auto-released by dying cgroup, so fail */ if (!cg_link->cgroup) { ret = -ENOLINK; goto out_unlock; } if (old_prog && link->prog != old_prog) { ret = -EPERM; goto out_unlock; } ret = __cgroup_bpf_replace(cg_link->cgroup, cg_link, new_prog); out_unlock: cgroup_unlock(); return ret; } static struct bpf_prog_list *find_detach_entry(struct hlist_head *progs, struct bpf_prog *prog, struct bpf_cgroup_link *link, bool allow_multi) { struct bpf_prog_list *pl; if (!allow_multi) { if (hlist_empty(progs)) /* report error when trying to detach and nothing is attached */ return ERR_PTR(-ENOENT); /* to maintain backward compatibility NONE and OVERRIDE cgroups * allow detaching with invalid FD (prog==NULL) in legacy mode */ return hlist_entry(progs->first, typeof(*pl), node); } if (!prog && !link) /* to detach MULTI prog the user has to specify valid FD * of the program or link to be detached */ return ERR_PTR(-EINVAL); /* find the prog or link and detach it */ hlist_for_each_entry(pl, progs, node) { if (pl->prog == prog && pl->link == link) return pl; } return ERR_PTR(-ENOENT); } /** * purge_effective_progs() - After compute_effective_progs fails to alloc new * cgrp->bpf.inactive table we can recover by * recomputing the array in place. * * @cgrp: The cgroup which descendants to travers * @prog: A program to detach or NULL * @link: A link to detach or NULL * @atype: Type of detach operation */ static void purge_effective_progs(struct cgroup *cgrp, struct bpf_prog *prog, struct bpf_cgroup_link *link, enum cgroup_bpf_attach_type atype) { struct cgroup_subsys_state *css; struct bpf_prog_array *progs; struct bpf_prog_list *pl; struct hlist_head *head; struct cgroup *cg; int pos; /* recompute effective prog array in place */ css_for_each_descendant_pre(css, &cgrp->self) { struct cgroup *desc = container_of(css, struct cgroup, self); if (percpu_ref_is_zero(&desc->bpf.refcnt)) continue; /* find position of link or prog in effective progs array */ for (pos = 0, cg = desc; cg; cg = cgroup_parent(cg)) { if (pos && !(cg->bpf.flags[atype] & BPF_F_ALLOW_MULTI)) continue; head = &cg->bpf.progs[atype]; hlist_for_each_entry(pl, head, node) { if (!prog_list_prog(pl)) continue; if (pl->prog == prog && pl->link == link) goto found; pos++; } } /* no link or prog match, skip the cgroup of this layer */ continue; found: progs = rcu_dereference_protected( desc->bpf.effective[atype], lockdep_is_held(&cgroup_mutex)); /* Remove the program from the array */ WARN_ONCE(bpf_prog_array_delete_safe_at(progs, pos), "Failed to purge a prog from array at index %d", pos); } } /** * __cgroup_bpf_detach() - Detach the program or link from a cgroup, and * propagate the change to descendants * @cgrp: The cgroup which descendants to traverse * @prog: A program to detach or NULL * @link: A link to detach or NULL * @type: Type of detach operation * * At most one of @prog or @link can be non-NULL. * Must be called with cgroup_mutex held. */ static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, struct bpf_cgroup_link *link, enum bpf_attach_type type) { enum cgroup_bpf_attach_type atype; struct bpf_prog *old_prog; struct bpf_prog_list *pl; struct hlist_head *progs; u32 attach_btf_id = 0; u32 flags; if (prog) attach_btf_id = prog->aux->attach_btf_id; if (link) attach_btf_id = link->link.prog->aux->attach_btf_id; atype = bpf_cgroup_atype_find(type, attach_btf_id); if (atype < 0) return -EINVAL; progs = &cgrp->bpf.progs[atype]; flags = cgrp->bpf.flags[atype]; if (prog && link) /* only one of prog or link can be specified */ return -EINVAL; pl = find_detach_entry(progs, prog, link, flags & BPF_F_ALLOW_MULTI); if (IS_ERR(pl)) return PTR_ERR(pl); /* mark it deleted, so it's ignored while recomputing effective */ old_prog = pl->prog; pl->prog = NULL; pl->link = NULL; if (update_effective_progs(cgrp, atype)) { /* if update effective array failed replace the prog with a dummy prog*/ pl->prog = old_prog; pl->link = link; purge_effective_progs(cgrp, old_prog, link, atype); } /* now can actually delete it from this cgroup list */ hlist_del(&pl->node); kfree(pl); if (hlist_empty(progs)) /* last program was detached, reset flags to zero */ cgrp->bpf.flags[atype] = 0; if (old_prog) { if (type == BPF_LSM_CGROUP) bpf_trampoline_unlink_cgroup_shim(old_prog); bpf_prog_put(old_prog); } static_branch_dec(&cgroup_bpf_enabled_key[atype]); return 0; } static int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, enum bpf_attach_type type) { int ret; cgroup_lock(); ret = __cgroup_bpf_detach(cgrp, prog, NULL, type); cgroup_unlock(); return ret; } /* Must be called with cgroup_mutex held to avoid races. */ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, union bpf_attr __user *uattr) { __u32 __user *prog_attach_flags = u64_to_user_ptr(attr->query.prog_attach_flags); bool effective_query = attr->query.query_flags & BPF_F_QUERY_EFFECTIVE; __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids); enum bpf_attach_type type = attr->query.attach_type; enum cgroup_bpf_attach_type from_atype, to_atype; enum cgroup_bpf_attach_type atype; struct bpf_prog_array *effective; int cnt, ret = 0, i; int total_cnt = 0; u32 flags; if (effective_query && prog_attach_flags) return -EINVAL; if (type == BPF_LSM_CGROUP) { if (!effective_query && attr->query.prog_cnt && prog_ids && !prog_attach_flags) return -EINVAL; from_atype = CGROUP_LSM_START; to_atype = CGROUP_LSM_END; flags = 0; } else { from_atype = to_cgroup_bpf_attach_type(type); if (from_atype < 0) return -EINVAL; to_atype = from_atype; flags = cgrp->bpf.flags[from_atype]; } for (atype = from_atype; atype <= to_atype; atype++) { if (effective_query) { effective = rcu_dereference_protected(cgrp->bpf.effective[atype], lockdep_is_held(&cgroup_mutex)); total_cnt += bpf_prog_array_length(effective); } else { total_cnt += prog_list_length(&cgrp->bpf.progs[atype]); } } /* always output uattr->query.attach_flags as 0 during effective query */ flags = effective_query ? 0 : flags; if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags))) return -EFAULT; if (copy_to_user(&uattr->query.prog_cnt, &total_cnt, sizeof(total_cnt))) return -EFAULT; if (attr->query.prog_cnt == 0 || !prog_ids || !total_cnt) /* return early if user requested only program count + flags */ return 0; if (attr->query.prog_cnt < total_cnt) { total_cnt = attr->query.prog_cnt; ret = -ENOSPC; } for (atype = from_atype; atype <= to_atype && total_cnt; atype++) { if (effective_query) { effective = rcu_dereference_protected(cgrp->bpf.effective[atype], lockdep_is_held(&cgroup_mutex)); cnt = min_t(int, bpf_prog_array_length(effective), total_cnt); ret = bpf_prog_array_copy_to_user(effective, prog_ids, cnt); } else { struct hlist_head *progs; struct bpf_prog_list *pl; struct bpf_prog *prog; u32 id; progs = &cgrp->bpf.progs[atype]; cnt = min_t(int, prog_list_length(progs), total_cnt); i = 0; hlist_for_each_entry(pl, progs, node) { prog = prog_list_prog(pl); id = prog->aux->id; if (copy_to_user(prog_ids + i, &id, sizeof(id))) return -EFAULT; if (++i == cnt) break; } if (prog_attach_flags) { flags = cgrp->bpf.flags[atype]; for (i = 0; i < cnt; i++) if (copy_to_user(prog_attach_flags + i, &flags, sizeof(flags))) return -EFAULT; prog_attach_flags += cnt; } } prog_ids += cnt; total_cnt -= cnt; } return ret; } static int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, union bpf_attr __user *uattr) { int ret; cgroup_lock(); ret = __cgroup_bpf_query(cgrp, attr, uattr); cgroup_unlock(); return ret; } int cgroup_bpf_prog_attach(const union bpf_attr *attr, enum bpf_prog_type ptype, struct bpf_prog *prog) { struct bpf_prog *replace_prog = NULL; struct cgroup *cgrp; int ret; cgrp = cgroup_get_from_fd(attr->target_fd); if (IS_ERR(cgrp)) return PTR_ERR(cgrp); if ((attr->attach_flags & BPF_F_ALLOW_MULTI) && (attr->attach_flags & BPF_F_REPLACE)) { replace_prog = bpf_prog_get_type(attr->replace_bpf_fd, ptype); if (IS_ERR(replace_prog)) { cgroup_put(cgrp); return PTR_ERR(replace_prog); } } ret = cgroup_bpf_attach(cgrp, prog, replace_prog, NULL, attr->attach_type, attr->attach_flags); if (replace_prog) bpf_prog_put(replace_prog); cgroup_put(cgrp); return ret; } int cgroup_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype) { struct bpf_prog *prog; struct cgroup *cgrp; int ret; cgrp = cgroup_get_from_fd(attr->target_fd); if (IS_ERR(cgrp)) return PTR_ERR(cgrp); prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype); if (IS_ERR(prog)) prog = NULL; ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type); if (prog) bpf_prog_put(prog); cgroup_put(cgrp); return ret; } static void bpf_cgroup_link_release(struct bpf_link *link) { struct bpf_cgroup_link *cg_link = container_of(link, struct bpf_cgroup_link, link); struct cgroup *cg; /* link might have been auto-detached by dying cgroup already, * in that case our work is done here */ if (!cg_link->cgroup) return; cgroup_lock(); /* re-check cgroup under lock again */ if (!cg_link->cgroup) { cgroup_unlock(); return; } WARN_ON(__cgroup_bpf_detach(cg_link->cgroup, NULL, cg_link, cg_link->type)); if (cg_link->type == BPF_LSM_CGROUP) bpf_trampoline_unlink_cgroup_shim(cg_link->link.prog); cg = cg_link->cgroup; cg_link->cgroup = NULL; cgroup_unlock(); cgroup_put(cg); } static void bpf_cgroup_link_dealloc(struct bpf_link *link) { struct bpf_cgroup_link *cg_link = container_of(link, struct bpf_cgroup_link, link); kfree(cg_link); } static int bpf_cgroup_link_detach(struct bpf_link *link) { bpf_cgroup_link_release(link); return 0; } static void bpf_cgroup_link_show_fdinfo(const struct bpf_link *link, struct seq_file *seq) { struct bpf_cgroup_link *cg_link = container_of(link, struct bpf_cgroup_link, link); u64 cg_id = 0; cgroup_lock(); if (cg_link->cgroup) cg_id = cgroup_id(cg_link->cgroup); cgroup_unlock(); seq_printf(seq, "cgroup_id:\t%llu\n" "attach_type:\t%d\n", cg_id, cg_link->type); } static int bpf_cgroup_link_fill_link_info(const struct bpf_link *link, struct bpf_link_info *info) { struct bpf_cgroup_link *cg_link = container_of(link, struct bpf_cgroup_link, link); u64 cg_id = 0; cgroup_lock(); if (cg_link->cgroup) cg_id = cgroup_id(cg_link->cgroup); cgroup_unlock(); info->cgroup.cgroup_id = cg_id; info->cgroup.attach_type = cg_link->type; return 0; } static const struct bpf_link_ops bpf_cgroup_link_lops = { .release = bpf_cgroup_link_release, .dealloc = bpf_cgroup_link_dealloc, .detach = bpf_cgroup_link_detach, .update_prog = cgroup_bpf_replace, .show_fdinfo = bpf_cgroup_link_show_fdinfo, .fill_link_info = bpf_cgroup_link_fill_link_info, }; int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) { struct bpf_link_primer link_primer; struct bpf_cgroup_link *link; struct cgroup *cgrp; int err; if (attr->link_create.flags) return -EINVAL; cgrp = cgroup_get_from_fd(attr->link_create.target_fd); if (IS_ERR(cgrp)) return PTR_ERR(cgrp); link = kzalloc(sizeof(*link), GFP_USER); if (!link) { err = -ENOMEM; goto out_put_cgroup; } bpf_link_init(&link->link, BPF_LINK_TYPE_CGROUP, &bpf_cgroup_link_lops, prog); link->cgroup = cgrp; link->type = attr->link_create.attach_type; err = bpf_link_prime(&link->link, &link_primer); if (err) { kfree(link); goto out_put_cgroup; } err = cgroup_bpf_attach(cgrp, NULL, NULL, link, link->type, BPF_F_ALLOW_MULTI); if (err) { bpf_link_cleanup(&link_primer); goto out_put_cgroup; } return bpf_link_settle(&link_primer); out_put_cgroup: cgroup_put(cgrp); return err; } int cgroup_bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) { struct cgroup *cgrp; int ret; cgrp = cgroup_get_from_fd(attr->query.target_fd); if (IS_ERR(cgrp)) return PTR_ERR(cgrp); ret = cgroup_bpf_query(cgrp, attr, uattr); cgroup_put(cgrp); return ret; } /** * __cgroup_bpf_run_filter_skb() - Run a program for packet filtering * @sk: The socket sending or receiving traffic * @skb: The skb that is being sent or received * @atype: The type of program to be executed * * If no socket is passed, or the socket is not of type INET or INET6, * this function does nothing and returns 0. * * The program type passed in via @type must be suitable for network * filtering. No further check is performed to assert that. * * For egress packets, this function can return: * NET_XMIT_SUCCESS (0) - continue with packet output * NET_XMIT_DROP (1) - drop packet and notify TCP to call cwr * NET_XMIT_CN (2) - continue with packet output and notify TCP * to call cwr * -err - drop packet * * For ingress packets, this function will return -EPERM if any * attached program was found and if it returned != 1 during execution. * Otherwise 0 is returned. */ int __cgroup_bpf_run_filter_skb(struct sock *sk, struct sk_buff *skb, enum cgroup_bpf_attach_type atype) { unsigned int offset = -skb_network_offset(skb); struct sock *save_sk; void *saved_data_end; struct cgroup *cgrp; int ret; if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6) return 0; cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); save_sk = skb->sk; skb->sk = sk; __skb_push(skb, offset); /* compute pointers for the bpf prog */ bpf_compute_and_save_data_end(skb, &saved_data_end); if (atype == CGROUP_INET_EGRESS) { u32 flags = 0; bool cn; ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, skb, __bpf_prog_run_save_cb, 0, &flags); /* Return values of CGROUP EGRESS BPF programs are: * 0: drop packet * 1: keep packet * 2: drop packet and cn * 3: keep packet and cn * * The returned value is then converted to one of the NET_XMIT * or an error code that is then interpreted as drop packet * (and no cn): * 0: NET_XMIT_SUCCESS skb should be transmitted * 1: NET_XMIT_DROP skb should be dropped and cn * 2: NET_XMIT_CN skb should be transmitted and cn * 3: -err skb should be dropped */ cn = flags & BPF_RET_SET_CN; if (ret && !IS_ERR_VALUE((long)ret)) ret = -EFAULT; if (!ret) ret = (cn ? NET_XMIT_CN : NET_XMIT_SUCCESS); else ret = (cn ? NET_XMIT_DROP : ret); } else { ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, skb, __bpf_prog_run_save_cb, 0, NULL); if (ret && !IS_ERR_VALUE((long)ret)) ret = -EFAULT; } bpf_restore_data_end(skb, saved_data_end); __skb_pull(skb, offset); skb->sk = save_sk; return ret; } EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb); /** * __cgroup_bpf_run_filter_sk() - Run a program on a sock * @sk: sock structure to manipulate * @atype: The type of program to be executed * * socket is passed is expected to be of type INET or INET6. * * The program type passed in via @type must be suitable for sock * filtering. No further check is performed to assert that. * * This function will return %-EPERM if any if an attached program was found * and if it returned != 1 during execution. In all other cases, 0 is returned. */ int __cgroup_bpf_run_filter_sk(struct sock *sk, enum cgroup_bpf_attach_type atype) { struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); return bpf_prog_run_array_cg(&cgrp->bpf, atype, sk, bpf_prog_run, 0, NULL); } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); /** * __cgroup_bpf_run_filter_sock_addr() - Run a program on a sock and * provided by user sockaddr * @sk: sock struct that will use sockaddr * @uaddr: sockaddr struct provided by user * @uaddrlen: Pointer to the size of the sockaddr struct provided by user. It is * read-only for AF_INET[6] uaddr but can be modified for AF_UNIX * uaddr. * @atype: The type of program to be executed * @t_ctx: Pointer to attach type specific context * @flags: Pointer to u32 which contains higher bits of BPF program * return value (OR'ed together). * * socket is expected to be of type INET, INET6 or UNIX. * * This function will return %-EPERM if an attached program is found and * returned value != 1 during execution. In all other cases, 0 is returned. */ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, struct sockaddr *uaddr, int *uaddrlen, enum cgroup_bpf_attach_type atype, void *t_ctx, u32 *flags) { struct bpf_sock_addr_kern ctx = { .sk = sk, .uaddr = uaddr, .t_ctx = t_ctx, }; struct sockaddr_storage unspec; struct cgroup *cgrp; int ret; /* Check socket family since not all sockets represent network * endpoint (e.g. AF_UNIX). */ if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6 && sk->sk_family != AF_UNIX) return 0; if (!ctx.uaddr) { memset(&unspec, 0, sizeof(unspec)); ctx.uaddr = (struct sockaddr *)&unspec; ctx.uaddrlen = 0; } else { ctx.uaddrlen = *uaddrlen; } cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run, 0, flags); if (!ret && uaddr) *uaddrlen = ctx.uaddrlen; return ret; } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr); /** * __cgroup_bpf_run_filter_sock_ops() - Run a program on a sock * @sk: socket to get cgroup from * @sock_ops: bpf_sock_ops_kern struct to pass to program. Contains * sk with connection information (IP addresses, etc.) May not contain * cgroup info if it is a req sock. * @atype: The type of program to be executed * * socket passed is expected to be of type INET or INET6. * * The program type passed in via @type must be suitable for sock_ops * filtering. No further check is performed to assert that. * * This function will return %-EPERM if any if an attached program was found * and if it returned != 1 during execution. In all other cases, 0 is returned. */ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, struct bpf_sock_ops_kern *sock_ops, enum cgroup_bpf_attach_type atype) { struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); return bpf_prog_run_array_cg(&cgrp->bpf, atype, sock_ops, bpf_prog_run, 0, NULL); } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops); int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, short access, enum cgroup_bpf_attach_type atype) { struct cgroup *cgrp; struct bpf_cgroup_dev_ctx ctx = { .access_type = (access << 16) | dev_type, .major = major, .minor = minor, }; int ret; rcu_read_lock(); cgrp = task_dfl_cgroup(current); ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run, 0, NULL); rcu_read_unlock(); return ret; } BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags) { /* flags argument is not used now, * but provides an ability to extend the API. * verifier checks that its value is correct. */ enum bpf_cgroup_storage_type stype = cgroup_storage_type(map); struct bpf_cgroup_storage *storage; struct bpf_cg_run_ctx *ctx; void *ptr; /* get current cgroup storage from BPF run context */ ctx = container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx); storage = ctx->prog_item->cgroup_storage[stype]; if (stype == BPF_CGROUP_STORAGE_SHARED) ptr = &READ_ONCE(storage->buf)->data[0]; else ptr = this_cpu_ptr(storage->percpu_buf); return (unsigned long)ptr; } const struct bpf_func_proto bpf_get_local_storage_proto = { .func = bpf_get_local_storage, .gpl_only = false, .ret_type = RET_PTR_TO_MAP_VALUE, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_ANYTHING, }; BPF_CALL_0(bpf_get_retval) { struct bpf_cg_run_ctx *ctx = container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx); return ctx->retval; } const struct bpf_func_proto bpf_get_retval_proto = { .func = bpf_get_retval, .gpl_only = false, .ret_type = RET_INTEGER, }; BPF_CALL_1(bpf_set_retval, int, retval) { struct bpf_cg_run_ctx *ctx = container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx); ctx->retval = retval; return 0; } const struct bpf_func_proto bpf_set_retval_proto = { .func = bpf_set_retval, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_ANYTHING, }; static const struct bpf_func_proto * cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { const struct bpf_func_proto *func_proto; func_proto = cgroup_common_func_proto(func_id, prog); if (func_proto) return func_proto; func_proto = cgroup_current_func_proto(func_id, prog); if (func_proto) return func_proto; switch (func_id) { case BPF_FUNC_perf_event_output: return &bpf_event_output_data_proto; default: return bpf_base_func_proto(func_id, prog); } } static bool cgroup_dev_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { const int size_default = sizeof(__u32); if (type == BPF_WRITE) return false; if (off < 0 || off + size > sizeof(struct bpf_cgroup_dev_ctx)) return false; /* The verifier guarantees that size > 0. */ if (off % size != 0) return false; switch (off) { case bpf_ctx_range(struct bpf_cgroup_dev_ctx, access_type): bpf_ctx_record_field_size(info, size_default); if (!bpf_ctx_narrow_access_ok(off, size, size_default)) return false; break; default: if (size != size_default) return false; } return true; } const struct bpf_prog_ops cg_dev_prog_ops = { }; const struct bpf_verifier_ops cg_dev_verifier_ops = { .get_func_proto = cgroup_dev_func_proto, .is_valid_access = cgroup_dev_is_valid_access, }; /** * __cgroup_bpf_run_filter_sysctl - Run a program on sysctl * * @head: sysctl table header * @table: sysctl table * @write: sysctl is being read (= 0) or written (= 1) * @buf: pointer to buffer (in and out) * @pcount: value-result argument: value is size of buffer pointed to by @buf, * result is size of @new_buf if program set new value, initial value * otherwise * @ppos: value-result argument: value is position at which read from or write * to sysctl is happening, result is new position if program overrode it, * initial value otherwise * @atype: type of program to be executed * * Program is run when sysctl is being accessed, either read or written, and * can allow or deny such access. * * This function will return %-EPERM if an attached program is found and * returned value != 1 during execution. In all other cases 0 is returned. */ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, const struct ctl_table *table, int write, char **buf, size_t *pcount, loff_t *ppos, enum cgroup_bpf_attach_type atype) { struct bpf_sysctl_kern ctx = { .head = head, .table = table, .write = write, .ppos = ppos, .cur_val = NULL, .cur_len = PAGE_SIZE, .new_val = NULL, .new_len = 0, .new_updated = 0, }; struct cgroup *cgrp; loff_t pos = 0; int ret; ctx.cur_val = kmalloc_track_caller(ctx.cur_len, GFP_KERNEL); if (!ctx.cur_val || table->proc_handler(table, 0, ctx.cur_val, &ctx.cur_len, &pos)) { /* Let BPF program decide how to proceed. */ ctx.cur_len = 0; } if (write && *buf && *pcount) { /* BPF program should be able to override new value with a * buffer bigger than provided by user. */ ctx.new_val = kmalloc_track_caller(PAGE_SIZE, GFP_KERNEL); ctx.new_len = min_t(size_t, PAGE_SIZE, *pcount); if (ctx.new_val) { memcpy(ctx.new_val, *buf, ctx.new_len); } else { /* Let BPF program decide how to proceed. */ ctx.new_len = 0; } } rcu_read_lock(); cgrp = task_dfl_cgroup(current); ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run, 0, NULL); rcu_read_unlock(); kfree(ctx.cur_val); if (ret == 1 && ctx.new_updated) { kfree(*buf); *buf = ctx.new_val; *pcount = ctx.new_len; } else { kfree(ctx.new_val); } return ret; } #ifdef CONFIG_NET static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen, struct bpf_sockopt_buf *buf) { if (unlikely(max_optlen < 0)) return -EINVAL; if (unlikely(max_optlen > PAGE_SIZE)) { /* We don't expose optvals that are greater than PAGE_SIZE * to the BPF program. */ max_optlen = PAGE_SIZE; } if (max_optlen <= sizeof(buf->data)) { /* When the optval fits into BPF_SOCKOPT_KERN_BUF_SIZE * bytes avoid the cost of kzalloc. */ ctx->optval = buf->data; ctx->optval_end = ctx->optval + max_optlen; return max_optlen; } ctx->optval = kzalloc(max_optlen, GFP_USER); if (!ctx->optval) return -ENOMEM; ctx->optval_end = ctx->optval + max_optlen; return max_optlen; } static void sockopt_free_buf(struct bpf_sockopt_kern *ctx, struct bpf_sockopt_buf *buf) { if (ctx->optval == buf->data) return; kfree(ctx->optval); } static bool sockopt_buf_allocated(struct bpf_sockopt_kern *ctx, struct bpf_sockopt_buf *buf) { return ctx->optval != buf->data; } int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, int *optname, sockptr_t optval, int *optlen, char **kernel_optval) { struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); struct bpf_sockopt_buf buf = {}; struct bpf_sockopt_kern ctx = { .sk = sk, .level = *level, .optname = *optname, }; int ret, max_optlen; /* Allocate a bit more than the initial user buffer for * BPF program. The canonical use case is overriding * TCP_CONGESTION(nv) to TCP_CONGESTION(cubic). */ max_optlen = max_t(int, 16, *optlen); max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf); if (max_optlen < 0) return max_optlen; ctx.optlen = *optlen; if (copy_from_sockptr(ctx.optval, optval, min(*optlen, max_optlen))) { ret = -EFAULT; goto out; } lock_sock(sk); ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_SETSOCKOPT, &ctx, bpf_prog_run, 0, NULL); release_sock(sk); if (ret) goto out; if (ctx.optlen == -1) { /* optlen set to -1, bypass kernel */ ret = 1; } else if (ctx.optlen > max_optlen || ctx.optlen < -1) { /* optlen is out of bounds */ if (*optlen > PAGE_SIZE && ctx.optlen >= 0) { pr_info_once("bpf setsockopt: ignoring program buffer with optlen=%d (max_optlen=%d)\n", ctx.optlen, max_optlen); ret = 0; goto out; } ret = -EFAULT; } else { /* optlen within bounds, run kernel handler */ ret = 0; /* export any potential modifications */ *level = ctx.level; *optname = ctx.optname; /* optlen == 0 from BPF indicates that we should * use original userspace data. */ if (ctx.optlen != 0) { *optlen = ctx.optlen; /* We've used bpf_sockopt_kern->buf as an intermediary * storage, but the BPF program indicates that we need * to pass this data to the kernel setsockopt handler. * No way to export on-stack buf, have to allocate a * new buffer. */ if (!sockopt_buf_allocated(&ctx, &buf)) { void *p = kmalloc(ctx.optlen, GFP_USER); if (!p) { ret = -ENOMEM; goto out; } memcpy(p, ctx.optval, ctx.optlen); *kernel_optval = p; } else { *kernel_optval = ctx.optval; } /* export and don't free sockopt buf */ return 0; } } out: sockopt_free_buf(&ctx, &buf); return ret; } int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, int optname, sockptr_t optval, sockptr_t optlen, int max_optlen, int retval) { struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); struct bpf_sockopt_buf buf = {}; struct bpf_sockopt_kern ctx = { .sk = sk, .level = level, .optname = optname, .current_task = current, }; int orig_optlen; int ret; orig_optlen = max_optlen; ctx.optlen = max_optlen; max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf); if (max_optlen < 0) return max_optlen; if (!retval) { /* If kernel getsockopt finished successfully, * copy whatever was returned to the user back * into our temporary buffer. Set optlen to the * one that kernel returned as well to let * BPF programs inspect the value. */ if (copy_from_sockptr(&ctx.optlen, optlen, sizeof(ctx.optlen))) { ret = -EFAULT; goto out; } if (ctx.optlen < 0) { ret = -EFAULT; goto out; } orig_optlen = ctx.optlen; if (copy_from_sockptr(ctx.optval, optval, min(ctx.optlen, max_optlen))) { ret = -EFAULT; goto out; } } lock_sock(sk); ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_GETSOCKOPT, &ctx, bpf_prog_run, retval, NULL); release_sock(sk); if (ret < 0) goto out; if (!sockptr_is_null(optval) && (ctx.optlen > max_optlen || ctx.optlen < 0)) { if (orig_optlen > PAGE_SIZE && ctx.optlen >= 0) { pr_info_once("bpf getsockopt: ignoring program buffer with optlen=%d (max_optlen=%d)\n", ctx.optlen, max_optlen); ret = retval; goto out; } ret = -EFAULT; goto out; } if (ctx.optlen != 0) { if (!sockptr_is_null(optval) && copy_to_sockptr(optval, ctx.optval, ctx.optlen)) { ret = -EFAULT; goto out; } if (copy_to_sockptr(optlen, &ctx.optlen, sizeof(ctx.optlen))) { ret = -EFAULT; goto out; } } out: sockopt_free_buf(&ctx, &buf); return ret; } int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level, int optname, void *optval, int *optlen, int retval) { struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); struct bpf_sockopt_kern ctx = { .sk = sk, .level = level, .optname = optname, .optlen = *optlen, .optval = optval, .optval_end = optval + *optlen, .current_task = current, }; int ret; /* Note that __cgroup_bpf_run_filter_getsockopt doesn't copy * user data back into BPF buffer when reval != 0. This is * done as an optimization to avoid extra copy, assuming * kernel won't populate the data in case of an error. * Here we always pass the data and memset() should * be called if that data shouldn't be "exported". */ ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_GETSOCKOPT, &ctx, bpf_prog_run, retval, NULL); if (ret < 0) return ret; if (ctx.optlen > *optlen) return -EFAULT; /* BPF programs can shrink the buffer, export the modifications. */ if (ctx.optlen != 0) *optlen = ctx.optlen; return ret; } #endif static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp, size_t *lenp) { ssize_t tmp_ret = 0, ret; if (dir->header.parent) { tmp_ret = sysctl_cpy_dir(dir->header.parent, bufp, lenp); if (tmp_ret < 0) return tmp_ret; } ret = strscpy(*bufp, dir->header.ctl_table[0].procname, *lenp); if (ret < 0) return ret; *bufp += ret; *lenp -= ret; ret += tmp_ret; /* Avoid leading slash. */ if (!ret) return ret; tmp_ret = strscpy(*bufp, "/", *lenp); if (tmp_ret < 0) return tmp_ret; *bufp += tmp_ret; *lenp -= tmp_ret; return ret + tmp_ret; } BPF_CALL_4(bpf_sysctl_get_name, struct bpf_sysctl_kern *, ctx, char *, buf, size_t, buf_len, u64, flags) { ssize_t tmp_ret = 0, ret; if (!buf) return -EINVAL; if (!(flags & BPF_F_SYSCTL_BASE_NAME)) { if (!ctx->head) return -EINVAL; tmp_ret = sysctl_cpy_dir(ctx->head->parent, &buf, &buf_len); if (tmp_ret < 0) return tmp_ret; } ret = strscpy(buf, ctx->table->procname, buf_len); return ret < 0 ? ret : tmp_ret + ret; } static const struct bpf_func_proto bpf_sysctl_get_name_proto = { .func = bpf_sysctl_get_name, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, }; static int copy_sysctl_value(char *dst, size_t dst_len, char *src, size_t src_len) { if (!dst) return -EINVAL; if (!dst_len) return -E2BIG; if (!src || !src_len) { memset(dst, 0, dst_len); return -EINVAL; } memcpy(dst, src, min(dst_len, src_len)); if (dst_len > src_len) { memset(dst + src_len, '\0', dst_len - src_len); return src_len; } dst[dst_len - 1] = '\0'; return -E2BIG; } BPF_CALL_3(bpf_sysctl_get_current_value, struct bpf_sysctl_kern *, ctx, char *, buf, size_t, buf_len) { return copy_sysctl_value(buf, buf_len, ctx->cur_val, ctx->cur_len); } static const struct bpf_func_proto bpf_sysctl_get_current_value_proto = { .func = bpf_sysctl_get_current_value, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_UNINIT_MEM, .arg3_type = ARG_CONST_SIZE, }; BPF_CALL_3(bpf_sysctl_get_new_value, struct bpf_sysctl_kern *, ctx, char *, buf, size_t, buf_len) { if (!ctx->write) { if (buf && buf_len) memset(buf, '\0', buf_len); return -EINVAL; } return copy_sysctl_value(buf, buf_len, ctx->new_val, ctx->new_len); } static const struct bpf_func_proto bpf_sysctl_get_new_value_proto = { .func = bpf_sysctl_get_new_value, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_UNINIT_MEM, .arg3_type = ARG_CONST_SIZE, }; BPF_CALL_3(bpf_sysctl_set_new_value, struct bpf_sysctl_kern *, ctx, const char *, buf, size_t, buf_len) { if (!ctx->write || !ctx->new_val || !ctx->new_len || !buf || !buf_len) return -EINVAL; if (buf_len > PAGE_SIZE - 1) return -E2BIG; memcpy(ctx->new_val, buf, buf_len); ctx->new_len = buf_len; ctx->new_updated = 1; return 0; } static const struct bpf_func_proto bpf_sysctl_set_new_value_proto = { .func = bpf_sysctl_set_new_value, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, }; static const struct bpf_func_proto * sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { const struct bpf_func_proto *func_proto; func_proto = cgroup_common_func_proto(func_id, prog); if (func_proto) return func_proto; func_proto = cgroup_current_func_proto(func_id, prog); if (func_proto) return func_proto; switch (func_id) { case BPF_FUNC_sysctl_get_name: return &bpf_sysctl_get_name_proto; case BPF_FUNC_sysctl_get_current_value: return &bpf_sysctl_get_current_value_proto; case BPF_FUNC_sysctl_get_new_value: return &bpf_sysctl_get_new_value_proto; case BPF_FUNC_sysctl_set_new_value: return &bpf_sysctl_set_new_value_proto; case BPF_FUNC_ktime_get_coarse_ns: return &bpf_ktime_get_coarse_ns_proto; case BPF_FUNC_perf_event_output: return &bpf_event_output_data_proto; default: return bpf_base_func_proto(func_id, prog); } } static bool sysctl_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { const int size_default = sizeof(__u32); if (off < 0 || off + size > sizeof(struct bpf_sysctl) || off % size) return false; switch (off) { case bpf_ctx_range(struct bpf_sysctl, write): if (type != BPF_READ) return false; bpf_ctx_record_field_size(info, size_default); return bpf_ctx_narrow_access_ok(off, size, size_default); case bpf_ctx_range(struct bpf_sysctl, file_pos): if (type == BPF_READ) { bpf_ctx_record_field_size(info, size_default); return bpf_ctx_narrow_access_ok(off, size, size_default); } else { return size == size_default; } default: return false; } } static u32 sysctl_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; u32 read_size; switch (si->off) { case offsetof(struct bpf_sysctl, write): *insn++ = BPF_LDX_MEM( BPF_SIZE(si->code), si->dst_reg, si->src_reg, bpf_target_off(struct bpf_sysctl_kern, write, sizeof_field(struct bpf_sysctl_kern, write), target_size)); break; case offsetof(struct bpf_sysctl, file_pos): /* ppos is a pointer so it should be accessed via indirect * loads and stores. Also for stores additional temporary * register is used since neither src_reg nor dst_reg can be * overridden. */ if (type == BPF_WRITE) { int treg = BPF_REG_9; if (si->src_reg == treg || si->dst_reg == treg) --treg; if (si->src_reg == treg || si->dst_reg == treg) --treg; *insn++ = BPF_STX_MEM( BPF_DW, si->dst_reg, treg, offsetof(struct bpf_sysctl_kern, tmp_reg)); *insn++ = BPF_LDX_MEM( BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos), treg, si->dst_reg, offsetof(struct bpf_sysctl_kern, ppos)); *insn++ = BPF_RAW_INSN( BPF_CLASS(si->code) | BPF_MEM | BPF_SIZEOF(u32), treg, si->src_reg, bpf_ctx_narrow_access_offset( 0, sizeof(u32), sizeof(loff_t)), si->imm); *insn++ = BPF_LDX_MEM( BPF_DW, treg, si->dst_reg, offsetof(struct bpf_sysctl_kern, tmp_reg)); } else { *insn++ = BPF_LDX_MEM( BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos), si->dst_reg, si->src_reg, offsetof(struct bpf_sysctl_kern, ppos)); read_size = bpf_size_to_bytes(BPF_SIZE(si->code)); *insn++ = BPF_LDX_MEM( BPF_SIZE(si->code), si->dst_reg, si->dst_reg, bpf_ctx_narrow_access_offset( 0, read_size, sizeof(loff_t))); } *target_size = sizeof(u32); break; } return insn - insn_buf; } const struct bpf_verifier_ops cg_sysctl_verifier_ops = { .get_func_proto = sysctl_func_proto, .is_valid_access = sysctl_is_valid_access, .convert_ctx_access = sysctl_convert_ctx_access, }; const struct bpf_prog_ops cg_sysctl_prog_ops = { }; #ifdef CONFIG_NET BPF_CALL_1(bpf_get_netns_cookie_sockopt, struct bpf_sockopt_kern *, ctx) { const struct net *net = ctx ? sock_net(ctx->sk) : &init_net; return net->net_cookie; } static const struct bpf_func_proto bpf_get_netns_cookie_sockopt_proto = { .func = bpf_get_netns_cookie_sockopt, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX_OR_NULL, }; #endif static const struct bpf_func_proto * cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { const struct bpf_func_proto *func_proto; func_proto = cgroup_common_func_proto(func_id, prog); if (func_proto) return func_proto; func_proto = cgroup_current_func_proto(func_id, prog); if (func_proto) return func_proto; switch (func_id) { #ifdef CONFIG_NET case BPF_FUNC_get_netns_cookie: return &bpf_get_netns_cookie_sockopt_proto; case BPF_FUNC_sk_storage_get: return &bpf_sk_storage_get_proto; case BPF_FUNC_sk_storage_delete: return &bpf_sk_storage_delete_proto; case BPF_FUNC_setsockopt: if (prog->expected_attach_type == BPF_CGROUP_SETSOCKOPT) return &bpf_sk_setsockopt_proto; return NULL; case BPF_FUNC_getsockopt: if (prog->expected_attach_type == BPF_CGROUP_SETSOCKOPT) return &bpf_sk_getsockopt_proto; return NULL; #endif #ifdef CONFIG_INET case BPF_FUNC_tcp_sock: return &bpf_tcp_sock_proto; #endif case BPF_FUNC_perf_event_output: return &bpf_event_output_data_proto; default: return bpf_base_func_proto(func_id, prog); } } static bool cg_sockopt_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { const int size_default = sizeof(__u32); if (off < 0 || off >= sizeof(struct bpf_sockopt)) return false; if (off % size != 0) return false; if (type == BPF_WRITE) { switch (off) { case offsetof(struct bpf_sockopt, retval): if (size != size_default) return false; return prog->expected_attach_type == BPF_CGROUP_GETSOCKOPT; case offsetof(struct bpf_sockopt, optname): fallthrough; case offsetof(struct bpf_sockopt, level): if (size != size_default) return false; return prog->expected_attach_type == BPF_CGROUP_SETSOCKOPT; case offsetof(struct bpf_sockopt, optlen): return size == size_default; default: return false; } } switch (off) { case offsetof(struct bpf_sockopt, sk): if (size != sizeof(__u64)) return false; info->reg_type = PTR_TO_SOCKET; break; case offsetof(struct bpf_sockopt, optval): if (size != sizeof(__u64)) return false; info->reg_type = PTR_TO_PACKET; break; case offsetof(struct bpf_sockopt, optval_end): if (size != sizeof(__u64)) return false; info->reg_type = PTR_TO_PACKET_END; break; case offsetof(struct bpf_sockopt, retval): if (size != size_default) return false; return prog->expected_attach_type == BPF_CGROUP_GETSOCKOPT; default: if (size != size_default) return false; break; } return true; } #define CG_SOCKOPT_READ_FIELD(F) \ BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, F), \ si->dst_reg, si->src_reg, \ offsetof(struct bpf_sockopt_kern, F)) #define CG_SOCKOPT_WRITE_FIELD(F) \ BPF_RAW_INSN((BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, F) | \ BPF_MEM | BPF_CLASS(si->code)), \ si->dst_reg, si->src_reg, \ offsetof(struct bpf_sockopt_kern, F), \ si->imm) static u32 cg_sockopt_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; switch (si->off) { case offsetof(struct bpf_sockopt, sk): *insn++ = CG_SOCKOPT_READ_FIELD(sk); break; case offsetof(struct bpf_sockopt, level): if (type == BPF_WRITE) *insn++ = CG_SOCKOPT_WRITE_FIELD(level); else *insn++ = CG_SOCKOPT_READ_FIELD(level); break; case offsetof(struct bpf_sockopt, optname): if (type == BPF_WRITE) *insn++ = CG_SOCKOPT_WRITE_FIELD(optname); else *insn++ = CG_SOCKOPT_READ_FIELD(optname); break; case offsetof(struct bpf_sockopt, optlen): if (type == BPF_WRITE) *insn++ = CG_SOCKOPT_WRITE_FIELD(optlen); else *insn++ = CG_SOCKOPT_READ_FIELD(optlen); break; case offsetof(struct bpf_sockopt, retval): BUILD_BUG_ON(offsetof(struct bpf_cg_run_ctx, run_ctx) != 0); if (type == BPF_WRITE) { int treg = BPF_REG_9; if (si->src_reg == treg || si->dst_reg == treg) --treg; if (si->src_reg == treg || si->dst_reg == treg) --treg; *insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, treg, offsetof(struct bpf_sockopt_kern, tmp_reg)); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, current_task), treg, si->dst_reg, offsetof(struct bpf_sockopt_kern, current_task)); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct task_struct, bpf_ctx), treg, treg, offsetof(struct task_struct, bpf_ctx)); *insn++ = BPF_RAW_INSN(BPF_CLASS(si->code) | BPF_MEM | BPF_FIELD_SIZEOF(struct bpf_cg_run_ctx, retval), treg, si->src_reg, offsetof(struct bpf_cg_run_ctx, retval), si->imm); *insn++ = BPF_LDX_MEM(BPF_DW, treg, si->dst_reg, offsetof(struct bpf_sockopt_kern, tmp_reg)); } else { *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, current_task), si->dst_reg, si->src_reg, offsetof(struct bpf_sockopt_kern, current_task)); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct task_struct, bpf_ctx), si->dst_reg, si->dst_reg, offsetof(struct task_struct, bpf_ctx)); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_cg_run_ctx, retval), si->dst_reg, si->dst_reg, offsetof(struct bpf_cg_run_ctx, retval)); } break; case offsetof(struct bpf_sockopt, optval): *insn++ = CG_SOCKOPT_READ_FIELD(optval); break; case offsetof(struct bpf_sockopt, optval_end): *insn++ = CG_SOCKOPT_READ_FIELD(optval_end); break; } return insn - insn_buf; } static int cg_sockopt_get_prologue(struct bpf_insn *insn_buf, bool direct_write, const struct bpf_prog *prog) { /* Nothing to do for sockopt argument. The data is kzalloc'ated. */ return 0; } const struct bpf_verifier_ops cg_sockopt_verifier_ops = { .get_func_proto = cg_sockopt_func_proto, .is_valid_access = cg_sockopt_is_valid_access, .convert_ctx_access = cg_sockopt_convert_ctx_access, .gen_prologue = cg_sockopt_get_prologue, }; const struct bpf_prog_ops cg_sockopt_prog_ops = { }; /* Common helpers for cgroup hooks. */ const struct bpf_func_proto * cgroup_common_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_get_local_storage: return &bpf_get_local_storage_proto; case BPF_FUNC_get_retval: switch (prog->expected_attach_type) { case BPF_CGROUP_INET_INGRESS: case BPF_CGROUP_INET_EGRESS: case BPF_CGROUP_SOCK_OPS: case BPF_CGROUP_UDP4_RECVMSG: case BPF_CGROUP_UDP6_RECVMSG: case BPF_CGROUP_UNIX_RECVMSG: case BPF_CGROUP_INET4_GETPEERNAME: case BPF_CGROUP_INET6_GETPEERNAME: case BPF_CGROUP_UNIX_GETPEERNAME: case BPF_CGROUP_INET4_GETSOCKNAME: case BPF_CGROUP_INET6_GETSOCKNAME: case BPF_CGROUP_UNIX_GETSOCKNAME: return NULL; default: return &bpf_get_retval_proto; } case BPF_FUNC_set_retval: switch (prog->expected_attach_type) { case BPF_CGROUP_INET_INGRESS: case BPF_CGROUP_INET_EGRESS: case BPF_CGROUP_SOCK_OPS: case BPF_CGROUP_UDP4_RECVMSG: case BPF_CGROUP_UDP6_RECVMSG: case BPF_CGROUP_UNIX_RECVMSG: case BPF_CGROUP_INET4_GETPEERNAME: case BPF_CGROUP_INET6_GETPEERNAME: case BPF_CGROUP_UNIX_GETPEERNAME: case BPF_CGROUP_INET4_GETSOCKNAME: case BPF_CGROUP_INET6_GETSOCKNAME: case BPF_CGROUP_UNIX_GETSOCKNAME: return NULL; default: return &bpf_set_retval_proto; } default: return NULL; } } /* Common helpers for cgroup hooks with valid process context. */ const struct bpf_func_proto * cgroup_current_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_get_current_uid_gid: return &bpf_get_current_uid_gid_proto; case BPF_FUNC_get_current_comm: return &bpf_get_current_comm_proto; #ifdef CONFIG_CGROUP_NET_CLASSID case BPF_FUNC_get_cgroup_classid: return &bpf_get_cgroup_classid_curr_proto; #endif case BPF_FUNC_current_task_under_cgroup: return &bpf_current_task_under_cgroup_proto; default: return NULL; } }
4 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 /* SPDX-License-Identifier: GPL-2.0-only */ /* * SM3 secure hash, as specified by OSCCA GM/T 0004-2012 SM3 and described * at https://datatracker.ietf.org/doc/html/draft-sca-cfrg-sm3-02 * * Copyright (C) 2017 ARM Limited or its affiliates. * Copyright (C) 2017 Gilad Ben-Yossef <gilad@benyossef.com> * Copyright (C) 2021 Tianjia Zhang <tianjia.zhang@linux.alibaba.com> */ #include <linux/module.h> #include <linux/unaligned.h> #include <crypto/sm3.h> static const u32 ____cacheline_aligned K[64] = { 0x79cc4519, 0xf3988a32, 0xe7311465, 0xce6228cb, 0x9cc45197, 0x3988a32f, 0x7311465e, 0xe6228cbc, 0xcc451979, 0x988a32f3, 0x311465e7, 0x6228cbce, 0xc451979c, 0x88a32f39, 0x11465e73, 0x228cbce6, 0x9d8a7a87, 0x3b14f50f, 0x7629ea1e, 0xec53d43c, 0xd8a7a879, 0xb14f50f3, 0x629ea1e7, 0xc53d43ce, 0x8a7a879d, 0x14f50f3b, 0x29ea1e76, 0x53d43cec, 0xa7a879d8, 0x4f50f3b1, 0x9ea1e762, 0x3d43cec5, 0x7a879d8a, 0xf50f3b14, 0xea1e7629, 0xd43cec53, 0xa879d8a7, 0x50f3b14f, 0xa1e7629e, 0x43cec53d, 0x879d8a7a, 0x0f3b14f5, 0x1e7629ea, 0x3cec53d4, 0x79d8a7a8, 0xf3b14f50, 0xe7629ea1, 0xcec53d43, 0x9d8a7a87, 0x3b14f50f, 0x7629ea1e, 0xec53d43c, 0xd8a7a879, 0xb14f50f3, 0x629ea1e7, 0xc53d43ce, 0x8a7a879d, 0x14f50f3b, 0x29ea1e76, 0x53d43cec, 0xa7a879d8, 0x4f50f3b1, 0x9ea1e762, 0x3d43cec5 }; /* * Transform the message X which consists of 16 32-bit-words. See * GM/T 004-2012 for details. */ #define R(i, a, b, c, d, e, f, g, h, t, w1, w2) \ do { \ ss1 = rol32((rol32((a), 12) + (e) + (t)), 7); \ ss2 = ss1 ^ rol32((a), 12); \ d += FF ## i(a, b, c) + ss2 + ((w1) ^ (w2)); \ h += GG ## i(e, f, g) + ss1 + (w1); \ b = rol32((b), 9); \ f = rol32((f), 19); \ h = P0((h)); \ } while (0) #define R1(a, b, c, d, e, f, g, h, t, w1, w2) \ R(1, a, b, c, d, e, f, g, h, t, w1, w2) #define R2(a, b, c, d, e, f, g, h, t, w1, w2) \ R(2, a, b, c, d, e, f, g, h, t, w1, w2) #define FF1(x, y, z) (x ^ y ^ z) #define FF2(x, y, z) ((x & y) | (x & z) | (y & z)) #define GG1(x, y, z) FF1(x, y, z) #define GG2(x, y, z) ((x & y) | (~x & z)) /* Message expansion */ #define P0(x) ((x) ^ rol32((x), 9) ^ rol32((x), 17)) #define P1(x) ((x) ^ rol32((x), 15) ^ rol32((x), 23)) #define I(i) (W[i] = get_unaligned_be32(data + i * 4)) #define W1(i) (W[i & 0x0f]) #define W2(i) (W[i & 0x0f] = \ P1(W[i & 0x0f] \ ^ W[(i-9) & 0x0f] \ ^ rol32(W[(i-3) & 0x0f], 15)) \ ^ rol32(W[(i-13) & 0x0f], 7) \ ^ W[(i-6) & 0x0f]) static void sm3_transform(struct sm3_state *sctx, u8 const *data, u32 W[16]) { u32 a, b, c, d, e, f, g, h, ss1, ss2; a = sctx->state[0]; b = sctx->state[1]; c = sctx->state[2]; d = sctx->state[3]; e = sctx->state[4]; f = sctx->state[5]; g = sctx->state[6]; h = sctx->state[7]; R1(a, b, c, d, e, f, g, h, K[0], I(0), I(4)); R1(d, a, b, c, h, e, f, g, K[1], I(1), I(5)); R1(c, d, a, b, g, h, e, f, K[2], I(2), I(6)); R1(b, c, d, a, f, g, h, e, K[3], I(3), I(7)); R1(a, b, c, d, e, f, g, h, K[4], W1(4), I(8)); R1(d, a, b, c, h, e, f, g, K[5], W1(5), I(9)); R1(c, d, a, b, g, h, e, f, K[6], W1(6), I(10)); R1(b, c, d, a, f, g, h, e, K[7], W1(7), I(11)); R1(a, b, c, d, e, f, g, h, K[8], W1(8), I(12)); R1(d, a, b, c, h, e, f, g, K[9], W1(9), I(13)); R1(c, d, a, b, g, h, e, f, K[10], W1(10), I(14)); R1(b, c, d, a, f, g, h, e, K[11], W1(11), I(15)); R1(a, b, c, d, e, f, g, h, K[12], W1(12), W2(16)); R1(d, a, b, c, h, e, f, g, K[13], W1(13), W2(17)); R1(c, d, a, b, g, h, e, f, K[14], W1(14), W2(18)); R1(b, c, d, a, f, g, h, e, K[15], W1(15), W2(19)); R2(a, b, c, d, e, f, g, h, K[16], W1(16), W2(20)); R2(d, a, b, c, h, e, f, g, K[17], W1(17), W2(21)); R2(c, d, a, b, g, h, e, f, K[18], W1(18), W2(22)); R2(b, c, d, a, f, g, h, e, K[19], W1(19), W2(23)); R2(a, b, c, d, e, f, g, h, K[20], W1(20), W2(24)); R2(d, a, b, c, h, e, f, g, K[21], W1(21), W2(25)); R2(c, d, a, b, g, h, e, f, K[22], W1(22), W2(26)); R2(b, c, d, a, f, g, h, e, K[23], W1(23), W2(27)); R2(a, b, c, d, e, f, g, h, K[24], W1(24), W2(28)); R2(d, a, b, c, h, e, f, g, K[25], W1(25), W2(29)); R2(c, d, a, b, g, h, e, f, K[26], W1(26), W2(30)); R2(b, c, d, a, f, g, h, e, K[27], W1(27), W2(31)); R2(a, b, c, d, e, f, g, h, K[28], W1(28), W2(32)); R2(d, a, b, c, h, e, f, g, K[29], W1(29), W2(33)); R2(c, d, a, b, g, h, e, f, K[30], W1(30), W2(34)); R2(b, c, d, a, f, g, h, e, K[31], W1(31), W2(35)); R2(a, b, c, d, e, f, g, h, K[32], W1(32), W2(36)); R2(d, a, b, c, h, e, f, g, K[33], W1(33), W2(37)); R2(c, d, a, b, g, h, e, f, K[34], W1(34), W2(38)); R2(b, c, d, a, f, g, h, e, K[35], W1(35), W2(39)); R2(a, b, c, d, e, f, g, h, K[36], W1(36), W2(40)); R2(d, a, b, c, h, e, f, g, K[37], W1(37), W2(41)); R2(c, d, a, b, g, h, e, f, K[38], W1(38), W2(42)); R2(b, c, d, a, f, g, h, e, K[39], W1(39), W2(43)); R2(a, b, c, d, e, f, g, h, K[40], W1(40), W2(44)); R2(d, a, b, c, h, e, f, g, K[41], W1(41), W2(45)); R2(c, d, a, b, g, h, e, f, K[42], W1(42), W2(46)); R2(b, c, d, a, f, g, h, e, K[43], W1(43), W2(47)); R2(a, b, c, d, e, f, g, h, K[44], W1(44), W2(48)); R2(d, a, b, c, h, e, f, g, K[45], W1(45), W2(49)); R2(c, d, a, b, g, h, e, f, K[46], W1(46), W2(50)); R2(b, c, d, a, f, g, h, e, K[47], W1(47), W2(51)); R2(a, b, c, d, e, f, g, h, K[48], W1(48), W2(52)); R2(d, a, b, c, h, e, f, g, K[49], W1(49), W2(53)); R2(c, d, a, b, g, h, e, f, K[50], W1(50), W2(54)); R2(b, c, d, a, f, g, h, e, K[51], W1(51), W2(55)); R2(a, b, c, d, e, f, g, h, K[52], W1(52), W2(56)); R2(d, a, b, c, h, e, f, g, K[53], W1(53), W2(57)); R2(c, d, a, b, g, h, e, f, K[54], W1(54), W2(58)); R2(b, c, d, a, f, g, h, e, K[55], W1(55), W2(59)); R2(a, b, c, d, e, f, g, h, K[56], W1(56), W2(60)); R2(d, a, b, c, h, e, f, g, K[57], W1(57), W2(61)); R2(c, d, a, b, g, h, e, f, K[58], W1(58), W2(62)); R2(b, c, d, a, f, g, h, e, K[59], W1(59), W2(63)); R2(a, b, c, d, e, f, g, h, K[60], W1(60), W2(64)); R2(d, a, b, c, h, e, f, g, K[61], W1(61), W2(65)); R2(c, d, a, b, g, h, e, f, K[62], W1(62), W2(66)); R2(b, c, d, a, f, g, h, e, K[63], W1(63), W2(67)); sctx->state[0] ^= a; sctx->state[1] ^= b; sctx->state[2] ^= c; sctx->state[3] ^= d; sctx->state[4] ^= e; sctx->state[5] ^= f; sctx->state[6] ^= g; sctx->state[7] ^= h; } #undef R #undef R1 #undef R2 #undef I #undef W1 #undef W2 static inline void sm3_block(struct sm3_state *sctx, u8 const *data, int blocks, u32 W[16]) { while (blocks--) { sm3_transform(sctx, data, W); data += SM3_BLOCK_SIZE; } } void sm3_update(struct sm3_state *sctx, const u8 *data, unsigned int len) { unsigned int partial = sctx->count % SM3_BLOCK_SIZE; u32 W[16]; sctx->count += len; if ((partial + len) >= SM3_BLOCK_SIZE) { int blocks; if (partial) { int p = SM3_BLOCK_SIZE - partial; memcpy(sctx->buffer + partial, data, p); data += p; len -= p; sm3_block(sctx, sctx->buffer, 1, W); } blocks = len / SM3_BLOCK_SIZE; len %= SM3_BLOCK_SIZE; if (blocks) { sm3_block(sctx, data, blocks, W); data += blocks * SM3_BLOCK_SIZE; } memzero_explicit(W, sizeof(W)); partial = 0; } if (len) memcpy(sctx->buffer + partial, data, len); } EXPORT_SYMBOL_GPL(sm3_update); void sm3_final(struct sm3_state *sctx, u8 *out) { const int bit_offset = SM3_BLOCK_SIZE - sizeof(u64); __be64 *bits = (__be64 *)(sctx->buffer + bit_offset); __be32 *digest = (__be32 *)out; unsigned int partial = sctx->count % SM3_BLOCK_SIZE; u32 W[16]; int i; sctx->buffer[partial++] = 0x80; if (partial > bit_offset) { memset(sctx->buffer + partial, 0, SM3_BLOCK_SIZE - partial); partial = 0; sm3_block(sctx, sctx->buffer, 1, W); } memset(sctx->buffer + partial, 0, bit_offset - partial); *bits = cpu_to_be64(sctx->count << 3); sm3_block(sctx, sctx->buffer, 1, W); for (i = 0; i < 8; i++) put_unaligned_be32(sctx->state[i], digest++); /* Zeroize sensitive information. */ memzero_explicit(W, sizeof(W)); memzero_explicit(sctx, sizeof(*sctx)); } EXPORT_SYMBOL_GPL(sm3_final); MODULE_DESCRIPTION("Generic SM3 library"); MODULE_LICENSE("GPL v2");
14 11 8 8 16 16 3 3 14 14 13 14 14 14 14 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 // SPDX-License-Identifier: GPL-2.0-only /* * pcrypt - Parallel crypto wrapper. * * Copyright (C) 2009 secunet Security Networks AG * Copyright (C) 2009 Steffen Klassert <steffen.klassert@secunet.com> */ #include <crypto/algapi.h> #include <crypto/internal/aead.h> #include <linux/atomic.h> #include <linux/err.h> #include <linux/init.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/kobject.h> #include <linux/cpu.h> #include <crypto/pcrypt.h> static struct padata_instance *pencrypt; static struct padata_instance *pdecrypt; static struct kset *pcrypt_kset; struct pcrypt_instance_ctx { struct crypto_aead_spawn spawn; struct padata_shell *psenc; struct padata_shell *psdec; atomic_t tfm_count; }; struct pcrypt_aead_ctx { struct crypto_aead *child; unsigned int cb_cpu; }; static inline struct pcrypt_instance_ctx *pcrypt_tfm_ictx( struct crypto_aead *tfm) { return aead_instance_ctx(aead_alg_instance(tfm)); } static int pcrypt_aead_setkey(struct crypto_aead *parent, const u8 *key, unsigned int keylen) { struct pcrypt_aead_ctx *ctx = crypto_aead_ctx(parent); return crypto_aead_setkey(ctx->child, key, keylen); } static int pcrypt_aead_setauthsize(struct crypto_aead *parent, unsigned int authsize) { struct pcrypt_aead_ctx *ctx = crypto_aead_ctx(parent); return crypto_aead_setauthsize(ctx->child, authsize); } static void pcrypt_aead_serial(struct padata_priv *padata) { struct pcrypt_request *preq = pcrypt_padata_request(padata); struct aead_request *req = pcrypt_request_ctx(preq); aead_request_complete(req->base.data, padata->info); } static void pcrypt_aead_done(void *data, int err) { struct aead_request *req = data; struct pcrypt_request *preq = aead_request_ctx(req); struct padata_priv *padata = pcrypt_request_padata(preq); padata->info = err; padata_do_serial(padata); } static void pcrypt_aead_enc(struct padata_priv *padata) { struct pcrypt_request *preq = pcrypt_padata_request(padata); struct aead_request *req = pcrypt_request_ctx(preq); int ret; ret = crypto_aead_encrypt(req); if (ret == -EINPROGRESS) return; padata->info = ret; padata_do_serial(padata); } static int pcrypt_aead_encrypt(struct aead_request *req) { int err; struct pcrypt_request *preq = aead_request_ctx(req); struct aead_request *creq = pcrypt_request_ctx(preq); struct padata_priv *padata = pcrypt_request_padata(preq); struct crypto_aead *aead = crypto_aead_reqtfm(req); struct pcrypt_aead_ctx *ctx = crypto_aead_ctx(aead); u32 flags = aead_request_flags(req); struct pcrypt_instance_ctx *ictx; ictx = pcrypt_tfm_ictx(aead); memset(padata, 0, sizeof(struct padata_priv)); padata->parallel = pcrypt_aead_enc; padata->serial = pcrypt_aead_serial; aead_request_set_tfm(creq, ctx->child); aead_request_set_callback(creq, flags & ~CRYPTO_TFM_REQ_MAY_SLEEP, pcrypt_aead_done, req); aead_request_set_crypt(creq, req->src, req->dst, req->cryptlen, req->iv); aead_request_set_ad(creq, req->assoclen); err = padata_do_parallel(ictx->psenc, padata, &ctx->cb_cpu); if (!err) return -EINPROGRESS; if (err == -EBUSY) { /* try non-parallel mode */ return crypto_aead_encrypt(creq); } return err; } static void pcrypt_aead_dec(struct padata_priv *padata) { struct pcrypt_request *preq = pcrypt_padata_request(padata); struct aead_request *req = pcrypt_request_ctx(preq); int ret; ret = crypto_aead_decrypt(req); if (ret == -EINPROGRESS) return; padata->info = ret; padata_do_serial(padata); } static int pcrypt_aead_decrypt(struct aead_request *req) { int err; struct pcrypt_request *preq = aead_request_ctx(req); struct aead_request *creq = pcrypt_request_ctx(preq); struct padata_priv *padata = pcrypt_request_padata(preq); struct crypto_aead *aead = crypto_aead_reqtfm(req); struct pcrypt_aead_ctx *ctx = crypto_aead_ctx(aead); u32 flags = aead_request_flags(req); struct pcrypt_instance_ctx *ictx; ictx = pcrypt_tfm_ictx(aead); memset(padata, 0, sizeof(struct padata_priv)); padata->parallel = pcrypt_aead_dec; padata->serial = pcrypt_aead_serial; aead_request_set_tfm(creq, ctx->child); aead_request_set_callback(creq, flags & ~CRYPTO_TFM_REQ_MAY_SLEEP, pcrypt_aead_done, req); aead_request_set_crypt(creq, req->src, req->dst, req->cryptlen, req->iv); aead_request_set_ad(creq, req->assoclen); err = padata_do_parallel(ictx->psdec, padata, &ctx->cb_cpu); if (!err) return -EINPROGRESS; if (err == -EBUSY) { /* try non-parallel mode */ return crypto_aead_decrypt(creq); } return err; } static int pcrypt_aead_init_tfm(struct crypto_aead *tfm) { int cpu, cpu_index; struct aead_instance *inst = aead_alg_instance(tfm); struct pcrypt_instance_ctx *ictx = aead_instance_ctx(inst); struct pcrypt_aead_ctx *ctx = crypto_aead_ctx(tfm); struct crypto_aead *cipher; cpu_index = (unsigned int)atomic_inc_return(&ictx->tfm_count) % cpumask_weight(cpu_online_mask); ctx->cb_cpu = cpumask_first(cpu_online_mask); for (cpu = 0; cpu < cpu_index; cpu++) ctx->cb_cpu = cpumask_next(ctx->cb_cpu, cpu_online_mask); cipher = crypto_spawn_aead(&ictx->spawn); if (IS_ERR(cipher)) return PTR_ERR(cipher); ctx->child = cipher; crypto_aead_set_reqsize(tfm, sizeof(struct pcrypt_request) + sizeof(struct aead_request) + crypto_aead_reqsize(cipher)); return 0; } static void pcrypt_aead_exit_tfm(struct crypto_aead *tfm) { struct pcrypt_aead_ctx *ctx = crypto_aead_ctx(tfm); crypto_free_aead(ctx->child); } static void pcrypt_free(struct aead_instance *inst) { struct pcrypt_instance_ctx *ctx = aead_instance_ctx(inst); crypto_drop_aead(&ctx->spawn); padata_free_shell(ctx->psdec); padata_free_shell(ctx->psenc); kfree(inst); } static int pcrypt_init_instance(struct crypto_instance *inst, struct crypto_alg *alg) { if (snprintf(inst->alg.cra_driver_name, CRYPTO_MAX_ALG_NAME, "pcrypt(%s)", alg->cra_driver_name) >= CRYPTO_MAX_ALG_NAME) return -ENAMETOOLONG; memcpy(inst->alg.cra_name, alg->cra_name, CRYPTO_MAX_ALG_NAME); inst->alg.cra_priority = alg->cra_priority + 100; inst->alg.cra_blocksize = alg->cra_blocksize; inst->alg.cra_alignmask = alg->cra_alignmask; return 0; } static int pcrypt_create_aead(struct crypto_template *tmpl, struct rtattr **tb, struct crypto_attr_type *algt) { struct pcrypt_instance_ctx *ctx; struct aead_instance *inst; struct aead_alg *alg; u32 mask = crypto_algt_inherited_mask(algt); int err; inst = kzalloc(sizeof(*inst) + sizeof(*ctx), GFP_KERNEL); if (!inst) return -ENOMEM; err = -ENOMEM; ctx = aead_instance_ctx(inst); ctx->psenc = padata_alloc_shell(pencrypt); if (!ctx->psenc) goto err_free_inst; ctx->psdec = padata_alloc_shell(pdecrypt); if (!ctx->psdec) goto err_free_inst; err = crypto_grab_aead(&ctx->spawn, aead_crypto_instance(inst), crypto_attr_alg_name(tb[1]), 0, mask); if (err) goto err_free_inst; alg = crypto_spawn_aead_alg(&ctx->spawn); err = pcrypt_init_instance(aead_crypto_instance(inst), &alg->base); if (err) goto err_free_inst; inst->alg.base.cra_flags |= CRYPTO_ALG_ASYNC; inst->alg.ivsize = crypto_aead_alg_ivsize(alg); inst->alg.maxauthsize = crypto_aead_alg_maxauthsize(alg); inst->alg.base.cra_ctxsize = sizeof(struct pcrypt_aead_ctx); inst->alg.init = pcrypt_aead_init_tfm; inst->alg.exit = pcrypt_aead_exit_tfm; inst->alg.setkey = pcrypt_aead_setkey; inst->alg.setauthsize = pcrypt_aead_setauthsize; inst->alg.encrypt = pcrypt_aead_encrypt; inst->alg.decrypt = pcrypt_aead_decrypt; inst->free = pcrypt_free; err = aead_register_instance(tmpl, inst); if (err) { err_free_inst: pcrypt_free(inst); } return err; } static int pcrypt_create(struct crypto_template *tmpl, struct rtattr **tb) { struct crypto_attr_type *algt; algt = crypto_get_attr_type(tb); if (IS_ERR(algt)) return PTR_ERR(algt); switch (algt->type & algt->mask & CRYPTO_ALG_TYPE_MASK) { case CRYPTO_ALG_TYPE_AEAD: return pcrypt_create_aead(tmpl, tb, algt); } return -EINVAL; } static int pcrypt_sysfs_add(struct padata_instance *pinst, const char *name) { int ret; pinst->kobj.kset = pcrypt_kset; ret = kobject_add(&pinst->kobj, NULL, "%s", name); if (!ret) kobject_uevent(&pinst->kobj, KOBJ_ADD); return ret; } static int pcrypt_init_padata(struct padata_instance **pinst, const char *name) { int ret = -ENOMEM; *pinst = padata_alloc(name); if (!*pinst) return ret; ret = pcrypt_sysfs_add(*pinst, name); if (ret) padata_free(*pinst); return ret; } static struct crypto_template pcrypt_tmpl = { .name = "pcrypt", .create = pcrypt_create, .module = THIS_MODULE, }; static int __init pcrypt_init(void) { int err = -ENOMEM; pcrypt_kset = kset_create_and_add("pcrypt", NULL, kernel_kobj); if (!pcrypt_kset) goto err; err = pcrypt_init_padata(&pencrypt, "pencrypt"); if (err) goto err_unreg_kset; err = pcrypt_init_padata(&pdecrypt, "pdecrypt"); if (err) goto err_deinit_pencrypt; return crypto_register_template(&pcrypt_tmpl); err_deinit_pencrypt: padata_free(pencrypt); err_unreg_kset: kset_unregister(pcrypt_kset); err: return err; } static void __exit pcrypt_exit(void) { crypto_unregister_template(&pcrypt_tmpl); padata_free(pencrypt); padata_free(pdecrypt); kset_unregister(pcrypt_kset); } subsys_initcall(pcrypt_init); module_exit(pcrypt_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Steffen Klassert <steffen.klassert@secunet.com>"); MODULE_DESCRIPTION("Parallel crypto wrapper"); MODULE_ALIAS_CRYPTO("pcrypt");
1963 104 1 821 819 409 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_NETLINK_H #define __LINUX_NETLINK_H #include <linux/capability.h> #include <linux/skbuff.h> #include <linux/export.h> #include <net/scm.h> #include <uapi/linux/netlink.h> struct net; void do_trace_netlink_extack(const char *msg); static inline struct nlmsghdr *nlmsg_hdr(const struct sk_buff *skb) { return (struct nlmsghdr *)skb->data; } enum netlink_skb_flags { NETLINK_SKB_DST = 0x8, /* Dst set in sendto or sendmsg */ }; struct netlink_skb_parms { struct scm_creds creds; /* Skb credentials */ __u32 portid; __u32 dst_group; __u32 flags; struct sock *sk; bool nsid_is_set; int nsid; }; #define NETLINK_CB(skb) (*(struct netlink_skb_parms*)&((skb)->cb)) #define NETLINK_CREDS(skb) (&NETLINK_CB((skb)).creds) #define NETLINK_CTX_SIZE 48 void netlink_table_grab(void); void netlink_table_ungrab(void); #define NL_CFG_F_NONROOT_RECV (1 << 0) #define NL_CFG_F_NONROOT_SEND (1 << 1) /* optional Netlink kernel configuration parameters */ struct netlink_kernel_cfg { unsigned int groups; unsigned int flags; void (*input)(struct sk_buff *skb); int (*bind)(struct net *net, int group); void (*unbind)(struct net *net, int group); void (*release) (struct sock *sk, unsigned long *groups); }; struct sock *__netlink_kernel_create(struct net *net, int unit, struct module *module, struct netlink_kernel_cfg *cfg); static inline struct sock * netlink_kernel_create(struct net *net, int unit, struct netlink_kernel_cfg *cfg) { return __netlink_kernel_create(net, unit, THIS_MODULE, cfg); } /* this can be increased when necessary - don't expose to userland */ #define NETLINK_MAX_COOKIE_LEN 20 #define NETLINK_MAX_FMTMSG_LEN 80 /** * struct netlink_ext_ack - netlink extended ACK report struct * @_msg: message string to report - don't access directly, use * %NL_SET_ERR_MSG * @bad_attr: attribute with error * @policy: policy for a bad attribute * @miss_type: attribute type which was missing * @miss_nest: nest missing an attribute (%NULL if missing top level attr) * @cookie: cookie data to return to userspace (for success) * @cookie_len: actual cookie data length * @_msg_buf: output buffer for formatted message strings - don't access * directly, use %NL_SET_ERR_MSG_FMT */ struct netlink_ext_ack { const char *_msg; const struct nlattr *bad_attr; const struct nla_policy *policy; const struct nlattr *miss_nest; u16 miss_type; u8 cookie[NETLINK_MAX_COOKIE_LEN]; u8 cookie_len; char _msg_buf[NETLINK_MAX_FMTMSG_LEN]; }; /* Always use this macro, this allows later putting the * message into a separate section or such for things * like translation or listing all possible messages. * If string formatting is needed use NL_SET_ERR_MSG_FMT. */ #define NL_SET_ERR_MSG(extack, msg) do { \ static const char __msg[] = msg; \ struct netlink_ext_ack *__extack = (extack); \ \ do_trace_netlink_extack(__msg); \ \ if (__extack) \ __extack->_msg = __msg; \ } while (0) /* We splice fmt with %s at each end even in the snprintf so that both calls * can use the same string constant, avoiding its duplication in .ro */ #define NL_SET_ERR_MSG_FMT(extack, fmt, args...) do { \ struct netlink_ext_ack *__extack = (extack); \ \ if (!__extack) \ break; \ if (snprintf(__extack->_msg_buf, NETLINK_MAX_FMTMSG_LEN, \ "%s" fmt "%s", "", ##args, "") >= \ NETLINK_MAX_FMTMSG_LEN) \ net_warn_ratelimited("%s" fmt "%s", "truncated extack: ", \ ##args, "\n"); \ \ do_trace_netlink_extack(__extack->_msg_buf); \ \ __extack->_msg = __extack->_msg_buf; \ } while (0) #define NL_SET_ERR_MSG_MOD(extack, msg) \ NL_SET_ERR_MSG((extack), KBUILD_MODNAME ": " msg) #define NL_SET_ERR_MSG_FMT_MOD(extack, fmt, args...) \ NL_SET_ERR_MSG_FMT((extack), KBUILD_MODNAME ": " fmt, ##args) #define NL_SET_ERR_MSG_WEAK(extack, msg) do { \ if ((extack) && !(extack)->_msg) \ NL_SET_ERR_MSG((extack), msg); \ } while (0) #define NL_SET_ERR_MSG_WEAK_MOD(extack, msg) do { \ if ((extack) && !(extack)->_msg) \ NL_SET_ERR_MSG_MOD((extack), msg); \ } while (0) #define NL_SET_BAD_ATTR_POLICY(extack, attr, pol) do { \ if ((extack)) { \ (extack)->bad_attr = (attr); \ (extack)->policy = (pol); \ } \ } while (0) #define NL_SET_BAD_ATTR(extack, attr) NL_SET_BAD_ATTR_POLICY(extack, attr, NULL) #define NL_SET_ERR_MSG_ATTR_POL(extack, attr, pol, msg) do { \ static const char __msg[] = msg; \ struct netlink_ext_ack *__extack = (extack); \ \ do_trace_netlink_extack(__msg); \ \ if (__extack) { \ __extack->_msg = __msg; \ __extack->bad_attr = (attr); \ __extack->policy = (pol); \ } \ } while (0) #define NL_SET_ERR_MSG_ATTR_POL_FMT(extack, attr, pol, fmt, args...) do { \ struct netlink_ext_ack *__extack = (extack); \ \ if (!__extack) \ break; \ \ if (snprintf(__extack->_msg_buf, NETLINK_MAX_FMTMSG_LEN, \ "%s" fmt "%s", "", ##args, "") >= \ NETLINK_MAX_FMTMSG_LEN) \ net_warn_ratelimited("%s" fmt "%s", "truncated extack: ", \ ##args, "\n"); \ \ do_trace_netlink_extack(__extack->_msg_buf); \ \ __extack->_msg = __extack->_msg_buf; \ __extack->bad_attr = (attr); \ __extack->policy = (pol); \ } while (0) #define NL_SET_ERR_MSG_ATTR(extack, attr, msg) \ NL_SET_ERR_MSG_ATTR_POL(extack, attr, NULL, msg) #define NL_SET_ERR_MSG_ATTR_FMT(extack, attr, msg, args...) \ NL_SET_ERR_MSG_ATTR_POL_FMT(extack, attr, NULL, msg, ##args) #define NL_SET_ERR_ATTR_MISS(extack, nest, type) do { \ struct netlink_ext_ack *__extack = (extack); \ \ if (__extack) { \ __extack->miss_nest = (nest); \ __extack->miss_type = (type); \ } \ } while (0) #define NL_REQ_ATTR_CHECK(extack, nest, tb, type) ({ \ struct nlattr **__tb = (tb); \ u32 __attr = (type); \ int __retval; \ \ __retval = !__tb[__attr]; \ if (__retval) \ NL_SET_ERR_ATTR_MISS((extack), (nest), __attr); \ __retval; \ }) static inline void nl_set_extack_cookie_u64(struct netlink_ext_ack *extack, u64 cookie) { if (!extack) return; memcpy(extack->cookie, &cookie, sizeof(cookie)); extack->cookie_len = sizeof(cookie); } void netlink_kernel_release(struct sock *sk); int __netlink_change_ngroups(struct sock *sk, unsigned int groups); int netlink_change_ngroups(struct sock *sk, unsigned int groups); void __netlink_clear_multicast_users(struct sock *sk, unsigned int group); void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err, const struct netlink_ext_ack *extack); int netlink_has_listeners(struct sock *sk, unsigned int group); bool netlink_strict_get_check(struct sk_buff *skb); int netlink_unicast(struct sock *ssk, struct sk_buff *skb, __u32 portid, int nonblock); int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, __u32 portid, __u32 group, gfp_t allocation); typedef int (*netlink_filter_fn)(struct sock *dsk, struct sk_buff *skb, void *data); int netlink_broadcast_filtered(struct sock *ssk, struct sk_buff *skb, __u32 portid, __u32 group, gfp_t allocation, netlink_filter_fn filter, void *filter_data); int netlink_set_err(struct sock *ssk, __u32 portid, __u32 group, int code); int netlink_register_notifier(struct notifier_block *nb); int netlink_unregister_notifier(struct notifier_block *nb); /* finegrained unicast helpers: */ struct sock *netlink_getsockbyfd(int fd); int netlink_attachskb(struct sock *sk, struct sk_buff *skb, long *timeo, struct sock *ssk); void netlink_detachskb(struct sock *sk, struct sk_buff *skb); int netlink_sendskb(struct sock *sk, struct sk_buff *skb); static inline struct sk_buff * netlink_skb_clone(struct sk_buff *skb, gfp_t gfp_mask) { struct sk_buff *nskb; nskb = skb_clone(skb, gfp_mask); if (!nskb) return NULL; /* This is a large skb, set destructor callback to release head */ if (is_vmalloc_addr(skb->head)) nskb->destructor = skb->destructor; return nskb; } /* * skb should fit one page. This choice is good for headerless malloc. * But we should limit to 8K so that userspace does not have to * use enormous buffer sizes on recvmsg() calls just to avoid * MSG_TRUNC when PAGE_SIZE is very large. */ #if PAGE_SIZE < 8192UL #define NLMSG_GOODSIZE SKB_WITH_OVERHEAD(PAGE_SIZE) #else #define NLMSG_GOODSIZE SKB_WITH_OVERHEAD(8192UL) #endif #define NLMSG_DEFAULT_SIZE (NLMSG_GOODSIZE - NLMSG_HDRLEN) struct netlink_callback { struct sk_buff *skb; const struct nlmsghdr *nlh; int (*dump)(struct sk_buff * skb, struct netlink_callback *cb); int (*done)(struct netlink_callback *cb); void *data; /* the module that dump function belong to */ struct module *module; struct netlink_ext_ack *extack; u16 family; u16 answer_flags; u32 min_dump_alloc; unsigned int prev_seq, seq; int flags; bool strict_check; union { u8 ctx[NETLINK_CTX_SIZE]; /* args is deprecated. Cast a struct over ctx instead * for proper type safety. */ long args[6]; }; }; #define NL_ASSERT_CTX_FITS(type_name) \ BUILD_BUG_ON(sizeof(type_name) > \ sizeof_field(struct netlink_callback, ctx)) struct netlink_notify { struct net *net; u32 portid; int protocol; }; struct nlmsghdr * __nlmsg_put(struct sk_buff *skb, u32 portid, u32 seq, int type, int len, int flags); struct netlink_dump_control { int (*start)(struct netlink_callback *); int (*dump)(struct sk_buff *skb, struct netlink_callback *); int (*done)(struct netlink_callback *); struct netlink_ext_ack *extack; void *data; struct module *module; u32 min_dump_alloc; int flags; }; int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb, const struct nlmsghdr *nlh, struct netlink_dump_control *control); static inline int netlink_dump_start(struct sock *ssk, struct sk_buff *skb, const struct nlmsghdr *nlh, struct netlink_dump_control *control) { if (!control->module) control->module = THIS_MODULE; return __netlink_dump_start(ssk, skb, nlh, control); } struct netlink_tap { struct net_device *dev; struct module *module; struct list_head list; }; int netlink_add_tap(struct netlink_tap *nt); int netlink_remove_tap(struct netlink_tap *nt); bool __netlink_ns_capable(const struct netlink_skb_parms *nsp, struct user_namespace *ns, int cap); bool netlink_ns_capable(const struct sk_buff *skb, struct user_namespace *ns, int cap); bool netlink_capable(const struct sk_buff *skb, int cap); bool netlink_net_capable(const struct sk_buff *skb, int cap); struct sk_buff *netlink_alloc_large_skb(unsigned int size, int broadcast); #endif /* __LINUX_NETLINK_H */
10 10 7 13 13 13 10 10 7 7 13 13 104 104 104 104 104 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 /* * Copyright (c) 2006,2007 The Regents of the University of Michigan. * All rights reserved. * * Andy Adamson <andros@citi.umich.edu> * Fred Isaman <iisaman@umich.edu> * * permission is granted to use, copy, create derivative works and * redistribute this software and such derivative works for any purpose, * so long as the name of the university of michigan is not used in * any advertising or publicity pertaining to the use or distribution * of this software without specific, written prior authorization. if * the above copyright notice or any other identification of the * university of michigan is included in any copy of any portion of * this software, then the disclaimer below must also be included. * * this software is provided as is, without representation from the * university of michigan as to its fitness for any purpose, and without * warranty by the university of michigan of any kind, either express * or implied, including without limitation the implied warranties of * merchantability and fitness for a particular purpose. the regents * of the university of michigan shall not be liable for any damages, * including special, indirect, incidental, or consequential damages, * with respect to any claim arising out or in connection with the use * of the software, even if it has been or is hereafter advised of the * possibility of such damages. */ #include <linux/module.h> #include <linux/blkdev.h> #include "blocklayout.h" #define NFSDBG_FACILITY NFSDBG_PNFS_LD static void nfs4_encode_simple(__be32 *p, struct pnfs_block_volume *b) { int i; *p++ = cpu_to_be32(1); *p++ = cpu_to_be32(b->type); *p++ = cpu_to_be32(b->simple.nr_sigs); for (i = 0; i < b->simple.nr_sigs; i++) { p = xdr_encode_hyper(p, b->simple.sigs[i].offset); p = xdr_encode_opaque(p, b->simple.sigs[i].sig, b->simple.sigs[i].sig_len); } } dev_t bl_resolve_deviceid(struct nfs_server *server, struct pnfs_block_volume *b, gfp_t gfp_mask) { struct net *net = server->nfs_client->cl_net; struct nfs_net *nn = net_generic(net, nfs_net_id); struct bl_dev_msg *reply = &nn->bl_mount_reply; struct bl_pipe_msg bl_pipe_msg; struct rpc_pipe_msg *msg = &bl_pipe_msg.msg; struct bl_msg_hdr *bl_msg; DECLARE_WAITQUEUE(wq, current); dev_t dev = 0; int rc; dprintk("%s CREATING PIPEFS MESSAGE\n", __func__); mutex_lock(&nn->bl_mutex); bl_pipe_msg.bl_wq = &nn->bl_wq; b->simple.len += 4; /* single volume */ if (b->simple.len > PAGE_SIZE) goto out_unlock; memset(msg, 0, sizeof(*msg)); msg->len = sizeof(*bl_msg) + b->simple.len; msg->data = kzalloc(msg->len, gfp_mask); if (!msg->data) goto out_unlock; bl_msg = msg->data; bl_msg->type = BL_DEVICE_MOUNT; bl_msg->totallen = b->simple.len; nfs4_encode_simple(msg->data + sizeof(*bl_msg), b); dprintk("%s CALLING USERSPACE DAEMON\n", __func__); add_wait_queue(&nn->bl_wq, &wq); rc = rpc_queue_upcall(nn->bl_device_pipe, msg); if (rc < 0) { remove_wait_queue(&nn->bl_wq, &wq); goto out_free_data; } set_current_state(TASK_UNINTERRUPTIBLE); schedule(); remove_wait_queue(&nn->bl_wq, &wq); if (reply->status != BL_DEVICE_REQUEST_PROC) { printk(KERN_WARNING "%s failed to decode device: %d\n", __func__, reply->status); goto out_free_data; } dev = MKDEV(reply->major, reply->minor); out_free_data: kfree(msg->data); out_unlock: mutex_unlock(&nn->bl_mutex); return dev; } static ssize_t bl_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) { struct nfs_net *nn = net_generic(file_inode(filp)->i_sb->s_fs_info, nfs_net_id); if (mlen != sizeof (struct bl_dev_msg)) return -EINVAL; if (copy_from_user(&nn->bl_mount_reply, src, mlen) != 0) return -EFAULT; wake_up(&nn->bl_wq); return mlen; } static void bl_pipe_destroy_msg(struct rpc_pipe_msg *msg) { struct bl_pipe_msg *bl_pipe_msg = container_of(msg, struct bl_pipe_msg, msg); if (msg->errno >= 0) return; wake_up(bl_pipe_msg->bl_wq); } static const struct rpc_pipe_ops bl_upcall_ops = { .upcall = rpc_pipe_generic_upcall, .downcall = bl_pipe_downcall, .destroy_msg = bl_pipe_destroy_msg, }; static struct dentry *nfs4blocklayout_register_sb(struct super_block *sb, struct rpc_pipe *pipe) { struct dentry *dir, *dentry; dir = rpc_d_lookup_sb(sb, NFS_PIPE_DIRNAME); if (dir == NULL) return ERR_PTR(-ENOENT); dentry = rpc_mkpipe_dentry(dir, "blocklayout", NULL, pipe); dput(dir); return dentry; } static void nfs4blocklayout_unregister_sb(struct super_block *sb, struct rpc_pipe *pipe) { if (pipe->dentry) rpc_unlink(pipe->dentry); } static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event, void *ptr) { struct super_block *sb = ptr; struct net *net = sb->s_fs_info; struct nfs_net *nn = net_generic(net, nfs_net_id); struct dentry *dentry; int ret = 0; if (!try_module_get(THIS_MODULE)) return 0; if (nn->bl_device_pipe == NULL) { module_put(THIS_MODULE); return 0; } switch (event) { case RPC_PIPEFS_MOUNT: dentry = nfs4blocklayout_register_sb(sb, nn->bl_device_pipe); if (IS_ERR(dentry)) { ret = PTR_ERR(dentry); break; } nn->bl_device_pipe->dentry = dentry; break; case RPC_PIPEFS_UMOUNT: if (nn->bl_device_pipe->dentry) nfs4blocklayout_unregister_sb(sb, nn->bl_device_pipe); break; default: ret = -ENOTSUPP; break; } module_put(THIS_MODULE); return ret; } static struct notifier_block nfs4blocklayout_block = { .notifier_call = rpc_pipefs_event, }; static struct dentry *nfs4blocklayout_register_net(struct net *net, struct rpc_pipe *pipe) { struct super_block *pipefs_sb; struct dentry *dentry; pipefs_sb = rpc_get_sb_net(net); if (!pipefs_sb) return NULL; dentry = nfs4blocklayout_register_sb(pipefs_sb, pipe); rpc_put_sb_net(net); return dentry; } static void nfs4blocklayout_unregister_net(struct net *net, struct rpc_pipe *pipe) { struct super_block *pipefs_sb; pipefs_sb = rpc_get_sb_net(net); if (pipefs_sb) { nfs4blocklayout_unregister_sb(pipefs_sb, pipe); rpc_put_sb_net(net); } } static int nfs4blocklayout_net_init(struct net *net) { struct nfs_net *nn = net_generic(net, nfs_net_id); struct dentry *dentry; mutex_init(&nn->bl_mutex); init_waitqueue_head(&nn->bl_wq); nn->bl_device_pipe = rpc_mkpipe_data(&bl_upcall_ops, 0); if (IS_ERR(nn->bl_device_pipe)) return PTR_ERR(nn->bl_device_pipe); dentry = nfs4blocklayout_register_net(net, nn->bl_device_pipe); if (IS_ERR(dentry)) { rpc_destroy_pipe_data(nn->bl_device_pipe); return PTR_ERR(dentry); } nn->bl_device_pipe->dentry = dentry; return 0; } static void nfs4blocklayout_net_exit(struct net *net) { struct nfs_net *nn = net_generic(net, nfs_net_id); nfs4blocklayout_unregister_net(net, nn->bl_device_pipe); rpc_destroy_pipe_data(nn->bl_device_pipe); nn->bl_device_pipe = NULL; } static struct pernet_operations nfs4blocklayout_net_ops = { .init = nfs4blocklayout_net_init, .exit = nfs4blocklayout_net_exit, }; int __init bl_init_pipefs(void) { int ret; ret = rpc_pipefs_notifier_register(&nfs4blocklayout_block); if (ret) goto out; ret = register_pernet_subsys(&nfs4blocklayout_net_ops); if (ret) goto out_unregister_notifier; return 0; out_unregister_notifier: rpc_pipefs_notifier_unregister(&nfs4blocklayout_block); out: return ret; } void bl_cleanup_pipefs(void) { rpc_pipefs_notifier_unregister(&nfs4blocklayout_block); unregister_pernet_subsys(&nfs4blocklayout_net_ops); }
26 18 16 16 8 8 8 1 8 2 8 8 8 8 8 8 8 8 8 8 2 18 8 18 6 6 5 5 5 4 10 11 13 2 3 13 12 9 10 10 3 3 2 1 2 1 1 10 10 10 10 10 9 15 13 15 15 14 13 2 13 13 3 26 26 26 8 26 25 4 2 24 6 26 26 26 14 14 1 13 12 12 12 1 1 1 1 1 1 1 1 1 28 28 14 2 28 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 /* FUSE: Filesystem in Userspace Copyright (C) 2001-2018 Miklos Szeredi <miklos@szeredi.hu> This program can be distributed under the terms of the GNU GPL. See the file COPYING. */ #include "fuse_i.h" #include <linux/iversion.h> #include <linux/posix_acl.h> #include <linux/pagemap.h> #include <linux/highmem.h> static bool fuse_use_readdirplus(struct inode *dir, struct dir_context *ctx) { struct fuse_conn *fc = get_fuse_conn(dir); struct fuse_inode *fi = get_fuse_inode(dir); if (!fc->do_readdirplus) return false; if (!fc->readdirplus_auto) return true; if (test_and_clear_bit(FUSE_I_ADVISE_RDPLUS, &fi->state)) return true; if (ctx->pos == 0) return true; return false; } static void fuse_add_dirent_to_cache(struct file *file, struct fuse_dirent *dirent, loff_t pos) { struct fuse_inode *fi = get_fuse_inode(file_inode(file)); size_t reclen = FUSE_DIRENT_SIZE(dirent); pgoff_t index; struct page *page; loff_t size; u64 version; unsigned int offset; void *addr; spin_lock(&fi->rdc.lock); /* * Is cache already completed? Or this entry does not go at the end of * cache? */ if (fi->rdc.cached || pos != fi->rdc.pos) { spin_unlock(&fi->rdc.lock); return; } version = fi->rdc.version; size = fi->rdc.size; offset = size & ~PAGE_MASK; index = size >> PAGE_SHIFT; /* Dirent doesn't fit in current page? Jump to next page. */ if (offset + reclen > PAGE_SIZE) { index++; offset = 0; } spin_unlock(&fi->rdc.lock); if (offset) { page = find_lock_page(file->f_mapping, index); } else { page = find_or_create_page(file->f_mapping, index, mapping_gfp_mask(file->f_mapping)); } if (!page) return; spin_lock(&fi->rdc.lock); /* Raced with another readdir */ if (fi->rdc.version != version || fi->rdc.size != size || WARN_ON(fi->rdc.pos != pos)) goto unlock; addr = kmap_local_page(page); if (!offset) { clear_page(addr); SetPageUptodate(page); } memcpy(addr + offset, dirent, reclen); kunmap_local(addr); fi->rdc.size = (index << PAGE_SHIFT) + offset + reclen; fi->rdc.pos = dirent->off; unlock: spin_unlock(&fi->rdc.lock); unlock_page(page); put_page(page); } static void fuse_readdir_cache_end(struct file *file, loff_t pos) { struct fuse_inode *fi = get_fuse_inode(file_inode(file)); loff_t end; spin_lock(&fi->rdc.lock); /* does cache end position match current position? */ if (fi->rdc.pos != pos) { spin_unlock(&fi->rdc.lock); return; } fi->rdc.cached = true; end = ALIGN(fi->rdc.size, PAGE_SIZE); spin_unlock(&fi->rdc.lock); /* truncate unused tail of cache */ truncate_inode_pages(file->f_mapping, end); } static bool fuse_emit(struct file *file, struct dir_context *ctx, struct fuse_dirent *dirent) { struct fuse_file *ff = file->private_data; if (ff->open_flags & FOPEN_CACHE_DIR) fuse_add_dirent_to_cache(file, dirent, ctx->pos); return dir_emit(ctx, dirent->name, dirent->namelen, dirent->ino, dirent->type); } static int parse_dirfile(char *buf, size_t nbytes, struct file *file, struct dir_context *ctx) { while (nbytes >= FUSE_NAME_OFFSET) { struct fuse_dirent *dirent = (struct fuse_dirent *) buf; size_t reclen = FUSE_DIRENT_SIZE(dirent); if (!dirent->namelen || dirent->namelen > FUSE_NAME_MAX) return -EIO; if (reclen > nbytes) break; if (memchr(dirent->name, '/', dirent->namelen) != NULL) return -EIO; if (!fuse_emit(file, ctx, dirent)) break; buf += reclen; nbytes -= reclen; ctx->pos = dirent->off; } return 0; } static int fuse_direntplus_link(struct file *file, struct fuse_direntplus *direntplus, u64 attr_version, u64 evict_ctr) { struct fuse_entry_out *o = &direntplus->entry_out; struct fuse_dirent *dirent = &direntplus->dirent; struct dentry *parent = file->f_path.dentry; struct qstr name = QSTR_INIT(dirent->name, dirent->namelen); struct dentry *dentry; struct dentry *alias; struct inode *dir = d_inode(parent); struct fuse_conn *fc; struct inode *inode; DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); if (!o->nodeid) { /* * Unlike in the case of fuse_lookup, zero nodeid does not mean * ENOENT. Instead, it only means the userspace filesystem did * not want to return attributes/handle for this entry. * * So do nothing. */ return 0; } if (name.name[0] == '.') { /* * We could potentially refresh the attributes of the directory * and its parent? */ if (name.len == 1) return 0; if (name.name[1] == '.' && name.len == 2) return 0; } if (invalid_nodeid(o->nodeid)) return -EIO; if (fuse_invalid_attr(&o->attr)) return -EIO; fc = get_fuse_conn(dir); name.hash = full_name_hash(parent, name.name, name.len); dentry = d_lookup(parent, &name); if (!dentry) { retry: dentry = d_alloc_parallel(parent, &name, &wq); if (IS_ERR(dentry)) return PTR_ERR(dentry); } if (!d_in_lookup(dentry)) { struct fuse_inode *fi; inode = d_inode(dentry); if (inode && get_node_id(inode) != o->nodeid) inode = NULL; if (!inode || fuse_stale_inode(inode, o->generation, &o->attr)) { if (inode) fuse_make_bad(inode); d_invalidate(dentry); dput(dentry); goto retry; } if (fuse_is_bad(inode)) { dput(dentry); return -EIO; } fi = get_fuse_inode(inode); spin_lock(&fi->lock); fi->nlookup++; spin_unlock(&fi->lock); forget_all_cached_acls(inode); fuse_change_attributes(inode, &o->attr, NULL, ATTR_TIMEOUT(o), attr_version); /* * The other branch comes via fuse_iget() * which bumps nlookup inside */ } else { inode = fuse_iget(dir->i_sb, o->nodeid, o->generation, &o->attr, ATTR_TIMEOUT(o), attr_version, evict_ctr); if (!inode) inode = ERR_PTR(-ENOMEM); alias = d_splice_alias(inode, dentry); d_lookup_done(dentry); if (alias) { dput(dentry); dentry = alias; } if (IS_ERR(dentry)) { if (!IS_ERR(inode)) { struct fuse_inode *fi = get_fuse_inode(inode); spin_lock(&fi->lock); fi->nlookup--; spin_unlock(&fi->lock); } return PTR_ERR(dentry); } } if (fc->readdirplus_auto) set_bit(FUSE_I_INIT_RDPLUS, &get_fuse_inode(inode)->state); fuse_change_entry_timeout(dentry, o); dput(dentry); return 0; } static void fuse_force_forget(struct file *file, u64 nodeid) { struct inode *inode = file_inode(file); struct fuse_mount *fm = get_fuse_mount(inode); struct fuse_forget_in inarg; FUSE_ARGS(args); memset(&inarg, 0, sizeof(inarg)); inarg.nlookup = 1; args.opcode = FUSE_FORGET; args.nodeid = nodeid; args.in_numargs = 1; args.in_args[0].size = sizeof(inarg); args.in_args[0].value = &inarg; args.force = true; args.noreply = true; fuse_simple_request(fm, &args); /* ignore errors */ } static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file, struct dir_context *ctx, u64 attr_version, u64 evict_ctr) { struct fuse_direntplus *direntplus; struct fuse_dirent *dirent; size_t reclen; int over = 0; int ret; while (nbytes >= FUSE_NAME_OFFSET_DIRENTPLUS) { direntplus = (struct fuse_direntplus *) buf; dirent = &direntplus->dirent; reclen = FUSE_DIRENTPLUS_SIZE(direntplus); if (!dirent->namelen || dirent->namelen > FUSE_NAME_MAX) return -EIO; if (reclen > nbytes) break; if (memchr(dirent->name, '/', dirent->namelen) != NULL) return -EIO; if (!over) { /* We fill entries into dstbuf only as much as it can hold. But we still continue iterating over remaining entries to link them. If not, we need to send a FORGET for each of those which we did not link. */ over = !fuse_emit(file, ctx, dirent); if (!over) ctx->pos = dirent->off; } buf += reclen; nbytes -= reclen; ret = fuse_direntplus_link(file, direntplus, attr_version, evict_ctr); if (ret) fuse_force_forget(file, direntplus->entry_out.nodeid); } return 0; } static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx) { int plus; ssize_t res; struct folio *folio; struct inode *inode = file_inode(file); struct fuse_mount *fm = get_fuse_mount(inode); struct fuse_io_args ia = {}; struct fuse_args_pages *ap = &ia.ap; struct fuse_folio_desc desc = { .length = PAGE_SIZE }; u64 attr_version = 0, evict_ctr = 0; bool locked; folio = folio_alloc(GFP_KERNEL, 0); if (!folio) return -ENOMEM; plus = fuse_use_readdirplus(inode, ctx); ap->args.out_pages = true; ap->num_folios = 1; ap->folios = &folio; ap->descs = &desc; if (plus) { attr_version = fuse_get_attr_version(fm->fc); evict_ctr = fuse_get_evict_ctr(fm->fc); fuse_read_args_fill(&ia, file, ctx->pos, PAGE_SIZE, FUSE_READDIRPLUS); } else { fuse_read_args_fill(&ia, file, ctx->pos, PAGE_SIZE, FUSE_READDIR); } locked = fuse_lock_inode(inode); res = fuse_simple_request(fm, &ap->args); fuse_unlock_inode(inode, locked); if (res >= 0) { if (!res) { struct fuse_file *ff = file->private_data; if (ff->open_flags & FOPEN_CACHE_DIR) fuse_readdir_cache_end(file, ctx->pos); } else if (plus) { res = parse_dirplusfile(folio_address(folio), res, file, ctx, attr_version, evict_ctr); } else { res = parse_dirfile(folio_address(folio), res, file, ctx); } } folio_put(folio); fuse_invalidate_atime(inode); return res; } enum fuse_parse_result { FOUND_ERR = -1, FOUND_NONE = 0, FOUND_SOME, FOUND_ALL, }; static enum fuse_parse_result fuse_parse_cache(struct fuse_file *ff, void *addr, unsigned int size, struct dir_context *ctx) { unsigned int offset = ff->readdir.cache_off & ~PAGE_MASK; enum fuse_parse_result res = FOUND_NONE; WARN_ON(offset >= size); for (;;) { struct fuse_dirent *dirent = addr + offset; unsigned int nbytes = size - offset; size_t reclen; if (nbytes < FUSE_NAME_OFFSET || !dirent->namelen) break; reclen = FUSE_DIRENT_SIZE(dirent); /* derefs ->namelen */ if (WARN_ON(dirent->namelen > FUSE_NAME_MAX)) return FOUND_ERR; if (WARN_ON(reclen > nbytes)) return FOUND_ERR; if (WARN_ON(memchr(dirent->name, '/', dirent->namelen) != NULL)) return FOUND_ERR; if (ff->readdir.pos == ctx->pos) { res = FOUND_SOME; if (!dir_emit(ctx, dirent->name, dirent->namelen, dirent->ino, dirent->type)) return FOUND_ALL; ctx->pos = dirent->off; } ff->readdir.pos = dirent->off; ff->readdir.cache_off += reclen; offset += reclen; } return res; } static void fuse_rdc_reset(struct inode *inode) { struct fuse_inode *fi = get_fuse_inode(inode); fi->rdc.cached = false; fi->rdc.version++; fi->rdc.size = 0; fi->rdc.pos = 0; } #define UNCACHED 1 static int fuse_readdir_cached(struct file *file, struct dir_context *ctx) { struct fuse_file *ff = file->private_data; struct inode *inode = file_inode(file); struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); enum fuse_parse_result res; pgoff_t index; unsigned int size; struct page *page; void *addr; /* Seeked? If so, reset the cache stream */ if (ff->readdir.pos != ctx->pos) { ff->readdir.pos = 0; ff->readdir.cache_off = 0; } /* * We're just about to start reading into the cache or reading the * cache; both cases require an up-to-date mtime value. */ if (!ctx->pos && fc->auto_inval_data) { int err = fuse_update_attributes(inode, file, STATX_MTIME); if (err) return err; } retry: spin_lock(&fi->rdc.lock); retry_locked: if (!fi->rdc.cached) { /* Starting cache? Set cache mtime. */ if (!ctx->pos && !fi->rdc.size) { fi->rdc.mtime = inode_get_mtime(inode); fi->rdc.iversion = inode_query_iversion(inode); } spin_unlock(&fi->rdc.lock); return UNCACHED; } /* * When at the beginning of the directory (i.e. just after opendir(3) or * rewinddir(3)), then need to check whether directory contents have * changed, and reset the cache if so. */ if (!ctx->pos) { struct timespec64 mtime = inode_get_mtime(inode); if (inode_peek_iversion(inode) != fi->rdc.iversion || !timespec64_equal(&fi->rdc.mtime, &mtime)) { fuse_rdc_reset(inode); goto retry_locked; } } /* * If cache version changed since the last getdents() call, then reset * the cache stream. */ if (ff->readdir.version != fi->rdc.version) { ff->readdir.pos = 0; ff->readdir.cache_off = 0; } /* * If at the beginning of the cache, than reset version to * current. */ if (ff->readdir.pos == 0) ff->readdir.version = fi->rdc.version; WARN_ON(fi->rdc.size < ff->readdir.cache_off); index = ff->readdir.cache_off >> PAGE_SHIFT; if (index == (fi->rdc.size >> PAGE_SHIFT)) size = fi->rdc.size & ~PAGE_MASK; else size = PAGE_SIZE; spin_unlock(&fi->rdc.lock); /* EOF? */ if ((ff->readdir.cache_off & ~PAGE_MASK) == size) return 0; page = find_get_page_flags(file->f_mapping, index, FGP_ACCESSED | FGP_LOCK); /* Page gone missing, then re-added to cache, but not initialized? */ if (page && !PageUptodate(page)) { unlock_page(page); put_page(page); page = NULL; } spin_lock(&fi->rdc.lock); if (!page) { /* * Uh-oh: page gone missing, cache is useless */ if (fi->rdc.version == ff->readdir.version) fuse_rdc_reset(inode); goto retry_locked; } /* Make sure it's still the same version after getting the page. */ if (ff->readdir.version != fi->rdc.version) { spin_unlock(&fi->rdc.lock); unlock_page(page); put_page(page); goto retry; } spin_unlock(&fi->rdc.lock); /* * Contents of the page are now protected against changing by holding * the page lock. */ addr = kmap_local_page(page); res = fuse_parse_cache(ff, addr, size, ctx); kunmap_local(addr); unlock_page(page); put_page(page); if (res == FOUND_ERR) return -EIO; if (res == FOUND_ALL) return 0; if (size == PAGE_SIZE) { /* We hit end of page: skip to next page. */ ff->readdir.cache_off = ALIGN(ff->readdir.cache_off, PAGE_SIZE); goto retry; } /* * End of cache reached. If found position, then we are done, otherwise * need to fall back to uncached, since the position we were looking for * wasn't in the cache. */ return res == FOUND_SOME ? 0 : UNCACHED; } int fuse_readdir(struct file *file, struct dir_context *ctx) { struct fuse_file *ff = file->private_data; struct inode *inode = file_inode(file); int err; if (fuse_is_bad(inode)) return -EIO; err = UNCACHED; if (ff->open_flags & FOPEN_CACHE_DIR) err = fuse_readdir_cached(file, ctx); if (err == UNCACHED) err = fuse_readdir_uncached(file, ctx); return err; }
3 3 3 3 3 3 1 2 2 9 9 3 1 3 2 2 3 8 8 5 5 1 4 2 1 8 8 8 8 4 8 8 4 8 8 8 8 4 1 3 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2017-2018 HUAWEI, Inc. * https://www.huawei.com/ * Copyright (C) 2021, Alibaba Cloud */ #include <linux/statfs.h> #include <linux/seq_file.h> #include <linux/crc32c.h> #include <linux/fs_context.h> #include <linux/fs_parser.h> #include <linux/exportfs.h> #include <linux/backing-dev.h> #include "xattr.h" #define CREATE_TRACE_POINTS #include <trace/events/erofs.h> static struct kmem_cache *erofs_inode_cachep __read_mostly; void _erofs_printk(struct super_block *sb, const char *fmt, ...) { struct va_format vaf; va_list args; int level; va_start(args, fmt); level = printk_get_level(fmt); vaf.fmt = printk_skip_level(fmt); vaf.va = &args; if (sb) printk("%c%cerofs (device %s): %pV", KERN_SOH_ASCII, level, sb->s_id, &vaf); else printk("%c%cerofs: %pV", KERN_SOH_ASCII, level, &vaf); va_end(args); } static int erofs_superblock_csum_verify(struct super_block *sb, void *sbdata) { size_t len = 1 << EROFS_SB(sb)->blkszbits; struct erofs_super_block *dsb; u32 expected_crc, crc; if (len > EROFS_SUPER_OFFSET) len -= EROFS_SUPER_OFFSET; dsb = kmemdup(sbdata + EROFS_SUPER_OFFSET, len, GFP_KERNEL); if (!dsb) return -ENOMEM; expected_crc = le32_to_cpu(dsb->checksum); dsb->checksum = 0; /* to allow for x86 boot sectors and other oddities. */ crc = crc32c(~0, dsb, len); kfree(dsb); if (crc != expected_crc) { erofs_err(sb, "invalid checksum 0x%08x, 0x%08x expected", crc, expected_crc); return -EBADMSG; } return 0; } static void erofs_inode_init_once(void *ptr) { struct erofs_inode *vi = ptr; inode_init_once(&vi->vfs_inode); } static struct inode *erofs_alloc_inode(struct super_block *sb) { struct erofs_inode *vi = alloc_inode_sb(sb, erofs_inode_cachep, GFP_KERNEL); if (!vi) return NULL; /* zero out everything except vfs_inode */ memset(vi, 0, offsetof(struct erofs_inode, vfs_inode)); return &vi->vfs_inode; } static void erofs_free_inode(struct inode *inode) { struct erofs_inode *vi = EROFS_I(inode); if (inode->i_op == &erofs_fast_symlink_iops) kfree(inode->i_link); kfree(vi->xattr_shared_xattrs); kmem_cache_free(erofs_inode_cachep, vi); } /* read variable-sized metadata, offset will be aligned by 4-byte */ void *erofs_read_metadata(struct super_block *sb, struct erofs_buf *buf, erofs_off_t *offset, int *lengthp) { u8 *buffer, *ptr; int len, i, cnt; *offset = round_up(*offset, 4); ptr = erofs_bread(buf, *offset, EROFS_KMAP); if (IS_ERR(ptr)) return ptr; len = le16_to_cpu(*(__le16 *)ptr); if (!len) len = U16_MAX + 1; buffer = kmalloc(len, GFP_KERNEL); if (!buffer) return ERR_PTR(-ENOMEM); *offset += sizeof(__le16); *lengthp = len; for (i = 0; i < len; i += cnt) { cnt = min_t(int, sb->s_blocksize - erofs_blkoff(sb, *offset), len - i); ptr = erofs_bread(buf, *offset, EROFS_KMAP); if (IS_ERR(ptr)) { kfree(buffer); return ptr; } memcpy(buffer + i, ptr, cnt); *offset += cnt; } return buffer; } #ifndef CONFIG_EROFS_FS_ZIP static int z_erofs_parse_cfgs(struct super_block *sb, struct erofs_super_block *dsb) { if (!dsb->u1.available_compr_algs) return 0; erofs_err(sb, "compression disabled, unable to mount compressed EROFS"); return -EOPNOTSUPP; } #endif static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb, struct erofs_device_info *dif, erofs_off_t *pos) { struct erofs_sb_info *sbi = EROFS_SB(sb); struct erofs_fscache *fscache; struct erofs_deviceslot *dis; struct file *file; dis = erofs_read_metabuf(buf, sb, *pos, EROFS_KMAP); if (IS_ERR(dis)) return PTR_ERR(dis); if (!sbi->devs->flatdev && !dif->path) { if (!dis->tag[0]) { erofs_err(sb, "empty device tag @ pos %llu", *pos); return -EINVAL; } dif->path = kmemdup_nul(dis->tag, sizeof(dis->tag), GFP_KERNEL); if (!dif->path) return -ENOMEM; } if (erofs_is_fscache_mode(sb)) { fscache = erofs_fscache_register_cookie(sb, dif->path, 0); if (IS_ERR(fscache)) return PTR_ERR(fscache); dif->fscache = fscache; } else if (!sbi->devs->flatdev) { file = erofs_is_fileio_mode(sbi) ? filp_open(dif->path, O_RDONLY | O_LARGEFILE, 0) : bdev_file_open_by_path(dif->path, BLK_OPEN_READ, sb->s_type, NULL); if (IS_ERR(file)) return PTR_ERR(file); if (!erofs_is_fileio_mode(sbi)) { dif->dax_dev = fs_dax_get_by_bdev(file_bdev(file), &dif->dax_part_off, NULL, NULL); } else if (!S_ISREG(file_inode(file)->i_mode)) { fput(file); return -EINVAL; } dif->file = file; } dif->blocks = le32_to_cpu(dis->blocks); dif->mapped_blkaddr = le32_to_cpu(dis->mapped_blkaddr); sbi->total_blocks += dif->blocks; *pos += EROFS_DEVT_SLOT_SIZE; return 0; } static int erofs_scan_devices(struct super_block *sb, struct erofs_super_block *dsb) { struct erofs_sb_info *sbi = EROFS_SB(sb); unsigned int ondisk_extradevs; erofs_off_t pos; struct erofs_buf buf = __EROFS_BUF_INITIALIZER; struct erofs_device_info *dif; int id, err = 0; sbi->total_blocks = sbi->dif0.blocks; if (!erofs_sb_has_device_table(sbi)) ondisk_extradevs = 0; else ondisk_extradevs = le16_to_cpu(dsb->extra_devices); if (sbi->devs->extra_devices && ondisk_extradevs != sbi->devs->extra_devices) { erofs_err(sb, "extra devices don't match (ondisk %u, given %u)", ondisk_extradevs, sbi->devs->extra_devices); return -EINVAL; } if (!ondisk_extradevs) return 0; if (!sbi->devs->extra_devices && !erofs_is_fscache_mode(sb)) sbi->devs->flatdev = true; sbi->device_id_mask = roundup_pow_of_two(ondisk_extradevs + 1) - 1; pos = le16_to_cpu(dsb->devt_slotoff) * EROFS_DEVT_SLOT_SIZE; down_read(&sbi->devs->rwsem); if (sbi->devs->extra_devices) { idr_for_each_entry(&sbi->devs->tree, dif, id) { err = erofs_init_device(&buf, sb, dif, &pos); if (err) break; } } else { for (id = 0; id < ondisk_extradevs; id++) { dif = kzalloc(sizeof(*dif), GFP_KERNEL); if (!dif) { err = -ENOMEM; break; } err = idr_alloc(&sbi->devs->tree, dif, 0, 0, GFP_KERNEL); if (err < 0) { kfree(dif); break; } ++sbi->devs->extra_devices; err = erofs_init_device(&buf, sb, dif, &pos); if (err) break; } } up_read(&sbi->devs->rwsem); erofs_put_metabuf(&buf); return err; } static int erofs_read_superblock(struct super_block *sb) { struct erofs_sb_info *sbi = EROFS_SB(sb); struct erofs_buf buf = __EROFS_BUF_INITIALIZER; struct erofs_super_block *dsb; void *data; int ret; data = erofs_read_metabuf(&buf, sb, 0, EROFS_KMAP); if (IS_ERR(data)) { erofs_err(sb, "cannot read erofs superblock"); return PTR_ERR(data); } dsb = (struct erofs_super_block *)(data + EROFS_SUPER_OFFSET); ret = -EINVAL; if (le32_to_cpu(dsb->magic) != EROFS_SUPER_MAGIC_V1) { erofs_err(sb, "cannot find valid erofs superblock"); goto out; } sbi->blkszbits = dsb->blkszbits; if (sbi->blkszbits < 9 || sbi->blkszbits > PAGE_SHIFT) { erofs_err(sb, "blkszbits %u isn't supported", sbi->blkszbits); goto out; } if (dsb->dirblkbits) { erofs_err(sb, "dirblkbits %u isn't supported", dsb->dirblkbits); goto out; } sbi->feature_compat = le32_to_cpu(dsb->feature_compat); if (erofs_sb_has_sb_chksum(sbi)) { ret = erofs_superblock_csum_verify(sb, data); if (ret) goto out; } ret = -EINVAL; sbi->feature_incompat = le32_to_cpu(dsb->feature_incompat); if (sbi->feature_incompat & ~EROFS_ALL_FEATURE_INCOMPAT) { erofs_err(sb, "unidentified incompatible feature %x, please upgrade kernel", sbi->feature_incompat & ~EROFS_ALL_FEATURE_INCOMPAT); goto out; } sbi->sb_size = 128 + dsb->sb_extslots * EROFS_SB_EXTSLOT_SIZE; if (sbi->sb_size > PAGE_SIZE - EROFS_SUPER_OFFSET) { erofs_err(sb, "invalid sb_extslots %u (more than a fs block)", sbi->sb_size); goto out; } sbi->dif0.blocks = le32_to_cpu(dsb->blocks); sbi->meta_blkaddr = le32_to_cpu(dsb->meta_blkaddr); #ifdef CONFIG_EROFS_FS_XATTR sbi->xattr_blkaddr = le32_to_cpu(dsb->xattr_blkaddr); sbi->xattr_prefix_start = le32_to_cpu(dsb->xattr_prefix_start); sbi->xattr_prefix_count = dsb->xattr_prefix_count; sbi->xattr_filter_reserved = dsb->xattr_filter_reserved; #endif sbi->islotbits = ilog2(sizeof(struct erofs_inode_compact)); sbi->root_nid = le16_to_cpu(dsb->root_nid); sbi->packed_nid = le64_to_cpu(dsb->packed_nid); sbi->inos = le64_to_cpu(dsb->inos); sbi->build_time = le64_to_cpu(dsb->build_time); sbi->build_time_nsec = le32_to_cpu(dsb->build_time_nsec); super_set_uuid(sb, (void *)dsb->uuid, sizeof(dsb->uuid)); ret = strscpy(sbi->volume_name, dsb->volume_name, sizeof(dsb->volume_name)); if (ret < 0) { /* -E2BIG */ erofs_err(sb, "bad volume name without NIL terminator"); ret = -EFSCORRUPTED; goto out; } /* parse on-disk compression configurations */ ret = z_erofs_parse_cfgs(sb, dsb); if (ret < 0) goto out; /* handle multiple devices */ ret = erofs_scan_devices(sb, dsb); if (erofs_is_fscache_mode(sb)) erofs_info(sb, "[deprecated] fscache-based on-demand read feature in use. Use at your own risk!"); out: erofs_put_metabuf(&buf); return ret; } static void erofs_default_options(struct erofs_sb_info *sbi) { #ifdef CONFIG_EROFS_FS_ZIP sbi->opt.cache_strategy = EROFS_ZIP_CACHE_READAROUND; sbi->opt.max_sync_decompress_pages = 3; sbi->opt.sync_decompress = EROFS_SYNC_DECOMPRESS_AUTO; #endif #ifdef CONFIG_EROFS_FS_XATTR set_opt(&sbi->opt, XATTR_USER); #endif #ifdef CONFIG_EROFS_FS_POSIX_ACL set_opt(&sbi->opt, POSIX_ACL); #endif } enum { Opt_user_xattr, Opt_acl, Opt_cache_strategy, Opt_dax, Opt_dax_enum, Opt_device, Opt_fsid, Opt_domain_id, Opt_directio, Opt_err }; static const struct constant_table erofs_param_cache_strategy[] = { {"disabled", EROFS_ZIP_CACHE_DISABLED}, {"readahead", EROFS_ZIP_CACHE_READAHEAD}, {"readaround", EROFS_ZIP_CACHE_READAROUND}, {} }; static const struct constant_table erofs_dax_param_enums[] = { {"always", EROFS_MOUNT_DAX_ALWAYS}, {"never", EROFS_MOUNT_DAX_NEVER}, {} }; static const struct fs_parameter_spec erofs_fs_parameters[] = { fsparam_flag_no("user_xattr", Opt_user_xattr), fsparam_flag_no("acl", Opt_acl), fsparam_enum("cache_strategy", Opt_cache_strategy, erofs_param_cache_strategy), fsparam_flag("dax", Opt_dax), fsparam_enum("dax", Opt_dax_enum, erofs_dax_param_enums), fsparam_string("device", Opt_device), fsparam_string("fsid", Opt_fsid), fsparam_string("domain_id", Opt_domain_id), fsparam_flag_no("directio", Opt_directio), {} }; static bool erofs_fc_set_dax_mode(struct fs_context *fc, unsigned int mode) { #ifdef CONFIG_FS_DAX struct erofs_sb_info *sbi = fc->s_fs_info; switch (mode) { case EROFS_MOUNT_DAX_ALWAYS: set_opt(&sbi->opt, DAX_ALWAYS); clear_opt(&sbi->opt, DAX_NEVER); return true; case EROFS_MOUNT_DAX_NEVER: set_opt(&sbi->opt, DAX_NEVER); clear_opt(&sbi->opt, DAX_ALWAYS); return true; default: DBG_BUGON(1); return false; } #else errorfc(fc, "dax options not supported"); return false; #endif } static int erofs_fc_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct erofs_sb_info *sbi = fc->s_fs_info; struct fs_parse_result result; struct erofs_device_info *dif; int opt, ret; opt = fs_parse(fc, erofs_fs_parameters, param, &result); if (opt < 0) return opt; switch (opt) { case Opt_user_xattr: #ifdef CONFIG_EROFS_FS_XATTR if (result.boolean) set_opt(&sbi->opt, XATTR_USER); else clear_opt(&sbi->opt, XATTR_USER); #else errorfc(fc, "{,no}user_xattr options not supported"); #endif break; case Opt_acl: #ifdef CONFIG_EROFS_FS_POSIX_ACL if (result.boolean) set_opt(&sbi->opt, POSIX_ACL); else clear_opt(&sbi->opt, POSIX_ACL); #else errorfc(fc, "{,no}acl options not supported"); #endif break; case Opt_cache_strategy: #ifdef CONFIG_EROFS_FS_ZIP sbi->opt.cache_strategy = result.uint_32; #else errorfc(fc, "compression not supported, cache_strategy ignored"); #endif break; case Opt_dax: if (!erofs_fc_set_dax_mode(fc, EROFS_MOUNT_DAX_ALWAYS)) return -EINVAL; break; case Opt_dax_enum: if (!erofs_fc_set_dax_mode(fc, result.uint_32)) return -EINVAL; break; case Opt_device: dif = kzalloc(sizeof(*dif), GFP_KERNEL); if (!dif) return -ENOMEM; dif->path = kstrdup(param->string, GFP_KERNEL); if (!dif->path) { kfree(dif); return -ENOMEM; } down_write(&sbi->devs->rwsem); ret = idr_alloc(&sbi->devs->tree, dif, 0, 0, GFP_KERNEL); up_write(&sbi->devs->rwsem); if (ret < 0) { kfree(dif->path); kfree(dif); return ret; } ++sbi->devs->extra_devices; break; #ifdef CONFIG_EROFS_FS_ONDEMAND case Opt_fsid: kfree(sbi->fsid); sbi->fsid = kstrdup(param->string, GFP_KERNEL); if (!sbi->fsid) return -ENOMEM; break; case Opt_domain_id: kfree(sbi->domain_id); sbi->domain_id = kstrdup(param->string, GFP_KERNEL); if (!sbi->domain_id) return -ENOMEM; break; #else case Opt_fsid: case Opt_domain_id: errorfc(fc, "%s option not supported", erofs_fs_parameters[opt].name); break; #endif case Opt_directio: #ifdef CONFIG_EROFS_FS_BACKED_BY_FILE if (result.boolean) set_opt(&sbi->opt, DIRECT_IO); else clear_opt(&sbi->opt, DIRECT_IO); #else errorfc(fc, "%s option not supported", erofs_fs_parameters[opt].name); #endif break; default: return -ENOPARAM; } return 0; } static struct inode *erofs_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation) { return erofs_iget(sb, ino); } static struct dentry *erofs_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { return generic_fh_to_dentry(sb, fid, fh_len, fh_type, erofs_nfs_get_inode); } static struct dentry *erofs_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { return generic_fh_to_parent(sb, fid, fh_len, fh_type, erofs_nfs_get_inode); } static struct dentry *erofs_get_parent(struct dentry *child) { erofs_nid_t nid; unsigned int d_type; int err; err = erofs_namei(d_inode(child), &dotdot_name, &nid, &d_type); if (err) return ERR_PTR(err); return d_obtain_alias(erofs_iget(child->d_sb, nid)); } static const struct export_operations erofs_export_ops = { .encode_fh = generic_encode_ino32_fh, .fh_to_dentry = erofs_fh_to_dentry, .fh_to_parent = erofs_fh_to_parent, .get_parent = erofs_get_parent, }; static void erofs_set_sysfs_name(struct super_block *sb) { struct erofs_sb_info *sbi = EROFS_SB(sb); if (sbi->domain_id) super_set_sysfs_name_generic(sb, "%s,%s", sbi->domain_id, sbi->fsid); else if (sbi->fsid) super_set_sysfs_name_generic(sb, "%s", sbi->fsid); else if (erofs_is_fileio_mode(sbi)) super_set_sysfs_name_generic(sb, "%s", bdi_dev_name(sb->s_bdi)); else super_set_sysfs_name_id(sb); } static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) { struct inode *inode; struct erofs_sb_info *sbi = EROFS_SB(sb); int err; sb->s_magic = EROFS_SUPER_MAGIC; sb->s_flags |= SB_RDONLY | SB_NOATIME; sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_op = &erofs_sops; sbi->blkszbits = PAGE_SHIFT; if (!sb->s_bdev) { sb->s_blocksize = PAGE_SIZE; sb->s_blocksize_bits = PAGE_SHIFT; if (erofs_is_fscache_mode(sb)) { err = erofs_fscache_register_fs(sb); if (err) return err; } err = super_setup_bdi(sb); if (err) return err; } else { if (!sb_set_blocksize(sb, PAGE_SIZE)) { errorfc(fc, "failed to set initial blksize"); return -EINVAL; } sbi->dif0.dax_dev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->dif0.dax_part_off, NULL, NULL); } err = erofs_read_superblock(sb); if (err) return err; if (sb->s_blocksize_bits != sbi->blkszbits) { if (erofs_is_fscache_mode(sb)) { errorfc(fc, "unsupported blksize for fscache mode"); return -EINVAL; } if (erofs_is_fileio_mode(sbi)) { sb->s_blocksize = 1 << sbi->blkszbits; sb->s_blocksize_bits = sbi->blkszbits; } else if (!sb_set_blocksize(sb, 1 << sbi->blkszbits)) { errorfc(fc, "failed to set erofs blksize"); return -EINVAL; } } if (test_opt(&sbi->opt, DAX_ALWAYS)) { if (!sbi->dif0.dax_dev) { errorfc(fc, "DAX unsupported by block device. Turning off DAX."); clear_opt(&sbi->opt, DAX_ALWAYS); } else if (sbi->blkszbits != PAGE_SHIFT) { errorfc(fc, "unsupported blocksize for DAX"); clear_opt(&sbi->opt, DAX_ALWAYS); } } sb->s_time_gran = 1; sb->s_xattr = erofs_xattr_handlers; sb->s_export_op = &erofs_export_ops; if (test_opt(&sbi->opt, POSIX_ACL)) sb->s_flags |= SB_POSIXACL; else sb->s_flags &= ~SB_POSIXACL; #ifdef CONFIG_EROFS_FS_ZIP xa_init(&sbi->managed_pslots); #endif inode = erofs_iget(sb, sbi->root_nid); if (IS_ERR(inode)) return PTR_ERR(inode); if (!S_ISDIR(inode->i_mode)) { erofs_err(sb, "rootino(nid %llu) is not a directory(i_mode %o)", sbi->root_nid, inode->i_mode); iput(inode); return -EINVAL; } sb->s_root = d_make_root(inode); if (!sb->s_root) return -ENOMEM; erofs_shrinker_register(sb); if (erofs_sb_has_fragments(sbi) && sbi->packed_nid) { sbi->packed_inode = erofs_iget(sb, sbi->packed_nid); if (IS_ERR(sbi->packed_inode)) { err = PTR_ERR(sbi->packed_inode); sbi->packed_inode = NULL; return err; } } err = erofs_init_managed_cache(sb); if (err) return err; err = erofs_xattr_prefixes_init(sb); if (err) return err; erofs_set_sysfs_name(sb); err = erofs_register_sysfs(sb); if (err) return err; erofs_info(sb, "mounted with root inode @ nid %llu.", sbi->root_nid); return 0; } static int erofs_fc_get_tree(struct fs_context *fc) { struct erofs_sb_info *sbi = fc->s_fs_info; int ret; if (IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && sbi->fsid) return get_tree_nodev(fc, erofs_fc_fill_super); ret = get_tree_bdev_flags(fc, erofs_fc_fill_super, IS_ENABLED(CONFIG_EROFS_FS_BACKED_BY_FILE) ? GET_TREE_BDEV_QUIET_LOOKUP : 0); #ifdef CONFIG_EROFS_FS_BACKED_BY_FILE if (ret == -ENOTBLK) { struct file *file; if (!fc->source) return invalf(fc, "No source specified"); file = filp_open(fc->source, O_RDONLY | O_LARGEFILE, 0); if (IS_ERR(file)) return PTR_ERR(file); sbi->dif0.file = file; if (S_ISREG(file_inode(sbi->dif0.file)->i_mode) && sbi->dif0.file->f_mapping->a_ops->read_folio) return get_tree_nodev(fc, erofs_fc_fill_super); } #endif return ret; } static int erofs_fc_reconfigure(struct fs_context *fc) { struct super_block *sb = fc->root->d_sb; struct erofs_sb_info *sbi = EROFS_SB(sb); struct erofs_sb_info *new_sbi = fc->s_fs_info; DBG_BUGON(!sb_rdonly(sb)); if (new_sbi->fsid || new_sbi->domain_id) erofs_info(sb, "ignoring reconfiguration for fsid|domain_id."); if (test_opt(&new_sbi->opt, POSIX_ACL)) fc->sb_flags |= SB_POSIXACL; else fc->sb_flags &= ~SB_POSIXACL; sbi->opt = new_sbi->opt; fc->sb_flags |= SB_RDONLY; return 0; } static int erofs_release_device_info(int id, void *ptr, void *data) { struct erofs_device_info *dif = ptr; fs_put_dax(dif->dax_dev, NULL); if (dif->file) fput(dif->file); erofs_fscache_unregister_cookie(dif->fscache); dif->fscache = NULL; kfree(dif->path); kfree(dif); return 0; } static void erofs_free_dev_context(struct erofs_dev_context *devs) { if (!devs) return; idr_for_each(&devs->tree, &erofs_release_device_info, NULL); idr_destroy(&devs->tree); kfree(devs); } static void erofs_sb_free(struct erofs_sb_info *sbi) { erofs_free_dev_context(sbi->devs); kfree(sbi->fsid); kfree(sbi->domain_id); if (sbi->dif0.file) fput(sbi->dif0.file); kfree(sbi); } static void erofs_fc_free(struct fs_context *fc) { struct erofs_sb_info *sbi = fc->s_fs_info; if (sbi) /* free here if an error occurs before transferring to sb */ erofs_sb_free(sbi); } static const struct fs_context_operations erofs_context_ops = { .parse_param = erofs_fc_parse_param, .get_tree = erofs_fc_get_tree, .reconfigure = erofs_fc_reconfigure, .free = erofs_fc_free, }; static int erofs_init_fs_context(struct fs_context *fc) { struct erofs_sb_info *sbi; sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) return -ENOMEM; sbi->devs = kzalloc(sizeof(struct erofs_dev_context), GFP_KERNEL); if (!sbi->devs) { kfree(sbi); return -ENOMEM; } fc->s_fs_info = sbi; idr_init(&sbi->devs->tree); init_rwsem(&sbi->devs->rwsem); erofs_default_options(sbi); fc->ops = &erofs_context_ops; return 0; } static void erofs_kill_sb(struct super_block *sb) { struct erofs_sb_info *sbi = EROFS_SB(sb); if ((IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && sbi->fsid) || sbi->dif0.file) kill_anon_super(sb); else kill_block_super(sb); fs_put_dax(sbi->dif0.dax_dev, NULL); erofs_fscache_unregister_fs(sb); erofs_sb_free(sbi); sb->s_fs_info = NULL; } static void erofs_put_super(struct super_block *sb) { struct erofs_sb_info *const sbi = EROFS_SB(sb); DBG_BUGON(!sbi); erofs_unregister_sysfs(sb); erofs_shrinker_unregister(sb); erofs_xattr_prefixes_cleanup(sb); #ifdef CONFIG_EROFS_FS_ZIP iput(sbi->managed_cache); sbi->managed_cache = NULL; #endif iput(sbi->packed_inode); sbi->packed_inode = NULL; erofs_free_dev_context(sbi->devs); sbi->devs = NULL; erofs_fscache_unregister_fs(sb); } static struct file_system_type erofs_fs_type = { .owner = THIS_MODULE, .name = "erofs", .init_fs_context = erofs_init_fs_context, .kill_sb = erofs_kill_sb, .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, }; MODULE_ALIAS_FS("erofs"); static int __init erofs_module_init(void) { int err; erofs_check_ondisk_layout_definitions(); erofs_inode_cachep = kmem_cache_create("erofs_inode", sizeof(struct erofs_inode), 0, SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT, erofs_inode_init_once); if (!erofs_inode_cachep) return -ENOMEM; err = erofs_init_shrinker(); if (err) goto shrinker_err; err = z_erofs_init_subsystem(); if (err) goto zip_err; err = erofs_init_sysfs(); if (err) goto sysfs_err; err = register_filesystem(&erofs_fs_type); if (err) goto fs_err; return 0; fs_err: erofs_exit_sysfs(); sysfs_err: z_erofs_exit_subsystem(); zip_err: erofs_exit_shrinker(); shrinker_err: kmem_cache_destroy(erofs_inode_cachep); return err; } static void __exit erofs_module_exit(void) { unregister_filesystem(&erofs_fs_type); /* Ensure all RCU free inodes / pclusters are safe to be destroyed. */ rcu_barrier(); erofs_exit_sysfs(); z_erofs_exit_subsystem(); erofs_exit_shrinker(); kmem_cache_destroy(erofs_inode_cachep); } static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; struct erofs_sb_info *sbi = EROFS_SB(sb); buf->f_type = sb->s_magic; buf->f_bsize = sb->s_blocksize; buf->f_blocks = sbi->total_blocks; buf->f_bfree = buf->f_bavail = 0; buf->f_files = ULLONG_MAX; buf->f_ffree = ULLONG_MAX - sbi->inos; buf->f_namelen = EROFS_NAME_LEN; if (uuid_is_null(&sb->s_uuid)) buf->f_fsid = u64_to_fsid(!sb->s_bdev ? 0 : huge_encode_dev(sb->s_bdev->bd_dev)); else buf->f_fsid = uuid_to_fsid(sb->s_uuid.b); return 0; } static int erofs_show_options(struct seq_file *seq, struct dentry *root) { struct erofs_sb_info *sbi = EROFS_SB(root->d_sb); struct erofs_mount_opts *opt = &sbi->opt; if (IS_ENABLED(CONFIG_EROFS_FS_XATTR)) seq_puts(seq, test_opt(opt, XATTR_USER) ? ",user_xattr" : ",nouser_xattr"); if (IS_ENABLED(CONFIG_EROFS_FS_POSIX_ACL)) seq_puts(seq, test_opt(opt, POSIX_ACL) ? ",acl" : ",noacl"); if (IS_ENABLED(CONFIG_EROFS_FS_ZIP)) seq_printf(seq, ",cache_strategy=%s", erofs_param_cache_strategy[opt->cache_strategy].name); if (test_opt(opt, DAX_ALWAYS)) seq_puts(seq, ",dax=always"); if (test_opt(opt, DAX_NEVER)) seq_puts(seq, ",dax=never"); if (erofs_is_fileio_mode(sbi) && test_opt(opt, DIRECT_IO)) seq_puts(seq, ",directio"); #ifdef CONFIG_EROFS_FS_ONDEMAND if (sbi->fsid) seq_printf(seq, ",fsid=%s", sbi->fsid); if (sbi->domain_id) seq_printf(seq, ",domain_id=%s", sbi->domain_id); #endif return 0; } const struct super_operations erofs_sops = { .put_super = erofs_put_super, .alloc_inode = erofs_alloc_inode, .free_inode = erofs_free_inode, .statfs = erofs_statfs, .show_options = erofs_show_options, }; module_init(erofs_module_init); module_exit(erofs_module_exit); MODULE_DESCRIPTION("Enhanced ROM File System"); MODULE_AUTHOR("Gao Xiang, Chao Yu, Miao Xie, CONSUMER BG, HUAWEI Inc."); MODULE_LICENSE("GPL");
4 4 4 4 4 4 4 4 4 2 6 9 6 6 9 9 7 7 1 7 7 11 11 3 2 1 8 3 3 3 10 9 9 9 9 4 11 11 4 9 9 9 9 15 15 13 2 12 1 11 11 11 11 9 7 7 4 4 4 2 4 4 4 4 19 16 15 7 19 29 6 6 5 4 4 1 2 2 4 4 3 3 2 3 29 29 16 29 6 3 9 9 9 9 9 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 /* RFCOMM implementation for Linux Bluetooth stack (BlueZ). Copyright (C) 2002 Maxim Krasnyansky <maxk@qualcomm.com> Copyright (C) 2002 Marcel Holtmann <marcel@holtmann.org> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License version 2 as published by the Free Software Foundation; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS SOFTWARE IS DISCLAIMED. */ /* * RFCOMM TTY. */ #include <linux/module.h> #include <linux/tty.h> #include <linux/tty_driver.h> #include <linux/tty_flip.h> #include <net/bluetooth/bluetooth.h> #include <net/bluetooth/hci_core.h> #include <net/bluetooth/rfcomm.h> #define RFCOMM_TTY_PORTS RFCOMM_MAX_DEV /* whole lotta rfcomm devices */ #define RFCOMM_TTY_MAJOR 216 /* device node major id of the usb/bluetooth.c driver */ #define RFCOMM_TTY_MINOR 0 static DEFINE_MUTEX(rfcomm_ioctl_mutex); static struct tty_driver *rfcomm_tty_driver; struct rfcomm_dev { struct tty_port port; struct list_head list; char name[12]; int id; unsigned long flags; int err; unsigned long status; /* don't export to userspace */ bdaddr_t src; bdaddr_t dst; u8 channel; uint modem_status; struct rfcomm_dlc *dlc; struct device *tty_dev; atomic_t wmem_alloc; struct sk_buff_head pending; }; static LIST_HEAD(rfcomm_dev_list); static DEFINE_MUTEX(rfcomm_dev_lock); static void rfcomm_dev_data_ready(struct rfcomm_dlc *dlc, struct sk_buff *skb); static void rfcomm_dev_state_change(struct rfcomm_dlc *dlc, int err); static void rfcomm_dev_modem_status(struct rfcomm_dlc *dlc, u8 v24_sig); /* ---- Device functions ---- */ static void rfcomm_dev_destruct(struct tty_port *port) { struct rfcomm_dev *dev = container_of(port, struct rfcomm_dev, port); struct rfcomm_dlc *dlc = dev->dlc; BT_DBG("dev %p dlc %p", dev, dlc); rfcomm_dlc_lock(dlc); /* Detach DLC if it's owned by this dev */ if (dlc->owner == dev) dlc->owner = NULL; rfcomm_dlc_unlock(dlc); rfcomm_dlc_put(dlc); if (dev->tty_dev) tty_unregister_device(rfcomm_tty_driver, dev->id); mutex_lock(&rfcomm_dev_lock); list_del(&dev->list); mutex_unlock(&rfcomm_dev_lock); kfree(dev); /* It's safe to call module_put() here because socket still holds reference to this module. */ module_put(THIS_MODULE); } /* device-specific initialization: open the dlc */ static int rfcomm_dev_activate(struct tty_port *port, struct tty_struct *tty) { struct rfcomm_dev *dev = container_of(port, struct rfcomm_dev, port); int err; err = rfcomm_dlc_open(dev->dlc, &dev->src, &dev->dst, dev->channel); if (err) set_bit(TTY_IO_ERROR, &tty->flags); return err; } /* we block the open until the dlc->state becomes BT_CONNECTED */ static bool rfcomm_dev_carrier_raised(struct tty_port *port) { struct rfcomm_dev *dev = container_of(port, struct rfcomm_dev, port); return (dev->dlc->state == BT_CONNECTED); } /* device-specific cleanup: close the dlc */ static void rfcomm_dev_shutdown(struct tty_port *port) { struct rfcomm_dev *dev = container_of(port, struct rfcomm_dev, port); if (dev->tty_dev->parent) device_move(dev->tty_dev, NULL, DPM_ORDER_DEV_LAST); /* close the dlc */ rfcomm_dlc_close(dev->dlc, 0); } static const struct tty_port_operations rfcomm_port_ops = { .destruct = rfcomm_dev_destruct, .activate = rfcomm_dev_activate, .shutdown = rfcomm_dev_shutdown, .carrier_raised = rfcomm_dev_carrier_raised, }; static struct rfcomm_dev *__rfcomm_dev_lookup(int id) { struct rfcomm_dev *dev; list_for_each_entry(dev, &rfcomm_dev_list, list) if (dev->id == id) return dev; return NULL; } static struct rfcomm_dev *rfcomm_dev_get(int id) { struct rfcomm_dev *dev; mutex_lock(&rfcomm_dev_lock); dev = __rfcomm_dev_lookup(id); if (dev && !tty_port_get(&dev->port)) dev = NULL; mutex_unlock(&rfcomm_dev_lock); return dev; } static void rfcomm_reparent_device(struct rfcomm_dev *dev) { struct hci_dev *hdev; struct hci_conn *conn; hdev = hci_get_route(&dev->dst, &dev->src, BDADDR_BREDR); if (!hdev) return; /* The lookup results are unsafe to access without the * hci device lock (FIXME: why is this not documented?) */ hci_dev_lock(hdev); conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &dev->dst); /* Just because the acl link is in the hash table is no * guarantee the sysfs device has been added ... */ if (conn && device_is_registered(&conn->dev)) device_move(dev->tty_dev, &conn->dev, DPM_ORDER_DEV_AFTER_PARENT); hci_dev_unlock(hdev); hci_dev_put(hdev); } static ssize_t address_show(struct device *tty_dev, struct device_attribute *attr, char *buf) { struct rfcomm_dev *dev = dev_get_drvdata(tty_dev); return sprintf(buf, "%pMR\n", &dev->dst); } static ssize_t channel_show(struct device *tty_dev, struct device_attribute *attr, char *buf) { struct rfcomm_dev *dev = dev_get_drvdata(tty_dev); return sprintf(buf, "%d\n", dev->channel); } static DEVICE_ATTR_RO(address); static DEVICE_ATTR_RO(channel); static struct rfcomm_dev *__rfcomm_dev_add(struct rfcomm_dev_req *req, struct rfcomm_dlc *dlc) { struct rfcomm_dev *dev, *entry; struct list_head *head = &rfcomm_dev_list; int err = 0; dev = kzalloc(sizeof(struct rfcomm_dev), GFP_KERNEL); if (!dev) return ERR_PTR(-ENOMEM); mutex_lock(&rfcomm_dev_lock); if (req->dev_id < 0) { dev->id = 0; list_for_each_entry(entry, &rfcomm_dev_list, list) { if (entry->id != dev->id) break; dev->id++; head = &entry->list; } } else { dev->id = req->dev_id; list_for_each_entry(entry, &rfcomm_dev_list, list) { if (entry->id == dev->id) { err = -EADDRINUSE; goto out; } if (entry->id > dev->id - 1) break; head = &entry->list; } } if ((dev->id < 0) || (dev->id > RFCOMM_MAX_DEV - 1)) { err = -ENFILE; goto out; } sprintf(dev->name, "rfcomm%d", dev->id); list_add(&dev->list, head); bacpy(&dev->src, &req->src); bacpy(&dev->dst, &req->dst); dev->channel = req->channel; dev->flags = req->flags & ((1 << RFCOMM_RELEASE_ONHUP) | (1 << RFCOMM_REUSE_DLC)); tty_port_init(&dev->port); dev->port.ops = &rfcomm_port_ops; skb_queue_head_init(&dev->pending); rfcomm_dlc_lock(dlc); if (req->flags & (1 << RFCOMM_REUSE_DLC)) { struct sock *sk = dlc->owner; struct sk_buff *skb; BUG_ON(!sk); rfcomm_dlc_throttle(dlc); while ((skb = skb_dequeue(&sk->sk_receive_queue))) { skb_orphan(skb); skb_queue_tail(&dev->pending, skb); atomic_sub(skb->len, &sk->sk_rmem_alloc); } } dlc->data_ready = rfcomm_dev_data_ready; dlc->state_change = rfcomm_dev_state_change; dlc->modem_status = rfcomm_dev_modem_status; dlc->owner = dev; dev->dlc = dlc; rfcomm_dev_modem_status(dlc, dlc->remote_v24_sig); rfcomm_dlc_unlock(dlc); /* It's safe to call __module_get() here because socket already holds reference to this module. */ __module_get(THIS_MODULE); mutex_unlock(&rfcomm_dev_lock); return dev; out: mutex_unlock(&rfcomm_dev_lock); kfree(dev); return ERR_PTR(err); } static int rfcomm_dev_add(struct rfcomm_dev_req *req, struct rfcomm_dlc *dlc) { struct rfcomm_dev *dev; struct device *tty; BT_DBG("id %d channel %d", req->dev_id, req->channel); dev = __rfcomm_dev_add(req, dlc); if (IS_ERR(dev)) { rfcomm_dlc_put(dlc); return PTR_ERR(dev); } tty = tty_port_register_device(&dev->port, rfcomm_tty_driver, dev->id, NULL); if (IS_ERR(tty)) { tty_port_put(&dev->port); return PTR_ERR(tty); } dev->tty_dev = tty; rfcomm_reparent_device(dev); dev_set_drvdata(dev->tty_dev, dev); if (device_create_file(dev->tty_dev, &dev_attr_address) < 0) BT_ERR("Failed to create address attribute"); if (device_create_file(dev->tty_dev, &dev_attr_channel) < 0) BT_ERR("Failed to create channel attribute"); return dev->id; } /* ---- Send buffer ---- */ static inline unsigned int rfcomm_room(struct rfcomm_dev *dev) { struct rfcomm_dlc *dlc = dev->dlc; /* Limit the outstanding number of packets not yet sent to 40 */ int pending = 40 - atomic_read(&dev->wmem_alloc); return max(0, pending) * dlc->mtu; } static void rfcomm_wfree(struct sk_buff *skb) { struct rfcomm_dev *dev = (void *) skb->sk; atomic_dec(&dev->wmem_alloc); if (test_bit(RFCOMM_TTY_ATTACHED, &dev->flags)) tty_port_tty_wakeup(&dev->port); tty_port_put(&dev->port); } static void rfcomm_set_owner_w(struct sk_buff *skb, struct rfcomm_dev *dev) { tty_port_get(&dev->port); atomic_inc(&dev->wmem_alloc); skb->sk = (void *) dev; skb->destructor = rfcomm_wfree; } static struct sk_buff *rfcomm_wmalloc(struct rfcomm_dev *dev, unsigned long size, gfp_t priority) { struct sk_buff *skb = alloc_skb(size, priority); if (skb) rfcomm_set_owner_w(skb, dev); return skb; } /* ---- Device IOCTLs ---- */ #define NOCAP_FLAGS ((1 << RFCOMM_REUSE_DLC) | (1 << RFCOMM_RELEASE_ONHUP)) static int __rfcomm_create_dev(struct sock *sk, void __user *arg) { struct rfcomm_dev_req req; struct rfcomm_dlc *dlc; int id; if (copy_from_user(&req, arg, sizeof(req))) return -EFAULT; BT_DBG("sk %p dev_id %d flags 0x%x", sk, req.dev_id, req.flags); if (req.flags != NOCAP_FLAGS && !capable(CAP_NET_ADMIN)) return -EPERM; if (req.flags & (1 << RFCOMM_REUSE_DLC)) { /* Socket must be connected */ if (sk->sk_state != BT_CONNECTED) return -EBADFD; dlc = rfcomm_pi(sk)->dlc; rfcomm_dlc_hold(dlc); } else { /* Validate the channel is unused */ dlc = rfcomm_dlc_exists(&req.src, &req.dst, req.channel); if (IS_ERR(dlc)) return PTR_ERR(dlc); if (dlc) return -EBUSY; dlc = rfcomm_dlc_alloc(GFP_KERNEL); if (!dlc) return -ENOMEM; } id = rfcomm_dev_add(&req, dlc); if (id < 0) return id; if (req.flags & (1 << RFCOMM_REUSE_DLC)) { /* DLC is now used by device. * Socket must be disconnected */ sk->sk_state = BT_CLOSED; } return id; } static int __rfcomm_release_dev(void __user *arg) { struct rfcomm_dev_req req; struct rfcomm_dev *dev; struct tty_struct *tty; if (copy_from_user(&req, arg, sizeof(req))) return -EFAULT; BT_DBG("dev_id %d flags 0x%x", req.dev_id, req.flags); dev = rfcomm_dev_get(req.dev_id); if (!dev) return -ENODEV; if (dev->flags != NOCAP_FLAGS && !capable(CAP_NET_ADMIN)) { tty_port_put(&dev->port); return -EPERM; } /* only release once */ if (test_and_set_bit(RFCOMM_DEV_RELEASED, &dev->status)) { tty_port_put(&dev->port); return -EALREADY; } if (req.flags & (1 << RFCOMM_HANGUP_NOW)) rfcomm_dlc_close(dev->dlc, 0); /* Shut down TTY synchronously before freeing rfcomm_dev */ tty = tty_port_tty_get(&dev->port); if (tty) { tty_vhangup(tty); tty_kref_put(tty); } if (!test_bit(RFCOMM_TTY_OWNED, &dev->status)) tty_port_put(&dev->port); tty_port_put(&dev->port); return 0; } static int rfcomm_create_dev(struct sock *sk, void __user *arg) { int ret; mutex_lock(&rfcomm_ioctl_mutex); ret = __rfcomm_create_dev(sk, arg); mutex_unlock(&rfcomm_ioctl_mutex); return ret; } static int rfcomm_release_dev(void __user *arg) { int ret; mutex_lock(&rfcomm_ioctl_mutex); ret = __rfcomm_release_dev(arg); mutex_unlock(&rfcomm_ioctl_mutex); return ret; } static int rfcomm_get_dev_list(void __user *arg) { struct rfcomm_dev *dev; struct rfcomm_dev_list_req *dl; struct rfcomm_dev_info *di; int n = 0, err; u16 dev_num; BT_DBG(""); if (get_user(dev_num, (u16 __user *) arg)) return -EFAULT; if (!dev_num || dev_num > (PAGE_SIZE * 4) / sizeof(*di)) return -EINVAL; dl = kzalloc(struct_size(dl, dev_info, dev_num), GFP_KERNEL); if (!dl) return -ENOMEM; dl->dev_num = dev_num; di = dl->dev_info; mutex_lock(&rfcomm_dev_lock); list_for_each_entry(dev, &rfcomm_dev_list, list) { if (!tty_port_get(&dev->port)) continue; di[n].id = dev->id; di[n].flags = dev->flags; di[n].state = dev->dlc->state; di[n].channel = dev->channel; bacpy(&di[n].src, &dev->src); bacpy(&di[n].dst, &dev->dst); tty_port_put(&dev->port); if (++n >= dev_num) break; } mutex_unlock(&rfcomm_dev_lock); dl->dev_num = n; err = copy_to_user(arg, dl, struct_size(dl, dev_info, n)); kfree(dl); return err ? -EFAULT : 0; } static int rfcomm_get_dev_info(void __user *arg) { struct rfcomm_dev *dev; struct rfcomm_dev_info di; int err = 0; BT_DBG(""); if (copy_from_user(&di, arg, sizeof(di))) return -EFAULT; dev = rfcomm_dev_get(di.id); if (!dev) return -ENODEV; di.flags = dev->flags; di.channel = dev->channel; di.state = dev->dlc->state; bacpy(&di.src, &dev->src); bacpy(&di.dst, &dev->dst); if (copy_to_user(arg, &di, sizeof(di))) err = -EFAULT; tty_port_put(&dev->port); return err; } int rfcomm_dev_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) { BT_DBG("cmd %d arg %p", cmd, arg); switch (cmd) { case RFCOMMCREATEDEV: return rfcomm_create_dev(sk, arg); case RFCOMMRELEASEDEV: return rfcomm_release_dev(arg); case RFCOMMGETDEVLIST: return rfcomm_get_dev_list(arg); case RFCOMMGETDEVINFO: return rfcomm_get_dev_info(arg); } return -EINVAL; } /* ---- DLC callbacks ---- */ static void rfcomm_dev_data_ready(struct rfcomm_dlc *dlc, struct sk_buff *skb) { struct rfcomm_dev *dev = dlc->owner; if (!dev) { kfree_skb(skb); return; } if (!skb_queue_empty(&dev->pending)) { skb_queue_tail(&dev->pending, skb); return; } BT_DBG("dlc %p len %d", dlc, skb->len); tty_insert_flip_string(&dev->port, skb->data, skb->len); tty_flip_buffer_push(&dev->port); kfree_skb(skb); } static void rfcomm_dev_state_change(struct rfcomm_dlc *dlc, int err) { struct rfcomm_dev *dev = dlc->owner; if (!dev) return; BT_DBG("dlc %p dev %p err %d", dlc, dev, err); dev->err = err; if (dlc->state == BT_CONNECTED) { rfcomm_reparent_device(dev); wake_up_interruptible(&dev->port.open_wait); } else if (dlc->state == BT_CLOSED) tty_port_tty_hangup(&dev->port, false); } static void rfcomm_dev_modem_status(struct rfcomm_dlc *dlc, u8 v24_sig) { struct rfcomm_dev *dev = dlc->owner; if (!dev) return; BT_DBG("dlc %p dev %p v24_sig 0x%02x", dlc, dev, v24_sig); if ((dev->modem_status & TIOCM_CD) && !(v24_sig & RFCOMM_V24_DV)) tty_port_tty_hangup(&dev->port, true); dev->modem_status = ((v24_sig & RFCOMM_V24_RTC) ? (TIOCM_DSR | TIOCM_DTR) : 0) | ((v24_sig & RFCOMM_V24_RTR) ? (TIOCM_RTS | TIOCM_CTS) : 0) | ((v24_sig & RFCOMM_V24_IC) ? TIOCM_RI : 0) | ((v24_sig & RFCOMM_V24_DV) ? TIOCM_CD : 0); } /* ---- TTY functions ---- */ static void rfcomm_tty_copy_pending(struct rfcomm_dev *dev) { struct sk_buff *skb; int inserted = 0; BT_DBG("dev %p", dev); rfcomm_dlc_lock(dev->dlc); while ((skb = skb_dequeue(&dev->pending))) { inserted += tty_insert_flip_string(&dev->port, skb->data, skb->len); kfree_skb(skb); } rfcomm_dlc_unlock(dev->dlc); if (inserted > 0) tty_flip_buffer_push(&dev->port); } /* do the reverse of install, clearing the tty fields and releasing the * reference to tty_port */ static void rfcomm_tty_cleanup(struct tty_struct *tty) { struct rfcomm_dev *dev = tty->driver_data; clear_bit(RFCOMM_TTY_ATTACHED, &dev->flags); rfcomm_dlc_lock(dev->dlc); tty->driver_data = NULL; rfcomm_dlc_unlock(dev->dlc); /* * purge the dlc->tx_queue to avoid circular dependencies * between dev and dlc */ skb_queue_purge(&dev->dlc->tx_queue); tty_port_put(&dev->port); } /* we acquire the tty_port reference since it's here the tty is first used * by setting the termios. We also populate the driver_data field and install * the tty port */ static int rfcomm_tty_install(struct tty_driver *driver, struct tty_struct *tty) { struct rfcomm_dev *dev; struct rfcomm_dlc *dlc; int err; dev = rfcomm_dev_get(tty->index); if (!dev) return -ENODEV; dlc = dev->dlc; /* Attach TTY and open DLC */ rfcomm_dlc_lock(dlc); tty->driver_data = dev; rfcomm_dlc_unlock(dlc); set_bit(RFCOMM_TTY_ATTACHED, &dev->flags); /* install the tty_port */ err = tty_port_install(&dev->port, driver, tty); if (err) { rfcomm_tty_cleanup(tty); return err; } /* take over the tty_port reference if the port was created with the * flag RFCOMM_RELEASE_ONHUP. This will force the release of the port * when the last process closes the tty. The behaviour is expected by * userspace. */ if (test_bit(RFCOMM_RELEASE_ONHUP, &dev->flags)) { set_bit(RFCOMM_TTY_OWNED, &dev->status); tty_port_put(&dev->port); } return 0; } static int rfcomm_tty_open(struct tty_struct *tty, struct file *filp) { struct rfcomm_dev *dev = tty->driver_data; int err; BT_DBG("tty %p id %d", tty, tty->index); BT_DBG("dev %p dst %pMR channel %d opened %d", dev, &dev->dst, dev->channel, dev->port.count); err = tty_port_open(&dev->port, tty, filp); if (err) return err; /* * FIXME: rfcomm should use proper flow control for * received data. This hack will be unnecessary and can * be removed when that's implemented */ rfcomm_tty_copy_pending(dev); rfcomm_dlc_unthrottle(dev->dlc); return 0; } static void rfcomm_tty_close(struct tty_struct *tty, struct file *filp) { struct rfcomm_dev *dev = tty->driver_data; BT_DBG("tty %p dev %p dlc %p opened %d", tty, dev, dev->dlc, dev->port.count); tty_port_close(&dev->port, tty, filp); } static ssize_t rfcomm_tty_write(struct tty_struct *tty, const u8 *buf, size_t count) { struct rfcomm_dev *dev = tty->driver_data; struct rfcomm_dlc *dlc = dev->dlc; struct sk_buff *skb; size_t sent = 0, size; BT_DBG("tty %p count %zu", tty, count); while (count) { size = min_t(size_t, count, dlc->mtu); skb = rfcomm_wmalloc(dev, size + RFCOMM_SKB_RESERVE, GFP_ATOMIC); if (!skb) break; skb_reserve(skb, RFCOMM_SKB_HEAD_RESERVE); skb_put_data(skb, buf + sent, size); rfcomm_dlc_send_noerror(dlc, skb); sent += size; count -= size; } return sent; } static unsigned int rfcomm_tty_write_room(struct tty_struct *tty) { struct rfcomm_dev *dev = tty->driver_data; int room = 0; if (dev && dev->dlc) room = rfcomm_room(dev); BT_DBG("tty %p room %d", tty, room); return room; } static int rfcomm_tty_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg) { BT_DBG("tty %p cmd 0x%02x", tty, cmd); switch (cmd) { case TCGETS: BT_DBG("TCGETS is not supported"); return -ENOIOCTLCMD; case TCSETS: BT_DBG("TCSETS is not supported"); return -ENOIOCTLCMD; case TIOCMIWAIT: BT_DBG("TIOCMIWAIT"); break; case TIOCSERGETLSR: BT_ERR("TIOCSERGETLSR is not supported"); return -ENOIOCTLCMD; case TIOCSERCONFIG: BT_ERR("TIOCSERCONFIG is not supported"); return -ENOIOCTLCMD; default: return -ENOIOCTLCMD; /* ioctls which we must ignore */ } return -ENOIOCTLCMD; } static void rfcomm_tty_set_termios(struct tty_struct *tty, const struct ktermios *old) { struct ktermios *new = &tty->termios; int old_baud_rate = tty_termios_baud_rate(old); int new_baud_rate = tty_termios_baud_rate(new); u8 baud, data_bits, stop_bits, parity, x_on, x_off; u16 changes = 0; struct rfcomm_dev *dev = tty->driver_data; BT_DBG("tty %p termios %p", tty, old); if (!dev || !dev->dlc || !dev->dlc->session) return; /* Handle turning off CRTSCTS */ if ((old->c_cflag & CRTSCTS) && !(new->c_cflag & CRTSCTS)) BT_DBG("Turning off CRTSCTS unsupported"); /* Parity on/off and when on, odd/even */ if (((old->c_cflag & PARENB) != (new->c_cflag & PARENB)) || ((old->c_cflag & PARODD) != (new->c_cflag & PARODD))) { changes |= RFCOMM_RPN_PM_PARITY; BT_DBG("Parity change detected."); } /* Mark and space parity are not supported! */ if (new->c_cflag & PARENB) { if (new->c_cflag & PARODD) { BT_DBG("Parity is ODD"); parity = RFCOMM_RPN_PARITY_ODD; } else { BT_DBG("Parity is EVEN"); parity = RFCOMM_RPN_PARITY_EVEN; } } else { BT_DBG("Parity is OFF"); parity = RFCOMM_RPN_PARITY_NONE; } /* Setting the x_on / x_off characters */ if (old->c_cc[VSTOP] != new->c_cc[VSTOP]) { BT_DBG("XOFF custom"); x_on = new->c_cc[VSTOP]; changes |= RFCOMM_RPN_PM_XON; } else { BT_DBG("XOFF default"); x_on = RFCOMM_RPN_XON_CHAR; } if (old->c_cc[VSTART] != new->c_cc[VSTART]) { BT_DBG("XON custom"); x_off = new->c_cc[VSTART]; changes |= RFCOMM_RPN_PM_XOFF; } else { BT_DBG("XON default"); x_off = RFCOMM_RPN_XOFF_CHAR; } /* Handle setting of stop bits */ if ((old->c_cflag & CSTOPB) != (new->c_cflag & CSTOPB)) changes |= RFCOMM_RPN_PM_STOP; /* POSIX does not support 1.5 stop bits and RFCOMM does not * support 2 stop bits. So a request for 2 stop bits gets * translated to 1.5 stop bits */ if (new->c_cflag & CSTOPB) stop_bits = RFCOMM_RPN_STOP_15; else stop_bits = RFCOMM_RPN_STOP_1; /* Handle number of data bits [5-8] */ if ((old->c_cflag & CSIZE) != (new->c_cflag & CSIZE)) changes |= RFCOMM_RPN_PM_DATA; switch (new->c_cflag & CSIZE) { case CS5: data_bits = RFCOMM_RPN_DATA_5; break; case CS6: data_bits = RFCOMM_RPN_DATA_6; break; case CS7: data_bits = RFCOMM_RPN_DATA_7; break; case CS8: data_bits = RFCOMM_RPN_DATA_8; break; default: data_bits = RFCOMM_RPN_DATA_8; break; } /* Handle baudrate settings */ if (old_baud_rate != new_baud_rate) changes |= RFCOMM_RPN_PM_BITRATE; switch (new_baud_rate) { case 2400: baud = RFCOMM_RPN_BR_2400; break; case 4800: baud = RFCOMM_RPN_BR_4800; break; case 7200: baud = RFCOMM_RPN_BR_7200; break; case 9600: baud = RFCOMM_RPN_BR_9600; break; case 19200: baud = RFCOMM_RPN_BR_19200; break; case 38400: baud = RFCOMM_RPN_BR_38400; break; case 57600: baud = RFCOMM_RPN_BR_57600; break; case 115200: baud = RFCOMM_RPN_BR_115200; break; case 230400: baud = RFCOMM_RPN_BR_230400; break; default: /* 9600 is standard accordinag to the RFCOMM specification */ baud = RFCOMM_RPN_BR_9600; break; } if (changes) rfcomm_send_rpn(dev->dlc->session, 1, dev->dlc->dlci, baud, data_bits, stop_bits, parity, RFCOMM_RPN_FLOW_NONE, x_on, x_off, changes); } static void rfcomm_tty_throttle(struct tty_struct *tty) { struct rfcomm_dev *dev = tty->driver_data; BT_DBG("tty %p dev %p", tty, dev); rfcomm_dlc_throttle(dev->dlc); } static void rfcomm_tty_unthrottle(struct tty_struct *tty) { struct rfcomm_dev *dev = tty->driver_data; BT_DBG("tty %p dev %p", tty, dev); rfcomm_dlc_unthrottle(dev->dlc); } static unsigned int rfcomm_tty_chars_in_buffer(struct tty_struct *tty) { struct rfcomm_dev *dev = tty->driver_data; BT_DBG("tty %p dev %p", tty, dev); if (!dev || !dev->dlc) return 0; if (!skb_queue_empty(&dev->dlc->tx_queue)) return dev->dlc->mtu; return 0; } static void rfcomm_tty_flush_buffer(struct tty_struct *tty) { struct rfcomm_dev *dev = tty->driver_data; BT_DBG("tty %p dev %p", tty, dev); if (!dev || !dev->dlc) return; skb_queue_purge(&dev->dlc->tx_queue); tty_wakeup(tty); } static void rfcomm_tty_send_xchar(struct tty_struct *tty, u8 ch) { BT_DBG("tty %p ch %c", tty, ch); } static void rfcomm_tty_wait_until_sent(struct tty_struct *tty, int timeout) { BT_DBG("tty %p timeout %d", tty, timeout); } static void rfcomm_tty_hangup(struct tty_struct *tty) { struct rfcomm_dev *dev = tty->driver_data; BT_DBG("tty %p dev %p", tty, dev); tty_port_hangup(&dev->port); } static int rfcomm_tty_tiocmget(struct tty_struct *tty) { struct rfcomm_dev *dev = tty->driver_data; BT_DBG("tty %p dev %p", tty, dev); return dev->modem_status; } static int rfcomm_tty_tiocmset(struct tty_struct *tty, unsigned int set, unsigned int clear) { struct rfcomm_dev *dev = tty->driver_data; struct rfcomm_dlc *dlc = dev->dlc; u8 v24_sig; BT_DBG("tty %p dev %p set 0x%02x clear 0x%02x", tty, dev, set, clear); rfcomm_dlc_get_modem_status(dlc, &v24_sig); if (set & TIOCM_DSR || set & TIOCM_DTR) v24_sig |= RFCOMM_V24_RTC; if (set & TIOCM_RTS || set & TIOCM_CTS) v24_sig |= RFCOMM_V24_RTR; if (set & TIOCM_RI) v24_sig |= RFCOMM_V24_IC; if (set & TIOCM_CD) v24_sig |= RFCOMM_V24_DV; if (clear & TIOCM_DSR || clear & TIOCM_DTR) v24_sig &= ~RFCOMM_V24_RTC; if (clear & TIOCM_RTS || clear & TIOCM_CTS) v24_sig &= ~RFCOMM_V24_RTR; if (clear & TIOCM_RI) v24_sig &= ~RFCOMM_V24_IC; if (clear & TIOCM_CD) v24_sig &= ~RFCOMM_V24_DV; rfcomm_dlc_set_modem_status(dlc, v24_sig); return 0; } /* ---- TTY structure ---- */ static const struct tty_operations rfcomm_ops = { .open = rfcomm_tty_open, .close = rfcomm_tty_close, .write = rfcomm_tty_write, .write_room = rfcomm_tty_write_room, .chars_in_buffer = rfcomm_tty_chars_in_buffer, .flush_buffer = rfcomm_tty_flush_buffer, .ioctl = rfcomm_tty_ioctl, .throttle = rfcomm_tty_throttle, .unthrottle = rfcomm_tty_unthrottle, .set_termios = rfcomm_tty_set_termios, .send_xchar = rfcomm_tty_send_xchar, .hangup = rfcomm_tty_hangup, .wait_until_sent = rfcomm_tty_wait_until_sent, .tiocmget = rfcomm_tty_tiocmget, .tiocmset = rfcomm_tty_tiocmset, .install = rfcomm_tty_install, .cleanup = rfcomm_tty_cleanup, }; int __init rfcomm_init_ttys(void) { int error; rfcomm_tty_driver = tty_alloc_driver(RFCOMM_TTY_PORTS, TTY_DRIVER_REAL_RAW | TTY_DRIVER_DYNAMIC_DEV); if (IS_ERR(rfcomm_tty_driver)) return PTR_ERR(rfcomm_tty_driver); rfcomm_tty_driver->driver_name = "rfcomm"; rfcomm_tty_driver->name = "rfcomm"; rfcomm_tty_driver->major = RFCOMM_TTY_MAJOR; rfcomm_tty_driver->minor_start = RFCOMM_TTY_MINOR; rfcomm_tty_driver->type = TTY_DRIVER_TYPE_SERIAL; rfcomm_tty_driver->subtype = SERIAL_TYPE_NORMAL; rfcomm_tty_driver->init_termios = tty_std_termios; rfcomm_tty_driver->init_termios.c_cflag = B9600 | CS8 | CREAD | HUPCL; rfcomm_tty_driver->init_termios.c_lflag &= ~ICANON; tty_set_operations(rfcomm_tty_driver, &rfcomm_ops); error = tty_register_driver(rfcomm_tty_driver); if (error) { BT_ERR("Can't register RFCOMM TTY driver"); tty_driver_kref_put(rfcomm_tty_driver); return error; } BT_INFO("RFCOMM TTY layer initialized"); return 0; } void rfcomm_cleanup_ttys(void) { tty_unregister_driver(rfcomm_tty_driver); tty_driver_kref_put(rfcomm_tty_driver); }
17 17 3 3 3 17 13 13 13 3 3 8 13 13 13 13 13 13 9 6 12 2 13 10 10 13 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2007-2014 Nicira, Inc. */ #include <linux/etherdevice.h> #include <linux/if.h> #include <linux/if_vlan.h> #include <linux/jhash.h> #include <linux/kernel.h> #include <linux/list.h> #include <linux/mutex.h> #include <linux/percpu.h> #include <linux/rcupdate.h> #include <linux/rtnetlink.h> #include <linux/compat.h> #include <net/net_namespace.h> #include <linux/module.h> #include "datapath.h" #include "vport.h" #include "vport-internal_dev.h" static LIST_HEAD(vport_ops_list); /* Protected by RCU read lock for reading, ovs_mutex for writing. */ static struct hlist_head *dev_table; #define VPORT_HASH_BUCKETS 1024 /** * ovs_vport_init - initialize vport subsystem * * Called at module load time to initialize the vport subsystem. */ int ovs_vport_init(void) { dev_table = kcalloc(VPORT_HASH_BUCKETS, sizeof(struct hlist_head), GFP_KERNEL); if (!dev_table) return -ENOMEM; return 0; } /** * ovs_vport_exit - shutdown vport subsystem * * Called at module exit time to shutdown the vport subsystem. */ void ovs_vport_exit(void) { kfree(dev_table); } static struct hlist_head *hash_bucket(const struct net *net, const char *name) { unsigned int hash = jhash(name, strlen(name), (unsigned long) net); return &dev_table[hash & (VPORT_HASH_BUCKETS - 1)]; } int __ovs_vport_ops_register(struct vport_ops *ops) { int err = -EEXIST; struct vport_ops *o; ovs_lock(); list_for_each_entry(o, &vport_ops_list, list) if (ops->type == o->type) goto errout; list_add_tail(&ops->list, &vport_ops_list); err = 0; errout: ovs_unlock(); return err; } EXPORT_SYMBOL_GPL(__ovs_vport_ops_register); void ovs_vport_ops_unregister(struct vport_ops *ops) { ovs_lock(); list_del(&ops->list); ovs_unlock(); } EXPORT_SYMBOL_GPL(ovs_vport_ops_unregister); /** * ovs_vport_locate - find a port that has already been created * * @net: network namespace * @name: name of port to find * * Must be called with ovs or RCU read lock. */ struct vport *ovs_vport_locate(const struct net *net, const char *name) { struct hlist_head *bucket = hash_bucket(net, name); struct vport *vport; hlist_for_each_entry_rcu(vport, bucket, hash_node, lockdep_ovsl_is_held()) if (!strcmp(name, ovs_vport_name(vport)) && net_eq(ovs_dp_get_net(vport->dp), net)) return vport; return NULL; } /** * ovs_vport_alloc - allocate and initialize new vport * * @priv_size: Size of private data area to allocate. * @ops: vport device ops * @parms: information about new vport. * * Allocate and initialize a new vport defined by @ops. The vport will contain * a private data area of size @priv_size that can be accessed using * vport_priv(). Some parameters of the vport will be initialized from @parms. * @vports that are no longer needed should be released with * vport_free(). */ struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *ops, const struct vport_parms *parms) { struct vport *vport; size_t alloc_size; int err; alloc_size = sizeof(struct vport); if (priv_size) { alloc_size = ALIGN(alloc_size, VPORT_ALIGN); alloc_size += priv_size; } vport = kzalloc(alloc_size, GFP_KERNEL); if (!vport) return ERR_PTR(-ENOMEM); vport->upcall_stats = netdev_alloc_pcpu_stats(struct vport_upcall_stats_percpu); if (!vport->upcall_stats) { err = -ENOMEM; goto err_kfree_vport; } vport->dp = parms->dp; vport->port_no = parms->port_no; vport->ops = ops; INIT_HLIST_NODE(&vport->dp_hash_node); if (ovs_vport_set_upcall_portids(vport, parms->upcall_portids)) { err = -EINVAL; goto err_free_percpu; } return vport; err_free_percpu: free_percpu(vport->upcall_stats); err_kfree_vport: kfree(vport); return ERR_PTR(err); } EXPORT_SYMBOL_GPL(ovs_vport_alloc); /** * ovs_vport_free - uninitialize and free vport * * @vport: vport to free * * Frees a vport allocated with vport_alloc() when it is no longer needed. * * The caller must ensure that an RCU grace period has passed since the last * time @vport was in a datapath. */ void ovs_vport_free(struct vport *vport) { /* vport is freed from RCU callback or error path, Therefore * it is safe to use raw dereference. */ kfree(rcu_dereference_raw(vport->upcall_portids)); free_percpu(vport->upcall_stats); kfree(vport); } EXPORT_SYMBOL_GPL(ovs_vport_free); static struct vport_ops *ovs_vport_lookup(const struct vport_parms *parms) { struct vport_ops *ops; list_for_each_entry(ops, &vport_ops_list, list) if (ops->type == parms->type) return ops; return NULL; } /** * ovs_vport_add - add vport device (for kernel callers) * * @parms: Information about new vport. * * Creates a new vport with the specified configuration (which is dependent on * device type). ovs_mutex must be held. */ struct vport *ovs_vport_add(const struct vport_parms *parms) { struct vport_ops *ops; struct vport *vport; ops = ovs_vport_lookup(parms); if (ops) { struct hlist_head *bucket; if (!try_module_get(ops->owner)) return ERR_PTR(-EAFNOSUPPORT); vport = ops->create(parms); if (IS_ERR(vport)) { module_put(ops->owner); return vport; } bucket = hash_bucket(ovs_dp_get_net(vport->dp), ovs_vport_name(vport)); hlist_add_head_rcu(&vport->hash_node, bucket); return vport; } /* Unlock to attempt module load and return -EAGAIN if load * was successful as we need to restart the port addition * workflow. */ ovs_unlock(); request_module("vport-type-%d", parms->type); ovs_lock(); if (!ovs_vport_lookup(parms)) return ERR_PTR(-EAFNOSUPPORT); else return ERR_PTR(-EAGAIN); } /** * ovs_vport_set_options - modify existing vport device (for kernel callers) * * @vport: vport to modify. * @options: New configuration. * * Modifies an existing device with the specified configuration (which is * dependent on device type). ovs_mutex must be held. */ int ovs_vport_set_options(struct vport *vport, struct nlattr *options) { if (!vport->ops->set_options) return -EOPNOTSUPP; return vport->ops->set_options(vport, options); } /** * ovs_vport_del - delete existing vport device * * @vport: vport to delete. * * Detaches @vport from its datapath and destroys it. ovs_mutex must * be held. */ void ovs_vport_del(struct vport *vport) { hlist_del_rcu(&vport->hash_node); module_put(vport->ops->owner); vport->ops->destroy(vport); } /** * ovs_vport_get_stats - retrieve device stats * * @vport: vport from which to retrieve the stats * @stats: location to store stats * * Retrieves transmit, receive, and error stats for the given device. * * Must be called with ovs_mutex or rcu_read_lock. */ void ovs_vport_get_stats(struct vport *vport, struct ovs_vport_stats *stats) { const struct rtnl_link_stats64 *dev_stats; struct rtnl_link_stats64 temp; dev_stats = dev_get_stats(vport->dev, &temp); stats->rx_errors = dev_stats->rx_errors; stats->tx_errors = dev_stats->tx_errors; stats->tx_dropped = dev_stats->tx_dropped; stats->rx_dropped = dev_stats->rx_dropped; stats->rx_bytes = dev_stats->rx_bytes; stats->rx_packets = dev_stats->rx_packets; stats->tx_bytes = dev_stats->tx_bytes; stats->tx_packets = dev_stats->tx_packets; } /** * ovs_vport_get_upcall_stats - retrieve upcall stats * * @vport: vport from which to retrieve the stats. * @skb: sk_buff where upcall stats should be appended. * * Retrieves upcall stats for the given device. * * Must be called with ovs_mutex or rcu_read_lock. */ int ovs_vport_get_upcall_stats(struct vport *vport, struct sk_buff *skb) { struct nlattr *nla; int i; __u64 tx_success = 0; __u64 tx_fail = 0; for_each_possible_cpu(i) { const struct vport_upcall_stats_percpu *stats; unsigned int start; stats = per_cpu_ptr(vport->upcall_stats, i); do { start = u64_stats_fetch_begin(&stats->syncp); tx_success += u64_stats_read(&stats->n_success); tx_fail += u64_stats_read(&stats->n_fail); } while (u64_stats_fetch_retry(&stats->syncp, start)); } nla = nla_nest_start_noflag(skb, OVS_VPORT_ATTR_UPCALL_STATS); if (!nla) return -EMSGSIZE; if (nla_put_u64_64bit(skb, OVS_VPORT_UPCALL_ATTR_SUCCESS, tx_success, OVS_VPORT_ATTR_PAD)) { nla_nest_cancel(skb, nla); return -EMSGSIZE; } if (nla_put_u64_64bit(skb, OVS_VPORT_UPCALL_ATTR_FAIL, tx_fail, OVS_VPORT_ATTR_PAD)) { nla_nest_cancel(skb, nla); return -EMSGSIZE; } nla_nest_end(skb, nla); return 0; } /** * ovs_vport_get_options - retrieve device options * * @vport: vport from which to retrieve the options. * @skb: sk_buff where options should be appended. * * Retrieves the configuration of the given device, appending an * %OVS_VPORT_ATTR_OPTIONS attribute that in turn contains nested * vport-specific attributes to @skb. * * Returns 0 if successful, -EMSGSIZE if @skb has insufficient room, or another * negative error code if a real error occurred. If an error occurs, @skb is * left unmodified. * * Must be called with ovs_mutex or rcu_read_lock. */ int ovs_vport_get_options(const struct vport *vport, struct sk_buff *skb) { struct nlattr *nla; int err; if (!vport->ops->get_options) return 0; nla = nla_nest_start_noflag(skb, OVS_VPORT_ATTR_OPTIONS); if (!nla) return -EMSGSIZE; err = vport->ops->get_options(vport, skb); if (err) { nla_nest_cancel(skb, nla); return err; } nla_nest_end(skb, nla); return 0; } /** * ovs_vport_set_upcall_portids - set upcall portids of @vport. * * @vport: vport to modify. * @ids: new configuration, an array of port ids. * * Sets the vport's upcall_portids to @ids. * * Returns 0 if successful, -EINVAL if @ids is zero length or cannot be parsed * as an array of U32. * * Must be called with ovs_mutex. */ int ovs_vport_set_upcall_portids(struct vport *vport, const struct nlattr *ids) { struct vport_portids *old, *vport_portids; if (!nla_len(ids) || nla_len(ids) % sizeof(u32)) return -EINVAL; old = ovsl_dereference(vport->upcall_portids); vport_portids = kmalloc(sizeof(*vport_portids) + nla_len(ids), GFP_KERNEL); if (!vport_portids) return -ENOMEM; vport_portids->n_ids = nla_len(ids) / sizeof(u32); vport_portids->rn_ids = reciprocal_value(vport_portids->n_ids); nla_memcpy(vport_portids->ids, ids, nla_len(ids)); rcu_assign_pointer(vport->upcall_portids, vport_portids); if (old) kfree_rcu(old, rcu); return 0; } /** * ovs_vport_get_upcall_portids - get the upcall_portids of @vport. * * @vport: vport from which to retrieve the portids. * @skb: sk_buff where portids should be appended. * * Retrieves the configuration of the given vport, appending the * %OVS_VPORT_ATTR_UPCALL_PID attribute which is the array of upcall * portids to @skb. * * Returns 0 if successful, -EMSGSIZE if @skb has insufficient room. * If an error occurs, @skb is left unmodified. Must be called with * ovs_mutex or rcu_read_lock. */ int ovs_vport_get_upcall_portids(const struct vport *vport, struct sk_buff *skb) { struct vport_portids *ids; ids = rcu_dereference_ovsl(vport->upcall_portids); if (vport->dp->user_features & OVS_DP_F_VPORT_PIDS) return nla_put(skb, OVS_VPORT_ATTR_UPCALL_PID, ids->n_ids * sizeof(u32), (void *)ids->ids); else return nla_put_u32(skb, OVS_VPORT_ATTR_UPCALL_PID, ids->ids[0]); } /** * ovs_vport_find_upcall_portid - find the upcall portid to send upcall. * * @vport: vport from which the missed packet is received. * @skb: skb that the missed packet was received. * * Uses the skb_get_hash() to select the upcall portid to send the * upcall. * * Returns the portid of the target socket. Must be called with rcu_read_lock. */ u32 ovs_vport_find_upcall_portid(const struct vport *vport, struct sk_buff *skb) { struct vport_portids *ids; u32 ids_index; u32 hash; ids = rcu_dereference(vport->upcall_portids); /* If there is only one portid, select it in the fast-path. */ if (ids->n_ids == 1) return ids->ids[0]; hash = skb_get_hash(skb); ids_index = hash - ids->n_ids * reciprocal_divide(hash, ids->rn_ids); return ids->ids[ids_index]; } /** * ovs_vport_receive - pass up received packet to the datapath for processing * * @vport: vport that received the packet * @skb: skb that was received * @tun_info: tunnel (if any) that carried packet * * Must be called with rcu_read_lock. The packet cannot be shared and * skb->data should point to the Ethernet header. */ int ovs_vport_receive(struct vport *vport, struct sk_buff *skb, const struct ip_tunnel_info *tun_info) { struct sw_flow_key key; int error; OVS_CB(skb)->input_vport = vport; OVS_CB(skb)->mru = 0; OVS_CB(skb)->cutlen = 0; OVS_CB(skb)->probability = 0; if (unlikely(dev_net(skb->dev) != ovs_dp_get_net(vport->dp))) { u32 mark; mark = skb->mark; skb_scrub_packet(skb, true); skb->mark = mark; tun_info = NULL; } /* Extract flow from 'skb' into 'key'. */ error = ovs_flow_key_extract(tun_info, skb, &key); if (unlikely(error)) { kfree_skb(skb); return error; } ovs_dp_process_packet(skb, &key); return 0; } static int packet_length(const struct sk_buff *skb, struct net_device *dev) { int length = skb->len - dev->hard_header_len; if (!skb_vlan_tag_present(skb) && eth_type_vlan(skb->protocol)) length -= VLAN_HLEN; /* Don't subtract for multiple VLAN tags. Most (all?) drivers allow * (ETH_LEN + VLAN_HLEN) in addition to the mtu value, but almost none * account for 802.1ad. e.g. is_skb_forwardable(). */ return length > 0 ? length : 0; } void ovs_vport_send(struct vport *vport, struct sk_buff *skb, u8 mac_proto) { int mtu = vport->dev->mtu; switch (vport->dev->type) { case ARPHRD_NONE: if (mac_proto == MAC_PROTO_ETHERNET) { skb_reset_network_header(skb); skb_reset_mac_len(skb); skb->protocol = htons(ETH_P_TEB); } else if (mac_proto != MAC_PROTO_NONE) { WARN_ON_ONCE(1); goto drop; } break; case ARPHRD_ETHER: if (mac_proto != MAC_PROTO_ETHERNET) goto drop; break; default: goto drop; } if (unlikely(packet_length(skb, vport->dev) > mtu && !skb_is_gso(skb))) { vport->dev->stats.tx_errors++; if (vport->dev->flags & IFF_UP) net_warn_ratelimited("%s: dropped over-mtu packet: " "%d > %d\n", vport->dev->name, packet_length(skb, vport->dev), mtu); goto drop; } skb->dev = vport->dev; skb_clear_tstamp(skb); vport->ops->send(skb); return; drop: kfree_skb(skb); }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_BLOCKGROUP_LOCK_H #define _LINUX_BLOCKGROUP_LOCK_H /* * Per-blockgroup locking for ext2 and ext3. * * Simple hashed spinlocking. */ #include <linux/spinlock.h> #include <linux/cache.h> #ifdef CONFIG_SMP #define NR_BG_LOCKS (4 << ilog2(NR_CPUS < 32 ? NR_CPUS : 32)) #else #define NR_BG_LOCKS 1 #endif struct bgl_lock { spinlock_t lock; } ____cacheline_aligned_in_smp; struct blockgroup_lock { struct bgl_lock locks[NR_BG_LOCKS]; }; static inline void bgl_lock_init(struct blockgroup_lock *bgl) { int i; for (i = 0; i < NR_BG_LOCKS; i++) spin_lock_init(&bgl->locks[i].lock); } static inline spinlock_t * bgl_lock_ptr(struct blockgroup_lock *bgl, unsigned int block_group) { return &bgl->locks[block_group & (NR_BG_LOCKS-1)].lock; } #endif
15 15 15 15 15 15 15 15 15 30 15 15 15 15 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright 2007-2012 Siemens AG * * Written by: * Dmitry Eremin-Solenikov <dbaryshkov@gmail.com> * Sergey Lapin <slapin@ossfans.org> * Maxim Gorbachyov <maxim.gorbachev@siemens.com> * Alexander Smirnov <alex.bluesman.smirnov@gmail.com> */ #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/crc-ccitt.h> #include <linux/unaligned.h> #include <net/rtnetlink.h> #include <net/ieee802154_netdev.h> #include <net/mac802154.h> #include <net/cfg802154.h> #include "ieee802154_i.h" #include "driver-ops.h" void ieee802154_xmit_sync_worker(struct work_struct *work) { struct ieee802154_local *local = container_of(work, struct ieee802154_local, sync_tx_work); struct sk_buff *skb = local->tx_skb; struct net_device *dev = skb->dev; int res; res = drv_xmit_sync(local, skb); if (res) goto err_tx; DEV_STATS_INC(dev, tx_packets); DEV_STATS_ADD(dev, tx_bytes, skb->len); ieee802154_xmit_complete(&local->hw, skb, false); return; err_tx: /* Restart the netif queue on each sub_if_data object. */ ieee802154_release_queue(local); if (atomic_dec_and_test(&local->phy->ongoing_txs)) wake_up(&local->phy->sync_txq); kfree_skb(skb); netdev_dbg(dev, "transmission failed\n"); } static netdev_tx_t ieee802154_tx(struct ieee802154_local *local, struct sk_buff *skb) { struct net_device *dev = skb->dev; int ret; if (!(local->hw.flags & IEEE802154_HW_TX_OMIT_CKSUM)) { struct sk_buff *nskb; u16 crc; if (unlikely(skb_tailroom(skb) < IEEE802154_FCS_LEN)) { nskb = skb_copy_expand(skb, 0, IEEE802154_FCS_LEN, GFP_ATOMIC); if (likely(nskb)) { consume_skb(skb); skb = nskb; } else { goto err_free_skb; } } crc = crc_ccitt(0, skb->data, skb->len); put_unaligned_le16(crc, skb_put(skb, 2)); } /* Stop the netif queue on each sub_if_data object. */ ieee802154_hold_queue(local); atomic_inc(&local->phy->ongoing_txs); /* Drivers should preferably implement the async callback. In some rare * cases they only provide a sync callback which we will use as a * fallback. */ if (local->ops->xmit_async) { unsigned int len = skb->len; ret = drv_xmit_async(local, skb); if (ret) goto err_wake_netif_queue; DEV_STATS_INC(dev, tx_packets); DEV_STATS_ADD(dev, tx_bytes, len); } else { local->tx_skb = skb; queue_work(local->workqueue, &local->sync_tx_work); } return NETDEV_TX_OK; err_wake_netif_queue: ieee802154_release_queue(local); if (atomic_dec_and_test(&local->phy->ongoing_txs)) wake_up(&local->phy->sync_txq); err_free_skb: kfree_skb(skb); return NETDEV_TX_OK; } static int ieee802154_sync_queue(struct ieee802154_local *local) { int ret; ieee802154_hold_queue(local); ieee802154_disable_queue(local); wait_event(local->phy->sync_txq, !atomic_read(&local->phy->ongoing_txs)); ret = local->tx_result; ieee802154_release_queue(local); return ret; } int ieee802154_sync_and_hold_queue(struct ieee802154_local *local) { int ret; ieee802154_hold_queue(local); ret = ieee802154_sync_queue(local); set_bit(WPAN_PHY_FLAG_STATE_QUEUE_STOPPED, &local->phy->flags); return ret; } int ieee802154_mlme_op_pre(struct ieee802154_local *local) { return ieee802154_sync_and_hold_queue(local); } int ieee802154_mlme_tx_locked(struct ieee802154_local *local, struct ieee802154_sub_if_data *sdata, struct sk_buff *skb) { /* Avoid possible calls to ->ndo_stop() when we asynchronously perform * MLME transmissions. */ ASSERT_RTNL(); /* Ensure the device was not stopped, otherwise error out */ if (!local->open_count) return -ENETDOWN; /* Warn if the ieee802154 core thinks MLME frames can be sent while the * net interface expects this cannot happen. */ if (WARN_ON_ONCE(!netif_running(sdata->dev))) return -ENETDOWN; ieee802154_tx(local, skb); return ieee802154_sync_queue(local); } int ieee802154_mlme_tx(struct ieee802154_local *local, struct ieee802154_sub_if_data *sdata, struct sk_buff *skb) { int ret; rtnl_lock(); ret = ieee802154_mlme_tx_locked(local, sdata, skb); rtnl_unlock(); return ret; } void ieee802154_mlme_op_post(struct ieee802154_local *local) { ieee802154_release_queue(local); } int ieee802154_mlme_tx_one(struct ieee802154_local *local, struct ieee802154_sub_if_data *sdata, struct sk_buff *skb) { int ret; ieee802154_mlme_op_pre(local); ret = ieee802154_mlme_tx(local, sdata, skb); ieee802154_mlme_op_post(local); return ret; } int ieee802154_mlme_tx_one_locked(struct ieee802154_local *local, struct ieee802154_sub_if_data *sdata, struct sk_buff *skb) { int ret; ieee802154_mlme_op_pre(local); ret = ieee802154_mlme_tx_locked(local, sdata, skb); ieee802154_mlme_op_post(local); return ret; } static bool ieee802154_queue_is_stopped(struct ieee802154_local *local) { return test_bit(WPAN_PHY_FLAG_STATE_QUEUE_STOPPED, &local->phy->flags); } static netdev_tx_t ieee802154_hot_tx(struct ieee802154_local *local, struct sk_buff *skb) { /* Warn if the net interface tries to transmit frames while the * ieee802154 core assumes the queue is stopped. */ WARN_ON_ONCE(ieee802154_queue_is_stopped(local)); return ieee802154_tx(local, skb); } netdev_tx_t ieee802154_monitor_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); skb->skb_iif = dev->ifindex; return ieee802154_hot_tx(sdata->local, skb); } netdev_tx_t ieee802154_subif_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); int rc; /* TODO we should move it to wpan_dev_hard_header and dev_hard_header * functions. The reason is wireshark will show a mac header which is * with security fields but the payload is not encrypted. */ rc = mac802154_llsec_encrypt(&sdata->sec, skb); if (rc) { netdev_warn(dev, "encryption failed: %i\n", rc); kfree_skb(skb); return NETDEV_TX_OK; } skb->skb_iif = dev->ifindex; return ieee802154_hot_tx(sdata->local, skb); }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 /* SPDX-License-Identifier: GPL-2.0 */ /* atmdev.h - ATM device driver declarations and various related items */ #ifndef LINUX_ATMDEV_H #define LINUX_ATMDEV_H #include <linux/wait.h> /* wait_queue_head_t */ #include <linux/time.h> /* struct timeval */ #include <linux/net.h> #include <linux/bug.h> #include <linux/skbuff.h> /* struct sk_buff */ #include <linux/uio.h> #include <net/sock.h> #include <linux/atomic.h> #include <linux/refcount.h> #include <uapi/linux/atmdev.h> #ifdef CONFIG_PROC_FS #include <linux/proc_fs.h> extern struct proc_dir_entry *atm_proc_root; #endif #ifdef CONFIG_COMPAT #include <linux/compat.h> struct compat_atm_iobuf { int length; compat_uptr_t buffer; }; #endif struct k_atm_aal_stats { #define __HANDLE_ITEM(i) atomic_t i __AAL_STAT_ITEMS #undef __HANDLE_ITEM }; struct k_atm_dev_stats { struct k_atm_aal_stats aal0; struct k_atm_aal_stats aal34; struct k_atm_aal_stats aal5; }; struct device; enum { ATM_VF_ADDR, /* Address is in use. Set by anybody, cleared by device driver. */ ATM_VF_READY, /* VC is ready to transfer data. Set by device driver, cleared by anybody. */ ATM_VF_PARTIAL, /* resources are bound to PVC (partial PVC setup), controlled by socket layer */ ATM_VF_REGIS, /* registered with demon, controlled by SVC socket layer */ ATM_VF_BOUND, /* local SAP is set, controlled by SVC socket layer */ ATM_VF_RELEASED, /* demon has indicated/requested release, controlled by SVC socket layer */ ATM_VF_HASQOS, /* QOS parameters have been set */ ATM_VF_LISTEN, /* socket is used for listening */ ATM_VF_META, /* SVC socket isn't used for normal data traffic and doesn't depend on signaling to be available */ ATM_VF_SESSION, /* VCC is p2mp session control descriptor */ ATM_VF_HASSAP, /* SAP has been set */ ATM_VF_CLOSE, /* asynchronous close - treat like VF_RELEASED*/ ATM_VF_WAITING, /* waiting for reply from sigd */ ATM_VF_IS_CLIP, /* in use by CLIP protocol */ }; #define ATM_VF2VS(flags) \ (test_bit(ATM_VF_READY,&(flags)) ? ATM_VS_CONNECTED : \ test_bit(ATM_VF_RELEASED,&(flags)) ? ATM_VS_CLOSING : \ test_bit(ATM_VF_LISTEN,&(flags)) ? ATM_VS_LISTEN : \ test_bit(ATM_VF_REGIS,&(flags)) ? ATM_VS_INUSE : \ test_bit(ATM_VF_BOUND,&(flags)) ? ATM_VS_BOUND : ATM_VS_IDLE) enum { ATM_DF_REMOVED, /* device was removed from atm_devs list */ }; #define ATM_PHY_SIG_LOST 0 /* no carrier/light */ #define ATM_PHY_SIG_UNKNOWN 1 /* carrier/light status is unknown */ #define ATM_PHY_SIG_FOUND 2 /* carrier/light okay */ #define ATM_ATMOPT_CLP 1 /* set CLP bit */ struct atm_vcc { /* struct sock has to be the first member of atm_vcc */ struct sock sk; unsigned long flags; /* VCC flags (ATM_VF_*) */ short vpi; /* VPI and VCI (types must be equal */ /* with sockaddr) */ int vci; unsigned long aal_options; /* AAL layer options */ unsigned long atm_options; /* ATM layer options */ struct atm_dev *dev; /* device back pointer */ struct atm_qos qos; /* QOS */ struct atm_sap sap; /* SAP */ void (*release_cb)(struct atm_vcc *vcc); /* release_sock callback */ void (*push)(struct atm_vcc *vcc,struct sk_buff *skb); void (*pop)(struct atm_vcc *vcc,struct sk_buff *skb); /* optional */ int (*push_oam)(struct atm_vcc *vcc,void *cell); int (*send)(struct atm_vcc *vcc,struct sk_buff *skb); void *dev_data; /* per-device data */ void *proto_data; /* per-protocol data */ struct k_atm_aal_stats *stats; /* pointer to AAL stats group */ struct module *owner; /* owner of ->push function */ /* SVC part --- may move later ------------------------------------- */ short itf; /* interface number */ struct sockaddr_atmsvc local; struct sockaddr_atmsvc remote; /* Multipoint part ------------------------------------------------- */ struct atm_vcc *session; /* session VCC descriptor */ /* Other stuff ----------------------------------------------------- */ void *user_back; /* user backlink - not touched by */ /* native ATM stack. Currently used */ /* by CLIP and sch_atm. */ }; static inline struct atm_vcc *atm_sk(struct sock *sk) { return (struct atm_vcc *)sk; } static inline struct atm_vcc *ATM_SD(struct socket *sock) { return atm_sk(sock->sk); } static inline struct sock *sk_atm(struct atm_vcc *vcc) { return (struct sock *)vcc; } struct atm_dev_addr { struct sockaddr_atmsvc addr; /* ATM address */ struct list_head entry; /* next address */ }; enum atm_addr_type_t { ATM_ADDR_LOCAL, ATM_ADDR_LECS }; struct atm_dev { const struct atmdev_ops *ops; /* device operations; NULL if unused */ const struct atmphy_ops *phy; /* PHY operations, may be undefined */ /* (NULL) */ const char *type; /* device type name */ int number; /* device index */ void *dev_data; /* per-device data */ void *phy_data; /* private PHY data */ unsigned long flags; /* device flags (ATM_DF_*) */ struct list_head local; /* local ATM addresses */ struct list_head lecs; /* LECS ATM addresses learned via ILMI */ unsigned char esi[ESI_LEN]; /* ESI ("MAC" addr) */ struct atm_cirange ci_range; /* VPI/VCI range */ struct k_atm_dev_stats stats; /* statistics */ char signal; /* signal status (ATM_PHY_SIG_*) */ int link_rate; /* link rate (default: OC3) */ refcount_t refcnt; /* reference count */ spinlock_t lock; /* protect internal members */ #ifdef CONFIG_PROC_FS struct proc_dir_entry *proc_entry; /* proc entry */ char *proc_name; /* proc entry name */ #endif struct device class_dev; /* sysfs device */ struct list_head dev_list; /* linkage */ }; /* OF: send_Oam Flags */ #define ATM_OF_IMMED 1 /* Attempt immediate delivery */ #define ATM_OF_INRATE 2 /* Attempt in-rate delivery */ struct atmdev_ops { /* only send is required */ void (*dev_close)(struct atm_dev *dev); int (*open)(struct atm_vcc *vcc); void (*close)(struct atm_vcc *vcc); int (*ioctl)(struct atm_dev *dev,unsigned int cmd,void __user *arg); #ifdef CONFIG_COMPAT int (*compat_ioctl)(struct atm_dev *dev,unsigned int cmd, void __user *arg); #endif int (*send)(struct atm_vcc *vcc,struct sk_buff *skb); int (*send_bh)(struct atm_vcc *vcc, struct sk_buff *skb); int (*send_oam)(struct atm_vcc *vcc,void *cell,int flags); void (*phy_put)(struct atm_dev *dev,unsigned char value, unsigned long addr); unsigned char (*phy_get)(struct atm_dev *dev,unsigned long addr); int (*change_qos)(struct atm_vcc *vcc,struct atm_qos *qos,int flags); int (*proc_read)(struct atm_dev *dev,loff_t *pos,char *page); struct module *owner; }; struct atmphy_ops { int (*start)(struct atm_dev *dev); int (*ioctl)(struct atm_dev *dev,unsigned int cmd,void __user *arg); void (*interrupt)(struct atm_dev *dev); int (*stop)(struct atm_dev *dev); }; struct atm_skb_data { struct atm_vcc *vcc; /* ATM VCC */ unsigned long atm_options; /* ATM layer options */ unsigned int acct_truesize; /* truesize accounted to vcc */ } __packed; #define VCC_HTABLE_SIZE 32 extern struct hlist_head vcc_hash[VCC_HTABLE_SIZE]; extern rwlock_t vcc_sklist_lock; #define ATM_SKB(skb) (((struct atm_skb_data *) (skb)->cb)) struct atm_dev *atm_dev_register(const char *type, struct device *parent, const struct atmdev_ops *ops, int number, /* -1 == pick first available */ unsigned long *flags); struct atm_dev *atm_dev_lookup(int number); void atm_dev_deregister(struct atm_dev *dev); /* atm_dev_signal_change * * Propagate lower layer signal change in atm_dev->signal to netdevice. * The event will be sent via a notifier call chain. */ void atm_dev_signal_change(struct atm_dev *dev, char signal); void vcc_insert_socket(struct sock *sk); void atm_dev_release_vccs(struct atm_dev *dev); static inline void atm_account_tx(struct atm_vcc *vcc, struct sk_buff *skb) { /* * Because ATM skbs may not belong to a sock (and we don't * necessarily want to), skb->truesize may be adjusted, * escaping the hack in pskb_expand_head() which avoids * doing so for some cases. So stash the value of truesize * at the time we accounted it, and atm_pop_raw() can use * that value later, in case it changes. */ refcount_add(skb->truesize, &sk_atm(vcc)->sk_wmem_alloc); ATM_SKB(skb)->acct_truesize = skb->truesize; ATM_SKB(skb)->atm_options = vcc->atm_options; } static inline void atm_force_charge(struct atm_vcc *vcc,int truesize) { atomic_add(truesize, &sk_atm(vcc)->sk_rmem_alloc); } static inline void atm_return(struct atm_vcc *vcc,int truesize) { atomic_sub(truesize, &sk_atm(vcc)->sk_rmem_alloc); } static inline int atm_may_send(struct atm_vcc *vcc,unsigned int size) { return (size + refcount_read(&sk_atm(vcc)->sk_wmem_alloc)) < sk_atm(vcc)->sk_sndbuf; } static inline void atm_dev_hold(struct atm_dev *dev) { refcount_inc(&dev->refcnt); } static inline void atm_dev_put(struct atm_dev *dev) { if (refcount_dec_and_test(&dev->refcnt)) { BUG_ON(!test_bit(ATM_DF_REMOVED, &dev->flags)); if (dev->ops->dev_close) dev->ops->dev_close(dev); put_device(&dev->class_dev); } } int atm_charge(struct atm_vcc *vcc,int truesize); struct sk_buff *atm_alloc_charge(struct atm_vcc *vcc,int pdu_size, gfp_t gfp_flags); int atm_pcr_goal(const struct atm_trafprm *tp); void vcc_release_async(struct atm_vcc *vcc, int reply); struct atm_ioctl { struct module *owner; /* A module reference is kept if appropriate over this call. * Return -ENOIOCTLCMD if you don't handle it. */ int (*ioctl)(struct socket *, unsigned int cmd, unsigned long arg); struct list_head list; }; /** * register_atm_ioctl - register handler for ioctl operations * * Special (non-device) handlers of ioctl's should * register here. If you're a normal device, you should * set .ioctl in your atmdev_ops instead. */ void register_atm_ioctl(struct atm_ioctl *); /** * deregister_atm_ioctl - remove the ioctl handler */ void deregister_atm_ioctl(struct atm_ioctl *); /* register_atmdevice_notifier - register atm_dev notify events * * Clients like br2684 will register notify events * Currently we notify of signal found/lost */ int register_atmdevice_notifier(struct notifier_block *nb); void unregister_atmdevice_notifier(struct notifier_block *nb); #endif
2 2 2 2 2 20 20 12 12 3 1 2 2 2 3 2 3 3 2 3 2 3 2 1 2 1 3 1 1 1 3 1 3 3 3 3 8 8 8 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 // SPDX-License-Identifier: GPL-2.0-only /* * mac80211 ethtool hooks for cfg80211 * * Copied from cfg.c - originally * Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright 2014 Intel Corporation (Author: Johannes Berg) * Copyright (C) 2018, 2022-2023 Intel Corporation */ #include <linux/types.h> #include <net/cfg80211.h> #include "ieee80211_i.h" #include "sta_info.h" #include "driver-ops.h" static int ieee80211_set_ringparam(struct net_device *dev, struct ethtool_ringparam *rp, struct kernel_ethtool_ringparam *kernel_rp, struct netlink_ext_ack *extack) { struct ieee80211_local *local = wiphy_priv(dev->ieee80211_ptr->wiphy); int ret; if (rp->rx_mini_pending != 0 || rp->rx_jumbo_pending != 0) return -EINVAL; wiphy_lock(local->hw.wiphy); ret = drv_set_ringparam(local, rp->tx_pending, rp->rx_pending); wiphy_unlock(local->hw.wiphy); return ret; } static void ieee80211_get_ringparam(struct net_device *dev, struct ethtool_ringparam *rp, struct kernel_ethtool_ringparam *kernel_rp, struct netlink_ext_ack *extack) { struct ieee80211_local *local = wiphy_priv(dev->ieee80211_ptr->wiphy); memset(rp, 0, sizeof(*rp)); wiphy_lock(local->hw.wiphy); drv_get_ringparam(local, &rp->tx_pending, &rp->tx_max_pending, &rp->rx_pending, &rp->rx_max_pending); wiphy_unlock(local->hw.wiphy); } static const char ieee80211_gstrings_sta_stats[][ETH_GSTRING_LEN] = { "rx_packets", "rx_bytes", "rx_duplicates", "rx_fragments", "rx_dropped", "tx_packets", "tx_bytes", "tx_filtered", "tx_retry_failed", "tx_retries", "sta_state", "txrate", "rxrate", "signal", "channel", "noise", "ch_time", "ch_time_busy", "ch_time_ext_busy", "ch_time_rx", "ch_time_tx" }; #define STA_STATS_LEN ARRAY_SIZE(ieee80211_gstrings_sta_stats) static int ieee80211_get_sset_count(struct net_device *dev, int sset) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); int rv = 0; if (sset == ETH_SS_STATS) rv += STA_STATS_LEN; rv += drv_get_et_sset_count(sdata, sset); if (rv == 0) return -EOPNOTSUPP; return rv; } static void ieee80211_get_stats(struct net_device *dev, struct ethtool_stats *stats, u64 *data) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_chanctx_conf *chanctx_conf; struct ieee80211_channel *channel; struct sta_info *sta; struct ieee80211_local *local = sdata->local; struct station_info sinfo; struct survey_info survey; int i, q; #define STA_STATS_SURVEY_LEN 7 memset(data, 0, sizeof(u64) * STA_STATS_LEN); #define ADD_STA_STATS(sta) \ do { \ data[i++] += sinfo.rx_packets; \ data[i++] += sinfo.rx_bytes; \ data[i++] += (sta)->rx_stats.num_duplicates; \ data[i++] += (sta)->rx_stats.fragments; \ data[i++] += sinfo.rx_dropped_misc; \ \ data[i++] += sinfo.tx_packets; \ data[i++] += sinfo.tx_bytes; \ data[i++] += (sta)->status_stats.filtered; \ data[i++] += sinfo.tx_failed; \ data[i++] += sinfo.tx_retries; \ } while (0) /* For Managed stations, find the single station based on BSSID * and use that. For interface types, iterate through all available * stations and add stats for any station that is assigned to this * network device. */ wiphy_lock(local->hw.wiphy); if (sdata->vif.type == NL80211_IFTYPE_STATION) { sta = sta_info_get_bss(sdata, sdata->deflink.u.mgd.bssid); if (!(sta && !WARN_ON(sta->sdata->dev != dev))) goto do_survey; memset(&sinfo, 0, sizeof(sinfo)); sta_set_sinfo(sta, &sinfo, false); i = 0; ADD_STA_STATS(&sta->deflink); data[i++] = sta->sta_state; if (sinfo.filled & BIT_ULL(NL80211_STA_INFO_TX_BITRATE)) data[i] = 100000ULL * cfg80211_calculate_bitrate(&sinfo.txrate); i++; if (sinfo.filled & BIT_ULL(NL80211_STA_INFO_RX_BITRATE)) data[i] = 100000ULL * cfg80211_calculate_bitrate(&sinfo.rxrate); i++; if (sinfo.filled & BIT_ULL(NL80211_STA_INFO_SIGNAL_AVG)) data[i] = (u8)sinfo.signal_avg; i++; } else { list_for_each_entry(sta, &local->sta_list, list) { /* Make sure this station belongs to the proper dev */ if (sta->sdata->dev != dev) continue; memset(&sinfo, 0, sizeof(sinfo)); sta_set_sinfo(sta, &sinfo, false); i = 0; ADD_STA_STATS(&sta->deflink); } } do_survey: i = STA_STATS_LEN - STA_STATS_SURVEY_LEN; /* Get survey stats for current channel */ survey.filled = 0; rcu_read_lock(); chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf); if (chanctx_conf) channel = chanctx_conf->def.chan; else channel = NULL; rcu_read_unlock(); if (channel) { q = 0; do { survey.filled = 0; if (drv_get_survey(local, q, &survey) != 0) { survey.filled = 0; break; } q++; } while (channel != survey.channel); } if (survey.filled) data[i++] = survey.channel->center_freq; else data[i++] = 0; if (survey.filled & SURVEY_INFO_NOISE_DBM) data[i++] = (u8)survey.noise; else data[i++] = -1LL; if (survey.filled & SURVEY_INFO_TIME) data[i++] = survey.time; else data[i++] = -1LL; if (survey.filled & SURVEY_INFO_TIME_BUSY) data[i++] = survey.time_busy; else data[i++] = -1LL; if (survey.filled & SURVEY_INFO_TIME_EXT_BUSY) data[i++] = survey.time_ext_busy; else data[i++] = -1LL; if (survey.filled & SURVEY_INFO_TIME_RX) data[i++] = survey.time_rx; else data[i++] = -1LL; if (survey.filled & SURVEY_INFO_TIME_TX) data[i++] = survey.time_tx; else data[i++] = -1LL; if (WARN_ON(i != STA_STATS_LEN)) { wiphy_unlock(local->hw.wiphy); return; } drv_get_et_stats(sdata, stats, &(data[STA_STATS_LEN])); wiphy_unlock(local->hw.wiphy); } static void ieee80211_get_strings(struct net_device *dev, u32 sset, u8 *data) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); int sz_sta_stats = 0; if (sset == ETH_SS_STATS) { sz_sta_stats = sizeof(ieee80211_gstrings_sta_stats); memcpy(data, ieee80211_gstrings_sta_stats, sz_sta_stats); } drv_get_et_strings(sdata, sset, &(data[sz_sta_stats])); } static int ieee80211_get_regs_len(struct net_device *dev) { return 0; } static void ieee80211_get_regs(struct net_device *dev, struct ethtool_regs *regs, void *data) { struct wireless_dev *wdev = dev->ieee80211_ptr; regs->version = wdev->wiphy->hw_version; regs->len = 0; } const struct ethtool_ops ieee80211_ethtool_ops = { .get_drvinfo = cfg80211_get_drvinfo, .get_regs_len = ieee80211_get_regs_len, .get_regs = ieee80211_get_regs, .get_link = ethtool_op_get_link, .get_ringparam = ieee80211_get_ringparam, .set_ringparam = ieee80211_set_ringparam, .get_strings = ieee80211_get_strings, .get_ethtool_stats = ieee80211_get_stats, .get_sset_count = ieee80211_get_sset_count, };
104 104 104 104 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 5 5 5 5 3 3 3 3 3 3 3 3 3 3 5 5 3 5 5 5 5 5 5 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 // SPDX-License-Identifier: GPL-2.0-only /* * Shared Memory Communications over RDMA (SMC-R) and RoCE * * SMC statistics netlink routines * * Copyright IBM Corp. 2021 * * Author(s): Guvenc Gulce */ #include <linux/init.h> #include <linux/mutex.h> #include <linux/percpu.h> #include <linux/ctype.h> #include <linux/smc.h> #include <net/genetlink.h> #include <net/sock.h> #include "smc_netlink.h" #include "smc_stats.h" int smc_stats_init(struct net *net) { net->smc.fback_rsn = kzalloc(sizeof(*net->smc.fback_rsn), GFP_KERNEL); if (!net->smc.fback_rsn) goto err_fback; net->smc.smc_stats = alloc_percpu(struct smc_stats); if (!net->smc.smc_stats) goto err_stats; mutex_init(&net->smc.mutex_fback_rsn); return 0; err_stats: kfree(net->smc.fback_rsn); err_fback: return -ENOMEM; } void smc_stats_exit(struct net *net) { kfree(net->smc.fback_rsn); if (net->smc.smc_stats) free_percpu(net->smc.smc_stats); } static int smc_nl_fill_stats_rmb_data(struct sk_buff *skb, struct smc_stats *stats, int tech, int type) { struct smc_stats_rmbcnt *stats_rmb_cnt; struct nlattr *attrs; if (type == SMC_NLA_STATS_T_TX_RMB_STATS) stats_rmb_cnt = &stats->smc[tech].rmb_tx; else stats_rmb_cnt = &stats->smc[tech].rmb_rx; attrs = nla_nest_start(skb, type); if (!attrs) goto errout; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_RMB_REUSE_CNT, stats_rmb_cnt->reuse_cnt, SMC_NLA_STATS_RMB_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_RMB_SIZE_SM_PEER_CNT, stats_rmb_cnt->buf_size_small_peer_cnt, SMC_NLA_STATS_RMB_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_RMB_SIZE_SM_CNT, stats_rmb_cnt->buf_size_small_cnt, SMC_NLA_STATS_RMB_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_RMB_FULL_PEER_CNT, stats_rmb_cnt->buf_full_peer_cnt, SMC_NLA_STATS_RMB_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_RMB_FULL_CNT, stats_rmb_cnt->buf_full_cnt, SMC_NLA_STATS_RMB_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_RMB_ALLOC_CNT, stats_rmb_cnt->alloc_cnt, SMC_NLA_STATS_RMB_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_RMB_DGRADE_CNT, stats_rmb_cnt->dgrade_cnt, SMC_NLA_STATS_RMB_PAD)) goto errattr; nla_nest_end(skb, attrs); return 0; errattr: nla_nest_cancel(skb, attrs); errout: return -EMSGSIZE; } static int smc_nl_fill_stats_bufsize_data(struct sk_buff *skb, struct smc_stats *stats, int tech, int type) { struct smc_stats_memsize *stats_pload; struct nlattr *attrs; if (type == SMC_NLA_STATS_T_TXPLOAD_SIZE) stats_pload = &stats->smc[tech].tx_pd; else if (type == SMC_NLA_STATS_T_RXPLOAD_SIZE) stats_pload = &stats->smc[tech].rx_pd; else if (type == SMC_NLA_STATS_T_TX_RMB_SIZE) stats_pload = &stats->smc[tech].tx_rmbsize; else if (type == SMC_NLA_STATS_T_RX_RMB_SIZE) stats_pload = &stats->smc[tech].rx_rmbsize; else goto errout; attrs = nla_nest_start(skb, type); if (!attrs) goto errout; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_8K, stats_pload->buf[SMC_BUF_8K], SMC_NLA_STATS_PLOAD_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_16K, stats_pload->buf[SMC_BUF_16K], SMC_NLA_STATS_PLOAD_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_32K, stats_pload->buf[SMC_BUF_32K], SMC_NLA_STATS_PLOAD_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_64K, stats_pload->buf[SMC_BUF_64K], SMC_NLA_STATS_PLOAD_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_128K, stats_pload->buf[SMC_BUF_128K], SMC_NLA_STATS_PLOAD_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_256K, stats_pload->buf[SMC_BUF_256K], SMC_NLA_STATS_PLOAD_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_512K, stats_pload->buf[SMC_BUF_512K], SMC_NLA_STATS_PLOAD_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_1024K, stats_pload->buf[SMC_BUF_1024K], SMC_NLA_STATS_PLOAD_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_G_1024K, stats_pload->buf[SMC_BUF_G_1024K], SMC_NLA_STATS_PLOAD_PAD)) goto errattr; nla_nest_end(skb, attrs); return 0; errattr: nla_nest_cancel(skb, attrs); errout: return -EMSGSIZE; } static int smc_nl_fill_stats_tech_data(struct sk_buff *skb, struct smc_stats *stats, int tech) { struct smc_stats_tech *smc_tech; struct nlattr *attrs; smc_tech = &stats->smc[tech]; if (tech == SMC_TYPE_D) attrs = nla_nest_start(skb, SMC_NLA_STATS_SMCD_TECH); else attrs = nla_nest_start(skb, SMC_NLA_STATS_SMCR_TECH); if (!attrs) goto errout; if (smc_nl_fill_stats_rmb_data(skb, stats, tech, SMC_NLA_STATS_T_TX_RMB_STATS)) goto errattr; if (smc_nl_fill_stats_rmb_data(skb, stats, tech, SMC_NLA_STATS_T_RX_RMB_STATS)) goto errattr; if (smc_nl_fill_stats_bufsize_data(skb, stats, tech, SMC_NLA_STATS_T_TXPLOAD_SIZE)) goto errattr; if (smc_nl_fill_stats_bufsize_data(skb, stats, tech, SMC_NLA_STATS_T_RXPLOAD_SIZE)) goto errattr; if (smc_nl_fill_stats_bufsize_data(skb, stats, tech, SMC_NLA_STATS_T_TX_RMB_SIZE)) goto errattr; if (smc_nl_fill_stats_bufsize_data(skb, stats, tech, SMC_NLA_STATS_T_RX_RMB_SIZE)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_CLNT_V1_SUCC, smc_tech->clnt_v1_succ_cnt, SMC_NLA_STATS_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_CLNT_V2_SUCC, smc_tech->clnt_v2_succ_cnt, SMC_NLA_STATS_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_SRV_V1_SUCC, smc_tech->srv_v1_succ_cnt, SMC_NLA_STATS_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_SRV_V2_SUCC, smc_tech->srv_v2_succ_cnt, SMC_NLA_STATS_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_RX_BYTES, smc_tech->rx_bytes, SMC_NLA_STATS_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_TX_BYTES, smc_tech->tx_bytes, SMC_NLA_STATS_PAD)) goto errattr; if (nla_put_uint(skb, SMC_NLA_STATS_T_RX_RMB_USAGE, smc_tech->rx_rmbuse)) goto errattr; if (nla_put_uint(skb, SMC_NLA_STATS_T_TX_RMB_USAGE, smc_tech->tx_rmbuse)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_RX_CNT, smc_tech->rx_cnt, SMC_NLA_STATS_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_TX_CNT, smc_tech->tx_cnt, SMC_NLA_STATS_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_SENDPAGE_CNT, 0, SMC_NLA_STATS_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_CORK_CNT, smc_tech->cork_cnt, SMC_NLA_STATS_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_NDLY_CNT, smc_tech->ndly_cnt, SMC_NLA_STATS_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_SPLICE_CNT, smc_tech->splice_cnt, SMC_NLA_STATS_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_URG_DATA_CNT, smc_tech->urg_data_cnt, SMC_NLA_STATS_PAD)) goto errattr; nla_nest_end(skb, attrs); return 0; errattr: nla_nest_cancel(skb, attrs); errout: return -EMSGSIZE; } int smc_nl_get_stats(struct sk_buff *skb, struct netlink_callback *cb) { struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); struct net *net = sock_net(skb->sk); struct smc_stats *stats; struct nlattr *attrs; int cpu, i, size; void *nlh; u64 *src; u64 *sum; if (cb_ctx->pos[0]) goto errmsg; nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &smc_gen_nl_family, NLM_F_MULTI, SMC_NETLINK_GET_STATS); if (!nlh) goto errmsg; attrs = nla_nest_start(skb, SMC_GEN_STATS); if (!attrs) goto errnest; stats = kzalloc(sizeof(*stats), GFP_KERNEL); if (!stats) goto erralloc; size = sizeof(*stats) / sizeof(u64); for_each_possible_cpu(cpu) { src = (u64 *)per_cpu_ptr(net->smc.smc_stats, cpu); sum = (u64 *)stats; for (i = 0; i < size; i++) *(sum++) += *(src++); } if (smc_nl_fill_stats_tech_data(skb, stats, SMC_TYPE_D)) goto errattr; if (smc_nl_fill_stats_tech_data(skb, stats, SMC_TYPE_R)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_CLNT_HS_ERR_CNT, stats->clnt_hshake_err_cnt, SMC_NLA_STATS_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_SRV_HS_ERR_CNT, stats->srv_hshake_err_cnt, SMC_NLA_STATS_PAD)) goto errattr; nla_nest_end(skb, attrs); genlmsg_end(skb, nlh); cb_ctx->pos[0] = 1; kfree(stats); return skb->len; errattr: kfree(stats); erralloc: nla_nest_cancel(skb, attrs); errnest: genlmsg_cancel(skb, nlh); errmsg: return skb->len; } static int smc_nl_get_fback_details(struct sk_buff *skb, struct netlink_callback *cb, int pos, bool is_srv) { struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); struct net *net = sock_net(skb->sk); int cnt_reported = cb_ctx->pos[2]; struct smc_stats_fback *trgt_arr; struct nlattr *attrs; int rc = 0; void *nlh; if (is_srv) trgt_arr = &net->smc.fback_rsn->srv[0]; else trgt_arr = &net->smc.fback_rsn->clnt[0]; if (!trgt_arr[pos].fback_code) return -ENODATA; nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &smc_gen_nl_family, NLM_F_MULTI, SMC_NETLINK_GET_FBACK_STATS); if (!nlh) goto errmsg; attrs = nla_nest_start(skb, SMC_GEN_FBACK_STATS); if (!attrs) goto errout; if (nla_put_u8(skb, SMC_NLA_FBACK_STATS_TYPE, is_srv)) goto errattr; if (!cnt_reported) { if (nla_put_u64_64bit(skb, SMC_NLA_FBACK_STATS_SRV_CNT, net->smc.fback_rsn->srv_fback_cnt, SMC_NLA_FBACK_STATS_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_FBACK_STATS_CLNT_CNT, net->smc.fback_rsn->clnt_fback_cnt, SMC_NLA_FBACK_STATS_PAD)) goto errattr; cnt_reported = 1; } if (nla_put_u32(skb, SMC_NLA_FBACK_STATS_RSN_CODE, trgt_arr[pos].fback_code)) goto errattr; if (nla_put_u16(skb, SMC_NLA_FBACK_STATS_RSN_CNT, trgt_arr[pos].count)) goto errattr; cb_ctx->pos[2] = cnt_reported; nla_nest_end(skb, attrs); genlmsg_end(skb, nlh); return rc; errattr: nla_nest_cancel(skb, attrs); errout: genlmsg_cancel(skb, nlh); errmsg: return -EMSGSIZE; } int smc_nl_get_fback_stats(struct sk_buff *skb, struct netlink_callback *cb) { struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); struct net *net = sock_net(skb->sk); int rc_srv = 0, rc_clnt = 0, k; int skip_serv = cb_ctx->pos[1]; int snum = cb_ctx->pos[0]; bool is_srv = true; mutex_lock(&net->smc.mutex_fback_rsn); for (k = 0; k < SMC_MAX_FBACK_RSN_CNT; k++) { if (k < snum) continue; if (!skip_serv) { rc_srv = smc_nl_get_fback_details(skb, cb, k, is_srv); if (rc_srv && rc_srv != -ENODATA) break; } else { skip_serv = 0; } rc_clnt = smc_nl_get_fback_details(skb, cb, k, !is_srv); if (rc_clnt && rc_clnt != -ENODATA) { skip_serv = 1; break; } if (rc_clnt == -ENODATA && rc_srv == -ENODATA) break; } mutex_unlock(&net->smc.mutex_fback_rsn); cb_ctx->pos[1] = skip_serv; cb_ctx->pos[0] = k; return skb->len; }
131 232 87 87 88 88 84 83 84 82 84 74 84 111 112 55 57 57 110 17 136 136 137 141 145 146 146 146 4 145 145 2 146 113 144 144 1 5 13 7 5 140 131 25 8 3 112 139 130 129 8 7 4 4 4 3 4 4 4 4 88 84 5 61 57 4 33 27 6 114 114 4 4 3 4 4 4 4 4 4 4 8 3 5 8 7 5 5 1 12 147 142 12 12 11 12 146 12 137 6 9 9 77 44 27 27 27 27 27 26 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 // SPDX-License-Identifier: GPL-2.0-or-later /* * Asynchronous Cryptographic Hash operations. * * This is the implementation of the ahash (asynchronous hash) API. It differs * from shash (synchronous hash) in that ahash supports asynchronous operations, * and it hashes data from scatterlists instead of virtually addressed buffers. * * The ahash API provides access to both ahash and shash algorithms. The shash * API only provides access to shash algorithms. * * Copyright (c) 2008 Loc Ho <lho@amcc.com> */ #include <crypto/scatterwalk.h> #include <linux/cryptouser.h> #include <linux/err.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/seq_file.h> #include <linux/string.h> #include <net/netlink.h> #include "hash.h" #define CRYPTO_ALG_TYPE_AHASH_MASK 0x0000000e /* * For an ahash tfm that is using an shash algorithm (instead of an ahash * algorithm), this returns the underlying shash tfm. */ static inline struct crypto_shash *ahash_to_shash(struct crypto_ahash *tfm) { return *(struct crypto_shash **)crypto_ahash_ctx(tfm); } static inline struct shash_desc *prepare_shash_desc(struct ahash_request *req, struct crypto_ahash *tfm) { struct shash_desc *desc = ahash_request_ctx(req); desc->tfm = ahash_to_shash(tfm); return desc; } int shash_ahash_update(struct ahash_request *req, struct shash_desc *desc) { struct crypto_hash_walk walk; int nbytes; for (nbytes = crypto_hash_walk_first(req, &walk); nbytes > 0; nbytes = crypto_hash_walk_done(&walk, nbytes)) nbytes = crypto_shash_update(desc, walk.data, nbytes); return nbytes; } EXPORT_SYMBOL_GPL(shash_ahash_update); int shash_ahash_finup(struct ahash_request *req, struct shash_desc *desc) { struct crypto_hash_walk walk; int nbytes; nbytes = crypto_hash_walk_first(req, &walk); if (!nbytes) return crypto_shash_final(desc, req->result); do { nbytes = crypto_hash_walk_last(&walk) ? crypto_shash_finup(desc, walk.data, nbytes, req->result) : crypto_shash_update(desc, walk.data, nbytes); nbytes = crypto_hash_walk_done(&walk, nbytes); } while (nbytes > 0); return nbytes; } EXPORT_SYMBOL_GPL(shash_ahash_finup); int shash_ahash_digest(struct ahash_request *req, struct shash_desc *desc) { unsigned int nbytes = req->nbytes; struct scatterlist *sg; unsigned int offset; int err; if (nbytes && (sg = req->src, offset = sg->offset, nbytes <= min(sg->length, ((unsigned int)(PAGE_SIZE)) - offset))) { void *data; data = kmap_local_page(sg_page(sg)); err = crypto_shash_digest(desc, data + offset, nbytes, req->result); kunmap_local(data); } else err = crypto_shash_init(desc) ?: shash_ahash_finup(req, desc); return err; } EXPORT_SYMBOL_GPL(shash_ahash_digest); static void crypto_exit_ahash_using_shash(struct crypto_tfm *tfm) { struct crypto_shash **ctx = crypto_tfm_ctx(tfm); crypto_free_shash(*ctx); } static int crypto_init_ahash_using_shash(struct crypto_tfm *tfm) { struct crypto_alg *calg = tfm->__crt_alg; struct crypto_ahash *crt = __crypto_ahash_cast(tfm); struct crypto_shash **ctx = crypto_tfm_ctx(tfm); struct crypto_shash *shash; if (!crypto_mod_get(calg)) return -EAGAIN; shash = crypto_create_tfm(calg, &crypto_shash_type); if (IS_ERR(shash)) { crypto_mod_put(calg); return PTR_ERR(shash); } crt->using_shash = true; *ctx = shash; tfm->exit = crypto_exit_ahash_using_shash; crypto_ahash_set_flags(crt, crypto_shash_get_flags(shash) & CRYPTO_TFM_NEED_KEY); crt->reqsize = sizeof(struct shash_desc) + crypto_shash_descsize(shash); return 0; } static int hash_walk_next(struct crypto_hash_walk *walk) { unsigned int offset = walk->offset; unsigned int nbytes = min(walk->entrylen, ((unsigned int)(PAGE_SIZE)) - offset); walk->data = kmap_local_page(walk->pg); walk->data += offset; walk->entrylen -= nbytes; return nbytes; } static int hash_walk_new_entry(struct crypto_hash_walk *walk) { struct scatterlist *sg; sg = walk->sg; walk->offset = sg->offset; walk->pg = sg_page(walk->sg) + (walk->offset >> PAGE_SHIFT); walk->offset = offset_in_page(walk->offset); walk->entrylen = sg->length; if (walk->entrylen > walk->total) walk->entrylen = walk->total; walk->total -= walk->entrylen; return hash_walk_next(walk); } int crypto_hash_walk_done(struct crypto_hash_walk *walk, int err) { walk->data -= walk->offset; kunmap_local(walk->data); crypto_yield(walk->flags); if (err) return err; if (walk->entrylen) { walk->offset = 0; walk->pg++; return hash_walk_next(walk); } if (!walk->total) return 0; walk->sg = sg_next(walk->sg); return hash_walk_new_entry(walk); } EXPORT_SYMBOL_GPL(crypto_hash_walk_done); int crypto_hash_walk_first(struct ahash_request *req, struct crypto_hash_walk *walk) { walk->total = req->nbytes; if (!walk->total) { walk->entrylen = 0; return 0; } walk->sg = req->src; walk->flags = req->base.flags; return hash_walk_new_entry(walk); } EXPORT_SYMBOL_GPL(crypto_hash_walk_first); static int ahash_nosetkey(struct crypto_ahash *tfm, const u8 *key, unsigned int keylen) { return -ENOSYS; } static void ahash_set_needkey(struct crypto_ahash *tfm, struct ahash_alg *alg) { if (alg->setkey != ahash_nosetkey && !(alg->halg.base.cra_flags & CRYPTO_ALG_OPTIONAL_KEY)) crypto_ahash_set_flags(tfm, CRYPTO_TFM_NEED_KEY); } int crypto_ahash_setkey(struct crypto_ahash *tfm, const u8 *key, unsigned int keylen) { if (likely(tfm->using_shash)) { struct crypto_shash *shash = ahash_to_shash(tfm); int err; err = crypto_shash_setkey(shash, key, keylen); if (unlikely(err)) { crypto_ahash_set_flags(tfm, crypto_shash_get_flags(shash) & CRYPTO_TFM_NEED_KEY); return err; } } else { struct ahash_alg *alg = crypto_ahash_alg(tfm); int err; err = alg->setkey(tfm, key, keylen); if (unlikely(err)) { ahash_set_needkey(tfm, alg); return err; } } crypto_ahash_clear_flags(tfm, CRYPTO_TFM_NEED_KEY); return 0; } EXPORT_SYMBOL_GPL(crypto_ahash_setkey); int crypto_ahash_init(struct ahash_request *req) { struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); if (likely(tfm->using_shash)) return crypto_shash_init(prepare_shash_desc(req, tfm)); if (crypto_ahash_get_flags(tfm) & CRYPTO_TFM_NEED_KEY) return -ENOKEY; return crypto_ahash_alg(tfm)->init(req); } EXPORT_SYMBOL_GPL(crypto_ahash_init); static int ahash_save_req(struct ahash_request *req, crypto_completion_t cplt, bool has_state) { struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); unsigned int ds = crypto_ahash_digestsize(tfm); struct ahash_request *subreq; unsigned int subreq_size; unsigned int reqsize; u8 *result; gfp_t gfp; u32 flags; subreq_size = sizeof(*subreq); reqsize = crypto_ahash_reqsize(tfm); reqsize = ALIGN(reqsize, crypto_tfm_ctx_alignment()); subreq_size += reqsize; subreq_size += ds; flags = ahash_request_flags(req); gfp = (flags & CRYPTO_TFM_REQ_MAY_SLEEP) ? GFP_KERNEL : GFP_ATOMIC; subreq = kmalloc(subreq_size, gfp); if (!subreq) return -ENOMEM; ahash_request_set_tfm(subreq, tfm); ahash_request_set_callback(subreq, flags, cplt, req); result = (u8 *)(subreq + 1) + reqsize; ahash_request_set_crypt(subreq, req->src, result, req->nbytes); if (has_state) { void *state; state = kmalloc(crypto_ahash_statesize(tfm), gfp); if (!state) { kfree(subreq); return -ENOMEM; } crypto_ahash_export(req, state); crypto_ahash_import(subreq, state); kfree_sensitive(state); } req->priv = subreq; return 0; } static void ahash_restore_req(struct ahash_request *req, int err) { struct ahash_request *subreq = req->priv; if (!err) memcpy(req->result, subreq->result, crypto_ahash_digestsize(crypto_ahash_reqtfm(req))); req->priv = NULL; kfree_sensitive(subreq); } int crypto_ahash_update(struct ahash_request *req) { struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); if (likely(tfm->using_shash)) return shash_ahash_update(req, ahash_request_ctx(req)); return crypto_ahash_alg(tfm)->update(req); } EXPORT_SYMBOL_GPL(crypto_ahash_update); int crypto_ahash_final(struct ahash_request *req) { struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); if (likely(tfm->using_shash)) return crypto_shash_final(ahash_request_ctx(req), req->result); return crypto_ahash_alg(tfm)->final(req); } EXPORT_SYMBOL_GPL(crypto_ahash_final); int crypto_ahash_finup(struct ahash_request *req) { struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); if (likely(tfm->using_shash)) return shash_ahash_finup(req, ahash_request_ctx(req)); return crypto_ahash_alg(tfm)->finup(req); } EXPORT_SYMBOL_GPL(crypto_ahash_finup); int crypto_ahash_digest(struct ahash_request *req) { struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); if (likely(tfm->using_shash)) return shash_ahash_digest(req, prepare_shash_desc(req, tfm)); if (crypto_ahash_get_flags(tfm) & CRYPTO_TFM_NEED_KEY) return -ENOKEY; return crypto_ahash_alg(tfm)->digest(req); } EXPORT_SYMBOL_GPL(crypto_ahash_digest); static void ahash_def_finup_done2(void *data, int err) { struct ahash_request *areq = data; if (err == -EINPROGRESS) return; ahash_restore_req(areq, err); ahash_request_complete(areq, err); } static int ahash_def_finup_finish1(struct ahash_request *req, int err) { struct ahash_request *subreq = req->priv; if (err) goto out; subreq->base.complete = ahash_def_finup_done2; err = crypto_ahash_alg(crypto_ahash_reqtfm(req))->final(subreq); if (err == -EINPROGRESS || err == -EBUSY) return err; out: ahash_restore_req(req, err); return err; } static void ahash_def_finup_done1(void *data, int err) { struct ahash_request *areq = data; struct ahash_request *subreq; if (err == -EINPROGRESS) goto out; subreq = areq->priv; subreq->base.flags &= CRYPTO_TFM_REQ_MAY_BACKLOG; err = ahash_def_finup_finish1(areq, err); if (err == -EINPROGRESS || err == -EBUSY) return; out: ahash_request_complete(areq, err); } static int ahash_def_finup(struct ahash_request *req) { struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); int err; err = ahash_save_req(req, ahash_def_finup_done1, true); if (err) return err; err = crypto_ahash_alg(tfm)->update(req->priv); if (err == -EINPROGRESS || err == -EBUSY) return err; return ahash_def_finup_finish1(req, err); } int crypto_ahash_export(struct ahash_request *req, void *out) { struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); if (likely(tfm->using_shash)) return crypto_shash_export(ahash_request_ctx(req), out); return crypto_ahash_alg(tfm)->export(req, out); } EXPORT_SYMBOL_GPL(crypto_ahash_export); int crypto_ahash_import(struct ahash_request *req, const void *in) { struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); if (likely(tfm->using_shash)) return crypto_shash_import(prepare_shash_desc(req, tfm), in); if (crypto_ahash_get_flags(tfm) & CRYPTO_TFM_NEED_KEY) return -ENOKEY; return crypto_ahash_alg(tfm)->import(req, in); } EXPORT_SYMBOL_GPL(crypto_ahash_import); static void crypto_ahash_exit_tfm(struct crypto_tfm *tfm) { struct crypto_ahash *hash = __crypto_ahash_cast(tfm); struct ahash_alg *alg = crypto_ahash_alg(hash); alg->exit_tfm(hash); } static int crypto_ahash_init_tfm(struct crypto_tfm *tfm) { struct crypto_ahash *hash = __crypto_ahash_cast(tfm); struct ahash_alg *alg = crypto_ahash_alg(hash); crypto_ahash_set_statesize(hash, alg->halg.statesize); if (tfm->__crt_alg->cra_type == &crypto_shash_type) return crypto_init_ahash_using_shash(tfm); ahash_set_needkey(hash, alg); if (alg->exit_tfm) tfm->exit = crypto_ahash_exit_tfm; return alg->init_tfm ? alg->init_tfm(hash) : 0; } static unsigned int crypto_ahash_extsize(struct crypto_alg *alg) { if (alg->cra_type == &crypto_shash_type) return sizeof(struct crypto_shash *); return crypto_alg_extsize(alg); } static void crypto_ahash_free_instance(struct crypto_instance *inst) { struct ahash_instance *ahash = ahash_instance(inst); ahash->free(ahash); } static int __maybe_unused crypto_ahash_report( struct sk_buff *skb, struct crypto_alg *alg) { struct crypto_report_hash rhash; memset(&rhash, 0, sizeof(rhash)); strscpy(rhash.type, "ahash", sizeof(rhash.type)); rhash.blocksize = alg->cra_blocksize; rhash.digestsize = __crypto_hash_alg_common(alg)->digestsize; return nla_put(skb, CRYPTOCFGA_REPORT_HASH, sizeof(rhash), &rhash); } static void crypto_ahash_show(struct seq_file *m, struct crypto_alg *alg) __maybe_unused; static void crypto_ahash_show(struct seq_file *m, struct crypto_alg *alg) { seq_printf(m, "type : ahash\n"); seq_printf(m, "async : %s\n", alg->cra_flags & CRYPTO_ALG_ASYNC ? "yes" : "no"); seq_printf(m, "blocksize : %u\n", alg->cra_blocksize); seq_printf(m, "digestsize : %u\n", __crypto_hash_alg_common(alg)->digestsize); } static const struct crypto_type crypto_ahash_type = { .extsize = crypto_ahash_extsize, .init_tfm = crypto_ahash_init_tfm, .free = crypto_ahash_free_instance, #ifdef CONFIG_PROC_FS .show = crypto_ahash_show, #endif #if IS_ENABLED(CONFIG_CRYPTO_USER) .report = crypto_ahash_report, #endif .maskclear = ~CRYPTO_ALG_TYPE_MASK, .maskset = CRYPTO_ALG_TYPE_AHASH_MASK, .type = CRYPTO_ALG_TYPE_AHASH, .tfmsize = offsetof(struct crypto_ahash, base), }; int crypto_grab_ahash(struct crypto_ahash_spawn *spawn, struct crypto_instance *inst, const char *name, u32 type, u32 mask) { spawn->base.frontend = &crypto_ahash_type; return crypto_grab_spawn(&spawn->base, inst, name, type, mask); } EXPORT_SYMBOL_GPL(crypto_grab_ahash); struct crypto_ahash *crypto_alloc_ahash(const char *alg_name, u32 type, u32 mask) { return crypto_alloc_tfm(alg_name, &crypto_ahash_type, type, mask); } EXPORT_SYMBOL_GPL(crypto_alloc_ahash); int crypto_has_ahash(const char *alg_name, u32 type, u32 mask) { return crypto_type_has_alg(alg_name, &crypto_ahash_type, type, mask); } EXPORT_SYMBOL_GPL(crypto_has_ahash); static bool crypto_hash_alg_has_setkey(struct hash_alg_common *halg) { struct crypto_alg *alg = &halg->base; if (alg->cra_type == &crypto_shash_type) return crypto_shash_alg_has_setkey(__crypto_shash_alg(alg)); return __crypto_ahash_alg(alg)->setkey != ahash_nosetkey; } struct crypto_ahash *crypto_clone_ahash(struct crypto_ahash *hash) { struct hash_alg_common *halg = crypto_hash_alg_common(hash); struct crypto_tfm *tfm = crypto_ahash_tfm(hash); struct crypto_ahash *nhash; struct ahash_alg *alg; int err; if (!crypto_hash_alg_has_setkey(halg)) { tfm = crypto_tfm_get(tfm); if (IS_ERR(tfm)) return ERR_CAST(tfm); return hash; } nhash = crypto_clone_tfm(&crypto_ahash_type, tfm); if (IS_ERR(nhash)) return nhash; nhash->reqsize = hash->reqsize; nhash->statesize = hash->statesize; if (likely(hash->using_shash)) { struct crypto_shash **nctx = crypto_ahash_ctx(nhash); struct crypto_shash *shash; shash = crypto_clone_shash(ahash_to_shash(hash)); if (IS_ERR(shash)) { err = PTR_ERR(shash); goto out_free_nhash; } nhash->using_shash = true; *nctx = shash; return nhash; } err = -ENOSYS; alg = crypto_ahash_alg(hash); if (!alg->clone_tfm) goto out_free_nhash; err = alg->clone_tfm(nhash, hash); if (err) goto out_free_nhash; return nhash; out_free_nhash: crypto_free_ahash(nhash); return ERR_PTR(err); } EXPORT_SYMBOL_GPL(crypto_clone_ahash); static int ahash_prepare_alg(struct ahash_alg *alg) { struct crypto_alg *base = &alg->halg.base; int err; if (alg->halg.statesize == 0) return -EINVAL; err = hash_prepare_alg(&alg->halg); if (err) return err; base->cra_type = &crypto_ahash_type; base->cra_flags |= CRYPTO_ALG_TYPE_AHASH; if (!alg->finup) alg->finup = ahash_def_finup; if (!alg->setkey) alg->setkey = ahash_nosetkey; return 0; } int crypto_register_ahash(struct ahash_alg *alg) { struct crypto_alg *base = &alg->halg.base; int err; err = ahash_prepare_alg(alg); if (err) return err; return crypto_register_alg(base); } EXPORT_SYMBOL_GPL(crypto_register_ahash); void crypto_unregister_ahash(struct ahash_alg *alg) { crypto_unregister_alg(&alg->halg.base); } EXPORT_SYMBOL_GPL(crypto_unregister_ahash); int crypto_register_ahashes(struct ahash_alg *algs, int count) { int i, ret; for (i = 0; i < count; i++) { ret = crypto_register_ahash(&algs[i]); if (ret) goto err; } return 0; err: for (--i; i >= 0; --i) crypto_unregister_ahash(&algs[i]); return ret; } EXPORT_SYMBOL_GPL(crypto_register_ahashes); void crypto_unregister_ahashes(struct ahash_alg *algs, int count) { int i; for (i = count - 1; i >= 0; --i) crypto_unregister_ahash(&algs[i]); } EXPORT_SYMBOL_GPL(crypto_unregister_ahashes); int ahash_register_instance(struct crypto_template *tmpl, struct ahash_instance *inst) { int err; if (WARN_ON(!inst->free)) return -EINVAL; err = ahash_prepare_alg(&inst->alg); if (err) return err; return crypto_register_instance(tmpl, ahash_crypto_instance(inst)); } EXPORT_SYMBOL_GPL(ahash_register_instance); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Asynchronous cryptographic hash type");
31 16 16 16 16 15 15 16 16 16 16 16 15 15 15 15 9 9 9 9 9 9 15 15 41 41 41 41 23 31 41 41 10 41 26 41 30 41 42 42 42 42 22 33 42 38 14 38 18 38 32 42 31 31 31 31 31 31 31 31 31 31 31 31 31 31 31 16 30 31 31 31 31 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 /* * Rate conversion Plug-In * Copyright (c) 1999 by Jaroslav Kysela <perex@perex.cz> * * * This library is free software; you can redistribute it and/or modify * it under the terms of the GNU Library General Public License as * published by the Free Software Foundation; either version 2 of * the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Library General Public License for more details. * * You should have received a copy of the GNU Library General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * */ #include <linux/time.h> #include <sound/core.h> #include <sound/pcm.h> #include "pcm_plugin.h" #define SHIFT 11 #define BITS (1<<SHIFT) #define R_MASK (BITS-1) /* * Basic rate conversion plugin */ struct rate_channel { signed short last_S1; signed short last_S2; }; typedef void (*rate_f)(struct snd_pcm_plugin *plugin, const struct snd_pcm_plugin_channel *src_channels, struct snd_pcm_plugin_channel *dst_channels, int src_frames, int dst_frames); struct rate_priv { unsigned int pitch; unsigned int pos; rate_f func; snd_pcm_sframes_t old_src_frames, old_dst_frames; struct rate_channel channels[]; }; static void rate_init(struct snd_pcm_plugin *plugin) { unsigned int channel; struct rate_priv *data = (struct rate_priv *)plugin->extra_data; data->pos = 0; for (channel = 0; channel < plugin->src_format.channels; channel++) { data->channels[channel].last_S1 = 0; data->channels[channel].last_S2 = 0; } } static void resample_expand(struct snd_pcm_plugin *plugin, const struct snd_pcm_plugin_channel *src_channels, struct snd_pcm_plugin_channel *dst_channels, int src_frames, int dst_frames) { unsigned int pos = 0; signed int val; signed short S1, S2; signed short *src, *dst; unsigned int channel; int src_step, dst_step; int src_frames1, dst_frames1; struct rate_priv *data = (struct rate_priv *)plugin->extra_data; struct rate_channel *rchannels = data->channels; for (channel = 0; channel < plugin->src_format.channels; channel++) { pos = data->pos; S1 = rchannels->last_S1; S2 = rchannels->last_S2; if (!src_channels[channel].enabled) { if (dst_channels[channel].wanted) snd_pcm_area_silence(&dst_channels[channel].area, 0, dst_frames, plugin->dst_format.format); dst_channels[channel].enabled = 0; continue; } dst_channels[channel].enabled = 1; src = (signed short *)src_channels[channel].area.addr + src_channels[channel].area.first / 8 / 2; dst = (signed short *)dst_channels[channel].area.addr + dst_channels[channel].area.first / 8 / 2; src_step = src_channels[channel].area.step / 8 / 2; dst_step = dst_channels[channel].area.step / 8 / 2; src_frames1 = src_frames; dst_frames1 = dst_frames; while (dst_frames1-- > 0) { if (pos & ~R_MASK) { pos &= R_MASK; S1 = S2; if (src_frames1-- > 0) { S2 = *src; src += src_step; } } val = S1 + ((S2 - S1) * (signed int)pos) / BITS; if (val < -32768) val = -32768; else if (val > 32767) val = 32767; *dst = val; dst += dst_step; pos += data->pitch; } rchannels->last_S1 = S1; rchannels->last_S2 = S2; rchannels++; } data->pos = pos; } static void resample_shrink(struct snd_pcm_plugin *plugin, const struct snd_pcm_plugin_channel *src_channels, struct snd_pcm_plugin_channel *dst_channels, int src_frames, int dst_frames) { unsigned int pos = 0; signed int val; signed short S1, S2; signed short *src, *dst; unsigned int channel; int src_step, dst_step; int src_frames1, dst_frames1; struct rate_priv *data = (struct rate_priv *)plugin->extra_data; struct rate_channel *rchannels = data->channels; for (channel = 0; channel < plugin->src_format.channels; ++channel) { pos = data->pos; S1 = rchannels->last_S1; S2 = rchannels->last_S2; if (!src_channels[channel].enabled) { if (dst_channels[channel].wanted) snd_pcm_area_silence(&dst_channels[channel].area, 0, dst_frames, plugin->dst_format.format); dst_channels[channel].enabled = 0; continue; } dst_channels[channel].enabled = 1; src = (signed short *)src_channels[channel].area.addr + src_channels[channel].area.first / 8 / 2; dst = (signed short *)dst_channels[channel].area.addr + dst_channels[channel].area.first / 8 / 2; src_step = src_channels[channel].area.step / 8 / 2; dst_step = dst_channels[channel].area.step / 8 / 2; src_frames1 = src_frames; dst_frames1 = dst_frames; while (dst_frames1 > 0) { S1 = S2; if (src_frames1-- > 0) { S2 = *src; src += src_step; } if (pos & ~R_MASK) { pos &= R_MASK; val = S1 + ((S2 - S1) * (signed int)pos) / BITS; if (val < -32768) val = -32768; else if (val > 32767) val = 32767; *dst = val; dst += dst_step; dst_frames1--; } pos += data->pitch; } rchannels->last_S1 = S1; rchannels->last_S2 = S2; rchannels++; } data->pos = pos; } static snd_pcm_sframes_t rate_src_frames(struct snd_pcm_plugin *plugin, snd_pcm_uframes_t frames) { struct rate_priv *data; snd_pcm_sframes_t res; if (snd_BUG_ON(!plugin)) return -ENXIO; if (frames == 0) return 0; data = (struct rate_priv *)plugin->extra_data; if (plugin->src_format.rate < plugin->dst_format.rate) { res = (((frames * data->pitch) + (BITS/2)) >> SHIFT); } else { res = DIV_ROUND_CLOSEST(frames << SHIFT, data->pitch); } if (data->old_src_frames > 0) { snd_pcm_sframes_t frames1 = frames, res1 = data->old_dst_frames; while (data->old_src_frames < frames1) { frames1 >>= 1; res1 <<= 1; } while (data->old_src_frames > frames1) { frames1 <<= 1; res1 >>= 1; } if (data->old_src_frames == frames1) return res1; } data->old_src_frames = frames; data->old_dst_frames = res; return res; } static snd_pcm_sframes_t rate_dst_frames(struct snd_pcm_plugin *plugin, snd_pcm_uframes_t frames) { struct rate_priv *data; snd_pcm_sframes_t res; if (snd_BUG_ON(!plugin)) return -ENXIO; if (frames == 0) return 0; data = (struct rate_priv *)plugin->extra_data; if (plugin->src_format.rate < plugin->dst_format.rate) { res = DIV_ROUND_CLOSEST(frames << SHIFT, data->pitch); } else { res = (((frames * data->pitch) + (BITS/2)) >> SHIFT); } if (data->old_dst_frames > 0) { snd_pcm_sframes_t frames1 = frames, res1 = data->old_src_frames; while (data->old_dst_frames < frames1) { frames1 >>= 1; res1 <<= 1; } while (data->old_dst_frames > frames1) { frames1 <<= 1; res1 >>= 1; } if (data->old_dst_frames == frames1) return res1; } data->old_dst_frames = frames; data->old_src_frames = res; return res; } static snd_pcm_sframes_t rate_transfer(struct snd_pcm_plugin *plugin, const struct snd_pcm_plugin_channel *src_channels, struct snd_pcm_plugin_channel *dst_channels, snd_pcm_uframes_t frames) { snd_pcm_uframes_t dst_frames; struct rate_priv *data; if (snd_BUG_ON(!plugin || !src_channels || !dst_channels)) return -ENXIO; if (frames == 0) return 0; #ifdef CONFIG_SND_DEBUG { unsigned int channel; for (channel = 0; channel < plugin->src_format.channels; channel++) { if (snd_BUG_ON(src_channels[channel].area.first % 8 || src_channels[channel].area.step % 8)) return -ENXIO; if (snd_BUG_ON(dst_channels[channel].area.first % 8 || dst_channels[channel].area.step % 8)) return -ENXIO; } } #endif dst_frames = rate_dst_frames(plugin, frames); if (dst_frames > dst_channels[0].frames) dst_frames = dst_channels[0].frames; data = (struct rate_priv *)plugin->extra_data; data->func(plugin, src_channels, dst_channels, frames, dst_frames); return dst_frames; } static int rate_action(struct snd_pcm_plugin *plugin, enum snd_pcm_plugin_action action, unsigned long udata) { if (snd_BUG_ON(!plugin)) return -ENXIO; switch (action) { case INIT: case PREPARE: rate_init(plugin); break; default: break; } return 0; /* silently ignore other actions */ } int snd_pcm_plugin_build_rate(struct snd_pcm_substream *plug, struct snd_pcm_plugin_format *src_format, struct snd_pcm_plugin_format *dst_format, struct snd_pcm_plugin **r_plugin) { int err; struct rate_priv *data; struct snd_pcm_plugin *plugin; if (snd_BUG_ON(!r_plugin)) return -ENXIO; *r_plugin = NULL; if (snd_BUG_ON(src_format->channels != dst_format->channels)) return -ENXIO; if (snd_BUG_ON(src_format->channels <= 0)) return -ENXIO; if (snd_BUG_ON(src_format->format != SNDRV_PCM_FORMAT_S16)) return -ENXIO; if (snd_BUG_ON(dst_format->format != SNDRV_PCM_FORMAT_S16)) return -ENXIO; if (snd_BUG_ON(src_format->rate == dst_format->rate)) return -ENXIO; err = snd_pcm_plugin_build(plug, "rate conversion", src_format, dst_format, struct_size(data, channels, src_format->channels), &plugin); if (err < 0) return err; data = (struct rate_priv *)plugin->extra_data; if (src_format->rate < dst_format->rate) { data->pitch = ((src_format->rate << SHIFT) + (dst_format->rate >> 1)) / dst_format->rate; data->func = resample_expand; } else { data->pitch = ((dst_format->rate << SHIFT) + (src_format->rate >> 1)) / src_format->rate; data->func = resample_shrink; } data->pos = 0; rate_init(plugin); data->old_src_frames = data->old_dst_frames = 0; plugin->transfer = rate_transfer; plugin->src_frames = rate_src_frames; plugin->dst_frames = rate_dst_frames; plugin->action = rate_action; *r_plugin = plugin; return 0; }
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 1991, 1992 Linus Torvalds * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs */ #ifndef _ASM_X86_STACKTRACE_H #define _ASM_X86_STACKTRACE_H #include <linux/uaccess.h> #include <linux/ptrace.h> #include <asm/cpu_entry_area.h> #include <asm/switch_to.h> enum stack_type { STACK_TYPE_UNKNOWN, STACK_TYPE_TASK, STACK_TYPE_IRQ, STACK_TYPE_SOFTIRQ, STACK_TYPE_ENTRY, STACK_TYPE_EXCEPTION, STACK_TYPE_EXCEPTION_LAST = STACK_TYPE_EXCEPTION + N_EXCEPTION_STACKS-1, }; struct stack_info { enum stack_type type; unsigned long *begin, *end, *next_sp; }; bool in_task_stack(unsigned long *stack, struct task_struct *task, struct stack_info *info); bool in_entry_stack(unsigned long *stack, struct stack_info *info); int get_stack_info(unsigned long *stack, struct task_struct *task, struct stack_info *info, unsigned long *visit_mask); bool get_stack_info_noinstr(unsigned long *stack, struct task_struct *task, struct stack_info *info); static __always_inline bool get_stack_guard_info(unsigned long *stack, struct stack_info *info) { /* make sure it's not in the stack proper */ if (get_stack_info_noinstr(stack, current, info)) return false; /* but if it is in the page below it, we hit a guard */ return get_stack_info_noinstr((void *)stack + PAGE_SIZE, current, info); } const char *stack_type_name(enum stack_type type); static inline bool on_stack(struct stack_info *info, void *addr, size_t len) { void *begin = info->begin; void *end = info->end; return (info->type != STACK_TYPE_UNKNOWN && addr >= begin && addr < end && addr + len > begin && addr + len <= end); } #ifdef CONFIG_X86_32 #define STACKSLOTS_PER_LINE 8 #else #define STACKSLOTS_PER_LINE 4 #endif #ifdef CONFIG_FRAME_POINTER static inline unsigned long * get_frame_pointer(struct task_struct *task, struct pt_regs *regs) { if (regs) return (unsigned long *)regs->bp; if (task == current) return __builtin_frame_address(0); return &((struct inactive_task_frame *)task->thread.sp)->bp; } #else static inline unsigned long * get_frame_pointer(struct task_struct *task, struct pt_regs *regs) { return NULL; } #endif /* CONFIG_FRAME_POINTER */ static inline unsigned long * get_stack_pointer(struct task_struct *task, struct pt_regs *regs) { if (regs) return (unsigned long *)regs->sp; if (task == current) return __builtin_frame_address(0); return (unsigned long *)task->thread.sp; } /* The form of the top of the frame on the stack */ struct stack_frame { struct stack_frame *next_frame; unsigned long return_address; }; struct stack_frame_ia32 { u32 next_frame; u32 return_address; }; void show_opcodes(struct pt_regs *regs, const char *loglvl); void show_ip(struct pt_regs *regs, const char *loglvl); #endif /* _ASM_X86_STACKTRACE_H */
298 299 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 // SPDX-License-Identifier: GPL-2.0 /* * tracing clocks * * Copyright (C) 2009 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> * * Implements 3 trace clock variants, with differing scalability/precision * tradeoffs: * * - local: CPU-local trace clock * - medium: scalable global clock with some jitter * - global: globally monotonic, serialized clock * * Tracer plugins will chose a default from these clocks. */ #include <linux/spinlock.h> #include <linux/irqflags.h> #include <linux/hardirq.h> #include <linux/module.h> #include <linux/percpu.h> #include <linux/sched.h> #include <linux/sched/clock.h> #include <linux/ktime.h> #include <linux/trace_clock.h> /* * trace_clock_local(): the simplest and least coherent tracing clock. * * Useful for tracing that does not cross to other CPUs nor * does it go through idle events. */ u64 notrace trace_clock_local(void) { u64 clock; /* * sched_clock() is an architecture implemented, fast, scalable, * lockless clock. It is not guaranteed to be coherent across * CPUs, nor across CPU idle events. */ preempt_disable_notrace(); clock = sched_clock(); preempt_enable_notrace(); return clock; } EXPORT_SYMBOL_GPL(trace_clock_local); /* * trace_clock(): 'between' trace clock. Not completely serialized, * but not completely incorrect when crossing CPUs either. * * This is based on cpu_clock(), which will allow at most ~1 jiffy of * jitter between CPUs. So it's a pretty scalable clock, but there * can be offsets in the trace data. */ u64 notrace trace_clock(void) { return local_clock(); } EXPORT_SYMBOL_GPL(trace_clock); /* * trace_jiffy_clock(): Simply use jiffies as a clock counter. * Note that this use of jiffies_64 is not completely safe on * 32-bit systems. But the window is tiny, and the effect if * we are affected is that we will have an obviously bogus * timestamp on a trace event - i.e. not life threatening. */ u64 notrace trace_clock_jiffies(void) { return jiffies_64_to_clock_t(jiffies_64 - INITIAL_JIFFIES); } EXPORT_SYMBOL_GPL(trace_clock_jiffies); /* * trace_clock_global(): special globally coherent trace clock * * It has higher overhead than the other trace clocks but is still * an order of magnitude faster than GTOD derived hardware clocks. * * Used by plugins that need globally coherent timestamps. */ /* keep prev_time and lock in the same cacheline. */ static struct { u64 prev_time; arch_spinlock_t lock; } trace_clock_struct ____cacheline_aligned_in_smp = { .lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED, }; u64 notrace trace_clock_global(void) { unsigned long flags; int this_cpu; u64 now, prev_time; raw_local_irq_save(flags); this_cpu = raw_smp_processor_id(); /* * The global clock "guarantees" that the events are ordered * between CPUs. But if two events on two different CPUS call * trace_clock_global at roughly the same time, it really does * not matter which one gets the earlier time. Just make sure * that the same CPU will always show a monotonic clock. * * Use a read memory barrier to get the latest written * time that was recorded. */ smp_rmb(); prev_time = READ_ONCE(trace_clock_struct.prev_time); now = sched_clock_cpu(this_cpu); /* Make sure that now is always greater than or equal to prev_time */ if ((s64)(now - prev_time) < 0) now = prev_time; /* * If in an NMI context then dont risk lockups and simply return * the current time. */ if (unlikely(in_nmi())) goto out; /* Tracing can cause strange recursion, always use a try lock */ if (arch_spin_trylock(&trace_clock_struct.lock)) { /* Reread prev_time in case it was already updated */ prev_time = READ_ONCE(trace_clock_struct.prev_time); if ((s64)(now - prev_time) < 0) now = prev_time; trace_clock_struct.prev_time = now; /* The unlock acts as the wmb for the above rmb */ arch_spin_unlock(&trace_clock_struct.lock); } out: raw_local_irq_restore(flags); return now; } EXPORT_SYMBOL_GPL(trace_clock_global); static atomic64_t trace_counter; /* * trace_clock_counter(): simply an atomic counter. * Use the trace_counter "counter" for cases where you do not care * about timings, but are interested in strict ordering. */ u64 notrace trace_clock_counter(void) { return atomic64_inc_return(&trace_counter); }
4 4 4 3 4 4 3 1 3 25 24 24 23 23 23 23 23 23 23 23 5 25 23 23 23 14 14 14 14 14 14 13 2 2 2 14 2 14 14 8 8 8 8 8 8 8 8 36 36 36 36 39 38 38 38 37 39 14 14 14 7 7 14 1 1 14 2 14 2 14 14 14 52 3 1 1 2 1 1 1 1 3 1 2 5 5 5 4 4 4 2 2 2 4 4 4 5 4 4 1 2 1 4 1 4 4 3 1 3 1 2 2 2 2 3 2 1 1 1 1 7 7 4 1 3 7 7 3 6 1 4 2 2 6 6 6 6 6 6 3 3 3 3 6 6 6 3 6 6 3 2 2 2 2 3 2 2 1 1 1 1 2 11 2 10 8 4 3 4 3 1 1 2 2 4 4 3 3 3 4 11 19 19 19 10 10 10 47 47 13 13 13 13 13 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 // SPDX-License-Identifier: GPL-2.0-only /* * VMware VMCI Driver * * Copyright (C) 2012 VMware, Inc. All rights reserved. */ #include <linux/vmw_vmci_defs.h> #include <linux/vmw_vmci_api.h> #include <linux/highmem.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/sched.h> #include <linux/cred.h> #include <linux/slab.h> #include "vmci_queue_pair.h" #include "vmci_datagram.h" #include "vmci_doorbell.h" #include "vmci_context.h" #include "vmci_driver.h" #include "vmci_event.h" /* Use a wide upper bound for the maximum contexts. */ #define VMCI_MAX_CONTEXTS 2000 /* * List of current VMCI contexts. Contexts can be added by * vmci_ctx_create() and removed via vmci_ctx_destroy(). * These, along with context lookup, are protected by the * list structure's lock. */ static struct { struct list_head head; spinlock_t lock; /* Spinlock for context list operations */ } ctx_list = { .head = LIST_HEAD_INIT(ctx_list.head), .lock = __SPIN_LOCK_UNLOCKED(ctx_list.lock), }; /* Used by contexts that did not set up notify flag pointers */ static bool ctx_dummy_notify; static void ctx_signal_notify(struct vmci_ctx *context) { *context->notify = true; } static void ctx_clear_notify(struct vmci_ctx *context) { *context->notify = false; } /* * If nothing requires the attention of the guest, clears both * notify flag and call. */ static void ctx_clear_notify_call(struct vmci_ctx *context) { if (context->pending_datagrams == 0 && vmci_handle_arr_get_size(context->pending_doorbell_array) == 0) ctx_clear_notify(context); } /* * Sets the context's notify flag iff datagrams are pending for this * context. Called from vmci_setup_notify(). */ void vmci_ctx_check_signal_notify(struct vmci_ctx *context) { spin_lock(&context->lock); if (context->pending_datagrams) ctx_signal_notify(context); spin_unlock(&context->lock); } /* * Allocates and initializes a VMCI context. */ struct vmci_ctx *vmci_ctx_create(u32 cid, u32 priv_flags, uintptr_t event_hnd, int user_version, const struct cred *cred) { struct vmci_ctx *context; int error; if (cid == VMCI_INVALID_ID) { pr_devel("Invalid context ID for VMCI context\n"); error = -EINVAL; goto err_out; } if (priv_flags & ~VMCI_PRIVILEGE_ALL_FLAGS) { pr_devel("Invalid flag (flags=0x%x) for VMCI context\n", priv_flags); error = -EINVAL; goto err_out; } if (user_version == 0) { pr_devel("Invalid suer_version %d\n", user_version); error = -EINVAL; goto err_out; } context = kzalloc(sizeof(*context), GFP_KERNEL); if (!context) { pr_warn("Failed to allocate memory for VMCI context\n"); error = -ENOMEM; goto err_out; } kref_init(&context->kref); spin_lock_init(&context->lock); INIT_LIST_HEAD(&context->list_item); INIT_LIST_HEAD(&context->datagram_queue); INIT_LIST_HEAD(&context->notifier_list); /* Initialize host-specific VMCI context. */ init_waitqueue_head(&context->host_context.wait_queue); context->queue_pair_array = vmci_handle_arr_create(0, VMCI_MAX_GUEST_QP_COUNT); if (!context->queue_pair_array) { error = -ENOMEM; goto err_free_ctx; } context->doorbell_array = vmci_handle_arr_create(0, VMCI_MAX_GUEST_DOORBELL_COUNT); if (!context->doorbell_array) { error = -ENOMEM; goto err_free_qp_array; } context->pending_doorbell_array = vmci_handle_arr_create(0, VMCI_MAX_GUEST_DOORBELL_COUNT); if (!context->pending_doorbell_array) { error = -ENOMEM; goto err_free_db_array; } context->user_version = user_version; context->priv_flags = priv_flags; if (cred) context->cred = get_cred(cred); context->notify = &ctx_dummy_notify; context->notify_page = NULL; /* * If we collide with an existing context we generate a new * and use it instead. The VMX will determine if regeneration * is okay. Since there isn't 4B - 16 VMs running on a given * host, the below loop will terminate. */ spin_lock(&ctx_list.lock); while (vmci_ctx_exists(cid)) { /* We reserve the lowest 16 ids for fixed contexts. */ cid = max(cid, VMCI_RESERVED_CID_LIMIT - 1) + 1; if (cid == VMCI_INVALID_ID) cid = VMCI_RESERVED_CID_LIMIT; } context->cid = cid; list_add_tail_rcu(&context->list_item, &ctx_list.head); spin_unlock(&ctx_list.lock); return context; err_free_db_array: vmci_handle_arr_destroy(context->doorbell_array); err_free_qp_array: vmci_handle_arr_destroy(context->queue_pair_array); err_free_ctx: kfree(context); err_out: return ERR_PTR(error); } /* * Destroy VMCI context. */ void vmci_ctx_destroy(struct vmci_ctx *context) { spin_lock(&ctx_list.lock); list_del_rcu(&context->list_item); spin_unlock(&ctx_list.lock); synchronize_rcu(); vmci_ctx_put(context); } /* * Fire notification for all contexts interested in given cid. */ static int ctx_fire_notification(u32 context_id, u32 priv_flags) { u32 i, array_size; struct vmci_ctx *sub_ctx; struct vmci_handle_arr *subscriber_array; struct vmci_handle context_handle = vmci_make_handle(context_id, VMCI_EVENT_HANDLER); /* * We create an array to hold the subscribers we find when * scanning through all contexts. */ subscriber_array = vmci_handle_arr_create(0, VMCI_MAX_CONTEXTS); if (subscriber_array == NULL) return VMCI_ERROR_NO_MEM; /* * Scan all contexts to find who is interested in being * notified about given contextID. */ rcu_read_lock(); list_for_each_entry_rcu(sub_ctx, &ctx_list.head, list_item) { struct vmci_handle_list *node; /* * We only deliver notifications of the removal of * contexts, if the two contexts are allowed to * interact. */ if (vmci_deny_interaction(priv_flags, sub_ctx->priv_flags)) continue; list_for_each_entry_rcu(node, &sub_ctx->notifier_list, node) { if (!vmci_handle_is_equal(node->handle, context_handle)) continue; vmci_handle_arr_append_entry(&subscriber_array, vmci_make_handle(sub_ctx->cid, VMCI_EVENT_HANDLER)); } } rcu_read_unlock(); /* Fire event to all subscribers. */ array_size = vmci_handle_arr_get_size(subscriber_array); for (i = 0; i < array_size; i++) { int result; struct vmci_event_ctx ev; ev.msg.hdr.dst = vmci_handle_arr_get_entry(subscriber_array, i); ev.msg.hdr.src = vmci_make_handle(VMCI_HYPERVISOR_CONTEXT_ID, VMCI_CONTEXT_RESOURCE_ID); ev.msg.hdr.payload_size = sizeof(ev) - sizeof(ev.msg.hdr); ev.msg.event_data.event = VMCI_EVENT_CTX_REMOVED; ev.payload.context_id = context_id; result = vmci_datagram_dispatch(VMCI_HYPERVISOR_CONTEXT_ID, &ev.msg.hdr, false); if (result < VMCI_SUCCESS) { pr_devel("Failed to enqueue event datagram (type=%d) for context (ID=0x%x)\n", ev.msg.event_data.event, ev.msg.hdr.dst.context); /* We continue to enqueue on next subscriber. */ } } vmci_handle_arr_destroy(subscriber_array); return VMCI_SUCCESS; } /* * Returns the current number of pending datagrams. The call may * also serve as a synchronization point for the datagram queue, * as no enqueue operations can occur concurrently. */ int vmci_ctx_pending_datagrams(u32 cid, u32 *pending) { struct vmci_ctx *context; context = vmci_ctx_get(cid); if (context == NULL) return VMCI_ERROR_INVALID_ARGS; spin_lock(&context->lock); if (pending) *pending = context->pending_datagrams; spin_unlock(&context->lock); vmci_ctx_put(context); return VMCI_SUCCESS; } /* * Queues a VMCI datagram for the appropriate target VM context. */ int vmci_ctx_enqueue_datagram(u32 cid, struct vmci_datagram *dg) { struct vmci_datagram_queue_entry *dq_entry; struct vmci_ctx *context; struct vmci_handle dg_src; size_t vmci_dg_size; vmci_dg_size = VMCI_DG_SIZE(dg); if (vmci_dg_size > VMCI_MAX_DG_SIZE) { pr_devel("Datagram too large (bytes=%zu)\n", vmci_dg_size); return VMCI_ERROR_INVALID_ARGS; } /* Get the target VM's VMCI context. */ context = vmci_ctx_get(cid); if (!context) { pr_devel("Invalid context (ID=0x%x)\n", cid); return VMCI_ERROR_INVALID_ARGS; } /* Allocate guest call entry and add it to the target VM's queue. */ dq_entry = kmalloc(sizeof(*dq_entry), GFP_KERNEL); if (dq_entry == NULL) { pr_warn("Failed to allocate memory for datagram\n"); vmci_ctx_put(context); return VMCI_ERROR_NO_MEM; } dq_entry->dg = dg; dq_entry->dg_size = vmci_dg_size; dg_src = dg->src; INIT_LIST_HEAD(&dq_entry->list_item); spin_lock(&context->lock); /* * We put a higher limit on datagrams from the hypervisor. If * the pending datagram is not from hypervisor, then we check * if enqueueing it would exceed the * VMCI_MAX_DATAGRAM_QUEUE_SIZE limit on the destination. If * the pending datagram is from hypervisor, we allow it to be * queued at the destination side provided we don't reach the * VMCI_MAX_DATAGRAM_AND_EVENT_QUEUE_SIZE limit. */ if (context->datagram_queue_size + vmci_dg_size >= VMCI_MAX_DATAGRAM_QUEUE_SIZE && (!vmci_handle_is_equal(dg_src, vmci_make_handle (VMCI_HYPERVISOR_CONTEXT_ID, VMCI_CONTEXT_RESOURCE_ID)) || context->datagram_queue_size + vmci_dg_size >= VMCI_MAX_DATAGRAM_AND_EVENT_QUEUE_SIZE)) { spin_unlock(&context->lock); vmci_ctx_put(context); kfree(dq_entry); pr_devel("Context (ID=0x%x) receive queue is full\n", cid); return VMCI_ERROR_NO_RESOURCES; } list_add(&dq_entry->list_item, &context->datagram_queue); context->pending_datagrams++; context->datagram_queue_size += vmci_dg_size; ctx_signal_notify(context); wake_up(&context->host_context.wait_queue); spin_unlock(&context->lock); vmci_ctx_put(context); return vmci_dg_size; } /* * Verifies whether a context with the specified context ID exists. * FIXME: utility is dubious as no decisions can be reliably made * using this data as context can appear and disappear at any time. */ bool vmci_ctx_exists(u32 cid) { struct vmci_ctx *context; bool exists = false; rcu_read_lock(); list_for_each_entry_rcu(context, &ctx_list.head, list_item) { if (context->cid == cid) { exists = true; break; } } rcu_read_unlock(); return exists; } /* * Retrieves VMCI context corresponding to the given cid. */ struct vmci_ctx *vmci_ctx_get(u32 cid) { struct vmci_ctx *c, *context = NULL; if (cid == VMCI_INVALID_ID) return NULL; rcu_read_lock(); list_for_each_entry_rcu(c, &ctx_list.head, list_item) { if (c->cid == cid) { /* * The context owner drops its own reference to the * context only after removing it from the list and * waiting for RCU grace period to expire. This * means that we are not about to increase the * reference count of something that is in the * process of being destroyed. */ context = c; kref_get(&context->kref); break; } } rcu_read_unlock(); return context; } /* * Deallocates all parts of a context data structure. This * function doesn't lock the context, because it assumes that * the caller was holding the last reference to context. */ static void ctx_free_ctx(struct kref *kref) { struct vmci_ctx *context = container_of(kref, struct vmci_ctx, kref); struct vmci_datagram_queue_entry *dq_entry, *dq_entry_tmp; struct vmci_handle temp_handle; struct vmci_handle_list *notifier, *tmp; /* * Fire event to all contexts interested in knowing this * context is dying. */ ctx_fire_notification(context->cid, context->priv_flags); /* * Cleanup all queue pair resources attached to context. If * the VM dies without cleaning up, this code will make sure * that no resources are leaked. */ temp_handle = vmci_handle_arr_get_entry(context->queue_pair_array, 0); while (!vmci_handle_is_equal(temp_handle, VMCI_INVALID_HANDLE)) { if (vmci_qp_broker_detach(temp_handle, context) < VMCI_SUCCESS) { /* * When vmci_qp_broker_detach() succeeds it * removes the handle from the array. If * detach fails, we must remove the handle * ourselves. */ vmci_handle_arr_remove_entry(context->queue_pair_array, temp_handle); } temp_handle = vmci_handle_arr_get_entry(context->queue_pair_array, 0); } /* * It is fine to destroy this without locking the callQueue, as * this is the only thread having a reference to the context. */ list_for_each_entry_safe(dq_entry, dq_entry_tmp, &context->datagram_queue, list_item) { WARN_ON(dq_entry->dg_size != VMCI_DG_SIZE(dq_entry->dg)); list_del(&dq_entry->list_item); kfree(dq_entry->dg); kfree(dq_entry); } list_for_each_entry_safe(notifier, tmp, &context->notifier_list, node) { list_del(&notifier->node); kfree(notifier); } vmci_handle_arr_destroy(context->queue_pair_array); vmci_handle_arr_destroy(context->doorbell_array); vmci_handle_arr_destroy(context->pending_doorbell_array); vmci_ctx_unset_notify(context); if (context->cred) put_cred(context->cred); kfree(context); } /* * Drops reference to VMCI context. If this is the last reference to * the context it will be deallocated. A context is created with * a reference count of one, and on destroy, it is removed from * the context list before its reference count is decremented. Thus, * if we reach zero, we are sure that nobody else are about to increment * it (they need the entry in the context list for that), and so there * is no need for locking. */ void vmci_ctx_put(struct vmci_ctx *context) { kref_put(&context->kref, ctx_free_ctx); } /* * Dequeues the next datagram and returns it to caller. * The caller passes in a pointer to the max size datagram * it can handle and the datagram is only unqueued if the * size is less than max_size. If larger max_size is set to * the size of the datagram to give the caller a chance to * set up a larger buffer for the guestcall. */ int vmci_ctx_dequeue_datagram(struct vmci_ctx *context, size_t *max_size, struct vmci_datagram **dg) { struct vmci_datagram_queue_entry *dq_entry; struct list_head *list_item; int rv; /* Dequeue the next datagram entry. */ spin_lock(&context->lock); if (context->pending_datagrams == 0) { ctx_clear_notify_call(context); spin_unlock(&context->lock); pr_devel("No datagrams pending\n"); return VMCI_ERROR_NO_MORE_DATAGRAMS; } list_item = context->datagram_queue.next; dq_entry = list_entry(list_item, struct vmci_datagram_queue_entry, list_item); /* Check size of caller's buffer. */ if (*max_size < dq_entry->dg_size) { *max_size = dq_entry->dg_size; spin_unlock(&context->lock); pr_devel("Caller's buffer should be at least (size=%u bytes)\n", (u32) *max_size); return VMCI_ERROR_NO_MEM; } list_del(list_item); context->pending_datagrams--; context->datagram_queue_size -= dq_entry->dg_size; if (context->pending_datagrams == 0) { ctx_clear_notify_call(context); rv = VMCI_SUCCESS; } else { /* * Return the size of the next datagram. */ struct vmci_datagram_queue_entry *next_entry; list_item = context->datagram_queue.next; next_entry = list_entry(list_item, struct vmci_datagram_queue_entry, list_item); /* * The following size_t -> int truncation is fine as * the maximum size of a (routable) datagram is 68KB. */ rv = (int)next_entry->dg_size; } spin_unlock(&context->lock); /* Caller must free datagram. */ *dg = dq_entry->dg; dq_entry->dg = NULL; kfree(dq_entry); return rv; } /* * Reverts actions set up by vmci_setup_notify(). Unmaps and unlocks the * page mapped/locked by vmci_setup_notify(). */ void vmci_ctx_unset_notify(struct vmci_ctx *context) { struct page *notify_page; spin_lock(&context->lock); notify_page = context->notify_page; context->notify = &ctx_dummy_notify; context->notify_page = NULL; spin_unlock(&context->lock); if (notify_page) { kunmap(notify_page); put_page(notify_page); } } /* * Add remote_cid to list of contexts current contexts wants * notifications from/about. */ int vmci_ctx_add_notification(u32 context_id, u32 remote_cid) { struct vmci_ctx *context; struct vmci_handle_list *notifier, *n; int result; bool exists = false; context = vmci_ctx_get(context_id); if (!context) return VMCI_ERROR_NOT_FOUND; if (VMCI_CONTEXT_IS_VM(context_id) && VMCI_CONTEXT_IS_VM(remote_cid)) { pr_devel("Context removed notifications for other VMs not supported (src=0x%x, remote=0x%x)\n", context_id, remote_cid); result = VMCI_ERROR_DST_UNREACHABLE; goto out; } if (context->priv_flags & VMCI_PRIVILEGE_FLAG_RESTRICTED) { result = VMCI_ERROR_NO_ACCESS; goto out; } notifier = kmalloc(sizeof(struct vmci_handle_list), GFP_KERNEL); if (!notifier) { result = VMCI_ERROR_NO_MEM; goto out; } INIT_LIST_HEAD(&notifier->node); notifier->handle = vmci_make_handle(remote_cid, VMCI_EVENT_HANDLER); spin_lock(&context->lock); if (context->n_notifiers < VMCI_MAX_CONTEXTS) { list_for_each_entry(n, &context->notifier_list, node) { if (vmci_handle_is_equal(n->handle, notifier->handle)) { exists = true; break; } } if (exists) { kfree(notifier); result = VMCI_ERROR_ALREADY_EXISTS; } else { list_add_tail_rcu(&notifier->node, &context->notifier_list); context->n_notifiers++; result = VMCI_SUCCESS; } } else { kfree(notifier); result = VMCI_ERROR_NO_MEM; } spin_unlock(&context->lock); out: vmci_ctx_put(context); return result; } /* * Remove remote_cid from current context's list of contexts it is * interested in getting notifications from/about. */ int vmci_ctx_remove_notification(u32 context_id, u32 remote_cid) { struct vmci_ctx *context; struct vmci_handle_list *notifier = NULL, *iter, *tmp; struct vmci_handle handle; context = vmci_ctx_get(context_id); if (!context) return VMCI_ERROR_NOT_FOUND; handle = vmci_make_handle(remote_cid, VMCI_EVENT_HANDLER); spin_lock(&context->lock); list_for_each_entry_safe(iter, tmp, &context->notifier_list, node) { if (vmci_handle_is_equal(iter->handle, handle)) { list_del_rcu(&iter->node); context->n_notifiers--; notifier = iter; break; } } spin_unlock(&context->lock); if (notifier) kvfree_rcu_mightsleep(notifier); vmci_ctx_put(context); return notifier ? VMCI_SUCCESS : VMCI_ERROR_NOT_FOUND; } static int vmci_ctx_get_chkpt_notifiers(struct vmci_ctx *context, u32 *buf_size, void **pbuf) { u32 *notifiers; size_t data_size; struct vmci_handle_list *entry; int i = 0; if (context->n_notifiers == 0) { *buf_size = 0; *pbuf = NULL; return VMCI_SUCCESS; } data_size = context->n_notifiers * sizeof(*notifiers); if (*buf_size < data_size) { *buf_size = data_size; return VMCI_ERROR_MORE_DATA; } notifiers = kmalloc(data_size, GFP_ATOMIC); /* FIXME: want GFP_KERNEL */ if (!notifiers) return VMCI_ERROR_NO_MEM; list_for_each_entry(entry, &context->notifier_list, node) notifiers[i++] = entry->handle.context; *buf_size = data_size; *pbuf = notifiers; return VMCI_SUCCESS; } static int vmci_ctx_get_chkpt_doorbells(struct vmci_ctx *context, u32 *buf_size, void **pbuf) { struct dbell_cpt_state *dbells; u32 i, n_doorbells; n_doorbells = vmci_handle_arr_get_size(context->doorbell_array); if (n_doorbells > 0) { size_t data_size = n_doorbells * sizeof(*dbells); if (*buf_size < data_size) { *buf_size = data_size; return VMCI_ERROR_MORE_DATA; } dbells = kzalloc(data_size, GFP_ATOMIC); if (!dbells) return VMCI_ERROR_NO_MEM; for (i = 0; i < n_doorbells; i++) dbells[i].handle = vmci_handle_arr_get_entry( context->doorbell_array, i); *buf_size = data_size; *pbuf = dbells; } else { *buf_size = 0; *pbuf = NULL; } return VMCI_SUCCESS; } /* * Get current context's checkpoint state of given type. */ int vmci_ctx_get_chkpt_state(u32 context_id, u32 cpt_type, u32 *buf_size, void **pbuf) { struct vmci_ctx *context; int result; context = vmci_ctx_get(context_id); if (!context) return VMCI_ERROR_NOT_FOUND; spin_lock(&context->lock); switch (cpt_type) { case VMCI_NOTIFICATION_CPT_STATE: result = vmci_ctx_get_chkpt_notifiers(context, buf_size, pbuf); break; case VMCI_WELLKNOWN_CPT_STATE: /* * For compatibility with VMX'en with VM to VM communication, we * always return zero wellknown handles. */ *buf_size = 0; *pbuf = NULL; result = VMCI_SUCCESS; break; case VMCI_DOORBELL_CPT_STATE: result = vmci_ctx_get_chkpt_doorbells(context, buf_size, pbuf); break; default: pr_devel("Invalid cpt state (type=%d)\n", cpt_type); result = VMCI_ERROR_INVALID_ARGS; break; } spin_unlock(&context->lock); vmci_ctx_put(context); return result; } /* * Set current context's checkpoint state of given type. */ int vmci_ctx_set_chkpt_state(u32 context_id, u32 cpt_type, u32 buf_size, void *cpt_buf) { u32 i; u32 current_id; int result = VMCI_SUCCESS; u32 num_ids = buf_size / sizeof(u32); if (cpt_type == VMCI_WELLKNOWN_CPT_STATE && num_ids > 0) { /* * We would end up here if VMX with VM to VM communication * attempts to restore a checkpoint with wellknown handles. */ pr_warn("Attempt to restore checkpoint with obsolete wellknown handles\n"); return VMCI_ERROR_OBSOLETE; } if (cpt_type != VMCI_NOTIFICATION_CPT_STATE) { pr_devel("Invalid cpt state (type=%d)\n", cpt_type); return VMCI_ERROR_INVALID_ARGS; } for (i = 0; i < num_ids && result == VMCI_SUCCESS; i++) { current_id = ((u32 *)cpt_buf)[i]; result = vmci_ctx_add_notification(context_id, current_id); if (result != VMCI_SUCCESS) break; } if (result != VMCI_SUCCESS) pr_devel("Failed to set cpt state (type=%d) (error=%d)\n", cpt_type, result); return result; } /* * Retrieves the specified context's pending notifications in the * form of a handle array. The handle arrays returned are the * actual data - not a copy and should not be modified by the * caller. They must be released using * vmci_ctx_rcv_notifications_release. */ int vmci_ctx_rcv_notifications_get(u32 context_id, struct vmci_handle_arr **db_handle_array, struct vmci_handle_arr **qp_handle_array) { struct vmci_ctx *context; int result = VMCI_SUCCESS; context = vmci_ctx_get(context_id); if (context == NULL) return VMCI_ERROR_NOT_FOUND; spin_lock(&context->lock); *db_handle_array = context->pending_doorbell_array; context->pending_doorbell_array = vmci_handle_arr_create(0, VMCI_MAX_GUEST_DOORBELL_COUNT); if (!context->pending_doorbell_array) { context->pending_doorbell_array = *db_handle_array; *db_handle_array = NULL; result = VMCI_ERROR_NO_MEM; } *qp_handle_array = NULL; spin_unlock(&context->lock); vmci_ctx_put(context); return result; } /* * Releases handle arrays with pending notifications previously * retrieved using vmci_ctx_rcv_notifications_get. If the * notifications were not successfully handed over to the guest, * success must be false. */ void vmci_ctx_rcv_notifications_release(u32 context_id, struct vmci_handle_arr *db_handle_array, struct vmci_handle_arr *qp_handle_array, bool success) { struct vmci_ctx *context = vmci_ctx_get(context_id); spin_lock(&context->lock); if (!success) { struct vmci_handle handle; /* * New notifications may have been added while we were not * holding the context lock, so we transfer any new pending * doorbell notifications to the old array, and reinstate the * old array. */ handle = vmci_handle_arr_remove_tail( context->pending_doorbell_array); while (!vmci_handle_is_invalid(handle)) { if (!vmci_handle_arr_has_entry(db_handle_array, handle)) { vmci_handle_arr_append_entry( &db_handle_array, handle); } handle = vmci_handle_arr_remove_tail( context->pending_doorbell_array); } vmci_handle_arr_destroy(context->pending_doorbell_array); context->pending_doorbell_array = db_handle_array; db_handle_array = NULL; } else { ctx_clear_notify_call(context); } spin_unlock(&context->lock); vmci_ctx_put(context); if (db_handle_array) vmci_handle_arr_destroy(db_handle_array); if (qp_handle_array) vmci_handle_arr_destroy(qp_handle_array); } /* * Registers that a new doorbell handle has been allocated by the * context. Only doorbell handles registered can be notified. */ int vmci_ctx_dbell_create(u32 context_id, struct vmci_handle handle) { struct vmci_ctx *context; int result; if (context_id == VMCI_INVALID_ID || vmci_handle_is_invalid(handle)) return VMCI_ERROR_INVALID_ARGS; context = vmci_ctx_get(context_id); if (context == NULL) return VMCI_ERROR_NOT_FOUND; spin_lock(&context->lock); if (!vmci_handle_arr_has_entry(context->doorbell_array, handle)) result = vmci_handle_arr_append_entry(&context->doorbell_array, handle); else result = VMCI_ERROR_DUPLICATE_ENTRY; spin_unlock(&context->lock); vmci_ctx_put(context); return result; } /* * Unregisters a doorbell handle that was previously registered * with vmci_ctx_dbell_create. */ int vmci_ctx_dbell_destroy(u32 context_id, struct vmci_handle handle) { struct vmci_ctx *context; struct vmci_handle removed_handle; if (context_id == VMCI_INVALID_ID || vmci_handle_is_invalid(handle)) return VMCI_ERROR_INVALID_ARGS; context = vmci_ctx_get(context_id); if (context == NULL) return VMCI_ERROR_NOT_FOUND; spin_lock(&context->lock); removed_handle = vmci_handle_arr_remove_entry(context->doorbell_array, handle); vmci_handle_arr_remove_entry(context->pending_doorbell_array, handle); spin_unlock(&context->lock); vmci_ctx_put(context); return vmci_handle_is_invalid(removed_handle) ? VMCI_ERROR_NOT_FOUND : VMCI_SUCCESS; } /* * Unregisters all doorbell handles that were previously * registered with vmci_ctx_dbell_create. */ int vmci_ctx_dbell_destroy_all(u32 context_id) { struct vmci_ctx *context; struct vmci_handle handle; if (context_id == VMCI_INVALID_ID) return VMCI_ERROR_INVALID_ARGS; context = vmci_ctx_get(context_id); if (context == NULL) return VMCI_ERROR_NOT_FOUND; spin_lock(&context->lock); do { struct vmci_handle_arr *arr = context->doorbell_array; handle = vmci_handle_arr_remove_tail(arr); } while (!vmci_handle_is_invalid(handle)); do { struct vmci_handle_arr *arr = context->pending_doorbell_array; handle = vmci_handle_arr_remove_tail(arr); } while (!vmci_handle_is_invalid(handle)); spin_unlock(&context->lock); vmci_ctx_put(context); return VMCI_SUCCESS; } /* * Registers a notification of a doorbell handle initiated by the * specified source context. The notification of doorbells are * subject to the same isolation rules as datagram delivery. To * allow host side senders of notifications a finer granularity * of sender rights than those assigned to the sending context * itself, the host context is required to specify a different * set of privilege flags that will override the privileges of * the source context. */ int vmci_ctx_notify_dbell(u32 src_cid, struct vmci_handle handle, u32 src_priv_flags) { struct vmci_ctx *dst_context; int result; if (vmci_handle_is_invalid(handle)) return VMCI_ERROR_INVALID_ARGS; /* Get the target VM's VMCI context. */ dst_context = vmci_ctx_get(handle.context); if (!dst_context) { pr_devel("Invalid context (ID=0x%x)\n", handle.context); return VMCI_ERROR_NOT_FOUND; } if (src_cid != handle.context) { u32 dst_priv_flags; if (VMCI_CONTEXT_IS_VM(src_cid) && VMCI_CONTEXT_IS_VM(handle.context)) { pr_devel("Doorbell notification from VM to VM not supported (src=0x%x, dst=0x%x)\n", src_cid, handle.context); result = VMCI_ERROR_DST_UNREACHABLE; goto out; } result = vmci_dbell_get_priv_flags(handle, &dst_priv_flags); if (result < VMCI_SUCCESS) { pr_warn("Failed to get privilege flags for destination (handle=0x%x:0x%x)\n", handle.context, handle.resource); goto out; } if (src_cid != VMCI_HOST_CONTEXT_ID || src_priv_flags == VMCI_NO_PRIVILEGE_FLAGS) { src_priv_flags = vmci_context_get_priv_flags(src_cid); } if (vmci_deny_interaction(src_priv_flags, dst_priv_flags)) { result = VMCI_ERROR_NO_ACCESS; goto out; } } if (handle.context == VMCI_HOST_CONTEXT_ID) { result = vmci_dbell_host_context_notify(src_cid, handle); } else { spin_lock(&dst_context->lock); if (!vmci_handle_arr_has_entry(dst_context->doorbell_array, handle)) { result = VMCI_ERROR_NOT_FOUND; } else { if (!vmci_handle_arr_has_entry( dst_context->pending_doorbell_array, handle)) { result = vmci_handle_arr_append_entry( &dst_context->pending_doorbell_array, handle); if (result == VMCI_SUCCESS) { ctx_signal_notify(dst_context); wake_up(&dst_context->host_context.wait_queue); } } else { result = VMCI_SUCCESS; } } spin_unlock(&dst_context->lock); } out: vmci_ctx_put(dst_context); return result; } bool vmci_ctx_supports_host_qp(struct vmci_ctx *context) { return context && context->user_version >= VMCI_VERSION_HOSTQP; } /* * Registers that a new queue pair handle has been allocated by * the context. */ int vmci_ctx_qp_create(struct vmci_ctx *context, struct vmci_handle handle) { int result; if (context == NULL || vmci_handle_is_invalid(handle)) return VMCI_ERROR_INVALID_ARGS; if (!vmci_handle_arr_has_entry(context->queue_pair_array, handle)) result = vmci_handle_arr_append_entry( &context->queue_pair_array, handle); else result = VMCI_ERROR_DUPLICATE_ENTRY; return result; } /* * Unregisters a queue pair handle that was previously registered * with vmci_ctx_qp_create. */ int vmci_ctx_qp_destroy(struct vmci_ctx *context, struct vmci_handle handle) { struct vmci_handle hndl; if (context == NULL || vmci_handle_is_invalid(handle)) return VMCI_ERROR_INVALID_ARGS; hndl = vmci_handle_arr_remove_entry(context->queue_pair_array, handle); return vmci_handle_is_invalid(hndl) ? VMCI_ERROR_NOT_FOUND : VMCI_SUCCESS; } /* * Determines whether a given queue pair handle is registered * with the given context. */ bool vmci_ctx_qp_exists(struct vmci_ctx *context, struct vmci_handle handle) { if (context == NULL || vmci_handle_is_invalid(handle)) return false; return vmci_handle_arr_has_entry(context->queue_pair_array, handle); } /* * vmci_context_get_priv_flags() - Retrieve privilege flags. * @context_id: The context ID of the VMCI context. * * Retrieves privilege flags of the given VMCI context ID. */ u32 vmci_context_get_priv_flags(u32 context_id) { if (vmci_host_code_active()) { u32 flags; struct vmci_ctx *context; context = vmci_ctx_get(context_id); if (!context) return VMCI_LEAST_PRIVILEGE_FLAGS; flags = context->priv_flags; vmci_ctx_put(context); return flags; } return VMCI_NO_PRIVILEGE_FLAGS; } EXPORT_SYMBOL_GPL(vmci_context_get_priv_flags); /* * vmci_is_context_owner() - Determimnes if user is the context owner * @context_id: The context ID of the VMCI context. * @uid: The host user id (real kernel value). * * Determines whether a given UID is the owner of given VMCI context. */ bool vmci_is_context_owner(u32 context_id, kuid_t uid) { bool is_owner = false; if (vmci_host_code_active()) { struct vmci_ctx *context = vmci_ctx_get(context_id); if (context) { if (context->cred) is_owner = uid_eq(context->cred->uid, uid); vmci_ctx_put(context); } } return is_owner; } EXPORT_SYMBOL_GPL(vmci_is_context_owner);
15 15 15 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 // SPDX-License-Identifier: GPL-2.0-or-later /* * IPVS: Weighted Least-Connection Scheduling module * * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> * Peter Kese <peter.kese@ijs.si> * * Changes: * Wensong Zhang : changed the ip_vs_wlc_schedule to return dest * Wensong Zhang : changed to use the inactconns in scheduling * Wensong Zhang : changed some comestics things for debugging * Wensong Zhang : changed for the d-linked destination list * Wensong Zhang : added the ip_vs_wlc_update_svc * Wensong Zhang : added any dest with weight=0 is quiesced */ #define KMSG_COMPONENT "IPVS" #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt #include <linux/module.h> #include <linux/kernel.h> #include <net/ip_vs.h> /* * Weighted Least Connection scheduling */ static struct ip_vs_dest * ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, struct ip_vs_iphdr *iph) { struct ip_vs_dest *dest, *least; int loh, doh; IP_VS_DBG(6, "ip_vs_wlc_schedule(): Scheduling...\n"); /* * We calculate the load of each dest server as follows: * (dest overhead) / dest->weight * * Remember -- no floats in kernel mode!!! * The comparison of h1*w2 > h2*w1 is equivalent to that of * h1/w1 > h2/w2 * if every weight is larger than zero. * * The server with weight=0 is quiesced and will not receive any * new connections. */ list_for_each_entry_rcu(dest, &svc->destinations, n_list) { if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && atomic_read(&dest->weight) > 0) { least = dest; loh = ip_vs_dest_conn_overhead(least); goto nextstage; } } ip_vs_scheduler_err(svc, "no destination available"); return NULL; /* * Find the destination with the least load. */ nextstage: list_for_each_entry_continue_rcu(dest, &svc->destinations, n_list) { if (dest->flags & IP_VS_DEST_F_OVERLOAD) continue; doh = ip_vs_dest_conn_overhead(dest); if ((__s64)loh * atomic_read(&dest->weight) > (__s64)doh * atomic_read(&least->weight)) { least = dest; loh = doh; } } IP_VS_DBG_BUF(6, "WLC: server %s:%u " "activeconns %d refcnt %d weight %d overhead %d\n", IP_VS_DBG_ADDR(least->af, &least->addr), ntohs(least->port), atomic_read(&least->activeconns), refcount_read(&least->refcnt), atomic_read(&least->weight), loh); return least; } static struct ip_vs_scheduler ip_vs_wlc_scheduler = { .name = "wlc", .refcnt = ATOMIC_INIT(0), .module = THIS_MODULE, .n_list = LIST_HEAD_INIT(ip_vs_wlc_scheduler.n_list), .schedule = ip_vs_wlc_schedule, }; static int __init ip_vs_wlc_init(void) { return register_ip_vs_scheduler(&ip_vs_wlc_scheduler); } static void __exit ip_vs_wlc_cleanup(void) { unregister_ip_vs_scheduler(&ip_vs_wlc_scheduler); synchronize_rcu(); } module_init(ip_vs_wlc_init); module_exit(ip_vs_wlc_cleanup); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("ipvs weighted least connection scheduler");
248 6 150 1 264 2 92 180 177 237 92 1 164 181 13 181 3 176 176 54 177 57 120 152 1 168 71 169 196 21 6 24 167 152 8 45 24 2 20 77 3 44 44 6 5 6 144 8 3 141 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 /* SPDX-License-Identifier: GPL-2.0-only */ /* * * Copyright (C) 2011 Novell Inc. */ #include <linux/kernel.h> #include <linux/uuid.h> #include <linux/fs.h> #include <linux/fsverity.h> #include <linux/namei.h> #include <linux/posix_acl.h> #include <linux/posix_acl_xattr.h> #include "ovl_entry.h" #undef pr_fmt #define pr_fmt(fmt) "overlayfs: " fmt enum ovl_path_type { __OVL_PATH_UPPER = (1 << 0), __OVL_PATH_MERGE = (1 << 1), __OVL_PATH_ORIGIN = (1 << 2), }; #define OVL_TYPE_UPPER(type) ((type) & __OVL_PATH_UPPER) #define OVL_TYPE_MERGE(type) ((type) & __OVL_PATH_MERGE) #define OVL_TYPE_ORIGIN(type) ((type) & __OVL_PATH_ORIGIN) #define OVL_XATTR_NAMESPACE "overlay." #define OVL_XATTR_TRUSTED_PREFIX XATTR_TRUSTED_PREFIX OVL_XATTR_NAMESPACE #define OVL_XATTR_TRUSTED_PREFIX_LEN (sizeof(OVL_XATTR_TRUSTED_PREFIX) - 1) #define OVL_XATTR_USER_PREFIX XATTR_USER_PREFIX OVL_XATTR_NAMESPACE #define OVL_XATTR_USER_PREFIX_LEN (sizeof(OVL_XATTR_USER_PREFIX) - 1) #define OVL_XATTR_ESCAPE_PREFIX OVL_XATTR_NAMESPACE #define OVL_XATTR_ESCAPE_PREFIX_LEN (sizeof(OVL_XATTR_ESCAPE_PREFIX) - 1) #define OVL_XATTR_ESCAPE_TRUSTED_PREFIX OVL_XATTR_TRUSTED_PREFIX OVL_XATTR_ESCAPE_PREFIX #define OVL_XATTR_ESCAPE_TRUSTED_PREFIX_LEN (sizeof(OVL_XATTR_ESCAPE_TRUSTED_PREFIX) - 1) #define OVL_XATTR_ESCAPE_USER_PREFIX OVL_XATTR_USER_PREFIX OVL_XATTR_ESCAPE_PREFIX #define OVL_XATTR_ESCAPE_USER_PREFIX_LEN (sizeof(OVL_XATTR_ESCAPE_USER_PREFIX) - 1) enum ovl_xattr { OVL_XATTR_OPAQUE, OVL_XATTR_REDIRECT, OVL_XATTR_ORIGIN, OVL_XATTR_IMPURE, OVL_XATTR_NLINK, OVL_XATTR_UPPER, OVL_XATTR_UUID, OVL_XATTR_METACOPY, OVL_XATTR_PROTATTR, OVL_XATTR_XWHITEOUT, }; enum ovl_inode_flag { /* Pure upper dir that may contain non pure upper entries */ OVL_IMPURE, /* Non-merge dir that may contain whiteout entries */ OVL_WHITEOUTS, OVL_INDEX, OVL_UPPERDATA, /* Inode number will remain constant over copy up. */ OVL_CONST_INO, OVL_HAS_DIGEST, OVL_VERIFIED_DIGEST, }; enum ovl_entry_flag { OVL_E_UPPER_ALIAS, OVL_E_OPAQUE, OVL_E_CONNECTED, /* Lower stack may contain xwhiteout entries */ OVL_E_XWHITEOUTS, }; enum { OVL_REDIRECT_OFF, /* "off" mode is never used. In effect */ OVL_REDIRECT_FOLLOW, /* ...it translates to either "follow" */ OVL_REDIRECT_NOFOLLOW, /* ...or "nofollow". */ OVL_REDIRECT_ON, }; enum { OVL_UUID_OFF, OVL_UUID_NULL, OVL_UUID_AUTO, OVL_UUID_ON, }; enum { OVL_XINO_OFF, OVL_XINO_AUTO, OVL_XINO_ON, }; enum { OVL_VERITY_OFF, OVL_VERITY_ON, OVL_VERITY_REQUIRE, }; /* * The tuple (fh,uuid) is a universal unique identifier for a copy up origin, * where: * origin.fh - exported file handle of the lower file * origin.uuid - uuid of the lower filesystem */ #define OVL_FH_VERSION 0 #define OVL_FH_MAGIC 0xfb /* CPU byte order required for fid decoding: */ #define OVL_FH_FLAG_BIG_ENDIAN (1 << 0) #define OVL_FH_FLAG_ANY_ENDIAN (1 << 1) /* Is the real inode encoded in fid an upper inode? */ #define OVL_FH_FLAG_PATH_UPPER (1 << 2) #define OVL_FH_FLAG_ALL (OVL_FH_FLAG_BIG_ENDIAN | OVL_FH_FLAG_ANY_ENDIAN | \ OVL_FH_FLAG_PATH_UPPER) #if defined(__LITTLE_ENDIAN) #define OVL_FH_FLAG_CPU_ENDIAN 0 #elif defined(__BIG_ENDIAN) #define OVL_FH_FLAG_CPU_ENDIAN OVL_FH_FLAG_BIG_ENDIAN #else #error Endianness not defined #endif /* The type used to be returned by overlay exportfs for misaligned fid */ #define OVL_FILEID_V0 0xfb /* The type returned by overlay exportfs for 32bit aligned fid */ #define OVL_FILEID_V1 0xf8 /* On-disk format for "origin" file handle */ struct ovl_fb { u8 version; /* 0 */ u8 magic; /* 0xfb */ u8 len; /* size of this header + size of fid */ u8 flags; /* OVL_FH_FLAG_* */ u8 type; /* fid_type of fid */ uuid_t uuid; /* uuid of filesystem */ u32 fid[]; /* file identifier should be 32bit aligned in-memory */ } __packed; /* In-memory and on-wire format for overlay file handle */ struct ovl_fh { u8 padding[3]; /* make sure fb.fid is 32bit aligned */ union { struct ovl_fb fb; DECLARE_FLEX_ARRAY(u8, buf); }; } __packed; #define OVL_FH_WIRE_OFFSET offsetof(struct ovl_fh, fb) #define OVL_FH_LEN(fh) (OVL_FH_WIRE_OFFSET + (fh)->fb.len) #define OVL_FH_FID_OFFSET (OVL_FH_WIRE_OFFSET + \ offsetof(struct ovl_fb, fid)) /* On-disk format for "metacopy" xattr (if non-zero size) */ struct ovl_metacopy { u8 version; /* 0 */ u8 len; /* size of this header + used digest bytes */ u8 flags; u8 digest_algo; /* FS_VERITY_HASH_ALG_* constant, 0 for no digest */ u8 digest[FS_VERITY_MAX_DIGEST_SIZE]; /* Only the used part on disk */ } __packed; #define OVL_METACOPY_MAX_SIZE (sizeof(struct ovl_metacopy)) #define OVL_METACOPY_MIN_SIZE (OVL_METACOPY_MAX_SIZE - FS_VERITY_MAX_DIGEST_SIZE) #define OVL_METACOPY_INIT { 0, OVL_METACOPY_MIN_SIZE } static inline int ovl_metadata_digest_size(const struct ovl_metacopy *metacopy) { if (metacopy->len < OVL_METACOPY_MIN_SIZE) return 0; return (int)metacopy->len - OVL_METACOPY_MIN_SIZE; } /* No atime modification on underlying */ #define OVL_OPEN_FLAGS (O_NOATIME) extern const char *const ovl_xattr_table[][2]; static inline const char *ovl_xattr(struct ovl_fs *ofs, enum ovl_xattr ox) { return ovl_xattr_table[ox][ofs->config.userxattr]; } /* * When changing ownership of an upper object map the intended ownership * according to the upper layer's idmapping. When an upper mount idmaps files * that are stored on-disk as owned by id 1001 to id 1000 this means stat on * this object will report it as being owned by id 1000 when calling stat via * the upper mount. * In order to change ownership of an object so stat reports id 1000 when * called on an idmapped upper mount the value written to disk - i.e., the * value stored in ia_*id - must 1001. The mount mapping helper will thus take * care to map 1000 to 1001. * The mnt idmapping helpers are nops if the upper layer isn't idmapped. */ static inline int ovl_do_notify_change(struct ovl_fs *ofs, struct dentry *upperdentry, struct iattr *attr) { return notify_change(ovl_upper_mnt_idmap(ofs), upperdentry, attr, NULL); } static inline int ovl_do_rmdir(struct ovl_fs *ofs, struct inode *dir, struct dentry *dentry) { int err = vfs_rmdir(ovl_upper_mnt_idmap(ofs), dir, dentry); pr_debug("rmdir(%pd2) = %i\n", dentry, err); return err; } static inline int ovl_do_unlink(struct ovl_fs *ofs, struct inode *dir, struct dentry *dentry) { int err = vfs_unlink(ovl_upper_mnt_idmap(ofs), dir, dentry, NULL); pr_debug("unlink(%pd2) = %i\n", dentry, err); return err; } static inline int ovl_do_link(struct ovl_fs *ofs, struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry) { int err = vfs_link(old_dentry, ovl_upper_mnt_idmap(ofs), dir, new_dentry, NULL); pr_debug("link(%pd2, %pd2) = %i\n", old_dentry, new_dentry, err); return err; } static inline int ovl_do_create(struct ovl_fs *ofs, struct inode *dir, struct dentry *dentry, umode_t mode) { int err = vfs_create(ovl_upper_mnt_idmap(ofs), dir, dentry, mode, true); pr_debug("create(%pd2, 0%o) = %i\n", dentry, mode, err); return err; } static inline int ovl_do_mkdir(struct ovl_fs *ofs, struct inode *dir, struct dentry *dentry, umode_t mode) { int err = vfs_mkdir(ovl_upper_mnt_idmap(ofs), dir, dentry, mode); pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, err); return err; } static inline int ovl_do_mknod(struct ovl_fs *ofs, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) { int err = vfs_mknod(ovl_upper_mnt_idmap(ofs), dir, dentry, mode, dev); pr_debug("mknod(%pd2, 0%o, 0%o) = %i\n", dentry, mode, dev, err); return err; } static inline int ovl_do_symlink(struct ovl_fs *ofs, struct inode *dir, struct dentry *dentry, const char *oldname) { int err = vfs_symlink(ovl_upper_mnt_idmap(ofs), dir, dentry, oldname); pr_debug("symlink(\"%s\", %pd2) = %i\n", oldname, dentry, err); return err; } static inline ssize_t ovl_do_getxattr(const struct path *path, const char *name, void *value, size_t size) { int err, len; WARN_ON(path->dentry->d_sb != path->mnt->mnt_sb); err = vfs_getxattr(mnt_idmap(path->mnt), path->dentry, name, value, size); len = (value && err > 0) ? err : 0; pr_debug("getxattr(%pd2, \"%s\", \"%*pE\", %zu, 0) = %i\n", path->dentry, name, min(len, 48), value, size, err); return err; } static inline ssize_t ovl_getxattr_upper(struct ovl_fs *ofs, struct dentry *upperdentry, enum ovl_xattr ox, void *value, size_t size) { struct path upperpath = { .dentry = upperdentry, .mnt = ovl_upper_mnt(ofs), }; return ovl_do_getxattr(&upperpath, ovl_xattr(ofs, ox), value, size); } static inline ssize_t ovl_path_getxattr(struct ovl_fs *ofs, const struct path *path, enum ovl_xattr ox, void *value, size_t size) { return ovl_do_getxattr(path, ovl_xattr(ofs, ox), value, size); } static inline int ovl_do_setxattr(struct ovl_fs *ofs, struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { int err = vfs_setxattr(ovl_upper_mnt_idmap(ofs), dentry, name, value, size, flags); pr_debug("setxattr(%pd2, \"%s\", \"%*pE\", %zu, %d) = %i\n", dentry, name, min((int)size, 48), value, size, flags, err); return err; } static inline int ovl_setxattr(struct ovl_fs *ofs, struct dentry *dentry, enum ovl_xattr ox, const void *value, size_t size) { return ovl_do_setxattr(ofs, dentry, ovl_xattr(ofs, ox), value, size, 0); } static inline int ovl_do_removexattr(struct ovl_fs *ofs, struct dentry *dentry, const char *name) { int err = vfs_removexattr(ovl_upper_mnt_idmap(ofs), dentry, name); pr_debug("removexattr(%pd2, \"%s\") = %i\n", dentry, name, err); return err; } static inline int ovl_removexattr(struct ovl_fs *ofs, struct dentry *dentry, enum ovl_xattr ox) { return ovl_do_removexattr(ofs, dentry, ovl_xattr(ofs, ox)); } static inline int ovl_do_set_acl(struct ovl_fs *ofs, struct dentry *dentry, const char *acl_name, struct posix_acl *acl) { return vfs_set_acl(ovl_upper_mnt_idmap(ofs), dentry, acl_name, acl); } static inline int ovl_do_remove_acl(struct ovl_fs *ofs, struct dentry *dentry, const char *acl_name) { return vfs_remove_acl(ovl_upper_mnt_idmap(ofs), dentry, acl_name); } static inline int ovl_do_rename(struct ovl_fs *ofs, struct inode *olddir, struct dentry *olddentry, struct inode *newdir, struct dentry *newdentry, unsigned int flags) { int err; struct renamedata rd = { .old_mnt_idmap = ovl_upper_mnt_idmap(ofs), .old_dir = olddir, .old_dentry = olddentry, .new_mnt_idmap = ovl_upper_mnt_idmap(ofs), .new_dir = newdir, .new_dentry = newdentry, .flags = flags, }; pr_debug("rename(%pd2, %pd2, 0x%x)\n", olddentry, newdentry, flags); err = vfs_rename(&rd); if (err) { pr_debug("...rename(%pd2, %pd2, ...) = %i\n", olddentry, newdentry, err); } return err; } static inline int ovl_do_whiteout(struct ovl_fs *ofs, struct inode *dir, struct dentry *dentry) { int err = vfs_whiteout(ovl_upper_mnt_idmap(ofs), dir, dentry); pr_debug("whiteout(%pd2) = %i\n", dentry, err); return err; } static inline struct file *ovl_do_tmpfile(struct ovl_fs *ofs, struct dentry *dentry, umode_t mode) { struct path path = { .mnt = ovl_upper_mnt(ofs), .dentry = dentry }; struct file *file = kernel_tmpfile_open(ovl_upper_mnt_idmap(ofs), &path, mode, O_LARGEFILE | O_WRONLY, current_cred()); int err = PTR_ERR_OR_ZERO(file); pr_debug("tmpfile(%pd2, 0%o) = %i\n", dentry, mode, err); return file; } static inline struct dentry *ovl_lookup_upper(struct ovl_fs *ofs, const char *name, struct dentry *base, int len) { return lookup_one(ovl_upper_mnt_idmap(ofs), name, base, len); } static inline bool ovl_open_flags_need_copy_up(int flags) { if (!flags) return false; return ((OPEN_FMODE(flags) & FMODE_WRITE) || (flags & O_TRUNC)); } /* util.c */ int ovl_get_write_access(struct dentry *dentry); void ovl_put_write_access(struct dentry *dentry); void ovl_start_write(struct dentry *dentry); void ovl_end_write(struct dentry *dentry); int ovl_want_write(struct dentry *dentry); void ovl_drop_write(struct dentry *dentry); struct dentry *ovl_workdir(struct dentry *dentry); const struct cred *ovl_override_creds(struct super_block *sb); void ovl_revert_creds(const struct cred *old_cred); static inline const struct cred *ovl_creds(struct super_block *sb) { return OVL_FS(sb)->creator_cred; } int ovl_can_decode_fh(struct super_block *sb); struct dentry *ovl_indexdir(struct super_block *sb); bool ovl_index_all(struct super_block *sb); bool ovl_verify_lower(struct super_block *sb); struct ovl_path *ovl_stack_alloc(unsigned int n); void ovl_stack_cpy(struct ovl_path *dst, struct ovl_path *src, unsigned int n); void ovl_stack_put(struct ovl_path *stack, unsigned int n); void ovl_stack_free(struct ovl_path *stack, unsigned int n); struct ovl_entry *ovl_alloc_entry(unsigned int numlower); void ovl_free_entry(struct ovl_entry *oe); bool ovl_dentry_remote(struct dentry *dentry); void ovl_dentry_update_reval(struct dentry *dentry, struct dentry *realdentry); void ovl_dentry_init_reval(struct dentry *dentry, struct dentry *upperdentry, struct ovl_entry *oe); void ovl_dentry_init_flags(struct dentry *dentry, struct dentry *upperdentry, struct ovl_entry *oe, unsigned int mask); bool ovl_dentry_weird(struct dentry *dentry); enum ovl_path_type ovl_path_type(struct dentry *dentry); void ovl_path_upper(struct dentry *dentry, struct path *path); void ovl_path_lower(struct dentry *dentry, struct path *path); void ovl_path_lowerdata(struct dentry *dentry, struct path *path); struct inode *ovl_i_path_real(struct inode *inode, struct path *path); enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path); enum ovl_path_type ovl_path_realdata(struct dentry *dentry, struct path *path); struct dentry *ovl_dentry_upper(struct dentry *dentry); struct dentry *ovl_dentry_lower(struct dentry *dentry); struct dentry *ovl_dentry_lowerdata(struct dentry *dentry); int ovl_dentry_set_lowerdata(struct dentry *dentry, struct ovl_path *datapath); const struct ovl_layer *ovl_i_layer_lower(struct inode *inode); const struct ovl_layer *ovl_layer_lower(struct dentry *dentry); struct dentry *ovl_dentry_real(struct dentry *dentry); struct dentry *ovl_i_dentry_upper(struct inode *inode); struct inode *ovl_inode_upper(struct inode *inode); struct inode *ovl_inode_lower(struct inode *inode); struct inode *ovl_inode_lowerdata(struct inode *inode); struct inode *ovl_inode_real(struct inode *inode); struct inode *ovl_inode_realdata(struct inode *inode); const char *ovl_lowerdata_redirect(struct inode *inode); struct ovl_dir_cache *ovl_dir_cache(struct inode *inode); void ovl_set_dir_cache(struct inode *inode, struct ovl_dir_cache *cache); void ovl_dentry_set_flag(unsigned long flag, struct dentry *dentry); void ovl_dentry_clear_flag(unsigned long flag, struct dentry *dentry); bool ovl_dentry_test_flag(unsigned long flag, struct dentry *dentry); bool ovl_dentry_is_opaque(struct dentry *dentry); bool ovl_dentry_is_whiteout(struct dentry *dentry); void ovl_dentry_set_opaque(struct dentry *dentry); bool ovl_dentry_has_xwhiteouts(struct dentry *dentry); void ovl_dentry_set_xwhiteouts(struct dentry *dentry); void ovl_layer_set_xwhiteouts(struct ovl_fs *ofs, const struct ovl_layer *layer); bool ovl_dentry_has_upper_alias(struct dentry *dentry); void ovl_dentry_set_upper_alias(struct dentry *dentry); bool ovl_dentry_needs_data_copy_up(struct dentry *dentry, int flags); bool ovl_dentry_needs_data_copy_up_locked(struct dentry *dentry, int flags); bool ovl_has_upperdata(struct inode *inode); void ovl_set_upperdata(struct inode *inode); const char *ovl_dentry_get_redirect(struct dentry *dentry); void ovl_dentry_set_redirect(struct dentry *dentry, const char *redirect); void ovl_inode_update(struct inode *inode, struct dentry *upperdentry); void ovl_dir_modified(struct dentry *dentry, bool impurity); u64 ovl_inode_version_get(struct inode *inode); bool ovl_is_whiteout(struct dentry *dentry); bool ovl_path_is_whiteout(struct ovl_fs *ofs, const struct path *path); struct file *ovl_path_open(const struct path *path, int flags); int ovl_copy_up_start(struct dentry *dentry, int flags); void ovl_copy_up_end(struct dentry *dentry); bool ovl_already_copied_up(struct dentry *dentry, int flags); char ovl_get_dir_xattr_val(struct ovl_fs *ofs, const struct path *path, enum ovl_xattr ox); bool ovl_path_check_origin_xattr(struct ovl_fs *ofs, const struct path *path); bool ovl_path_check_xwhiteout_xattr(struct ovl_fs *ofs, const struct path *path); bool ovl_init_uuid_xattr(struct super_block *sb, struct ovl_fs *ofs, const struct path *upperpath); static inline bool ovl_upper_is_whiteout(struct ovl_fs *ofs, struct dentry *upperdentry) { struct path upperpath = { .dentry = upperdentry, .mnt = ovl_upper_mnt(ofs), }; return ovl_path_is_whiteout(ofs, &upperpath); } static inline bool ovl_check_origin_xattr(struct ovl_fs *ofs, struct dentry *upperdentry) { struct path upperpath = { .dentry = upperdentry, .mnt = ovl_upper_mnt(ofs), }; return ovl_path_check_origin_xattr(ofs, &upperpath); } int ovl_check_setxattr(struct ovl_fs *ofs, struct dentry *upperdentry, enum ovl_xattr ox, const void *value, size_t size, int xerr); int ovl_set_impure(struct dentry *dentry, struct dentry *upperdentry); bool ovl_inuse_trylock(struct dentry *dentry); void ovl_inuse_unlock(struct dentry *dentry); bool ovl_is_inuse(struct dentry *dentry); bool ovl_need_index(struct dentry *dentry); int ovl_nlink_start(struct dentry *dentry); void ovl_nlink_end(struct dentry *dentry); int ovl_lock_rename_workdir(struct dentry *workdir, struct dentry *upperdir); int ovl_check_metacopy_xattr(struct ovl_fs *ofs, const struct path *path, struct ovl_metacopy *data); int ovl_set_metacopy_xattr(struct ovl_fs *ofs, struct dentry *d, struct ovl_metacopy *metacopy); bool ovl_is_metacopy_dentry(struct dentry *dentry); char *ovl_get_redirect_xattr(struct ovl_fs *ofs, const struct path *path, int padding); int ovl_ensure_verity_loaded(struct path *path); int ovl_get_verity_xattr(struct ovl_fs *ofs, const struct path *path, u8 *digest_buf, int *buf_length); int ovl_validate_verity(struct ovl_fs *ofs, struct path *metapath, struct path *datapath); int ovl_get_verity_digest(struct ovl_fs *ofs, struct path *src, struct ovl_metacopy *metacopy); int ovl_sync_status(struct ovl_fs *ofs); static inline void ovl_set_flag(unsigned long flag, struct inode *inode) { set_bit(flag, &OVL_I(inode)->flags); } static inline void ovl_clear_flag(unsigned long flag, struct inode *inode) { clear_bit(flag, &OVL_I(inode)->flags); } static inline bool ovl_test_flag(unsigned long flag, struct inode *inode) { return test_bit(flag, &OVL_I(inode)->flags); } static inline bool ovl_is_impuredir(struct super_block *sb, struct dentry *upperdentry) { struct ovl_fs *ofs = OVL_FS(sb); struct path upperpath = { .dentry = upperdentry, .mnt = ovl_upper_mnt(ofs), }; return ovl_get_dir_xattr_val(ofs, &upperpath, OVL_XATTR_IMPURE) == 'y'; } static inline char ovl_get_opaquedir_val(struct ovl_fs *ofs, const struct path *path) { return ovl_get_dir_xattr_val(ofs, path, OVL_XATTR_OPAQUE); } static inline bool ovl_redirect_follow(struct ovl_fs *ofs) { return ofs->config.redirect_mode != OVL_REDIRECT_NOFOLLOW; } static inline bool ovl_redirect_dir(struct ovl_fs *ofs) { return ofs->config.redirect_mode == OVL_REDIRECT_ON; } static inline bool ovl_origin_uuid(struct ovl_fs *ofs) { return ofs->config.uuid != OVL_UUID_OFF; } static inline bool ovl_has_fsid(struct ovl_fs *ofs) { return ofs->config.uuid == OVL_UUID_ON || ofs->config.uuid == OVL_UUID_AUTO; } /* * With xino=auto, we do best effort to keep all inodes on same st_dev and * d_ino consistent with st_ino. * With xino=on, we do the same effort but we warn if we failed. */ static inline bool ovl_xino_warn(struct ovl_fs *ofs) { return ofs->config.xino == OVL_XINO_ON; } /* * To avoid regressions in existing setups with overlay lower offline changes, * we allow lower changes only if none of the new features are used. */ static inline bool ovl_allow_offline_changes(struct ovl_fs *ofs) { return (!ofs->config.index && !ofs->config.metacopy && !ovl_redirect_dir(ofs) && !ovl_xino_warn(ofs)); } /* All layers on same fs? */ static inline bool ovl_same_fs(struct ovl_fs *ofs) { return ofs->xino_mode == 0; } /* All overlay inodes have same st_dev? */ static inline bool ovl_same_dev(struct ovl_fs *ofs) { return ofs->xino_mode >= 0; } static inline unsigned int ovl_xino_bits(struct ovl_fs *ofs) { return ovl_same_dev(ofs) ? ofs->xino_mode : 0; } static inline void ovl_inode_lock(struct inode *inode) { mutex_lock(&OVL_I(inode)->lock); } static inline int ovl_inode_lock_interruptible(struct inode *inode) { return mutex_lock_interruptible(&OVL_I(inode)->lock); } static inline void ovl_inode_unlock(struct inode *inode) { mutex_unlock(&OVL_I(inode)->lock); } /* namei.c */ int ovl_check_fb_len(struct ovl_fb *fb, int fb_len); static inline int ovl_check_fh_len(struct ovl_fh *fh, int fh_len) { if (fh_len < sizeof(struct ovl_fh)) return -EINVAL; return ovl_check_fb_len(&fh->fb, fh_len - OVL_FH_WIRE_OFFSET); } struct dentry *ovl_decode_real_fh(struct ovl_fs *ofs, struct ovl_fh *fh, struct vfsmount *mnt, bool connected); int ovl_check_origin_fh(struct ovl_fs *ofs, struct ovl_fh *fh, bool connected, struct dentry *upperdentry, struct ovl_path **stackp); int ovl_verify_set_fh(struct ovl_fs *ofs, struct dentry *dentry, enum ovl_xattr ox, const struct ovl_fh *fh, bool is_upper, bool set); int ovl_verify_origin_xattr(struct ovl_fs *ofs, struct dentry *dentry, enum ovl_xattr ox, struct dentry *real, bool is_upper, bool set); struct dentry *ovl_index_upper(struct ovl_fs *ofs, struct dentry *index, bool connected); int ovl_verify_index(struct ovl_fs *ofs, struct dentry *index); int ovl_get_index_name_fh(const struct ovl_fh *fh, struct qstr *name); int ovl_get_index_name(struct ovl_fs *ofs, struct dentry *origin, struct qstr *name); struct dentry *ovl_get_index_fh(struct ovl_fs *ofs, struct ovl_fh *fh); struct dentry *ovl_lookup_index(struct ovl_fs *ofs, struct dentry *upper, struct dentry *origin, bool verify); int ovl_path_next(int idx, struct dentry *dentry, struct path *path, const struct ovl_layer **layer); int ovl_verify_lowerdata(struct dentry *dentry); struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags); bool ovl_lower_positive(struct dentry *dentry); static inline int ovl_verify_origin_fh(struct ovl_fs *ofs, struct dentry *upper, const struct ovl_fh *fh, bool set) { return ovl_verify_set_fh(ofs, upper, OVL_XATTR_ORIGIN, fh, false, set); } static inline int ovl_verify_origin(struct ovl_fs *ofs, struct dentry *upper, struct dentry *origin, bool set) { return ovl_verify_origin_xattr(ofs, upper, OVL_XATTR_ORIGIN, origin, false, set); } static inline int ovl_verify_upper(struct ovl_fs *ofs, struct dentry *index, struct dentry *upper, bool set) { return ovl_verify_origin_xattr(ofs, index, OVL_XATTR_UPPER, upper, true, set); } /* readdir.c */ extern const struct file_operations ovl_dir_operations; struct file *ovl_dir_real_file(const struct file *file, bool want_upper); int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list); void ovl_cleanup_whiteouts(struct ovl_fs *ofs, struct dentry *upper, struct list_head *list); void ovl_cache_free(struct list_head *list); void ovl_dir_cache_free(struct inode *inode); int ovl_check_d_type_supported(const struct path *realpath); int ovl_workdir_cleanup(struct ovl_fs *ofs, struct inode *dir, struct vfsmount *mnt, struct dentry *dentry, int level); int ovl_indexdir_cleanup(struct ovl_fs *ofs); /* * Can we iterate real dir directly? * * Non-merge dir may contain whiteouts from a time it was a merge upper, before * lower dir was removed under it and possibly before it was rotated from upper * to lower layer. */ static inline bool ovl_dir_is_real(struct inode *dir) { return !ovl_test_flag(OVL_WHITEOUTS, dir); } /* inode.c */ int ovl_set_nlink_upper(struct dentry *dentry); int ovl_set_nlink_lower(struct dentry *dentry); unsigned int ovl_get_nlink(struct ovl_fs *ofs, struct dentry *lowerdentry, struct dentry *upperdentry, unsigned int fallback); int ovl_permission(struct mnt_idmap *idmap, struct inode *inode, int mask); #ifdef CONFIG_FS_POSIX_ACL struct posix_acl *do_ovl_get_acl(struct mnt_idmap *idmap, struct inode *inode, int type, bool rcu, bool noperm); static inline struct posix_acl *ovl_get_inode_acl(struct inode *inode, int type, bool rcu) { return do_ovl_get_acl(&nop_mnt_idmap, inode, type, rcu, true); } static inline struct posix_acl *ovl_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, int type) { return do_ovl_get_acl(idmap, d_inode(dentry), type, false, false); } int ovl_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type); struct posix_acl *ovl_get_acl_path(const struct path *path, const char *acl_name, bool noperm); #else #define ovl_get_inode_acl NULL #define ovl_get_acl NULL #define ovl_set_acl NULL static inline struct posix_acl *ovl_get_acl_path(const struct path *path, const char *acl_name, bool noperm) { return NULL; } #endif int ovl_update_time(struct inode *inode, int flags); bool ovl_is_private_xattr(struct super_block *sb, const char *name); struct ovl_inode_params { struct inode *newinode; struct dentry *upperdentry; struct ovl_entry *oe; bool index; char *redirect; char *lowerdata_redirect; }; void ovl_inode_init(struct inode *inode, struct ovl_inode_params *oip, unsigned long ino, int fsid); struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, dev_t rdev); struct inode *ovl_lookup_inode(struct super_block *sb, struct dentry *real, bool is_upper); bool ovl_lookup_trap_inode(struct super_block *sb, struct dentry *dir); struct inode *ovl_get_trap_inode(struct super_block *sb, struct dentry *dir); struct inode *ovl_get_inode(struct super_block *sb, struct ovl_inode_params *oip); void ovl_copyattr(struct inode *to); /* vfs inode flags copied from real to ovl inode */ #define OVL_COPY_I_FLAGS_MASK (S_SYNC | S_NOATIME | S_APPEND | S_IMMUTABLE) /* vfs inode flags read from overlay.protattr xattr to ovl inode */ #define OVL_PROT_I_FLAGS_MASK (S_APPEND | S_IMMUTABLE) /* * fileattr flags copied from lower to upper inode on copy up. * We cannot copy up immutable/append-only flags, because that would prevent * linking temp inode to upper dir, so we store them in xattr instead. */ #define OVL_COPY_FS_FLAGS_MASK (FS_SYNC_FL | FS_NOATIME_FL) #define OVL_COPY_FSX_FLAGS_MASK (FS_XFLAG_SYNC | FS_XFLAG_NOATIME) #define OVL_PROT_FS_FLAGS_MASK (FS_APPEND_FL | FS_IMMUTABLE_FL) #define OVL_PROT_FSX_FLAGS_MASK (FS_XFLAG_APPEND | FS_XFLAG_IMMUTABLE) void ovl_check_protattr(struct inode *inode, struct dentry *upper); int ovl_set_protattr(struct inode *inode, struct dentry *upper, struct fileattr *fa); static inline void ovl_copyflags(struct inode *from, struct inode *to) { unsigned int mask = OVL_COPY_I_FLAGS_MASK; inode_set_flags(to, from->i_flags & mask, mask); } /* dir.c */ extern const struct inode_operations ovl_dir_inode_operations; int ovl_cleanup_and_whiteout(struct ovl_fs *ofs, struct inode *dir, struct dentry *dentry); struct ovl_cattr { dev_t rdev; umode_t mode; const char *link; struct dentry *hardlink; }; #define OVL_CATTR(m) (&(struct ovl_cattr) { .mode = (m) }) int ovl_mkdir_real(struct ovl_fs *ofs, struct inode *dir, struct dentry **newdentry, umode_t mode); struct dentry *ovl_create_real(struct ovl_fs *ofs, struct inode *dir, struct dentry *newdentry, struct ovl_cattr *attr); int ovl_cleanup(struct ovl_fs *ofs, struct inode *dir, struct dentry *dentry); struct dentry *ovl_lookup_temp(struct ovl_fs *ofs, struct dentry *workdir); struct dentry *ovl_create_temp(struct ovl_fs *ofs, struct dentry *workdir, struct ovl_cattr *attr); /* file.c */ extern const struct file_operations ovl_file_operations; int ovl_real_fileattr_get(const struct path *realpath, struct fileattr *fa); int ovl_real_fileattr_set(const struct path *realpath, struct fileattr *fa); int ovl_fileattr_get(struct dentry *dentry, struct fileattr *fa); int ovl_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa); struct ovl_file; struct ovl_file *ovl_file_alloc(struct file *realfile); void ovl_file_free(struct ovl_file *of); /* copy_up.c */ int ovl_copy_up(struct dentry *dentry); int ovl_copy_up_with_data(struct dentry *dentry); int ovl_maybe_copy_up(struct dentry *dentry, int flags); int ovl_copy_xattr(struct super_block *sb, const struct path *path, struct dentry *new); int ovl_set_attr(struct ovl_fs *ofs, struct dentry *upper, struct kstat *stat); struct ovl_fh *ovl_encode_real_fh(struct ovl_fs *ofs, struct dentry *real, bool is_upper); struct ovl_fh *ovl_get_origin_fh(struct ovl_fs *ofs, struct dentry *origin); int ovl_set_origin_fh(struct ovl_fs *ofs, const struct ovl_fh *fh, struct dentry *upper); /* export.c */ extern const struct export_operations ovl_export_operations; extern const struct export_operations ovl_export_fid_operations; /* super.c */ int ovl_fill_super(struct super_block *sb, struct fs_context *fc); /* Will this overlay be forced to mount/remount ro? */ static inline bool ovl_force_readonly(struct ovl_fs *ofs) { return (!ovl_upper_mnt(ofs) || !ofs->workdir); } /* xattr.c */ const struct xattr_handler * const *ovl_xattr_handlers(struct ovl_fs *ofs); int ovl_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr); int ovl_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags); ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size);
23263 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_CPUFEATURE_H #define _ASM_X86_CPUFEATURE_H #include <asm/processor.h> #if defined(__KERNEL__) && !defined(__ASSEMBLY__) #include <asm/asm.h> #include <linux/bitops.h> #include <asm/alternative.h> enum cpuid_leafs { CPUID_1_EDX = 0, CPUID_8000_0001_EDX, CPUID_8086_0001_EDX, CPUID_LNX_1, CPUID_1_ECX, CPUID_C000_0001_EDX, CPUID_8000_0001_ECX, CPUID_LNX_2, CPUID_LNX_3, CPUID_7_0_EBX, CPUID_D_1_EAX, CPUID_LNX_4, CPUID_7_1_EAX, CPUID_8000_0008_EBX, CPUID_6_EAX, CPUID_8000_000A_EDX, CPUID_7_ECX, CPUID_8000_0007_EBX, CPUID_7_EDX, CPUID_8000_001F_EAX, CPUID_8000_0021_EAX, CPUID_LNX_5, NR_CPUID_WORDS, }; #define X86_CAP_FMT_NUM "%d:%d" #define x86_cap_flag_num(flag) ((flag) >> 5), ((flag) & 31) extern const char * const x86_cap_flags[NCAPINTS*32]; extern const char * const x86_power_flags[32]; #define X86_CAP_FMT "%s" #define x86_cap_flag(flag) x86_cap_flags[flag] /* * In order to save room, we index into this array by doing * X86_BUG_<name> - NCAPINTS*32. */ extern const char * const x86_bug_flags[NBUGINTS*32]; #define test_cpu_cap(c, bit) \ arch_test_bit(bit, (unsigned long *)((c)->x86_capability)) /* * There are 32 bits/features in each mask word. The high bits * (selected with (bit>>5) give us the word number and the low 5 * bits give us the bit/feature number inside the word. * (1UL<<((bit)&31) gives us a mask for the feature_bit so we can * see if it is set in the mask word. */ #define CHECK_BIT_IN_MASK_WORD(maskname, word, bit) \ (((bit)>>5)==(word) && (1UL<<((bit)&31) & maskname##word )) /* * {REQUIRED,DISABLED}_MASK_CHECK below may seem duplicated with the * following BUILD_BUG_ON_ZERO() check but when NCAPINTS gets changed, all * header macros which use NCAPINTS need to be changed. The duplicated macro * use causes the compiler to issue errors for all headers so that all usage * sites can be corrected. */ #define REQUIRED_MASK_BIT_SET(feature_bit) \ ( CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 0, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 1, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 2, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 3, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 4, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 5, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 6, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 7, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 8, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 9, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 10, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 11, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 12, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 13, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 14, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 15, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 16, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 17, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 18, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 19, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 20, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 21, feature_bit) || \ REQUIRED_MASK_CHECK || \ BUILD_BUG_ON_ZERO(NCAPINTS != 22)) #define DISABLED_MASK_BIT_SET(feature_bit) \ ( CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 0, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 1, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 2, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 3, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 4, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 5, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 6, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 7, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 8, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 9, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 10, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 11, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 12, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 13, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 14, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 15, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 16, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 17, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 18, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 19, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 20, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 21, feature_bit) || \ DISABLED_MASK_CHECK || \ BUILD_BUG_ON_ZERO(NCAPINTS != 22)) #define cpu_has(c, bit) \ (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \ test_cpu_cap(c, bit)) #define this_cpu_has(bit) \ (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \ x86_this_cpu_test_bit(bit, cpu_info.x86_capability)) /* * This macro is for detection of features which need kernel * infrastructure to be used. It may *not* directly test the CPU * itself. Use the cpu_has() family if you want true runtime * testing of CPU features, like in hypervisor code where you are * supporting a possible guest feature where host support for it * is not relevant. */ #define cpu_feature_enabled(bit) \ (__builtin_constant_p(bit) && DISABLED_MASK_BIT_SET(bit) ? 0 : static_cpu_has(bit)) #define boot_cpu_has(bit) cpu_has(&boot_cpu_data, bit) #define set_cpu_cap(c, bit) set_bit(bit, (unsigned long *)((c)->x86_capability)) extern void setup_clear_cpu_cap(unsigned int bit); extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit); #define setup_force_cpu_cap(bit) do { \ \ if (!boot_cpu_has(bit)) \ WARN_ON(alternatives_patched); \ \ set_cpu_cap(&boot_cpu_data, bit); \ set_bit(bit, (unsigned long *)cpu_caps_set); \ } while (0) #define setup_force_cpu_bug(bit) setup_force_cpu_cap(bit) /* * Static testing of CPU features. Used the same as boot_cpu_has(). It * statically patches the target code for additional performance. Use * static_cpu_has() only in fast paths, where every cycle counts. Which * means that the boot_cpu_has() variant is already fast enough for the * majority of cases and you should stick to using it as it is generally * only two instructions: a RIP-relative MOV and a TEST. * * Do not use an "m" constraint for [cap_byte] here: gcc doesn't know * that this is only used on a fallback path and will sometimes cause * it to manifest the address of boot_cpu_data in a register, fouling * the mainline (post-initialization) code. */ static __always_inline bool _static_cpu_has(u16 bit) { asm goto(ALTERNATIVE_TERNARY("jmp 6f", %c[feature], "", "jmp %l[t_no]") ".pushsection .altinstr_aux,\"ax\"\n" "6:\n" " testb %[bitnum], %a[cap_byte]\n" " jnz %l[t_yes]\n" " jmp %l[t_no]\n" ".popsection\n" : : [feature] "i" (bit), [bitnum] "i" (1 << (bit & 7)), [cap_byte] "i" (&((const char *)boot_cpu_data.x86_capability)[bit >> 3]) : : t_yes, t_no); t_yes: return true; t_no: return false; } #define static_cpu_has(bit) \ ( \ __builtin_constant_p(boot_cpu_has(bit)) ? \ boot_cpu_has(bit) : \ _static_cpu_has(bit) \ ) #define cpu_has_bug(c, bit) cpu_has(c, (bit)) #define set_cpu_bug(c, bit) set_cpu_cap(c, (bit)) #define clear_cpu_bug(c, bit) clear_cpu_cap(c, (bit)) #define static_cpu_has_bug(bit) static_cpu_has((bit)) #define boot_cpu_has_bug(bit) cpu_has_bug(&boot_cpu_data, (bit)) #define boot_cpu_set_bug(bit) set_cpu_cap(&boot_cpu_data, (bit)) #define MAX_CPU_FEATURES (NCAPINTS * 32) #define cpu_have_feature boot_cpu_has #define CPU_FEATURE_TYPEFMT "x86,ven%04Xfam%04Xmod%04X" #define CPU_FEATURE_TYPEVAL boot_cpu_data.x86_vendor, boot_cpu_data.x86, \ boot_cpu_data.x86_model #endif /* defined(__KERNEL__) && !defined(__ASSEMBLY__) */ #endif /* _ASM_X86_CPUFEATURE_H */
9 9 9 9 20 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 /* SPDX-License-Identifier: GPL-2.0 */ /* Copyright(c) 1999 - 2018 Intel Corporation. */ /* Linux PRO/1000 Ethernet Driver main header file */ #ifndef _E1000_H_ #define _E1000_H_ #include <linux/bitops.h> #include <linux/types.h> #include <linux/timer.h> #include <linux/workqueue.h> #include <linux/io.h> #include <linux/netdevice.h> #include <linux/pci.h> #include <linux/crc32.h> #include <linux/if_vlan.h> #include <linux/timecounter.h> #include <linux/net_tstamp.h> #include <linux/ptp_clock_kernel.h> #include <linux/ptp_classify.h> #include <linux/mii.h> #include <linux/mdio.h> #include <linux/mutex.h> #include <linux/pm_qos.h> #include "hw.h" struct e1000_info; #define e_dbg(format, arg...) \ netdev_dbg(hw->adapter->netdev, format, ## arg) #define e_err(format, arg...) \ netdev_err(adapter->netdev, format, ## arg) #define e_info(format, arg...) \ netdev_info(adapter->netdev, format, ## arg) #define e_warn(format, arg...) \ netdev_warn(adapter->netdev, format, ## arg) #define e_notice(format, arg...) \ netdev_notice(adapter->netdev, format, ## arg) /* Interrupt modes, as used by the IntMode parameter */ #define E1000E_INT_MODE_LEGACY 0 #define E1000E_INT_MODE_MSI 1 #define E1000E_INT_MODE_MSIX 2 /* Tx/Rx descriptor defines */ #define E1000_DEFAULT_TXD 256 #define E1000_MAX_TXD 4096 #define E1000_MIN_TXD 64 #define E1000_DEFAULT_RXD 256 #define E1000_MAX_RXD 4096 #define E1000_MIN_RXD 64 #define E1000_MIN_ITR_USECS 10 /* 100000 irq/sec */ #define E1000_MAX_ITR_USECS 10000 /* 100 irq/sec */ #define E1000_FC_PAUSE_TIME 0x0680 /* 858 usec */ /* How many Tx Descriptors do we need to call netif_wake_queue ? */ /* How many Rx Buffers do we bundle into one write to the hardware ? */ #define E1000_RX_BUFFER_WRITE 16 /* Must be power of 2 */ #define AUTO_ALL_MODES 0 #define E1000_EEPROM_APME 0x0400 #define E1000_MNG_VLAN_NONE (-1) #define DEFAULT_JUMBO 9234 /* Time to wait before putting the device into D3 if there's no link (in ms). */ #define LINK_TIMEOUT 100 /* Count for polling __E1000_RESET condition every 10-20msec. * Experimentation has shown the reset can take approximately 210msec. */ #define E1000_CHECK_RESET_COUNT 25 #define PCICFG_DESC_RING_STATUS 0xe4 #define FLUSH_DESC_REQUIRED 0x100 /* in the case of WTHRESH, it appears at least the 82571/2 hardware * writes back 4 descriptors when WTHRESH=5, and 3 descriptors when * WTHRESH=4, so a setting of 5 gives the most efficient bus * utilization but to avoid possible Tx stalls, set it to 1 */ #define E1000_TXDCTL_DMA_BURST_ENABLE \ (E1000_TXDCTL_GRAN | /* set descriptor granularity */ \ E1000_TXDCTL_COUNT_DESC | \ (1u << 16) | /* wthresh must be +1 more than desired */\ (1u << 8) | /* hthresh */ \ 0x1f) /* pthresh */ #define E1000_RXDCTL_DMA_BURST_ENABLE \ (0x01000000 | /* set descriptor granularity */ \ (4u << 16) | /* set writeback threshold */ \ (4u << 8) | /* set prefetch threshold */ \ 0x20) /* set hthresh */ #define E1000_TIDV_FPD BIT(31) #define E1000_RDTR_FPD BIT(31) enum e1000_boards { board_82571, board_82572, board_82573, board_82574, board_82583, board_80003es2lan, board_ich8lan, board_ich9lan, board_ich10lan, board_pchlan, board_pch2lan, board_pch_lpt, board_pch_spt, board_pch_cnp, board_pch_tgp, board_pch_adp, board_pch_mtp }; struct e1000_ps_page { struct page *page; u64 dma; /* must be u64 - written to hw */ }; /* wrappers around a pointer to a socket buffer, * so a DMA handle can be stored along with the buffer */ struct e1000_buffer { dma_addr_t dma; struct sk_buff *skb; union { /* Tx */ struct { unsigned long time_stamp; u16 length; u16 next_to_watch; unsigned int segs; unsigned int bytecount; u16 mapped_as_page; }; /* Rx */ struct { /* arrays of page information for packet split */ struct e1000_ps_page *ps_pages; struct page *page; }; }; }; struct e1000_ring { struct e1000_adapter *adapter; /* back pointer to adapter */ void *desc; /* pointer to ring memory */ dma_addr_t dma; /* phys address of ring */ unsigned int size; /* length of ring in bytes */ unsigned int count; /* number of desc. in ring */ u16 next_to_use; u16 next_to_clean; void __iomem *head; void __iomem *tail; /* array of buffer information structs */ struct e1000_buffer *buffer_info; char name[IFNAMSIZ + 5]; u32 ims_val; u32 itr_val; void __iomem *itr_register; int set_itr; struct sk_buff *rx_skb_top; }; /* PHY register snapshot values */ struct e1000_phy_regs { u16 bmcr; /* basic mode control register */ u16 bmsr; /* basic mode status register */ u16 advertise; /* auto-negotiation advertisement */ u16 lpa; /* link partner ability register */ u16 expansion; /* auto-negotiation expansion reg */ u16 ctrl1000; /* 1000BASE-T control register */ u16 stat1000; /* 1000BASE-T status register */ u16 estatus; /* extended status register */ }; /* board specific private data structure */ struct e1000_adapter { struct timer_list watchdog_timer; struct timer_list phy_info_timer; struct timer_list blink_timer; struct work_struct reset_task; struct work_struct watchdog_task; const struct e1000_info *ei; unsigned long active_vlans[BITS_TO_LONGS(VLAN_N_VID)]; u32 bd_number; u32 rx_buffer_len; u16 mng_vlan_id; u16 link_speed; u16 link_duplex; u16 eeprom_vers; /* track device up/down/testing state */ unsigned long state; /* Interrupt Throttle Rate */ u32 itr; u32 itr_setting; u16 tx_itr; u16 rx_itr; /* Tx - one ring per active queue */ struct e1000_ring *tx_ring ____cacheline_aligned_in_smp; u32 tx_fifo_limit; struct napi_struct napi; unsigned int uncorr_errors; /* uncorrectable ECC errors */ unsigned int corr_errors; /* correctable ECC errors */ unsigned int restart_queue; u32 txd_cmd; bool detect_tx_hung; bool tx_hang_recheck; u8 tx_timeout_factor; u32 tx_int_delay; u32 tx_abs_int_delay; unsigned int total_tx_bytes; unsigned int total_tx_packets; unsigned int total_rx_bytes; unsigned int total_rx_packets; /* Tx stats */ u64 tpt_old; u64 colc_old; u32 gotc; u64 gotc_old; u32 tx_timeout_count; u32 tx_fifo_head; u32 tx_head_addr; u32 tx_fifo_size; u32 tx_dma_failed; u32 tx_hwtstamp_timeouts; u32 tx_hwtstamp_skipped; /* Rx */ bool (*clean_rx)(struct e1000_ring *ring, int *work_done, int work_to_do) ____cacheline_aligned_in_smp; void (*alloc_rx_buf)(struct e1000_ring *ring, int cleaned_count, gfp_t gfp); struct e1000_ring *rx_ring; u32 rx_int_delay; u32 rx_abs_int_delay; /* Rx stats */ u64 hw_csum_err; u64 hw_csum_good; u64 rx_hdr_split; u32 gorc; u64 gorc_old; u32 alloc_rx_buff_failed; u32 rx_dma_failed; u32 rx_hwtstamp_cleared; unsigned int rx_ps_pages; u16 rx_ps_bsize0; u32 max_frame_size; u32 min_frame_size; /* OS defined structs */ struct net_device *netdev; struct pci_dev *pdev; /* structs defined in e1000_hw.h */ struct e1000_hw hw; spinlock_t stats64_lock; /* protects statistics counters */ struct e1000_hw_stats stats; struct e1000_phy_info phy_info; struct e1000_phy_stats phy_stats; /* Snapshot of PHY registers */ struct e1000_phy_regs phy_regs; struct e1000_ring test_tx_ring; struct e1000_ring test_rx_ring; u32 test_icr; u32 msg_enable; unsigned int num_vectors; struct msix_entry *msix_entries; int int_mode; u32 eiac_mask; u32 eeprom_wol; u32 wol; u32 pba; u32 max_hw_frame_size; bool fc_autoneg; unsigned int flags; unsigned int flags2; struct work_struct downshift_task; struct work_struct update_phy_task; struct work_struct print_hang_task; int phy_hang_count; u16 tx_ring_count; u16 rx_ring_count; struct hwtstamp_config hwtstamp_config; struct delayed_work systim_overflow_work; struct sk_buff *tx_hwtstamp_skb; unsigned long tx_hwtstamp_start; struct work_struct tx_hwtstamp_work; spinlock_t systim_lock; /* protects SYSTIML/H regsters */ struct cyclecounter cc; struct timecounter tc; struct ptp_clock *ptp_clock; struct ptp_clock_info ptp_clock_info; struct pm_qos_request pm_qos_req; long ptp_delta; u16 eee_advert; }; struct e1000_info { enum e1000_mac_type mac; unsigned int flags; unsigned int flags2; u32 pba; u32 max_hw_frame_size; s32 (*get_variants)(struct e1000_adapter *); const struct e1000_mac_operations *mac_ops; const struct e1000_phy_operations *phy_ops; const struct e1000_nvm_operations *nvm_ops; }; s32 e1000e_get_base_timinca(struct e1000_adapter *adapter, u32 *timinca); /* The system time is maintained by a 64-bit counter comprised of the 32-bit * SYSTIMH and SYSTIML registers. How the counter increments (and therefore * its resolution) is based on the contents of the TIMINCA register - it * increments every incperiod (bits 31:24) clock ticks by incvalue (bits 23:0). * For the best accuracy, the incperiod should be as small as possible. The * incvalue is scaled by a factor as large as possible (while still fitting * in bits 23:0) so that relatively small clock corrections can be made. * * As a result, a shift of INCVALUE_SHIFT_n is used to fit a value of * INCVALUE_n into the TIMINCA register allowing 32+8+(24-INCVALUE_SHIFT_n) * bits to count nanoseconds leaving the rest for fractional nonseconds. * * Any given INCVALUE also has an associated maximum adjustment value. This * maximum adjustment value is the largest increase (or decrease) which can be * safely applied without overflowing the INCVALUE. Since INCVALUE has * a maximum range of 24 bits, its largest value is 0xFFFFFF. * * To understand where the maximum value comes from, consider the following * equation: * * new_incval = base_incval + (base_incval * adjustment) / 1billion * * To avoid overflow that means: * max_incval = base_incval + (base_incval * max_adj) / billion * * Re-arranging: * max_adj = floor(((max_incval - base_incval) * 1billion) / 1billion) */ #define INCVALUE_96MHZ 125 #define INCVALUE_SHIFT_96MHZ 17 #define INCPERIOD_SHIFT_96MHZ 2 #define INCPERIOD_96MHZ (12 >> INCPERIOD_SHIFT_96MHZ) #define MAX_PPB_96MHZ 23999900 /* 23,999,900 ppb */ #define INCVALUE_25MHZ 40 #define INCVALUE_SHIFT_25MHZ 18 #define INCPERIOD_25MHZ 1 #define MAX_PPB_25MHZ 599999900 /* 599,999,900 ppb */ #define INCVALUE_24MHZ 125 #define INCVALUE_SHIFT_24MHZ 14 #define INCPERIOD_24MHZ 3 #define MAX_PPB_24MHZ 999999999 /* 999,999,999 ppb */ #define INCVALUE_38400KHZ 26 #define INCVALUE_SHIFT_38400KHZ 19 #define INCPERIOD_38400KHZ 1 #define MAX_PPB_38400KHZ 230769100 /* 230,769,100 ppb */ /* Another drawback of scaling the incvalue by a large factor is the * 64-bit SYSTIM register overflows more quickly. This is dealt with * by simply reading the clock before it overflows. * * Clock ns bits Overflows after * ~~~~~~ ~~~~~~~ ~~~~~~~~~~~~~~~ * 96MHz 47-bit 2^(47-INCPERIOD_SHIFT_96MHz) / 10^9 / 3600 = 9.77 hrs * 25MHz 46-bit 2^46 / 10^9 / 3600 = 19.55 hours */ #define E1000_SYSTIM_OVERFLOW_PERIOD (HZ * 60 * 60 * 4) #define E1000_MAX_82574_SYSTIM_REREADS 50 #define E1000_82574_SYSTIM_EPSILON (1ULL << 35ULL) /* hardware capability, feature, and workaround flags */ #define FLAG_HAS_AMT BIT(0) #define FLAG_HAS_FLASH BIT(1) #define FLAG_HAS_HW_VLAN_FILTER BIT(2) #define FLAG_HAS_WOL BIT(3) /* reserved BIT(4) */ #define FLAG_HAS_CTRLEXT_ON_LOAD BIT(5) #define FLAG_HAS_SWSM_ON_LOAD BIT(6) #define FLAG_HAS_JUMBO_FRAMES BIT(7) #define FLAG_READ_ONLY_NVM BIT(8) #define FLAG_IS_ICH BIT(9) #define FLAG_HAS_MSIX BIT(10) #define FLAG_HAS_SMART_POWER_DOWN BIT(11) #define FLAG_IS_QUAD_PORT_A BIT(12) #define FLAG_IS_QUAD_PORT BIT(13) #define FLAG_HAS_HW_TIMESTAMP BIT(14) #define FLAG_APME_IN_WUC BIT(15) #define FLAG_APME_IN_CTRL3 BIT(16) #define FLAG_APME_CHECK_PORT_B BIT(17) #define FLAG_DISABLE_FC_PAUSE_TIME BIT(18) #define FLAG_NO_WAKE_UCAST BIT(19) #define FLAG_MNG_PT_ENABLED BIT(20) #define FLAG_RESET_OVERWRITES_LAA BIT(21) #define FLAG_TARC_SPEED_MODE_BIT BIT(22) #define FLAG_TARC_SET_BIT_ZERO BIT(23) #define FLAG_RX_NEEDS_RESTART BIT(24) #define FLAG_LSC_GIG_SPEED_DROP BIT(25) #define FLAG_SMART_POWER_DOWN BIT(26) #define FLAG_MSI_ENABLED BIT(27) /* reserved BIT(28) */ #define FLAG_TSO_FORCE BIT(29) #define FLAG_RESTART_NOW BIT(30) #define FLAG_MSI_TEST_FAILED BIT(31) #define FLAG2_CRC_STRIPPING BIT(0) #define FLAG2_HAS_PHY_WAKEUP BIT(1) #define FLAG2_IS_DISCARDING BIT(2) #define FLAG2_DISABLE_ASPM_L1 BIT(3) #define FLAG2_HAS_PHY_STATS BIT(4) #define FLAG2_HAS_EEE BIT(5) #define FLAG2_DMA_BURST BIT(6) #define FLAG2_DISABLE_ASPM_L0S BIT(7) #define FLAG2_DISABLE_AIM BIT(8) #define FLAG2_CHECK_PHY_HANG BIT(9) #define FLAG2_NO_DISABLE_RX BIT(10) #define FLAG2_PCIM2PCI_ARBITER_WA BIT(11) #define FLAG2_DFLT_CRC_STRIPPING BIT(12) #define FLAG2_CHECK_RX_HWTSTAMP BIT(13) #define FLAG2_CHECK_SYSTIM_OVERFLOW BIT(14) #define FLAG2_ENABLE_S0IX_FLOWS BIT(15) #define E1000_RX_DESC_PS(R, i) \ (&(((union e1000_rx_desc_packet_split *)((R).desc))[i])) #define E1000_RX_DESC_EXT(R, i) \ (&(((union e1000_rx_desc_extended *)((R).desc))[i])) #define E1000_GET_DESC(R, i, type) (&(((struct type *)((R).desc))[i])) #define E1000_TX_DESC(R, i) E1000_GET_DESC(R, i, e1000_tx_desc) #define E1000_CONTEXT_DESC(R, i) E1000_GET_DESC(R, i, e1000_context_desc) enum e1000_state_t { __E1000_TESTING, __E1000_RESETTING, __E1000_ACCESS_SHARED_RESOURCE, __E1000_DOWN }; enum latency_range { lowest_latency = 0, low_latency = 1, bulk_latency = 2, latency_invalid = 255 }; extern char e1000e_driver_name[]; void e1000e_check_options(struct e1000_adapter *adapter); void e1000e_set_ethtool_ops(struct net_device *netdev); int e1000e_open(struct net_device *netdev); int e1000e_close(struct net_device *netdev); void e1000e_up(struct e1000_adapter *adapter); void e1000e_down(struct e1000_adapter *adapter, bool reset); void e1000e_reinit_locked(struct e1000_adapter *adapter); void e1000e_reset(struct e1000_adapter *adapter); void e1000e_power_up_phy(struct e1000_adapter *adapter); int e1000e_setup_rx_resources(struct e1000_ring *ring); int e1000e_setup_tx_resources(struct e1000_ring *ring); void e1000e_free_rx_resources(struct e1000_ring *ring); void e1000e_free_tx_resources(struct e1000_ring *ring); void e1000e_get_stats64(struct net_device *netdev, struct rtnl_link_stats64 *stats); void e1000e_set_interrupt_capability(struct e1000_adapter *adapter); void e1000e_reset_interrupt_capability(struct e1000_adapter *adapter); void e1000e_get_hw_control(struct e1000_adapter *adapter); void e1000e_release_hw_control(struct e1000_adapter *adapter); void e1000e_write_itr(struct e1000_adapter *adapter, u32 itr); extern unsigned int copybreak; extern const struct e1000_info e1000_82571_info; extern const struct e1000_info e1000_82572_info; extern const struct e1000_info e1000_82573_info; extern const struct e1000_info e1000_82574_info; extern const struct e1000_info e1000_82583_info; extern const struct e1000_info e1000_ich8_info; extern const struct e1000_info e1000_ich9_info; extern const struct e1000_info e1000_ich10_info; extern const struct e1000_info e1000_pch_info; extern const struct e1000_info e1000_pch2_info; extern const struct e1000_info e1000_pch_lpt_info; extern const struct e1000_info e1000_pch_spt_info; extern const struct e1000_info e1000_pch_cnp_info; extern const struct e1000_info e1000_pch_tgp_info; extern const struct e1000_info e1000_pch_adp_info; extern const struct e1000_info e1000_pch_mtp_info; extern const struct e1000_info e1000_es2_info; void e1000e_ptp_init(struct e1000_adapter *adapter); void e1000e_ptp_remove(struct e1000_adapter *adapter); u64 e1000e_read_systim(struct e1000_adapter *adapter, struct ptp_system_timestamp *sts); static inline s32 e1000_phy_hw_reset(struct e1000_hw *hw) { return hw->phy.ops.reset(hw); } static inline s32 e1e_rphy(struct e1000_hw *hw, u32 offset, u16 *data) { return hw->phy.ops.read_reg(hw, offset, data); } static inline s32 e1e_rphy_locked(struct e1000_hw *hw, u32 offset, u16 *data) { return hw->phy.ops.read_reg_locked(hw, offset, data); } static inline s32 e1e_wphy(struct e1000_hw *hw, u32 offset, u16 data) { return hw->phy.ops.write_reg(hw, offset, data); } static inline s32 e1e_wphy_locked(struct e1000_hw *hw, u32 offset, u16 data) { return hw->phy.ops.write_reg_locked(hw, offset, data); } void e1000e_reload_nvm_generic(struct e1000_hw *hw); static inline s32 e1000e_read_mac_addr(struct e1000_hw *hw) { if (hw->mac.ops.read_mac_addr) return hw->mac.ops.read_mac_addr(hw); return e1000_read_mac_addr_generic(hw); } static inline s32 e1000_validate_nvm_checksum(struct e1000_hw *hw) { return hw->nvm.ops.validate(hw); } static inline s32 e1000e_update_nvm_checksum(struct e1000_hw *hw) { return hw->nvm.ops.update(hw); } static inline s32 e1000_read_nvm(struct e1000_hw *hw, u16 offset, u16 words, u16 *data) { return hw->nvm.ops.read(hw, offset, words, data); } static inline s32 e1000_write_nvm(struct e1000_hw *hw, u16 offset, u16 words, u16 *data) { return hw->nvm.ops.write(hw, offset, words, data); } static inline s32 e1000_get_phy_info(struct e1000_hw *hw) { return hw->phy.ops.get_info(hw); } static inline u32 __er32(struct e1000_hw *hw, unsigned long reg) { return readl(hw->hw_addr + reg); } #define er32(reg) __er32(hw, E1000_##reg) void __ew32(struct e1000_hw *hw, unsigned long reg, u32 val); #define ew32(reg, val) __ew32(hw, E1000_##reg, (val)) #define e1e_flush() er32(STATUS) #define E1000_WRITE_REG_ARRAY(a, reg, offset, value) \ (__ew32((a), (reg + ((offset) << 2)), (value))) #define E1000_READ_REG_ARRAY(a, reg, offset) \ (readl((a)->hw_addr + reg + ((offset) << 2))) #endif /* _E1000_H_ */
3 3 5 1 1 1 5 5 1 2 1 1 1 1 1 1 1 1 1 1 8 9 15 2 2 6 4 3 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __CFG802154_RDEV_OPS #define __CFG802154_RDEV_OPS #include <net/cfg802154.h> #include "core.h" #include "trace.h" static inline struct net_device * rdev_add_virtual_intf_deprecated(struct cfg802154_registered_device *rdev, const char *name, unsigned char name_assign_type, int type) { return rdev->ops->add_virtual_intf_deprecated(&rdev->wpan_phy, name, name_assign_type, type); } static inline void rdev_del_virtual_intf_deprecated(struct cfg802154_registered_device *rdev, struct net_device *dev) { rdev->ops->del_virtual_intf_deprecated(&rdev->wpan_phy, dev); } static inline int rdev_suspend(struct cfg802154_registered_device *rdev) { int ret; trace_802154_rdev_suspend(&rdev->wpan_phy); ret = rdev->ops->suspend(&rdev->wpan_phy); trace_802154_rdev_return_int(&rdev->wpan_phy, ret); return ret; } static inline int rdev_resume(struct cfg802154_registered_device *rdev) { int ret; trace_802154_rdev_resume(&rdev->wpan_phy); ret = rdev->ops->resume(&rdev->wpan_phy); trace_802154_rdev_return_int(&rdev->wpan_phy, ret); return ret; } static inline int rdev_add_virtual_intf(struct cfg802154_registered_device *rdev, char *name, unsigned char name_assign_type, enum nl802154_iftype type, __le64 extended_addr) { int ret; trace_802154_rdev_add_virtual_intf(&rdev->wpan_phy, name, type, extended_addr); ret = rdev->ops->add_virtual_intf(&rdev->wpan_phy, name, name_assign_type, type, extended_addr); trace_802154_rdev_return_int(&rdev->wpan_phy, ret); return ret; } static inline int rdev_del_virtual_intf(struct cfg802154_registered_device *rdev, struct wpan_dev *wpan_dev) { int ret; trace_802154_rdev_del_virtual_intf(&rdev->wpan_phy, wpan_dev); ret = rdev->ops->del_virtual_intf(&rdev->wpan_phy, wpan_dev); trace_802154_rdev_return_int(&rdev->wpan_phy, ret); return ret; } static inline int rdev_set_channel(struct cfg802154_registered_device *rdev, u8 page, u8 channel) { int ret; trace_802154_rdev_set_channel(&rdev->wpan_phy, page, channel); ret = rdev->ops->set_channel(&rdev->wpan_phy, page, channel); trace_802154_rdev_return_int(&rdev->wpan_phy, ret); return ret; } static inline int rdev_set_cca_mode(struct cfg802154_registered_device *rdev, const struct wpan_phy_cca *cca) { int ret; trace_802154_rdev_set_cca_mode(&rdev->wpan_phy, cca); ret = rdev->ops->set_cca_mode(&rdev->wpan_phy, cca); trace_802154_rdev_return_int(&rdev->wpan_phy, ret); return ret; } static inline int rdev_set_cca_ed_level(struct cfg802154_registered_device *rdev, s32 ed_level) { int ret; trace_802154_rdev_set_cca_ed_level(&rdev->wpan_phy, ed_level); ret = rdev->ops->set_cca_ed_level(&rdev->wpan_phy, ed_level); trace_802154_rdev_return_int(&rdev->wpan_phy, ret); return ret; } static inline int rdev_set_tx_power(struct cfg802154_registered_device *rdev, s32 power) { int ret; trace_802154_rdev_set_tx_power(&rdev->wpan_phy, power); ret = rdev->ops->set_tx_power(&rdev->wpan_phy, power); trace_802154_rdev_return_int(&rdev->wpan_phy, ret); return ret; } static inline int rdev_set_pan_id(struct cfg802154_registered_device *rdev, struct wpan_dev *wpan_dev, __le16 pan_id) { int ret; trace_802154_rdev_set_pan_id(&rdev->wpan_phy, wpan_dev, pan_id); ret = rdev->ops->set_pan_id(&rdev->wpan_phy, wpan_dev, pan_id); trace_802154_rdev_return_int(&rdev->wpan_phy, ret); return ret; } static inline int rdev_set_short_addr(struct cfg802154_registered_device *rdev, struct wpan_dev *wpan_dev, __le16 short_addr) { int ret; trace_802154_rdev_set_short_addr(&rdev->wpan_phy, wpan_dev, short_addr); ret = rdev->ops->set_short_addr(&rdev->wpan_phy, wpan_dev, short_addr); trace_802154_rdev_return_int(&rdev->wpan_phy, ret); return ret; } static inline int rdev_set_backoff_exponent(struct cfg802154_registered_device *rdev, struct wpan_dev *wpan_dev, u8 min_be, u8 max_be) { int ret; trace_802154_rdev_set_backoff_exponent(&rdev->wpan_phy, wpan_dev, min_be, max_be); ret = rdev->ops->set_backoff_exponent(&rdev->wpan_phy, wpan_dev, min_be, max_be); trace_802154_rdev_return_int(&rdev->wpan_phy, ret); return ret; } static inline int rdev_set_max_csma_backoffs(struct cfg802154_registered_device *rdev, struct wpan_dev *wpan_dev, u8 max_csma_backoffs) { int ret; trace_802154_rdev_set_csma_backoffs(&rdev->wpan_phy, wpan_dev, max_csma_backoffs); ret = rdev->ops->set_max_csma_backoffs(&rdev->wpan_phy, wpan_dev, max_csma_backoffs); trace_802154_rdev_return_int(&rdev->wpan_phy, ret); return ret; } static inline int rdev_set_max_frame_retries(struct cfg802154_registered_device *rdev, struct wpan_dev *wpan_dev, s8 max_frame_retries) { int ret; trace_802154_rdev_set_max_frame_retries(&rdev->wpan_phy, wpan_dev, max_frame_retries); ret = rdev->ops->set_max_frame_retries(&rdev->wpan_phy, wpan_dev, max_frame_retries); trace_802154_rdev_return_int(&rdev->wpan_phy, ret); return ret; } static inline int rdev_set_lbt_mode(struct cfg802154_registered_device *rdev, struct wpan_dev *wpan_dev, bool mode) { int ret; trace_802154_rdev_set_lbt_mode(&rdev->wpan_phy, wpan_dev, mode); ret = rdev->ops->set_lbt_mode(&rdev->wpan_phy, wpan_dev, mode); trace_802154_rdev_return_int(&rdev->wpan_phy, ret); return ret; } static inline int rdev_set_ackreq_default(struct cfg802154_registered_device *rdev, struct wpan_dev *wpan_dev, bool ackreq) { int ret; trace_802154_rdev_set_ackreq_default(&rdev->wpan_phy, wpan_dev, ackreq); ret = rdev->ops->set_ackreq_default(&rdev->wpan_phy, wpan_dev, ackreq); trace_802154_rdev_return_int(&rdev->wpan_phy, ret); return ret; } static inline int rdev_trigger_scan(struct cfg802154_registered_device *rdev, struct cfg802154_scan_request *request) { int ret; if (!rdev->ops->trigger_scan) return -EOPNOTSUPP; trace_802154_rdev_trigger_scan(&rdev->wpan_phy, request); ret = rdev->ops->trigger_scan(&rdev->wpan_phy, request); trace_802154_rdev_return_int(&rdev->wpan_phy, ret); return ret; } static inline int rdev_abort_scan(struct cfg802154_registered_device *rdev, struct wpan_dev *wpan_dev) { int ret; if (!rdev->ops->abort_scan) return -EOPNOTSUPP; trace_802154_rdev_abort_scan(&rdev->wpan_phy, wpan_dev); ret = rdev->ops->abort_scan(&rdev->wpan_phy, wpan_dev); trace_802154_rdev_return_int(&rdev->wpan_phy, ret); return ret; } static inline int rdev_send_beacons(struct cfg802154_registered_device *rdev, struct cfg802154_beacon_request *request) { int ret; if (!rdev->ops->send_beacons) return -EOPNOTSUPP; trace_802154_rdev_send_beacons(&rdev->wpan_phy, request); ret = rdev->ops->send_beacons(&rdev->wpan_phy, request); trace_802154_rdev_return_int(&rdev->wpan_phy, ret); return ret; } static inline int rdev_stop_beacons(struct cfg802154_registered_device *rdev, struct wpan_dev *wpan_dev) { int ret; if (!rdev->ops->stop_beacons) return -EOPNOTSUPP; trace_802154_rdev_stop_beacons(&rdev->wpan_phy, wpan_dev); ret = rdev->ops->stop_beacons(&rdev->wpan_phy, wpan_dev); trace_802154_rdev_return_int(&rdev->wpan_phy, ret); return ret; } static inline int rdev_associate(struct cfg802154_registered_device *rdev, struct wpan_dev *wpan_dev, struct ieee802154_addr *coord) { int ret; if (!rdev->ops->associate) return -EOPNOTSUPP; trace_802154_rdev_associate(&rdev->wpan_phy, wpan_dev, coord); ret = rdev->ops->associate(&rdev->wpan_phy, wpan_dev, coord); trace_802154_rdev_return_int(&rdev->wpan_phy, ret); return ret; } static inline int rdev_disassociate(struct cfg802154_registered_device *rdev, struct wpan_dev *wpan_dev, struct ieee802154_addr *target) { int ret; if (!rdev->ops->disassociate) return -EOPNOTSUPP; trace_802154_rdev_disassociate(&rdev->wpan_phy, wpan_dev, target); ret = rdev->ops->disassociate(&rdev->wpan_phy, wpan_dev, target); trace_802154_rdev_return_int(&rdev->wpan_phy, ret); return ret; } #ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL /* TODO this is already a nl802154, so move into ieee802154 */ static inline void rdev_get_llsec_table(struct cfg802154_registered_device *rdev, struct wpan_dev *wpan_dev, struct ieee802154_llsec_table **table) { rdev->ops->get_llsec_table(&rdev->wpan_phy, wpan_dev, table); } static inline void rdev_lock_llsec_table(struct cfg802154_registered_device *rdev, struct wpan_dev *wpan_dev) { rdev->ops->lock_llsec_table(&rdev->wpan_phy, wpan_dev); } static inline void rdev_unlock_llsec_table(struct cfg802154_registered_device *rdev, struct wpan_dev *wpan_dev) { rdev->ops->unlock_llsec_table(&rdev->wpan_phy, wpan_dev); } static inline int rdev_get_llsec_params(struct cfg802154_registered_device *rdev, struct wpan_dev *wpan_dev, struct ieee802154_llsec_params *params) { return rdev->ops->get_llsec_params(&rdev->wpan_phy, wpan_dev, params); } static inline int rdev_set_llsec_params(struct cfg802154_registered_device *rdev, struct wpan_dev *wpan_dev, const struct ieee802154_llsec_params *params, u32 changed) { return rdev->ops->set_llsec_params(&rdev->wpan_phy, wpan_dev, params, changed); } static inline int rdev_add_llsec_key(struct cfg802154_registered_device *rdev, struct wpan_dev *wpan_dev, const struct ieee802154_llsec_key_id *id, const struct ieee802154_llsec_key *key) { return rdev->ops->add_llsec_key(&rdev->wpan_phy, wpan_dev, id, key); } static inline int rdev_del_llsec_key(struct cfg802154_registered_device *rdev, struct wpan_dev *wpan_dev, const struct ieee802154_llsec_key_id *id) { return rdev->ops->del_llsec_key(&rdev->wpan_phy, wpan_dev, id); } static inline int rdev_add_seclevel(struct cfg802154_registered_device *rdev, struct wpan_dev *wpan_dev, const struct ieee802154_llsec_seclevel *sl) { return rdev->ops->add_seclevel(&rdev->wpan_phy, wpan_dev, sl); } static inline int rdev_del_seclevel(struct cfg802154_registered_device *rdev, struct wpan_dev *wpan_dev, const struct ieee802154_llsec_seclevel *sl) { return rdev->ops->del_seclevel(&rdev->wpan_phy, wpan_dev, sl); } static inline int rdev_add_device(struct cfg802154_registered_device *rdev, struct wpan_dev *wpan_dev, const struct ieee802154_llsec_device *dev_desc) { return rdev->ops->add_device(&rdev->wpan_phy, wpan_dev, dev_desc); } static inline int rdev_del_device(struct cfg802154_registered_device *rdev, struct wpan_dev *wpan_dev, __le64 extended_addr) { return rdev->ops->del_device(&rdev->wpan_phy, wpan_dev, extended_addr); } static inline int rdev_add_devkey(struct cfg802154_registered_device *rdev, struct wpan_dev *wpan_dev, __le64 extended_addr, const struct ieee802154_llsec_device_key *devkey) { return rdev->ops->add_devkey(&rdev->wpan_phy, wpan_dev, extended_addr, devkey); } static inline int rdev_del_devkey(struct cfg802154_registered_device *rdev, struct wpan_dev *wpan_dev, __le64 extended_addr, const struct ieee802154_llsec_device_key *devkey) { return rdev->ops->del_devkey(&rdev->wpan_phy, wpan_dev, extended_addr, devkey); } #endif /* CONFIG_IEEE802154_NL802154_EXPERIMENTAL */ #endif /* __CFG802154_RDEV_OPS */
23 23 23 2 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 // SPDX-License-Identifier: GPL-2.0-or-later #include <linux/seq_file.h> #include <net/ip.h> #include <net/mptcp.h> #include <net/snmp.h> #include <net/net_namespace.h> #include "mib.h" static const struct snmp_mib mptcp_snmp_list[] = { SNMP_MIB_ITEM("MPCapableSYNRX", MPTCP_MIB_MPCAPABLEPASSIVE), SNMP_MIB_ITEM("MPCapableSYNTX", MPTCP_MIB_MPCAPABLEACTIVE), SNMP_MIB_ITEM("MPCapableSYNACKRX", MPTCP_MIB_MPCAPABLEACTIVEACK), SNMP_MIB_ITEM("MPCapableACKRX", MPTCP_MIB_MPCAPABLEPASSIVEACK), SNMP_MIB_ITEM("MPCapableFallbackACK", MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK), SNMP_MIB_ITEM("MPCapableFallbackSYNACK", MPTCP_MIB_MPCAPABLEACTIVEFALLBACK), SNMP_MIB_ITEM("MPCapableSYNTXDrop", MPTCP_MIB_MPCAPABLEACTIVEDROP), SNMP_MIB_ITEM("MPCapableSYNTXDisabled", MPTCP_MIB_MPCAPABLEACTIVEDISABLED), SNMP_MIB_ITEM("MPCapableEndpAttempt", MPTCP_MIB_MPCAPABLEENDPATTEMPT), SNMP_MIB_ITEM("MPFallbackTokenInit", MPTCP_MIB_TOKENFALLBACKINIT), SNMP_MIB_ITEM("MPTCPRetrans", MPTCP_MIB_RETRANSSEGS), SNMP_MIB_ITEM("MPJoinNoTokenFound", MPTCP_MIB_JOINNOTOKEN), SNMP_MIB_ITEM("MPJoinSynRx", MPTCP_MIB_JOINSYNRX), SNMP_MIB_ITEM("MPJoinSynBackupRx", MPTCP_MIB_JOINSYNBACKUPRX), SNMP_MIB_ITEM("MPJoinSynAckRx", MPTCP_MIB_JOINSYNACKRX), SNMP_MIB_ITEM("MPJoinSynAckBackupRx", MPTCP_MIB_JOINSYNACKBACKUPRX), SNMP_MIB_ITEM("MPJoinSynAckHMacFailure", MPTCP_MIB_JOINSYNACKMAC), SNMP_MIB_ITEM("MPJoinAckRx", MPTCP_MIB_JOINACKRX), SNMP_MIB_ITEM("MPJoinAckHMacFailure", MPTCP_MIB_JOINACKMAC), SNMP_MIB_ITEM("MPJoinSynTx", MPTCP_MIB_JOINSYNTX), SNMP_MIB_ITEM("MPJoinSynTxCreatSkErr", MPTCP_MIB_JOINSYNTXCREATSKERR), SNMP_MIB_ITEM("MPJoinSynTxBindErr", MPTCP_MIB_JOINSYNTXBINDERR), SNMP_MIB_ITEM("MPJoinSynTxConnectErr", MPTCP_MIB_JOINSYNTXCONNECTERR), SNMP_MIB_ITEM("DSSNotMatching", MPTCP_MIB_DSSNOMATCH), SNMP_MIB_ITEM("DSSCorruptionFallback", MPTCP_MIB_DSSCORRUPTIONFALLBACK), SNMP_MIB_ITEM("DSSCorruptionReset", MPTCP_MIB_DSSCORRUPTIONRESET), SNMP_MIB_ITEM("InfiniteMapTx", MPTCP_MIB_INFINITEMAPTX), SNMP_MIB_ITEM("InfiniteMapRx", MPTCP_MIB_INFINITEMAPRX), SNMP_MIB_ITEM("DSSNoMatchTCP", MPTCP_MIB_DSSTCPMISMATCH), SNMP_MIB_ITEM("DataCsumErr", MPTCP_MIB_DATACSUMERR), SNMP_MIB_ITEM("OFOQueueTail", MPTCP_MIB_OFOQUEUETAIL), SNMP_MIB_ITEM("OFOQueue", MPTCP_MIB_OFOQUEUE), SNMP_MIB_ITEM("OFOMerge", MPTCP_MIB_OFOMERGE), SNMP_MIB_ITEM("NoDSSInWindow", MPTCP_MIB_NODSSWINDOW), SNMP_MIB_ITEM("DuplicateData", MPTCP_MIB_DUPDATA), SNMP_MIB_ITEM("AddAddr", MPTCP_MIB_ADDADDR), SNMP_MIB_ITEM("AddAddrTx", MPTCP_MIB_ADDADDRTX), SNMP_MIB_ITEM("AddAddrTxDrop", MPTCP_MIB_ADDADDRTXDROP), SNMP_MIB_ITEM("EchoAdd", MPTCP_MIB_ECHOADD), SNMP_MIB_ITEM("EchoAddTx", MPTCP_MIB_ECHOADDTX), SNMP_MIB_ITEM("EchoAddTxDrop", MPTCP_MIB_ECHOADDTXDROP), SNMP_MIB_ITEM("PortAdd", MPTCP_MIB_PORTADD), SNMP_MIB_ITEM("AddAddrDrop", MPTCP_MIB_ADDADDRDROP), SNMP_MIB_ITEM("MPJoinPortSynRx", MPTCP_MIB_JOINPORTSYNRX), SNMP_MIB_ITEM("MPJoinPortSynAckRx", MPTCP_MIB_JOINPORTSYNACKRX), SNMP_MIB_ITEM("MPJoinPortAckRx", MPTCP_MIB_JOINPORTACKRX), SNMP_MIB_ITEM("MismatchPortSynRx", MPTCP_MIB_MISMATCHPORTSYNRX), SNMP_MIB_ITEM("MismatchPortAckRx", MPTCP_MIB_MISMATCHPORTACKRX), SNMP_MIB_ITEM("RmAddr", MPTCP_MIB_RMADDR), SNMP_MIB_ITEM("RmAddrDrop", MPTCP_MIB_RMADDRDROP), SNMP_MIB_ITEM("RmAddrTx", MPTCP_MIB_RMADDRTX), SNMP_MIB_ITEM("RmAddrTxDrop", MPTCP_MIB_RMADDRTXDROP), SNMP_MIB_ITEM("RmSubflow", MPTCP_MIB_RMSUBFLOW), SNMP_MIB_ITEM("MPPrioTx", MPTCP_MIB_MPPRIOTX), SNMP_MIB_ITEM("MPPrioRx", MPTCP_MIB_MPPRIORX), SNMP_MIB_ITEM("MPFailTx", MPTCP_MIB_MPFAILTX), SNMP_MIB_ITEM("MPFailRx", MPTCP_MIB_MPFAILRX), SNMP_MIB_ITEM("MPFastcloseTx", MPTCP_MIB_MPFASTCLOSETX), SNMP_MIB_ITEM("MPFastcloseRx", MPTCP_MIB_MPFASTCLOSERX), SNMP_MIB_ITEM("MPRstTx", MPTCP_MIB_MPRSTTX), SNMP_MIB_ITEM("MPRstRx", MPTCP_MIB_MPRSTRX), SNMP_MIB_ITEM("RcvPruned", MPTCP_MIB_RCVPRUNED), SNMP_MIB_ITEM("SubflowStale", MPTCP_MIB_SUBFLOWSTALE), SNMP_MIB_ITEM("SubflowRecover", MPTCP_MIB_SUBFLOWRECOVER), SNMP_MIB_ITEM("SndWndShared", MPTCP_MIB_SNDWNDSHARED), SNMP_MIB_ITEM("RcvWndShared", MPTCP_MIB_RCVWNDSHARED), SNMP_MIB_ITEM("RcvWndConflictUpdate", MPTCP_MIB_RCVWNDCONFLICTUPDATE), SNMP_MIB_ITEM("RcvWndConflict", MPTCP_MIB_RCVWNDCONFLICT), SNMP_MIB_ITEM("MPCurrEstab", MPTCP_MIB_CURRESTAB), SNMP_MIB_ITEM("Blackhole", MPTCP_MIB_BLACKHOLE), SNMP_MIB_SENTINEL }; /* mptcp_mib_alloc - allocate percpu mib counters * * These are allocated when the first mptcp socket is created so * we do not waste percpu memory if mptcp isn't in use. */ bool mptcp_mib_alloc(struct net *net) { struct mptcp_mib __percpu *mib = alloc_percpu(struct mptcp_mib); if (!mib) return false; if (cmpxchg(&net->mib.mptcp_statistics, NULL, mib)) free_percpu(mib); return true; } void mptcp_seq_show(struct seq_file *seq) { unsigned long sum[ARRAY_SIZE(mptcp_snmp_list) - 1]; struct net *net = seq->private; int i; seq_puts(seq, "MPTcpExt:"); for (i = 0; mptcp_snmp_list[i].name; i++) seq_printf(seq, " %s", mptcp_snmp_list[i].name); seq_puts(seq, "\nMPTcpExt:"); memset(sum, 0, sizeof(sum)); if (net->mib.mptcp_statistics) snmp_get_cpu_field_batch(sum, mptcp_snmp_list, net->mib.mptcp_statistics); for (i = 0; mptcp_snmp_list[i].name; i++) seq_printf(seq, " %lu", sum[i]); seq_putc(seq, '\n'); }
24 131 130 45 952 952 951 54 498 496 26 26 26 26 26 26 26 26 26 26 26 25 25 26 26 498 26 90 1 89 90 90 482 356 35 35 35 35 4 465 466 4 465 30 27 24 24 131 129 117 113 250 255 253 252 249 247 245 16 16 16 16 15 15 15 36 25 37 36 37 25 25 2 2 2 2 1 1 1 2 2 2 2 3 1 2 1 3 133 133 8 133 125 124 124 124 124 124 124 124 65 124 36 4 36 32 7 30 31 1 31 3 3 3 3 3 161 161 52 135 161 161 3 161 160 160 132 132 55 52 52 3 3 135 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/proc/inode.c * * Copyright (C) 1991, 1992 Linus Torvalds */ #include <linux/cache.h> #include <linux/time.h> #include <linux/proc_fs.h> #include <linux/kernel.h> #include <linux/pid_namespace.h> #include <linux/mm.h> #include <linux/string.h> #include <linux/stat.h> #include <linux/completion.h> #include <linux/poll.h> #include <linux/printk.h> #include <linux/file.h> #include <linux/limits.h> #include <linux/init.h> #include <linux/module.h> #include <linux/sysctl.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/mount.h> #include <linux/bug.h> #include "internal.h" static void proc_evict_inode(struct inode *inode) { struct ctl_table_header *head; struct proc_inode *ei = PROC_I(inode); truncate_inode_pages_final(&inode->i_data); clear_inode(inode); /* Stop tracking associated processes */ if (ei->pid) proc_pid_evict_inode(ei); head = ei->sysctl; if (head) { RCU_INIT_POINTER(ei->sysctl, NULL); proc_sys_evict_inode(inode, head); } } static struct kmem_cache *proc_inode_cachep __ro_after_init; static struct kmem_cache *pde_opener_cache __ro_after_init; static struct inode *proc_alloc_inode(struct super_block *sb) { struct proc_inode *ei; ei = alloc_inode_sb(sb, proc_inode_cachep, GFP_KERNEL); if (!ei) return NULL; ei->pid = NULL; ei->fd = 0; ei->op.proc_get_link = NULL; ei->pde = NULL; ei->sysctl = NULL; ei->sysctl_entry = NULL; INIT_HLIST_NODE(&ei->sibling_inodes); ei->ns_ops = NULL; return &ei->vfs_inode; } static void proc_free_inode(struct inode *inode) { struct proc_inode *ei = PROC_I(inode); if (ei->pid) put_pid(ei->pid); /* Let go of any associated proc directory entry */ if (ei->pde) pde_put(ei->pde); kmem_cache_free(proc_inode_cachep, PROC_I(inode)); } static void init_once(void *foo) { struct proc_inode *ei = (struct proc_inode *) foo; inode_init_once(&ei->vfs_inode); } void __init proc_init_kmemcache(void) { proc_inode_cachep = kmem_cache_create("proc_inode_cache", sizeof(struct proc_inode), 0, (SLAB_RECLAIM_ACCOUNT| SLAB_ACCOUNT| SLAB_PANIC), init_once); pde_opener_cache = kmem_cache_create("pde_opener", sizeof(struct pde_opener), 0, SLAB_ACCOUNT|SLAB_PANIC, NULL); proc_dir_entry_cache = kmem_cache_create_usercopy( "proc_dir_entry", SIZEOF_PDE, 0, SLAB_PANIC, offsetof(struct proc_dir_entry, inline_name), SIZEOF_PDE_INLINE_NAME, NULL); BUILD_BUG_ON(sizeof(struct proc_dir_entry) >= SIZEOF_PDE); } void proc_invalidate_siblings_dcache(struct hlist_head *inodes, spinlock_t *lock) { struct hlist_node *node; struct super_block *old_sb = NULL; rcu_read_lock(); while ((node = hlist_first_rcu(inodes))) { struct proc_inode *ei = hlist_entry(node, struct proc_inode, sibling_inodes); struct super_block *sb; struct inode *inode; spin_lock(lock); hlist_del_init_rcu(&ei->sibling_inodes); spin_unlock(lock); inode = &ei->vfs_inode; sb = inode->i_sb; if ((sb != old_sb) && !atomic_inc_not_zero(&sb->s_active)) continue; inode = igrab(inode); rcu_read_unlock(); if (sb != old_sb) { if (old_sb) deactivate_super(old_sb); old_sb = sb; } if (unlikely(!inode)) { rcu_read_lock(); continue; } if (S_ISDIR(inode->i_mode)) { struct dentry *dir = d_find_any_alias(inode); if (dir) { d_invalidate(dir); dput(dir); } } else { struct dentry *dentry; while ((dentry = d_find_alias(inode))) { d_invalidate(dentry); dput(dentry); } } iput(inode); rcu_read_lock(); } rcu_read_unlock(); if (old_sb) deactivate_super(old_sb); } static inline const char *hidepid2str(enum proc_hidepid v) { switch (v) { case HIDEPID_OFF: return "off"; case HIDEPID_NO_ACCESS: return "noaccess"; case HIDEPID_INVISIBLE: return "invisible"; case HIDEPID_NOT_PTRACEABLE: return "ptraceable"; } WARN_ONCE(1, "bad hide_pid value: %d\n", v); return "unknown"; } static int proc_show_options(struct seq_file *seq, struct dentry *root) { struct proc_fs_info *fs_info = proc_sb_info(root->d_sb); if (!gid_eq(fs_info->pid_gid, GLOBAL_ROOT_GID)) seq_printf(seq, ",gid=%u", from_kgid_munged(&init_user_ns, fs_info->pid_gid)); if (fs_info->hide_pid != HIDEPID_OFF) seq_printf(seq, ",hidepid=%s", hidepid2str(fs_info->hide_pid)); if (fs_info->pidonly != PROC_PIDONLY_OFF) seq_printf(seq, ",subset=pid"); return 0; } const struct super_operations proc_sops = { .alloc_inode = proc_alloc_inode, .free_inode = proc_free_inode, .drop_inode = generic_delete_inode, .evict_inode = proc_evict_inode, .statfs = simple_statfs, .show_options = proc_show_options, }; enum {BIAS = -1U<<31}; static inline int use_pde(struct proc_dir_entry *pde) { return likely(atomic_inc_unless_negative(&pde->in_use)); } static void unuse_pde(struct proc_dir_entry *pde) { if (unlikely(atomic_dec_return(&pde->in_use) == BIAS)) complete(pde->pde_unload_completion); } /* * At most 2 contexts can enter this function: the one doing the last * close on the descriptor and whoever is deleting PDE itself. * * First to enter calls ->proc_release hook and signals its completion * to the second one which waits and then does nothing. * * PDE is locked on entry, unlocked on exit. */ static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo) __releases(&pde->pde_unload_lock) { /* * close() (proc_reg_release()) can't delete an entry and proceed: * ->release hook needs to be available at the right moment. * * rmmod (remove_proc_entry() et al) can't delete an entry and proceed: * "struct file" needs to be available at the right moment. */ if (pdeo->closing) { /* somebody else is doing that, just wait */ DECLARE_COMPLETION_ONSTACK(c); pdeo->c = &c; spin_unlock(&pde->pde_unload_lock); wait_for_completion(&c); } else { struct file *file; struct completion *c; pdeo->closing = true; spin_unlock(&pde->pde_unload_lock); file = pdeo->file; pde->proc_ops->proc_release(file_inode(file), file); spin_lock(&pde->pde_unload_lock); /* Strictly after ->proc_release, see above. */ list_del(&pdeo->lh); c = pdeo->c; spin_unlock(&pde->pde_unload_lock); if (unlikely(c)) complete(c); kmem_cache_free(pde_opener_cache, pdeo); } } void proc_entry_rundown(struct proc_dir_entry *de) { DECLARE_COMPLETION_ONSTACK(c); /* Wait until all existing callers into module are done. */ de->pde_unload_completion = &c; if (atomic_add_return(BIAS, &de->in_use) != BIAS) wait_for_completion(&c); /* ->pde_openers list can't grow from now on. */ spin_lock(&de->pde_unload_lock); while (!list_empty(&de->pde_openers)) { struct pde_opener *pdeo; pdeo = list_first_entry(&de->pde_openers, struct pde_opener, lh); close_pdeo(de, pdeo); spin_lock(&de->pde_unload_lock); } spin_unlock(&de->pde_unload_lock); } static loff_t proc_reg_llseek(struct file *file, loff_t offset, int whence) { struct proc_dir_entry *pde = PDE(file_inode(file)); loff_t rv = -EINVAL; if (pde_is_permanent(pde)) { return pde->proc_ops->proc_lseek(file, offset, whence); } else if (use_pde(pde)) { rv = pde->proc_ops->proc_lseek(file, offset, whence); unuse_pde(pde); } return rv; } static ssize_t proc_reg_read_iter(struct kiocb *iocb, struct iov_iter *iter) { struct proc_dir_entry *pde = PDE(file_inode(iocb->ki_filp)); ssize_t ret; if (pde_is_permanent(pde)) return pde->proc_ops->proc_read_iter(iocb, iter); if (!use_pde(pde)) return -EIO; ret = pde->proc_ops->proc_read_iter(iocb, iter); unuse_pde(pde); return ret; } static ssize_t pde_read(struct proc_dir_entry *pde, struct file *file, char __user *buf, size_t count, loff_t *ppos) { __auto_type read = pde->proc_ops->proc_read; if (read) return read(file, buf, count, ppos); return -EIO; } static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct proc_dir_entry *pde = PDE(file_inode(file)); ssize_t rv = -EIO; if (pde_is_permanent(pde)) { return pde_read(pde, file, buf, count, ppos); } else if (use_pde(pde)) { rv = pde_read(pde, file, buf, count, ppos); unuse_pde(pde); } return rv; } static ssize_t pde_write(struct proc_dir_entry *pde, struct file *file, const char __user *buf, size_t count, loff_t *ppos) { __auto_type write = pde->proc_ops->proc_write; if (write) return write(file, buf, count, ppos); return -EIO; } static ssize_t proc_reg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct proc_dir_entry *pde = PDE(file_inode(file)); ssize_t rv = -EIO; if (pde_is_permanent(pde)) { return pde_write(pde, file, buf, count, ppos); } else if (use_pde(pde)) { rv = pde_write(pde, file, buf, count, ppos); unuse_pde(pde); } return rv; } static __poll_t pde_poll(struct proc_dir_entry *pde, struct file *file, struct poll_table_struct *pts) { __auto_type poll = pde->proc_ops->proc_poll; if (poll) return poll(file, pts); return DEFAULT_POLLMASK; } static __poll_t proc_reg_poll(struct file *file, struct poll_table_struct *pts) { struct proc_dir_entry *pde = PDE(file_inode(file)); __poll_t rv = DEFAULT_POLLMASK; if (pde_is_permanent(pde)) { return pde_poll(pde, file, pts); } else if (use_pde(pde)) { rv = pde_poll(pde, file, pts); unuse_pde(pde); } return rv; } static long pde_ioctl(struct proc_dir_entry *pde, struct file *file, unsigned int cmd, unsigned long arg) { __auto_type ioctl = pde->proc_ops->proc_ioctl; if (ioctl) return ioctl(file, cmd, arg); return -ENOTTY; } static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct proc_dir_entry *pde = PDE(file_inode(file)); long rv = -ENOTTY; if (pde_is_permanent(pde)) { return pde_ioctl(pde, file, cmd, arg); } else if (use_pde(pde)) { rv = pde_ioctl(pde, file, cmd, arg); unuse_pde(pde); } return rv; } #ifdef CONFIG_COMPAT static long pde_compat_ioctl(struct proc_dir_entry *pde, struct file *file, unsigned int cmd, unsigned long arg) { __auto_type compat_ioctl = pde->proc_ops->proc_compat_ioctl; if (compat_ioctl) return compat_ioctl(file, cmd, arg); return -ENOTTY; } static long proc_reg_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct proc_dir_entry *pde = PDE(file_inode(file)); long rv = -ENOTTY; if (pde_is_permanent(pde)) { return pde_compat_ioctl(pde, file, cmd, arg); } else if (use_pde(pde)) { rv = pde_compat_ioctl(pde, file, cmd, arg); unuse_pde(pde); } return rv; } #endif static int pde_mmap(struct proc_dir_entry *pde, struct file *file, struct vm_area_struct *vma) { __auto_type mmap = pde->proc_ops->proc_mmap; if (mmap) return mmap(file, vma); return -EIO; } static int proc_reg_mmap(struct file *file, struct vm_area_struct *vma) { struct proc_dir_entry *pde = PDE(file_inode(file)); int rv = -EIO; if (pde_is_permanent(pde)) { return pde_mmap(pde, file, vma); } else if (use_pde(pde)) { rv = pde_mmap(pde, file, vma); unuse_pde(pde); } return rv; } static unsigned long pde_get_unmapped_area(struct proc_dir_entry *pde, struct file *file, unsigned long orig_addr, unsigned long len, unsigned long pgoff, unsigned long flags) { if (pde->proc_ops->proc_get_unmapped_area) return pde->proc_ops->proc_get_unmapped_area(file, orig_addr, len, pgoff, flags); #ifdef CONFIG_MMU return mm_get_unmapped_area(current->mm, file, orig_addr, len, pgoff, flags); #endif return orig_addr; } static unsigned long proc_reg_get_unmapped_area(struct file *file, unsigned long orig_addr, unsigned long len, unsigned long pgoff, unsigned long flags) { struct proc_dir_entry *pde = PDE(file_inode(file)); unsigned long rv = -EIO; if (pde_is_permanent(pde)) { return pde_get_unmapped_area(pde, file, orig_addr, len, pgoff, flags); } else if (use_pde(pde)) { rv = pde_get_unmapped_area(pde, file, orig_addr, len, pgoff, flags); unuse_pde(pde); } return rv; } static int proc_reg_open(struct inode *inode, struct file *file) { struct proc_dir_entry *pde = PDE(inode); int rv = 0; typeof_member(struct proc_ops, proc_open) open; struct pde_opener *pdeo; if (!pde->proc_ops->proc_lseek) file->f_mode &= ~FMODE_LSEEK; if (pde_is_permanent(pde)) { open = pde->proc_ops->proc_open; if (open) rv = open(inode, file); return rv; } /* * Ensure that * 1) PDE's ->release hook will be called no matter what * either normally by close()/->release, or forcefully by * rmmod/remove_proc_entry. * * 2) rmmod isn't blocked by opening file in /proc and sitting on * the descriptor (including "rmmod foo </proc/foo" scenario). * * Save every "struct file" with custom ->release hook. */ if (!use_pde(pde)) return -ENOENT; __auto_type release = pde->proc_ops->proc_release; if (release) { pdeo = kmem_cache_alloc(pde_opener_cache, GFP_KERNEL); if (!pdeo) { rv = -ENOMEM; goto out_unuse; } } open = pde->proc_ops->proc_open; if (open) rv = open(inode, file); if (release) { if (rv == 0) { /* To know what to release. */ pdeo->file = file; pdeo->closing = false; pdeo->c = NULL; spin_lock(&pde->pde_unload_lock); list_add(&pdeo->lh, &pde->pde_openers); spin_unlock(&pde->pde_unload_lock); } else kmem_cache_free(pde_opener_cache, pdeo); } out_unuse: unuse_pde(pde); return rv; } static int proc_reg_release(struct inode *inode, struct file *file) { struct proc_dir_entry *pde = PDE(inode); struct pde_opener *pdeo; if (pde_is_permanent(pde)) { __auto_type release = pde->proc_ops->proc_release; if (release) { return release(inode, file); } return 0; } spin_lock(&pde->pde_unload_lock); list_for_each_entry(pdeo, &pde->pde_openers, lh) { if (pdeo->file == file) { close_pdeo(pde, pdeo); return 0; } } spin_unlock(&pde->pde_unload_lock); return 0; } static const struct file_operations proc_reg_file_ops = { .llseek = proc_reg_llseek, .read = proc_reg_read, .write = proc_reg_write, .poll = proc_reg_poll, .unlocked_ioctl = proc_reg_unlocked_ioctl, .mmap = proc_reg_mmap, .get_unmapped_area = proc_reg_get_unmapped_area, .open = proc_reg_open, .release = proc_reg_release, }; static const struct file_operations proc_iter_file_ops = { .llseek = proc_reg_llseek, .read_iter = proc_reg_read_iter, .write = proc_reg_write, .splice_read = copy_splice_read, .poll = proc_reg_poll, .unlocked_ioctl = proc_reg_unlocked_ioctl, .mmap = proc_reg_mmap, .get_unmapped_area = proc_reg_get_unmapped_area, .open = proc_reg_open, .release = proc_reg_release, }; #ifdef CONFIG_COMPAT static const struct file_operations proc_reg_file_ops_compat = { .llseek = proc_reg_llseek, .read = proc_reg_read, .write = proc_reg_write, .poll = proc_reg_poll, .unlocked_ioctl = proc_reg_unlocked_ioctl, .compat_ioctl = proc_reg_compat_ioctl, .mmap = proc_reg_mmap, .get_unmapped_area = proc_reg_get_unmapped_area, .open = proc_reg_open, .release = proc_reg_release, }; static const struct file_operations proc_iter_file_ops_compat = { .llseek = proc_reg_llseek, .read_iter = proc_reg_read_iter, .splice_read = copy_splice_read, .write = proc_reg_write, .poll = proc_reg_poll, .unlocked_ioctl = proc_reg_unlocked_ioctl, .compat_ioctl = proc_reg_compat_ioctl, .mmap = proc_reg_mmap, .get_unmapped_area = proc_reg_get_unmapped_area, .open = proc_reg_open, .release = proc_reg_release, }; #endif static void proc_put_link(void *p) { unuse_pde(p); } static const char *proc_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { struct proc_dir_entry *pde = PDE(inode); if (!use_pde(pde)) return ERR_PTR(-EINVAL); set_delayed_call(done, proc_put_link, pde); return pde->data; } const struct inode_operations proc_link_inode_operations = { .get_link = proc_get_link, }; struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de) { struct inode *inode = new_inode(sb); if (!inode) { pde_put(de); return NULL; } inode->i_private = de->data; inode->i_ino = de->low_ino; simple_inode_init_ts(inode); PROC_I(inode)->pde = de; if (is_empty_pde(de)) { make_empty_dir_inode(inode); return inode; } if (de->mode) { inode->i_mode = de->mode; inode->i_uid = de->uid; inode->i_gid = de->gid; } if (de->size) inode->i_size = de->size; if (de->nlink) set_nlink(inode, de->nlink); if (S_ISREG(inode->i_mode)) { inode->i_op = de->proc_iops; if (de->proc_ops->proc_read_iter) inode->i_fop = &proc_iter_file_ops; else inode->i_fop = &proc_reg_file_ops; #ifdef CONFIG_COMPAT if (de->proc_ops->proc_compat_ioctl) { if (de->proc_ops->proc_read_iter) inode->i_fop = &proc_iter_file_ops_compat; else inode->i_fop = &proc_reg_file_ops_compat; } #endif } else if (S_ISDIR(inode->i_mode)) { inode->i_op = de->proc_iops; inode->i_fop = de->proc_dir_ops; } else if (S_ISLNK(inode->i_mode)) { inode->i_op = de->proc_iops; inode->i_fop = NULL; } else { BUG(); } return inode; }
81 75 17 14 78 78 75 70 70 70 75 67 92 99 92 71 72 71 71 13 9 7 4 4 7 5 4 3 1 5 5 2 8 3 2 2 2 3 138 138 11 135 135 135 133 133 135 3 27 10 8 10 3 8 7 6 2 8 27 4 2 2 2 4 4 8 4 6 8 6 132 124 124 124 124 124 13 112 124 124 124 124 132 21 3 3 39 3 2 3 2 1 28 35 8 7 6 1 5 4 1 8 5 194 10 2 4 6 21 2 158 125 26 26 112 45 41 7 45 45 1 114 114 114 114 2 112 112 112 112 112 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1991, 1992 Linus Torvalds * * Added support for a Unix98-style ptmx device. * -- C. Scott Ananian <cananian@alumni.princeton.edu>, 14-Jan-1998 * */ #include <linux/module.h> #include <linux/errno.h> #include <linux/interrupt.h> #include <linux/tty.h> #include <linux/tty_flip.h> #include <linux/fcntl.h> #include <linux/sched/signal.h> #include <linux/string.h> #include <linux/major.h> #include <linux/mm.h> #include <linux/init.h> #include <linux/device.h> #include <linux/uaccess.h> #include <linux/bitops.h> #include <linux/devpts_fs.h> #include <linux/slab.h> #include <linux/mutex.h> #include <linux/poll.h> #include <linux/mount.h> #include <linux/file.h> #include <linux/ioctl.h> #include <linux/compat.h> #include "tty.h" #undef TTY_DEBUG_HANGUP #ifdef TTY_DEBUG_HANGUP # define tty_debug_hangup(tty, f, args...) tty_debug(tty, f, ##args) #else # define tty_debug_hangup(tty, f, args...) do {} while (0) #endif #ifdef CONFIG_UNIX98_PTYS static struct tty_driver *ptm_driver; static struct tty_driver *pts_driver; static DEFINE_MUTEX(devpts_mutex); #endif static void pty_close(struct tty_struct *tty, struct file *filp) { if (tty->driver->subtype == PTY_TYPE_MASTER) WARN_ON(tty->count > 1); else { if (tty_io_error(tty)) return; if (tty->count > 2) return; } set_bit(TTY_IO_ERROR, &tty->flags); wake_up_interruptible(&tty->read_wait); wake_up_interruptible(&tty->write_wait); spin_lock_irq(&tty->ctrl.lock); tty->ctrl.packet = false; spin_unlock_irq(&tty->ctrl.lock); /* Review - krefs on tty_link ?? */ if (!tty->link) return; set_bit(TTY_OTHER_CLOSED, &tty->link->flags); wake_up_interruptible(&tty->link->read_wait); wake_up_interruptible(&tty->link->write_wait); if (tty->driver->subtype == PTY_TYPE_MASTER) { set_bit(TTY_OTHER_CLOSED, &tty->flags); #ifdef CONFIG_UNIX98_PTYS if (tty->driver == ptm_driver) { mutex_lock(&devpts_mutex); if (tty->link->driver_data) devpts_pty_kill(tty->link->driver_data); mutex_unlock(&devpts_mutex); } #endif tty_vhangup(tty->link); } } /* * The unthrottle routine is called by the line discipline to signal * that it can receive more characters. For PTY's, the TTY_THROTTLED * flag is always set, to force the line discipline to always call the * unthrottle routine when there are fewer than TTY_THRESHOLD_UNTHROTTLE * characters in the queue. This is necessary since each time this * happens, we need to wake up any sleeping processes that could be * (1) trying to send data to the pty, or (2) waiting in wait_until_sent() * for the pty buffer to be drained. */ static void pty_unthrottle(struct tty_struct *tty) { tty_wakeup(tty->link); set_bit(TTY_THROTTLED, &tty->flags); } /** * pty_write - write to a pty * @tty: the tty we write from * @buf: kernel buffer of data * @c: bytes to write * * Our "hardware" write method. Data is coming from the ldisc which * may be in a non sleeping state. We simply throw this at the other * end of the link as if we were an IRQ handler receiving stuff for * the other side of the pty/tty pair. */ static ssize_t pty_write(struct tty_struct *tty, const u8 *buf, size_t c) { struct tty_struct *to = tty->link; if (tty->flow.stopped || !c) return 0; return tty_insert_flip_string_and_push_buffer(to->port, buf, c); } /** * pty_write_room - write space * @tty: tty we are writing from * * Report how many bytes the ldisc can send into the queue for * the other device. */ static unsigned int pty_write_room(struct tty_struct *tty) { if (tty->flow.stopped) return 0; return tty_buffer_space_avail(tty->link->port); } /* Set the lock flag on a pty */ static int pty_set_lock(struct tty_struct *tty, int __user *arg) { int val; if (get_user(val, arg)) return -EFAULT; if (val) set_bit(TTY_PTY_LOCK, &tty->flags); else clear_bit(TTY_PTY_LOCK, &tty->flags); return 0; } static int pty_get_lock(struct tty_struct *tty, int __user *arg) { int locked = test_bit(TTY_PTY_LOCK, &tty->flags); return put_user(locked, arg); } /* Set the packet mode on a pty */ static int pty_set_pktmode(struct tty_struct *tty, int __user *arg) { int pktmode; if (get_user(pktmode, arg)) return -EFAULT; spin_lock_irq(&tty->ctrl.lock); if (pktmode) { if (!tty->ctrl.packet) { tty->link->ctrl.pktstatus = 0; smp_mb(); tty->ctrl.packet = true; } } else tty->ctrl.packet = false; spin_unlock_irq(&tty->ctrl.lock); return 0; } /* Get the packet mode of a pty */ static int pty_get_pktmode(struct tty_struct *tty, int __user *arg) { int pktmode = tty->ctrl.packet; return put_user(pktmode, arg); } /* Send a signal to the slave */ static int pty_signal(struct tty_struct *tty, int sig) { struct pid *pgrp; if (sig != SIGINT && sig != SIGQUIT && sig != SIGTSTP) return -EINVAL; if (tty->link) { pgrp = tty_get_pgrp(tty->link); if (pgrp) kill_pgrp(pgrp, sig, 1); put_pid(pgrp); } return 0; } static void pty_flush_buffer(struct tty_struct *tty) { struct tty_struct *to = tty->link; if (!to) return; tty_buffer_flush(to, NULL); if (to->ctrl.packet) { spin_lock_irq(&tty->ctrl.lock); tty->ctrl.pktstatus |= TIOCPKT_FLUSHWRITE; wake_up_interruptible(&to->read_wait); spin_unlock_irq(&tty->ctrl.lock); } } static int pty_open(struct tty_struct *tty, struct file *filp) { if (!tty || !tty->link) return -ENODEV; if (test_bit(TTY_OTHER_CLOSED, &tty->flags)) goto out; if (test_bit(TTY_PTY_LOCK, &tty->link->flags)) goto out; if (tty->driver->subtype == PTY_TYPE_SLAVE && tty->link->count != 1) goto out; clear_bit(TTY_IO_ERROR, &tty->flags); clear_bit(TTY_OTHER_CLOSED, &tty->link->flags); set_bit(TTY_THROTTLED, &tty->flags); return 0; out: set_bit(TTY_IO_ERROR, &tty->flags); return -EIO; } static void pty_set_termios(struct tty_struct *tty, const struct ktermios *old_termios) { /* See if packet mode change of state. */ if (tty->link && tty->link->ctrl.packet) { int extproc = (old_termios->c_lflag & EXTPROC) | L_EXTPROC(tty); int old_flow = ((old_termios->c_iflag & IXON) && (old_termios->c_cc[VSTOP] == '\023') && (old_termios->c_cc[VSTART] == '\021')); int new_flow = (I_IXON(tty) && STOP_CHAR(tty) == '\023' && START_CHAR(tty) == '\021'); if ((old_flow != new_flow) || extproc) { spin_lock_irq(&tty->ctrl.lock); if (old_flow != new_flow) { tty->ctrl.pktstatus &= ~(TIOCPKT_DOSTOP | TIOCPKT_NOSTOP); if (new_flow) tty->ctrl.pktstatus |= TIOCPKT_DOSTOP; else tty->ctrl.pktstatus |= TIOCPKT_NOSTOP; } if (extproc) tty->ctrl.pktstatus |= TIOCPKT_IOCTL; spin_unlock_irq(&tty->ctrl.lock); wake_up_interruptible(&tty->link->read_wait); } } tty->termios.c_cflag &= ~(CSIZE | PARENB); tty->termios.c_cflag |= (CS8 | CREAD); } /** * pty_resize - resize event * @tty: tty being resized * @ws: window size being set. * * Update the termios variables and send the necessary signals to * peform a terminal resize correctly */ static int pty_resize(struct tty_struct *tty, struct winsize *ws) { struct pid *pgrp, *rpgrp; struct tty_struct *pty = tty->link; /* For a PTY we need to lock the tty side */ mutex_lock(&tty->winsize_mutex); if (!memcmp(ws, &tty->winsize, sizeof(*ws))) goto done; /* Signal the foreground process group of both ptys */ pgrp = tty_get_pgrp(tty); rpgrp = tty_get_pgrp(pty); if (pgrp) kill_pgrp(pgrp, SIGWINCH, 1); if (rpgrp != pgrp && rpgrp) kill_pgrp(rpgrp, SIGWINCH, 1); put_pid(pgrp); put_pid(rpgrp); tty->winsize = *ws; pty->winsize = *ws; /* Never used so will go away soon */ done: mutex_unlock(&tty->winsize_mutex); return 0; } /** * pty_start - start() handler * pty_stop - stop() handler * @tty: tty being flow-controlled * * Propagates the TIOCPKT status to the master pty. * * NB: only the master pty can be in packet mode so only the slave * needs start()/stop() handlers */ static void pty_start(struct tty_struct *tty) { unsigned long flags; if (tty->link && tty->link->ctrl.packet) { spin_lock_irqsave(&tty->ctrl.lock, flags); tty->ctrl.pktstatus &= ~TIOCPKT_STOP; tty->ctrl.pktstatus |= TIOCPKT_START; spin_unlock_irqrestore(&tty->ctrl.lock, flags); wake_up_interruptible_poll(&tty->link->read_wait, EPOLLIN); } } static void pty_stop(struct tty_struct *tty) { unsigned long flags; if (tty->link && tty->link->ctrl.packet) { spin_lock_irqsave(&tty->ctrl.lock, flags); tty->ctrl.pktstatus &= ~TIOCPKT_START; tty->ctrl.pktstatus |= TIOCPKT_STOP; spin_unlock_irqrestore(&tty->ctrl.lock, flags); wake_up_interruptible_poll(&tty->link->read_wait, EPOLLIN); } } /** * pty_common_install - set up the pty pair * @driver: the pty driver * @tty: the tty being instantiated * @legacy: true if this is BSD style * * Perform the initial set up for the tty/pty pair. Called from the * tty layer when the port is first opened. * * Locking: the caller must hold the tty_mutex */ static int pty_common_install(struct tty_driver *driver, struct tty_struct *tty, bool legacy) { struct tty_struct *o_tty; struct tty_port *ports[2]; int idx = tty->index; int retval = -ENOMEM; /* Opening the slave first has always returned -EIO */ if (driver->subtype != PTY_TYPE_MASTER) return -EIO; ports[0] = kmalloc(sizeof **ports, GFP_KERNEL); ports[1] = kmalloc(sizeof **ports, GFP_KERNEL); if (!ports[0] || !ports[1]) goto err; if (!try_module_get(driver->other->owner)) { /* This cannot in fact currently happen */ goto err; } o_tty = alloc_tty_struct(driver->other, idx); if (!o_tty) goto err_put_module; tty_set_lock_subclass(o_tty); lockdep_set_subclass(&o_tty->termios_rwsem, TTY_LOCK_SLAVE); if (legacy) { /* We always use new tty termios data so we can do this the easy way .. */ tty_init_termios(tty); tty_init_termios(o_tty); driver->other->ttys[idx] = o_tty; driver->ttys[idx] = tty; } else { memset(&tty->termios_locked, 0, sizeof(tty->termios_locked)); tty->termios = driver->init_termios; memset(&o_tty->termios_locked, 0, sizeof(tty->termios_locked)); o_tty->termios = driver->other->init_termios; } /* * Everything allocated ... set up the o_tty structure. */ tty_driver_kref_get(driver->other); /* Establish the links in both directions */ tty->link = o_tty; o_tty->link = tty; tty_port_init(ports[0]); tty_port_init(ports[1]); tty_buffer_set_limit(ports[0], 8192); tty_buffer_set_limit(ports[1], 8192); o_tty->port = ports[0]; tty->port = ports[1]; o_tty->port->itty = o_tty; tty_buffer_set_lock_subclass(o_tty->port); tty_driver_kref_get(driver); tty->count++; o_tty->count++; return 0; err_put_module: module_put(driver->other->owner); err: kfree(ports[0]); kfree(ports[1]); return retval; } static void pty_cleanup(struct tty_struct *tty) { tty_port_put(tty->port); } /* Traditional BSD devices */ #ifdef CONFIG_LEGACY_PTYS static int pty_install(struct tty_driver *driver, struct tty_struct *tty) { return pty_common_install(driver, tty, true); } static void pty_remove(struct tty_driver *driver, struct tty_struct *tty) { struct tty_struct *pair = tty->link; driver->ttys[tty->index] = NULL; if (pair) pair->driver->ttys[pair->index] = NULL; } static int pty_bsd_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg) { switch (cmd) { case TIOCSPTLCK: /* Set PT Lock (disallow slave open) */ return pty_set_lock(tty, (int __user *) arg); case TIOCGPTLCK: /* Get PT Lock status */ return pty_get_lock(tty, (int __user *)arg); case TIOCPKT: /* Set PT packet mode */ return pty_set_pktmode(tty, (int __user *)arg); case TIOCGPKT: /* Get PT packet mode */ return pty_get_pktmode(tty, (int __user *)arg); case TIOCSIG: /* Send signal to other side of pty */ return pty_signal(tty, (int) arg); case TIOCGPTN: /* TTY returns ENOTTY, but glibc expects EINVAL here */ return -EINVAL; } return -ENOIOCTLCMD; } #ifdef CONFIG_COMPAT static long pty_bsd_compat_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg) { /* * PTY ioctls don't require any special translation between 32-bit and * 64-bit userspace, they are already compatible. */ return pty_bsd_ioctl(tty, cmd, (unsigned long)compat_ptr(arg)); } #else #define pty_bsd_compat_ioctl NULL #endif static int legacy_count = CONFIG_LEGACY_PTY_COUNT; /* * not really modular, but the easiest way to keep compat with existing * bootargs behaviour is to continue using module_param here. */ module_param(legacy_count, int, 0); /* * The master side of a pty can do TIOCSPTLCK and thus * has pty_bsd_ioctl. */ static const struct tty_operations master_pty_ops_bsd = { .install = pty_install, .open = pty_open, .close = pty_close, .write = pty_write, .write_room = pty_write_room, .flush_buffer = pty_flush_buffer, .unthrottle = pty_unthrottle, .ioctl = pty_bsd_ioctl, .compat_ioctl = pty_bsd_compat_ioctl, .cleanup = pty_cleanup, .resize = pty_resize, .remove = pty_remove }; static const struct tty_operations slave_pty_ops_bsd = { .install = pty_install, .open = pty_open, .close = pty_close, .write = pty_write, .write_room = pty_write_room, .flush_buffer = pty_flush_buffer, .unthrottle = pty_unthrottle, .set_termios = pty_set_termios, .cleanup = pty_cleanup, .resize = pty_resize, .start = pty_start, .stop = pty_stop, .remove = pty_remove }; static void __init legacy_pty_init(void) { struct tty_driver *pty_driver, *pty_slave_driver; if (legacy_count <= 0) return; pty_driver = tty_alloc_driver(legacy_count, TTY_DRIVER_RESET_TERMIOS | TTY_DRIVER_REAL_RAW | TTY_DRIVER_DYNAMIC_ALLOC); if (IS_ERR(pty_driver)) panic("Couldn't allocate pty driver"); pty_slave_driver = tty_alloc_driver(legacy_count, TTY_DRIVER_RESET_TERMIOS | TTY_DRIVER_REAL_RAW | TTY_DRIVER_DYNAMIC_ALLOC); if (IS_ERR(pty_slave_driver)) panic("Couldn't allocate pty slave driver"); pty_driver->driver_name = "pty_master"; pty_driver->name = "pty"; pty_driver->major = PTY_MASTER_MAJOR; pty_driver->minor_start = 0; pty_driver->type = TTY_DRIVER_TYPE_PTY; pty_driver->subtype = PTY_TYPE_MASTER; pty_driver->init_termios = tty_std_termios; pty_driver->init_termios.c_iflag = 0; pty_driver->init_termios.c_oflag = 0; pty_driver->init_termios.c_cflag = B38400 | CS8 | CREAD; pty_driver->init_termios.c_lflag = 0; pty_driver->init_termios.c_ispeed = 38400; pty_driver->init_termios.c_ospeed = 38400; pty_driver->other = pty_slave_driver; tty_set_operations(pty_driver, &master_pty_ops_bsd); pty_slave_driver->driver_name = "pty_slave"; pty_slave_driver->name = "ttyp"; pty_slave_driver->major = PTY_SLAVE_MAJOR; pty_slave_driver->minor_start = 0; pty_slave_driver->type = TTY_DRIVER_TYPE_PTY; pty_slave_driver->subtype = PTY_TYPE_SLAVE; pty_slave_driver->init_termios = tty_std_termios; pty_slave_driver->init_termios.c_cflag = B38400 | CS8 | CREAD; pty_slave_driver->init_termios.c_ispeed = 38400; pty_slave_driver->init_termios.c_ospeed = 38400; pty_slave_driver->other = pty_driver; tty_set_operations(pty_slave_driver, &slave_pty_ops_bsd); if (tty_register_driver(pty_driver)) panic("Couldn't register pty driver"); if (tty_register_driver(pty_slave_driver)) panic("Couldn't register pty slave driver"); } #else static inline void legacy_pty_init(void) { } #endif /* Unix98 devices */ #ifdef CONFIG_UNIX98_PTYS static struct cdev ptmx_cdev; /** * ptm_open_peer - open the peer of a pty * @master: the open struct file of the ptmx device node * @tty: the master of the pty being opened * @flags: the flags for open * * Provide a race free way for userspace to open the slave end of a pty * (where they have the master fd and cannot access or trust the mount * namespace /dev/