Total coverage: 147533 (8%)of 1853345
20 559 20 20 20 103 103 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 #include <linux/notifier.h> #include <linux/socket.h> #include <linux/kernel.h> #include <linux/export.h> #include <net/net_namespace.h> #include <net/fib_notifier.h> #include <net/netns/ipv6.h> #include <net/ip6_fib.h> int call_fib6_notifier(struct notifier_block *nb, enum fib_event_type event_type, struct fib_notifier_info *info) { info->family = AF_INET6; return call_fib_notifier(nb, event_type, info); } int call_fib6_notifiers(struct net *net, enum fib_event_type event_type, struct fib_notifier_info *info) { info->family = AF_INET6; return call_fib_notifiers(net, event_type, info); } static unsigned int fib6_seq_read(const struct net *net) { return fib6_tables_seq_read(net) + fib6_rules_seq_read(net); } static int fib6_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack) { int err; err = fib6_rules_dump(net, nb, extack); if (err) return err; return fib6_tables_dump(net, nb, extack); } static const struct fib_notifier_ops fib6_notifier_ops_template = { .family = AF_INET6, .fib_seq_read = fib6_seq_read, .fib_dump = fib6_dump, .owner = THIS_MODULE, }; int __net_init fib6_notifier_init(struct net *net) { struct fib_notifier_ops *ops; ops = fib_notifier_ops_register(&fib6_notifier_ops_template, net); if (IS_ERR(ops)) return PTR_ERR(ops); net->ipv6.notifier_ops = ops; return 0; } void __net_exit fib6_notifier_exit(struct net *net) { fib_notifier_ops_unregister(net->ipv6.notifier_ops); }
12 750 7 7 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 /* SPDX-License-Identifier: GPL-2.0 */ /* linux/net/inet/arp.h */ #ifndef _ARP_H #define _ARP_H #include <linux/if_arp.h> #include <linux/hash.h> #include <net/neighbour.h> extern struct neigh_table arp_tbl; static inline u32 arp_hashfn(const void *pkey, const struct net_device *dev, u32 *hash_rnd) { u32 key = *(const u32 *)pkey; u32 val = key ^ hash32_ptr(dev); return val * hash_rnd[0]; } #ifdef CONFIG_INET static inline struct neighbour *__ipv4_neigh_lookup_noref(struct net_device *dev, u32 key) { if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT)) key = INADDR_ANY; return ___neigh_lookup_noref(&arp_tbl, neigh_key_eq32, arp_hashfn, &key, dev); } #else static inline struct neighbour *__ipv4_neigh_lookup_noref(struct net_device *dev, u32 key) { return NULL; } #endif static inline struct neighbour *__ipv4_neigh_lookup(struct net_device *dev, u32 key) { struct neighbour *n; rcu_read_lock(); n = __ipv4_neigh_lookup_noref(dev, key); if (n && !refcount_inc_not_zero(&n->refcnt)) n = NULL; rcu_read_unlock(); return n; } static inline void __ipv4_confirm_neigh(struct net_device *dev, u32 key) { struct neighbour *n; rcu_read_lock(); n = __ipv4_neigh_lookup_noref(dev, key); neigh_confirm(n); rcu_read_unlock(); } void arp_init(void); int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg); void arp_send(int type, int ptype, __be32 dest_ip, struct net_device *dev, __be32 src_ip, const unsigned char *dest_hw, const unsigned char *src_hw, const unsigned char *th); int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir); void arp_ifdown(struct net_device *dev); int arp_invalidate(struct net_device *dev, __be32 ip, bool force); struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip, struct net_device *dev, __be32 src_ip, const unsigned char *dest_hw, const unsigned char *src_hw, const unsigned char *target_hw); void arp_xmit(struct sk_buff *skb); #endif /* _ARP_H */
21 1 20 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2011 Florian Westphal <fw@strlen.de> * * based on fib_frontend.c; Author: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/skbuff.h> #include <linux/netdevice.h> #include <net/inet_dscp.h> #include <linux/ip.h> #include <net/ip.h> #include <net/ip_fib.h> #include <net/route.h> #include <linux/netfilter/xt_rpfilter.h> #include <linux/netfilter/x_tables.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Florian Westphal <fw@strlen.de>"); MODULE_DESCRIPTION("iptables: ipv4 reverse path filter match"); /* don't try to find route from mcast/bcast/zeronet */ static __be32 rpfilter_get_saddr(__be32 addr) { if (ipv4_is_multicast(addr) || ipv4_is_lbcast(addr) || ipv4_is_zeronet(addr)) return 0; return addr; } static bool rpfilter_lookup_reverse(struct net *net, struct flowi4 *fl4, const struct net_device *dev, u8 flags) { struct fib_result res; if (fib_lookup(net, fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE)) return false; if (res.type != RTN_UNICAST) { if (res.type != RTN_LOCAL || !(flags & XT_RPFILTER_ACCEPT_LOCAL)) return false; } return fib_info_nh_uses_dev(res.fi, dev) || flags & XT_RPFILTER_LOOSE; } static bool rpfilter_is_loopback(const struct sk_buff *skb, const struct net_device *in) { return skb->pkt_type == PACKET_LOOPBACK || in->flags & IFF_LOOPBACK; } static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_rpfilter_info *info; const struct iphdr *iph; struct flowi4 flow; bool invert; info = par->matchinfo; invert = info->flags & XT_RPFILTER_INVERT; if (rpfilter_is_loopback(skb, xt_in(par))) return true ^ invert; iph = ip_hdr(skb); if (ipv4_is_zeronet(iph->saddr)) { if (ipv4_is_lbcast(iph->daddr) || ipv4_is_local_multicast(iph->daddr)) return true ^ invert; } memset(&flow, 0, sizeof(flow)); flow.flowi4_iif = LOOPBACK_IFINDEX; flow.daddr = iph->saddr; flow.saddr = rpfilter_get_saddr(iph->daddr); flow.flowi4_mark = info->flags & XT_RPFILTER_VALID_MARK ? skb->mark : 0; flow.flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(iph)); flow.flowi4_scope = RT_SCOPE_UNIVERSE; flow.flowi4_l3mdev = l3mdev_master_ifindex_rcu(xt_in(par)); flow.flowi4_uid = sock_net_uid(xt_net(par), NULL); return rpfilter_lookup_reverse(xt_net(par), &flow, xt_in(par), info->flags) ^ invert; } static int rpfilter_check(const struct xt_mtchk_param *par) { const struct xt_rpfilter_info *info = par->matchinfo; unsigned int options = ~XT_RPFILTER_OPTION_MASK; if (info->flags & options) { pr_info_ratelimited("unknown options\n"); return -EINVAL; } if (strcmp(par->table, "mangle") != 0 && strcmp(par->table, "raw") != 0) { pr_info_ratelimited("only valid in \'raw\' or \'mangle\' table, not \'%s\'\n", par->table); return -EINVAL; } return 0; } static struct xt_match rpfilter_mt_reg __read_mostly = { .name = "rpfilter", .family = NFPROTO_IPV4, .checkentry = rpfilter_check, .match = rpfilter_mt, .matchsize = sizeof(struct xt_rpfilter_info), .hooks = (1 << NF_INET_PRE_ROUTING), .me = THIS_MODULE }; static int __init rpfilter_mt_init(void) { return xt_register_match(&rpfilter_mt_reg); } static void __exit rpfilter_mt_exit(void) { xt_unregister_match(&rpfilter_mt_reg); } module_init(rpfilter_mt_init); module_exit(rpfilter_mt_exit);
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 /* SPDX-License-Identifier: GPL-2.0 */ /* * Routines to manage notifier chains for passing status changes to any * interested routines. We need this instead of hard coded call lists so * that modules can poke their nose into the innards. The network devices * needed them so here they are for the rest of you. * * Alan Cox <Alan.Cox@linux.org> */ #ifndef _LINUX_NOTIFIER_H #define _LINUX_NOTIFIER_H #include <linux/errno.h> #include <linux/mutex.h> #include <linux/rwsem.h> #include <linux/srcu.h> /* * Notifier chains are of four types: * * Atomic notifier chains: Chain callbacks run in interrupt/atomic * context. Callouts are not allowed to block. * Blocking notifier chains: Chain callbacks run in process context. * Callouts are allowed to block. * Raw notifier chains: There are no restrictions on callbacks, * registration, or unregistration. All locking and protection * must be provided by the caller. * SRCU notifier chains: A variant of blocking notifier chains, with * the same restrictions. * * atomic_notifier_chain_register() may be called from an atomic context, * but blocking_notifier_chain_register() and srcu_notifier_chain_register() * must be called from a process context. Ditto for the corresponding * _unregister() routines. * * atomic_notifier_chain_unregister(), blocking_notifier_chain_unregister(), * and srcu_notifier_chain_unregister() _must not_ be called from within * the call chain. * * SRCU notifier chains are an alternative form of blocking notifier chains. * They use SRCU (Sleepable Read-Copy Update) instead of rw-semaphores for * protection of the chain links. This means there is _very_ low overhead * in srcu_notifier_call_chain(): no cache bounces and no memory barriers. * As compensation, srcu_notifier_chain_unregister() is rather expensive. * SRCU notifier chains should be used when the chain will be called very * often but notifier_blocks will seldom be removed. */ struct notifier_block; typedef int (*notifier_fn_t)(struct notifier_block *nb, unsigned long action, void *data); struct notifier_block { notifier_fn_t notifier_call; struct notifier_block __rcu *next; int priority; }; struct atomic_notifier_head { spinlock_t lock; struct notifier_block __rcu *head; }; struct blocking_notifier_head { struct rw_semaphore rwsem; struct notifier_block __rcu *head; }; struct raw_notifier_head { struct notifier_block __rcu *head; }; struct srcu_notifier_head { struct mutex mutex; struct srcu_usage srcuu; struct srcu_struct srcu; struct notifier_block __rcu *head; }; #define ATOMIC_INIT_NOTIFIER_HEAD(name) do { \ spin_lock_init(&(name)->lock); \ (name)->head = NULL; \ } while (0) #define BLOCKING_INIT_NOTIFIER_HEAD(name) do { \ init_rwsem(&(name)->rwsem); \ (name)->head = NULL; \ } while (0) #define RAW_INIT_NOTIFIER_HEAD(name) do { \ (name)->head = NULL; \ } while (0) /* srcu_notifier_heads must be cleaned up dynamically */ extern void srcu_init_notifier_head(struct srcu_notifier_head *nh); #define srcu_cleanup_notifier_head(name) \ cleanup_srcu_struct(&(name)->srcu); #define ATOMIC_NOTIFIER_INIT(name) { \ .lock = __SPIN_LOCK_UNLOCKED(name.lock), \ .head = NULL } #define BLOCKING_NOTIFIER_INIT(name) { \ .rwsem = __RWSEM_INITIALIZER((name).rwsem), \ .head = NULL } #define RAW_NOTIFIER_INIT(name) { \ .head = NULL } #define SRCU_NOTIFIER_INIT(name, pcpu) \ { \ .mutex = __MUTEX_INITIALIZER(name.mutex), \ .head = NULL, \ .srcuu = __SRCU_USAGE_INIT(name.srcuu), \ .srcu = __SRCU_STRUCT_INIT(name.srcu, name.srcuu, pcpu), \ } #define ATOMIC_NOTIFIER_HEAD(name) \ struct atomic_notifier_head name = \ ATOMIC_NOTIFIER_INIT(name) #define BLOCKING_NOTIFIER_HEAD(name) \ struct blocking_notifier_head name = \ BLOCKING_NOTIFIER_INIT(name) #define RAW_NOTIFIER_HEAD(name) \ struct raw_notifier_head name = \ RAW_NOTIFIER_INIT(name) #ifdef CONFIG_TREE_SRCU #define _SRCU_NOTIFIER_HEAD(name, mod) \ static DEFINE_PER_CPU(struct srcu_data, name##_head_srcu_data); \ mod struct srcu_notifier_head name = \ SRCU_NOTIFIER_INIT(name, name##_head_srcu_data) #else #define _SRCU_NOTIFIER_HEAD(name, mod) \ mod struct srcu_notifier_head name = \ SRCU_NOTIFIER_INIT(name, name) #endif #define SRCU_NOTIFIER_HEAD(name) \ _SRCU_NOTIFIER_HEAD(name, /* not static */) #define SRCU_NOTIFIER_HEAD_STATIC(name) \ _SRCU_NOTIFIER_HEAD(name, static) #ifdef __KERNEL__ extern int atomic_notifier_chain_register(struct atomic_notifier_head *nh, struct notifier_block *nb); extern int blocking_notifier_chain_register(struct blocking_notifier_head *nh, struct notifier_block *nb); extern int raw_notifier_chain_register(struct raw_notifier_head *nh, struct notifier_block *nb); extern int srcu_notifier_chain_register(struct srcu_notifier_head *nh, struct notifier_block *nb); extern int atomic_notifier_chain_register_unique_prio( struct atomic_notifier_head *nh, struct notifier_block *nb); extern int blocking_notifier_chain_register_unique_prio( struct blocking_notifier_head *nh, struct notifier_block *nb); extern int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh, struct notifier_block *nb); extern int blocking_notifier_chain_unregister(struct blocking_notifier_head *nh, struct notifier_block *nb); extern int raw_notifier_chain_unregister(struct raw_notifier_head *nh, struct notifier_block *nb); extern int srcu_notifier_chain_unregister(struct srcu_notifier_head *nh, struct notifier_block *nb); extern int atomic_notifier_call_chain(struct atomic_notifier_head *nh, unsigned long val, void *v); extern int blocking_notifier_call_chain(struct blocking_notifier_head *nh, unsigned long val, void *v); extern int raw_notifier_call_chain(struct raw_notifier_head *nh, unsigned long val, void *v); extern int srcu_notifier_call_chain(struct srcu_notifier_head *nh, unsigned long val, void *v); extern int blocking_notifier_call_chain_robust(struct blocking_notifier_head *nh, unsigned long val_up, unsigned long val_down, void *v); extern int raw_notifier_call_chain_robust(struct raw_notifier_head *nh, unsigned long val_up, unsigned long val_down, void *v); extern bool atomic_notifier_call_chain_is_empty(struct atomic_notifier_head *nh); #define NOTIFY_DONE 0x0000 /* Don't care */ #define NOTIFY_OK 0x0001 /* Suits me */ #define NOTIFY_STOP_MASK 0x8000 /* Don't call further */ #define NOTIFY_BAD (NOTIFY_STOP_MASK|0x0002) /* Bad/Veto action */ /* * Clean way to return from the notifier and stop further calls. */ #define NOTIFY_STOP (NOTIFY_OK|NOTIFY_STOP_MASK) /* Encapsulate (negative) errno value (in particular, NOTIFY_BAD <=> EPERM). */ static inline int notifier_from_errno(int err) { if (err) return NOTIFY_STOP_MASK | (NOTIFY_OK - err); return NOTIFY_OK; } /* Restore (negative) errno value from notify return value. */ static inline int notifier_to_errno(int ret) { ret &= ~NOTIFY_STOP_MASK; return ret > NOTIFY_OK ? NOTIFY_OK - ret : 0; } /* * Declared notifiers so far. I can imagine quite a few more chains * over time (eg laptop power reset chains, reboot chain (to clean * device units up), device [un]mount chain, module load/unload chain, * low memory chain, screenblank chain (for plug in modular screenblankers) * VC switch chains (for loadable kernel svgalib VC switch helpers) etc... */ /* CPU notfiers are defined in include/linux/cpu.h. */ /* netdevice notifiers are defined in include/linux/netdevice.h */ /* reboot notifiers are defined in include/linux/reboot.h. */ /* Hibernation and suspend events are defined in include/linux/suspend.h. */ /* Virtual Terminal events are defined in include/linux/vt.h. */ #define NETLINK_URELEASE 0x0001 /* Unicast netlink socket released */ /* Console keyboard events. * Note: KBD_KEYCODE is always sent before KBD_UNBOUND_KEYCODE, KBD_UNICODE and * KBD_KEYSYM. */ #define KBD_KEYCODE 0x0001 /* Keyboard keycode, called before any other */ #define KBD_UNBOUND_KEYCODE 0x0002 /* Keyboard keycode which is not bound to any other */ #define KBD_UNICODE 0x0003 /* Keyboard unicode */ #define KBD_KEYSYM 0x0004 /* Keyboard keysym */ #define KBD_POST_KEYSYM 0x0005 /* Called after keyboard keysym interpretation */ #endif /* __KERNEL__ */ #endif /* _LINUX_NOTIFIER_H */
6 2 6 5 6 7 4 66 66 67 114 113 4 113 106 7 6 6 6 1 1 5 6 6 6 47 46 43 4 9 9 7 7 4 6 4 6 16 16 2 2 2 2 2 6 68 70 1 62 2 39 24 62 26 1 3 3 3 60 56 57 35 2 3 20 2 8 46 52 52 50 51 6 3 47 43 4 30 60 63 59 60 58 45 21 10 12 12 12 52 2 1 1 54 54 51 2 2 51 52 52 52 1 51 62 63 54 4 4 56 58 59 60 63 62 63 6 62 1 3 60 1 1 3 3 64 64 61 63 62 62 62 40 22 58 4 63 61 60 7 7 7 3 3 1 1 66 66 57 5 64 1 63 62 64 64 64 107 108 12 12 12 12 9 2 2 7 8 6 9 6 2 7 7 5 1 2 3 3 5 5 5 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 1 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 // SPDX-License-Identifier: GPL-2.0 /* * cfg80211 scan result handling * * Copyright 2008 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright 2016 Intel Deutschland GmbH * Copyright (C) 2018-2025 Intel Corporation */ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/netdevice.h> #include <linux/wireless.h> #include <linux/nl80211.h> #include <linux/etherdevice.h> #include <linux/crc32.h> #include <linux/bitfield.h> #include <net/arp.h> #include <net/cfg80211.h> #include <net/cfg80211-wext.h> #include <net/iw_handler.h> #include <kunit/visibility.h> #include "core.h" #include "nl80211.h" #include "wext-compat.h" #include "rdev-ops.h" /** * DOC: BSS tree/list structure * * At the top level, the BSS list is kept in both a list in each * registered device (@bss_list) as well as an RB-tree for faster * lookup. In the RB-tree, entries can be looked up using their * channel, MESHID, MESHCONF (for MBSSes) or channel, BSSID, SSID * for other BSSes. * * Due to the possibility of hidden SSIDs, there's a second level * structure, the "hidden_list" and "hidden_beacon_bss" pointer. * The hidden_list connects all BSSes belonging to a single AP * that has a hidden SSID, and connects beacon and probe response * entries. For a probe response entry for a hidden SSID, the * hidden_beacon_bss pointer points to the BSS struct holding the * beacon's information. * * Reference counting is done for all these references except for * the hidden_list, so that a beacon BSS struct that is otherwise * not referenced has one reference for being on the bss_list and * one for each probe response entry that points to it using the * hidden_beacon_bss pointer. When a BSS struct that has such a * pointer is get/put, the refcount update is also propagated to * the referenced struct, this ensure that it cannot get removed * while somebody is using the probe response version. * * Note that the hidden_beacon_bss pointer never changes, due to * the reference counting. Therefore, no locking is needed for * it. * * Also note that the hidden_beacon_bss pointer is only relevant * if the driver uses something other than the IEs, e.g. private * data stored in the BSS struct, since the beacon IEs are * also linked into the probe response struct. */ /* * Limit the number of BSS entries stored in mac80211. Each one is * a bit over 4k at most, so this limits to roughly 4-5M of memory. * If somebody wants to really attack this though, they'd likely * use small beacons, and only one type of frame, limiting each of * the entries to a much smaller size (in order to generate more * entries in total, so overhead is bigger.) */ static int bss_entries_limit = 1000; module_param(bss_entries_limit, int, 0644); MODULE_PARM_DESC(bss_entries_limit, "limit to number of scan BSS entries (per wiphy, default 1000)"); #define IEEE80211_SCAN_RESULT_EXPIRE (30 * HZ) static void bss_free(struct cfg80211_internal_bss *bss) { struct cfg80211_bss_ies *ies; if (WARN_ON(atomic_read(&bss->hold))) return; ies = (void *)rcu_access_pointer(bss->pub.beacon_ies); if (ies && !bss->pub.hidden_beacon_bss) kfree_rcu(ies, rcu_head); ies = (void *)rcu_access_pointer(bss->pub.proberesp_ies); if (ies) kfree_rcu(ies, rcu_head); /* * This happens when the module is removed, it doesn't * really matter any more save for completeness */ if (!list_empty(&bss->hidden_list)) list_del(&bss->hidden_list); kfree(bss); } static inline void bss_ref_get(struct cfg80211_registered_device *rdev, struct cfg80211_internal_bss *bss) { lockdep_assert_held(&rdev->bss_lock); bss->refcount++; if (bss->pub.hidden_beacon_bss) bss_from_pub(bss->pub.hidden_beacon_bss)->refcount++; if (bss->pub.transmitted_bss) bss_from_pub(bss->pub.transmitted_bss)->refcount++; } static inline void bss_ref_put(struct cfg80211_registered_device *rdev, struct cfg80211_internal_bss *bss) { lockdep_assert_held(&rdev->bss_lock); if (bss->pub.hidden_beacon_bss) { struct cfg80211_internal_bss *hbss; hbss = bss_from_pub(bss->pub.hidden_beacon_bss); hbss->refcount--; if (hbss->refcount == 0) bss_free(hbss); } if (bss->pub.transmitted_bss) { struct cfg80211_internal_bss *tbss; tbss = bss_from_pub(bss->pub.transmitted_bss); tbss->refcount--; if (tbss->refcount == 0) bss_free(tbss); } bss->refcount--; if (bss->refcount == 0) bss_free(bss); } static bool __cfg80211_unlink_bss(struct cfg80211_registered_device *rdev, struct cfg80211_internal_bss *bss) { lockdep_assert_held(&rdev->bss_lock); if (!list_empty(&bss->hidden_list)) { /* * don't remove the beacon entry if it has * probe responses associated with it */ if (!bss->pub.hidden_beacon_bss) return false; /* * if it's a probe response entry break its * link to the other entries in the group */ list_del_init(&bss->hidden_list); } list_del_init(&bss->list); list_del_init(&bss->pub.nontrans_list); rb_erase(&bss->rbn, &rdev->bss_tree); rdev->bss_entries--; WARN_ONCE((rdev->bss_entries == 0) ^ list_empty(&rdev->bss_list), "rdev bss entries[%d]/list[empty:%d] corruption\n", rdev->bss_entries, list_empty(&rdev->bss_list)); bss_ref_put(rdev, bss); return true; } bool cfg80211_is_element_inherited(const struct element *elem, const struct element *non_inherit_elem) { u8 id_len, ext_id_len, i, loop_len, id; const u8 *list; if (elem->id == WLAN_EID_MULTIPLE_BSSID) return false; if (elem->id == WLAN_EID_EXTENSION && elem->datalen > 1 && elem->data[0] == WLAN_EID_EXT_EHT_MULTI_LINK) return false; if (!non_inherit_elem || non_inherit_elem->datalen < 2) return true; /* * non inheritance element format is: * ext ID (56) | IDs list len | list | extension IDs list len | list * Both lists are optional. Both lengths are mandatory. * This means valid length is: * elem_len = 1 (extension ID) + 2 (list len fields) + list lengths */ id_len = non_inherit_elem->data[1]; if (non_inherit_elem->datalen < 3 + id_len) return true; ext_id_len = non_inherit_elem->data[2 + id_len]; if (non_inherit_elem->datalen < 3 + id_len + ext_id_len) return true; if (elem->id == WLAN_EID_EXTENSION) { if (!ext_id_len) return true; loop_len = ext_id_len; list = &non_inherit_elem->data[3 + id_len]; id = elem->data[0]; } else { if (!id_len) return true; loop_len = id_len; list = &non_inherit_elem->data[2]; id = elem->id; } for (i = 0; i < loop_len; i++) { if (list[i] == id) return false; } return true; } EXPORT_SYMBOL(cfg80211_is_element_inherited); static size_t cfg80211_copy_elem_with_frags(const struct element *elem, const u8 *ie, size_t ie_len, u8 **pos, u8 *buf, size_t buf_len) { if (WARN_ON((u8 *)elem < ie || elem->data > ie + ie_len || elem->data + elem->datalen > ie + ie_len)) return 0; if (elem->datalen + 2 > buf + buf_len - *pos) return 0; memcpy(*pos, elem, elem->datalen + 2); *pos += elem->datalen + 2; /* Finish if it is not fragmented */ if (elem->datalen != 255) return *pos - buf; ie_len = ie + ie_len - elem->data - elem->datalen; ie = (const u8 *)elem->data + elem->datalen; for_each_element(elem, ie, ie_len) { if (elem->id != WLAN_EID_FRAGMENT) break; if (elem->datalen + 2 > buf + buf_len - *pos) return 0; memcpy(*pos, elem, elem->datalen + 2); *pos += elem->datalen + 2; if (elem->datalen != 255) break; } return *pos - buf; } VISIBLE_IF_CFG80211_KUNIT size_t cfg80211_gen_new_ie(const u8 *ie, size_t ielen, const u8 *subie, size_t subie_len, u8 *new_ie, size_t new_ie_len) { const struct element *non_inherit_elem, *parent, *sub; u8 *pos = new_ie; const u8 *mbssid_index_ie; u8 id, ext_id, bssid_index = 255; unsigned int match_len; non_inherit_elem = cfg80211_find_ext_elem(WLAN_EID_EXT_NON_INHERITANCE, subie, subie_len); mbssid_index_ie = cfg80211_find_ie(WLAN_EID_MULTI_BSSID_IDX, subie, subie_len); if (mbssid_index_ie && mbssid_index_ie[1] > 0 && mbssid_index_ie[2] > 0 && mbssid_index_ie[2] <= 46) bssid_index = mbssid_index_ie[2]; /* We copy the elements one by one from the parent to the generated * elements. * If they are not inherited (included in subie or in the non * inheritance element), then we copy all occurrences the first time * we see this element type. */ for_each_element(parent, ie, ielen) { if (parent->id == WLAN_EID_FRAGMENT) continue; if (parent->id == WLAN_EID_EXTENSION) { if (parent->datalen < 1) continue; id = WLAN_EID_EXTENSION; ext_id = parent->data[0]; match_len = 1; } else { id = parent->id; match_len = 0; } /* Find first occurrence in subie */ sub = cfg80211_find_elem_match(id, subie, subie_len, &ext_id, match_len, 0); /* Copy from parent if not in subie and inherited */ if (!sub && cfg80211_is_element_inherited(parent, non_inherit_elem)) { if (!cfg80211_copy_elem_with_frags(parent, ie, ielen, &pos, new_ie, new_ie_len)) return 0; continue; } /* For ML probe response, match the MLE in the frame body with * MLD id being 'bssid_index' */ if (parent->id == WLAN_EID_EXTENSION && parent->datalen > 1 && parent->data[0] == WLAN_EID_EXT_EHT_MULTI_LINK && bssid_index == ieee80211_mle_get_mld_id(parent->data + 1)) { if (!cfg80211_copy_elem_with_frags(parent, ie, ielen, &pos, new_ie, new_ie_len)) return 0; /* Continue here to prevent processing the MLE in * sub-element, which AP MLD should not carry */ continue; } /* Already copied if an earlier element had the same type */ if (cfg80211_find_elem_match(id, ie, (u8 *)parent - ie, &ext_id, match_len, 0)) continue; /* Not inheriting, copy all similar elements from subie */ while (sub) { if (!cfg80211_copy_elem_with_frags(sub, subie, subie_len, &pos, new_ie, new_ie_len)) return 0; sub = cfg80211_find_elem_match(id, sub->data + sub->datalen, subie_len + subie - (sub->data + sub->datalen), &ext_id, match_len, 0); } } /* The above misses elements that are included in subie but not in the * parent, so do a pass over subie and append those. * Skip the non-tx BSSID caps and non-inheritance element. */ for_each_element(sub, subie, subie_len) { if (sub->id == WLAN_EID_NON_TX_BSSID_CAP) continue; if (sub->id == WLAN_EID_FRAGMENT) continue; if (sub->id == WLAN_EID_EXTENSION) { if (sub->datalen < 1) continue; id = WLAN_EID_EXTENSION; ext_id = sub->data[0]; match_len = 1; if (ext_id == WLAN_EID_EXT_NON_INHERITANCE) continue; } else { id = sub->id; match_len = 0; } /* Processed if one was included in the parent */ if (cfg80211_find_elem_match(id, ie, ielen, &ext_id, match_len, 0)) continue; if (!cfg80211_copy_elem_with_frags(sub, subie, subie_len, &pos, new_ie, new_ie_len)) return 0; } return pos - new_ie; } EXPORT_SYMBOL_IF_CFG80211_KUNIT(cfg80211_gen_new_ie); static bool is_bss(struct cfg80211_bss *a, const u8 *bssid, const u8 *ssid, size_t ssid_len) { const struct cfg80211_bss_ies *ies; const struct element *ssid_elem; if (bssid && !ether_addr_equal(a->bssid, bssid)) return false; if (!ssid) return true; ies = rcu_access_pointer(a->ies); if (!ies) return false; ssid_elem = cfg80211_find_elem(WLAN_EID_SSID, ies->data, ies->len); if (!ssid_elem) return false; if (ssid_elem->datalen != ssid_len) return false; return memcmp(ssid_elem->data, ssid, ssid_len) == 0; } static int cfg80211_add_nontrans_list(struct cfg80211_bss *trans_bss, struct cfg80211_bss *nontrans_bss) { const struct element *ssid_elem; struct cfg80211_bss *bss = NULL; rcu_read_lock(); ssid_elem = ieee80211_bss_get_elem(nontrans_bss, WLAN_EID_SSID); if (!ssid_elem) { rcu_read_unlock(); return -EINVAL; } /* check if nontrans_bss is in the list */ list_for_each_entry(bss, &trans_bss->nontrans_list, nontrans_list) { if (is_bss(bss, nontrans_bss->bssid, ssid_elem->data, ssid_elem->datalen)) { rcu_read_unlock(); return 0; } } rcu_read_unlock(); /* * This is a bit weird - it's not on the list, but already on another * one! The only way that could happen is if there's some BSSID/SSID * shared by multiple APs in their multi-BSSID profiles, potentially * with hidden SSID mixed in ... ignore it. */ if (!list_empty(&nontrans_bss->nontrans_list)) return -EINVAL; /* add to the list */ list_add_tail(&nontrans_bss->nontrans_list, &trans_bss->nontrans_list); return 0; } static void __cfg80211_bss_expire(struct cfg80211_registered_device *rdev, unsigned long expire_time) { struct cfg80211_internal_bss *bss, *tmp; bool expired = false; lockdep_assert_held(&rdev->bss_lock); list_for_each_entry_safe(bss, tmp, &rdev->bss_list, list) { if (atomic_read(&bss->hold)) continue; if (!time_after(expire_time, bss->ts)) continue; if (__cfg80211_unlink_bss(rdev, bss)) expired = true; } if (expired) rdev->bss_generation++; } static bool cfg80211_bss_expire_oldest(struct cfg80211_registered_device *rdev) { struct cfg80211_internal_bss *bss, *oldest = NULL; bool ret; lockdep_assert_held(&rdev->bss_lock); list_for_each_entry(bss, &rdev->bss_list, list) { if (atomic_read(&bss->hold)) continue; if (!list_empty(&bss->hidden_list) && !bss->pub.hidden_beacon_bss) continue; if (oldest && time_before(oldest->ts, bss->ts)) continue; oldest = bss; } if (WARN_ON(!oldest)) return false; /* * The callers make sure to increase rdev->bss_generation if anything * gets removed (and a new entry added), so there's no need to also do * it here. */ ret = __cfg80211_unlink_bss(rdev, oldest); WARN_ON(!ret); return ret; } static u8 cfg80211_parse_bss_param(u8 data, struct cfg80211_colocated_ap *coloc_ap) { coloc_ap->oct_recommended = u8_get_bits(data, IEEE80211_RNR_TBTT_PARAMS_OCT_RECOMMENDED); coloc_ap->same_ssid = u8_get_bits(data, IEEE80211_RNR_TBTT_PARAMS_SAME_SSID); coloc_ap->multi_bss = u8_get_bits(data, IEEE80211_RNR_TBTT_PARAMS_MULTI_BSSID); coloc_ap->transmitted_bssid = u8_get_bits(data, IEEE80211_RNR_TBTT_PARAMS_TRANSMITTED_BSSID); coloc_ap->unsolicited_probe = u8_get_bits(data, IEEE80211_RNR_TBTT_PARAMS_PROBE_ACTIVE); coloc_ap->colocated_ess = u8_get_bits(data, IEEE80211_RNR_TBTT_PARAMS_COLOC_ESS); return u8_get_bits(data, IEEE80211_RNR_TBTT_PARAMS_COLOC_AP); } static int cfg80211_calc_short_ssid(const struct cfg80211_bss_ies *ies, const struct element **elem, u32 *s_ssid) { *elem = cfg80211_find_elem(WLAN_EID_SSID, ies->data, ies->len); if (!*elem || (*elem)->datalen > IEEE80211_MAX_SSID_LEN) return -EINVAL; *s_ssid = ~crc32_le(~0, (*elem)->data, (*elem)->datalen); return 0; } VISIBLE_IF_CFG80211_KUNIT void cfg80211_free_coloc_ap_list(struct list_head *coloc_ap_list) { struct cfg80211_colocated_ap *ap, *tmp_ap; list_for_each_entry_safe(ap, tmp_ap, coloc_ap_list, list) { list_del(&ap->list); kfree(ap); } } EXPORT_SYMBOL_IF_CFG80211_KUNIT(cfg80211_free_coloc_ap_list); static int cfg80211_parse_ap_info(struct cfg80211_colocated_ap *entry, const u8 *pos, u8 length, const struct element *ssid_elem, u32 s_ssid_tmp) { u8 bss_params; entry->psd_20 = IEEE80211_RNR_TBTT_PARAMS_PSD_RESERVED; /* The length is already verified by the caller to contain bss_params */ if (length > sizeof(struct ieee80211_tbtt_info_7_8_9)) { struct ieee80211_tbtt_info_ge_11 *tbtt_info = (void *)pos; memcpy(entry->bssid, tbtt_info->bssid, ETH_ALEN); entry->short_ssid = le32_to_cpu(tbtt_info->short_ssid); entry->short_ssid_valid = true; bss_params = tbtt_info->bss_params; /* Ignore disabled links */ if (length >= offsetofend(typeof(*tbtt_info), mld_params)) { if (le16_get_bits(tbtt_info->mld_params.params, IEEE80211_RNR_MLD_PARAMS_DISABLED_LINK)) return -EINVAL; } if (length >= offsetofend(struct ieee80211_tbtt_info_ge_11, psd_20)) entry->psd_20 = tbtt_info->psd_20; } else { struct ieee80211_tbtt_info_7_8_9 *tbtt_info = (void *)pos; memcpy(entry->bssid, tbtt_info->bssid, ETH_ALEN); bss_params = tbtt_info->bss_params; if (length == offsetofend(struct ieee80211_tbtt_info_7_8_9, psd_20)) entry->psd_20 = tbtt_info->psd_20; } /* ignore entries with invalid BSSID */ if (!is_valid_ether_addr(entry->bssid)) return -EINVAL; /* skip non colocated APs */ if (!cfg80211_parse_bss_param(bss_params, entry)) return -EINVAL; /* no information about the short ssid. Consider the entry valid * for now. It would later be dropped in case there are explicit * SSIDs that need to be matched */ if (!entry->same_ssid && !entry->short_ssid_valid) return 0; if (entry->same_ssid) { entry->short_ssid = s_ssid_tmp; entry->short_ssid_valid = true; /* * This is safe because we validate datalen in * cfg80211_parse_colocated_ap(), before calling this * function. */ memcpy(&entry->ssid, &ssid_elem->data, ssid_elem->datalen); entry->ssid_len = ssid_elem->datalen; } return 0; } bool cfg80211_iter_rnr(const u8 *elems, size_t elems_len, enum cfg80211_rnr_iter_ret (*iter)(void *data, u8 type, const struct ieee80211_neighbor_ap_info *info, const u8 *tbtt_info, u8 tbtt_info_len), void *iter_data) { const struct element *rnr; const u8 *pos, *end; for_each_element_id(rnr, WLAN_EID_REDUCED_NEIGHBOR_REPORT, elems, elems_len) { const struct ieee80211_neighbor_ap_info *info; pos = rnr->data; end = rnr->data + rnr->datalen; /* RNR IE may contain more than one NEIGHBOR_AP_INFO */ while (sizeof(*info) <= end - pos) { u8 length, i, count; u8 type; info = (void *)pos; count = u8_get_bits(info->tbtt_info_hdr, IEEE80211_AP_INFO_TBTT_HDR_COUNT) + 1; length = info->tbtt_info_len; pos += sizeof(*info); if (count * length > end - pos) return false; type = u8_get_bits(info->tbtt_info_hdr, IEEE80211_AP_INFO_TBTT_HDR_TYPE); for (i = 0; i < count; i++) { switch (iter(iter_data, type, info, pos, length)) { case RNR_ITER_CONTINUE: break; case RNR_ITER_BREAK: return true; case RNR_ITER_ERROR: return false; } pos += length; } } if (pos != end) return false; } return true; } EXPORT_SYMBOL_GPL(cfg80211_iter_rnr); struct colocated_ap_data { const struct element *ssid_elem; struct list_head ap_list; u32 s_ssid_tmp; int n_coloc; }; static enum cfg80211_rnr_iter_ret cfg80211_parse_colocated_ap_iter(void *_data, u8 type, const struct ieee80211_neighbor_ap_info *info, const u8 *tbtt_info, u8 tbtt_info_len) { struct colocated_ap_data *data = _data; struct cfg80211_colocated_ap *entry; enum nl80211_band band; if (type != IEEE80211_TBTT_INFO_TYPE_TBTT) return RNR_ITER_CONTINUE; if (!ieee80211_operating_class_to_band(info->op_class, &band)) return RNR_ITER_CONTINUE; /* TBTT info must include bss param + BSSID + (short SSID or * same_ssid bit to be set). Ignore other options, and move to * the next AP info */ if (band != NL80211_BAND_6GHZ || !(tbtt_info_len == offsetofend(struct ieee80211_tbtt_info_7_8_9, bss_params) || tbtt_info_len == sizeof(struct ieee80211_tbtt_info_7_8_9) || tbtt_info_len >= offsetofend(struct ieee80211_tbtt_info_ge_11, bss_params))) return RNR_ITER_CONTINUE; entry = kzalloc(sizeof(*entry), GFP_ATOMIC); if (!entry) return RNR_ITER_ERROR; entry->center_freq = ieee80211_channel_to_frequency(info->channel, band); if (!cfg80211_parse_ap_info(entry, tbtt_info, tbtt_info_len, data->ssid_elem, data->s_ssid_tmp)) { struct cfg80211_colocated_ap *tmp; /* Don't add duplicate BSSIDs on the same channel. */ list_for_each_entry(tmp, &data->ap_list, list) { if (ether_addr_equal(tmp->bssid, entry->bssid) && tmp->center_freq == entry->center_freq) { kfree(entry); return RNR_ITER_CONTINUE; } } data->n_coloc++; list_add_tail(&entry->list, &data->ap_list); } else { kfree(entry); } return RNR_ITER_CONTINUE; } VISIBLE_IF_CFG80211_KUNIT int cfg80211_parse_colocated_ap(const struct cfg80211_bss_ies *ies, struct list_head *list) { struct colocated_ap_data data = {}; int ret; INIT_LIST_HEAD(&data.ap_list); ret = cfg80211_calc_short_ssid(ies, &data.ssid_elem, &data.s_ssid_tmp); if (ret) return 0; if (!cfg80211_iter_rnr(ies->data, ies->len, cfg80211_parse_colocated_ap_iter, &data)) { cfg80211_free_coloc_ap_list(&data.ap_list); return 0; } list_splice_tail(&data.ap_list, list); return data.n_coloc; } EXPORT_SYMBOL_IF_CFG80211_KUNIT(cfg80211_parse_colocated_ap); static void cfg80211_scan_req_add_chan(struct cfg80211_scan_request *request, struct ieee80211_channel *chan, bool add_to_6ghz) { int i; u32 n_channels = request->n_channels; struct cfg80211_scan_6ghz_params *params = &request->scan_6ghz_params[request->n_6ghz_params]; for (i = 0; i < n_channels; i++) { if (request->channels[i] == chan) { if (add_to_6ghz) params->channel_idx = i; return; } } request->n_channels++; request->channels[n_channels] = chan; if (add_to_6ghz) request->scan_6ghz_params[request->n_6ghz_params].channel_idx = n_channels; } static bool cfg80211_find_ssid_match(struct cfg80211_colocated_ap *ap, struct cfg80211_scan_request *request) { int i; u32 s_ssid; for (i = 0; i < request->n_ssids; i++) { /* wildcard ssid in the scan request */ if (!request->ssids[i].ssid_len) { if (ap->multi_bss && !ap->transmitted_bssid) continue; return true; } if (ap->ssid_len && ap->ssid_len == request->ssids[i].ssid_len) { if (!memcmp(request->ssids[i].ssid, ap->ssid, ap->ssid_len)) return true; } else if (ap->short_ssid_valid) { s_ssid = ~crc32_le(~0, request->ssids[i].ssid, request->ssids[i].ssid_len); if (ap->short_ssid == s_ssid) return true; } } return false; } static int cfg80211_scan_6ghz(struct cfg80211_registered_device *rdev) { u8 i; struct cfg80211_colocated_ap *ap; int n_channels, count = 0, err; struct cfg80211_scan_request *request, *rdev_req = rdev->scan_req; LIST_HEAD(coloc_ap_list); bool need_scan_psc = true; const struct ieee80211_sband_iftype_data *iftd; size_t size, offs_ssids, offs_6ghz_params, offs_ies; rdev_req->scan_6ghz = true; if (!rdev->wiphy.bands[NL80211_BAND_6GHZ]) return -EOPNOTSUPP; iftd = ieee80211_get_sband_iftype_data(rdev->wiphy.bands[NL80211_BAND_6GHZ], rdev_req->wdev->iftype); if (!iftd || !iftd->he_cap.has_he) return -EOPNOTSUPP; n_channels = rdev->wiphy.bands[NL80211_BAND_6GHZ]->n_channels; if (rdev_req->flags & NL80211_SCAN_FLAG_COLOCATED_6GHZ) { struct cfg80211_internal_bss *intbss; spin_lock_bh(&rdev->bss_lock); list_for_each_entry(intbss, &rdev->bss_list, list) { struct cfg80211_bss *res = &intbss->pub; const struct cfg80211_bss_ies *ies; const struct element *ssid_elem; struct cfg80211_colocated_ap *entry; u32 s_ssid_tmp; int ret; ies = rcu_access_pointer(res->ies); count += cfg80211_parse_colocated_ap(ies, &coloc_ap_list); /* In case the scan request specified a specific BSSID * and the BSS is found and operating on 6GHz band then * add this AP to the collocated APs list. * This is relevant for ML probe requests when the lower * band APs have not been discovered. */ if (is_broadcast_ether_addr(rdev_req->bssid) || !ether_addr_equal(rdev_req->bssid, res->bssid) || res->channel->band != NL80211_BAND_6GHZ) continue; ret = cfg80211_calc_short_ssid(ies, &ssid_elem, &s_ssid_tmp); if (ret) continue; entry = kzalloc(sizeof(*entry), GFP_ATOMIC); if (!entry) continue; memcpy(entry->bssid, res->bssid, ETH_ALEN); entry->short_ssid = s_ssid_tmp; memcpy(entry->ssid, ssid_elem->data, ssid_elem->datalen); entry->ssid_len = ssid_elem->datalen; entry->short_ssid_valid = true; entry->center_freq = res->channel->center_freq; list_add_tail(&entry->list, &coloc_ap_list); count++; } spin_unlock_bh(&rdev->bss_lock); } size = struct_size(request, channels, n_channels); offs_ssids = size; size += sizeof(*request->ssids) * rdev_req->n_ssids; offs_6ghz_params = size; size += sizeof(*request->scan_6ghz_params) * count; offs_ies = size; size += rdev_req->ie_len; request = kzalloc(size, GFP_KERNEL); if (!request) { cfg80211_free_coloc_ap_list(&coloc_ap_list); return -ENOMEM; } *request = *rdev_req; request->n_channels = 0; request->n_6ghz_params = 0; if (rdev_req->n_ssids) { /* * Add the ssids from the parent scan request to the new * scan request, so the driver would be able to use them * in its probe requests to discover hidden APs on PSC * channels. */ request->ssids = (void *)request + offs_ssids; memcpy(request->ssids, rdev_req->ssids, sizeof(*request->ssids) * request->n_ssids); } request->scan_6ghz_params = (void *)request + offs_6ghz_params; if (rdev_req->ie_len) { void *ie = (void *)request + offs_ies; memcpy(ie, rdev_req->ie, rdev_req->ie_len); request->ie = ie; } /* * PSC channels should not be scanned in case of direct scan with 1 SSID * and at least one of the reported co-located APs with same SSID * indicating that all APs in the same ESS are co-located */ if (count && request->n_ssids == 1 && request->ssids[0].ssid_len) { list_for_each_entry(ap, &coloc_ap_list, list) { if (ap->colocated_ess && cfg80211_find_ssid_match(ap, request)) { need_scan_psc = false; break; } } } /* * add to the scan request the channels that need to be scanned * regardless of the collocated APs (PSC channels or all channels * in case that NL80211_SCAN_FLAG_COLOCATED_6GHZ is not set) */ for (i = 0; i < rdev_req->n_channels; i++) { if (rdev_req->channels[i]->band == NL80211_BAND_6GHZ && ((need_scan_psc && cfg80211_channel_is_psc(rdev_req->channels[i])) || !(rdev_req->flags & NL80211_SCAN_FLAG_COLOCATED_6GHZ))) { cfg80211_scan_req_add_chan(request, rdev_req->channels[i], false); } } if (!(rdev_req->flags & NL80211_SCAN_FLAG_COLOCATED_6GHZ)) goto skip; list_for_each_entry(ap, &coloc_ap_list, list) { bool found = false; struct cfg80211_scan_6ghz_params *scan_6ghz_params = &request->scan_6ghz_params[request->n_6ghz_params]; struct ieee80211_channel *chan = ieee80211_get_channel(&rdev->wiphy, ap->center_freq); if (!chan || chan->flags & IEEE80211_CHAN_DISABLED || !cfg80211_wdev_channel_allowed(rdev_req->wdev, chan)) continue; for (i = 0; i < rdev_req->n_channels; i++) { if (rdev_req->channels[i] == chan) found = true; } if (!found) continue; if (request->n_ssids > 0 && !cfg80211_find_ssid_match(ap, request)) continue; if (!is_broadcast_ether_addr(request->bssid) && !ether_addr_equal(request->bssid, ap->bssid)) continue; if (!request->n_ssids && ap->multi_bss && !ap->transmitted_bssid) continue; cfg80211_scan_req_add_chan(request, chan, true); memcpy(scan_6ghz_params->bssid, ap->bssid, ETH_ALEN); scan_6ghz_params->short_ssid = ap->short_ssid; scan_6ghz_params->short_ssid_valid = ap->short_ssid_valid; scan_6ghz_params->unsolicited_probe = ap->unsolicited_probe; scan_6ghz_params->psd_20 = ap->psd_20; /* * If a PSC channel is added to the scan and 'need_scan_psc' is * set to false, then all the APs that the scan logic is * interested with on the channel are collocated and thus there * is no need to perform the initial PSC channel listen. */ if (cfg80211_channel_is_psc(chan) && !need_scan_psc) scan_6ghz_params->psc_no_listen = true; request->n_6ghz_params++; } skip: cfg80211_free_coloc_ap_list(&coloc_ap_list); if (request->n_channels) { struct cfg80211_scan_request *old = rdev->int_scan_req; rdev->int_scan_req = request; /* * If this scan follows a previous scan, save the scan start * info from the first part of the scan */ if (old) rdev->int_scan_req->info = old->info; err = rdev_scan(rdev, request); if (err) { rdev->int_scan_req = old; kfree(request); } else { kfree(old); } return err; } kfree(request); return -EINVAL; } int cfg80211_scan(struct cfg80211_registered_device *rdev) { struct cfg80211_scan_request *request; struct cfg80211_scan_request *rdev_req = rdev->scan_req; u32 n_channels = 0, idx, i; if (!(rdev->wiphy.flags & WIPHY_FLAG_SPLIT_SCAN_6GHZ)) return rdev_scan(rdev, rdev_req); for (i = 0; i < rdev_req->n_channels; i++) { if (rdev_req->channels[i]->band != NL80211_BAND_6GHZ) n_channels++; } if (!n_channels) return cfg80211_scan_6ghz(rdev); request = kzalloc(struct_size(request, channels, n_channels), GFP_KERNEL); if (!request) return -ENOMEM; *request = *rdev_req; request->n_channels = n_channels; for (i = idx = 0; i < rdev_req->n_channels; i++) { if (rdev_req->channels[i]->band != NL80211_BAND_6GHZ) request->channels[idx++] = rdev_req->channels[i]; } rdev_req->scan_6ghz = false; rdev->int_scan_req = request; return rdev_scan(rdev, request); } void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev, bool send_message) { struct cfg80211_scan_request *request, *rdev_req; struct wireless_dev *wdev; struct sk_buff *msg; #ifdef CONFIG_CFG80211_WEXT union iwreq_data wrqu; #endif lockdep_assert_held(&rdev->wiphy.mtx); if (rdev->scan_msg) { nl80211_send_scan_msg(rdev, rdev->scan_msg); rdev->scan_msg = NULL; return; } rdev_req = rdev->scan_req; if (!rdev_req) return; wdev = rdev_req->wdev; request = rdev->int_scan_req ? rdev->int_scan_req : rdev_req; if (wdev_running(wdev) && (rdev->wiphy.flags & WIPHY_FLAG_SPLIT_SCAN_6GHZ) && !rdev_req->scan_6ghz && !request->info.aborted && !cfg80211_scan_6ghz(rdev)) return; /* * This must be before sending the other events! * Otherwise, wpa_supplicant gets completely confused with * wext events. */ if (wdev->netdev) cfg80211_sme_scan_done(wdev->netdev); if (!request->info.aborted && request->flags & NL80211_SCAN_FLAG_FLUSH) { /* flush entries from previous scans */ spin_lock_bh(&rdev->bss_lock); __cfg80211_bss_expire(rdev, request->scan_start); spin_unlock_bh(&rdev->bss_lock); } msg = nl80211_build_scan_msg(rdev, wdev, request->info.aborted); #ifdef CONFIG_CFG80211_WEXT if (wdev->netdev && !request->info.aborted) { memset(&wrqu, 0, sizeof(wrqu)); wireless_send_event(wdev->netdev, SIOCGIWSCAN, &wrqu, NULL); } #endif dev_put(wdev->netdev); kfree(rdev->int_scan_req); rdev->int_scan_req = NULL; kfree(rdev->scan_req); rdev->scan_req = NULL; if (!send_message) rdev->scan_msg = msg; else nl80211_send_scan_msg(rdev, msg); } void __cfg80211_scan_done(struct wiphy *wiphy, struct wiphy_work *wk) { ___cfg80211_scan_done(wiphy_to_rdev(wiphy), true); } void cfg80211_scan_done(struct cfg80211_scan_request *request, struct cfg80211_scan_info *info) { struct cfg80211_scan_info old_info = request->info; trace_cfg80211_scan_done(request, info); WARN_ON(request != wiphy_to_rdev(request->wiphy)->scan_req && request != wiphy_to_rdev(request->wiphy)->int_scan_req); request->info = *info; /* * In case the scan is split, the scan_start_tsf and tsf_bssid should * be of the first part. In such a case old_info.scan_start_tsf should * be non zero. */ if (request->scan_6ghz && old_info.scan_start_tsf) { request->info.scan_start_tsf = old_info.scan_start_tsf; memcpy(request->info.tsf_bssid, old_info.tsf_bssid, sizeof(request->info.tsf_bssid)); } request->notified = true; wiphy_work_queue(request->wiphy, &wiphy_to_rdev(request->wiphy)->scan_done_wk); } EXPORT_SYMBOL(cfg80211_scan_done); void cfg80211_add_sched_scan_req(struct cfg80211_registered_device *rdev, struct cfg80211_sched_scan_request *req) { lockdep_assert_held(&rdev->wiphy.mtx); list_add_rcu(&req->list, &rdev->sched_scan_req_list); } static void cfg80211_del_sched_scan_req(struct cfg80211_registered_device *rdev, struct cfg80211_sched_scan_request *req) { lockdep_assert_held(&rdev->wiphy.mtx); list_del_rcu(&req->list); kfree_rcu(req, rcu_head); } static struct cfg80211_sched_scan_request * cfg80211_find_sched_scan_req(struct cfg80211_registered_device *rdev, u64 reqid) { struct cfg80211_sched_scan_request *pos; list_for_each_entry_rcu(pos, &rdev->sched_scan_req_list, list, lockdep_is_held(&rdev->wiphy.mtx)) { if (pos->reqid == reqid) return pos; } return NULL; } /* * Determines if a scheduled scan request can be handled. When a legacy * scheduled scan is running no other scheduled scan is allowed regardless * whether the request is for legacy or multi-support scan. When a multi-support * scheduled scan is running a request for legacy scan is not allowed. In this * case a request for multi-support scan can be handled if resources are * available, ie. struct wiphy::max_sched_scan_reqs limit is not yet reached. */ int cfg80211_sched_scan_req_possible(struct cfg80211_registered_device *rdev, bool want_multi) { struct cfg80211_sched_scan_request *pos; int i = 0; list_for_each_entry(pos, &rdev->sched_scan_req_list, list) { /* request id zero means legacy in progress */ if (!i && !pos->reqid) return -EINPROGRESS; i++; } if (i) { /* no legacy allowed when multi request(s) are active */ if (!want_multi) return -EINPROGRESS; /* resource limit reached */ if (i == rdev->wiphy.max_sched_scan_reqs) return -ENOSPC; } return 0; } void cfg80211_sched_scan_results_wk(struct work_struct *work) { struct cfg80211_registered_device *rdev; struct cfg80211_sched_scan_request *req, *tmp; rdev = container_of(work, struct cfg80211_registered_device, sched_scan_res_wk); guard(wiphy)(&rdev->wiphy); list_for_each_entry_safe(req, tmp, &rdev->sched_scan_req_list, list) { if (req->report_results) { req->report_results = false; if (req->flags & NL80211_SCAN_FLAG_FLUSH) { /* flush entries from previous scans */ spin_lock_bh(&rdev->bss_lock); __cfg80211_bss_expire(rdev, req->scan_start); spin_unlock_bh(&rdev->bss_lock); req->scan_start = jiffies; } nl80211_send_sched_scan(req, NL80211_CMD_SCHED_SCAN_RESULTS); } } } void cfg80211_sched_scan_results(struct wiphy *wiphy, u64 reqid) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); struct cfg80211_sched_scan_request *request; trace_cfg80211_sched_scan_results(wiphy, reqid); /* ignore if we're not scanning */ rcu_read_lock(); request = cfg80211_find_sched_scan_req(rdev, reqid); if (request) { request->report_results = true; queue_work(cfg80211_wq, &rdev->sched_scan_res_wk); } rcu_read_unlock(); } EXPORT_SYMBOL(cfg80211_sched_scan_results); void cfg80211_sched_scan_stopped_locked(struct wiphy *wiphy, u64 reqid) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); lockdep_assert_held(&wiphy->mtx); trace_cfg80211_sched_scan_stopped(wiphy, reqid); __cfg80211_stop_sched_scan(rdev, reqid, true); } EXPORT_SYMBOL(cfg80211_sched_scan_stopped_locked); void cfg80211_sched_scan_stopped(struct wiphy *wiphy, u64 reqid) { guard(wiphy)(wiphy); cfg80211_sched_scan_stopped_locked(wiphy, reqid); } EXPORT_SYMBOL(cfg80211_sched_scan_stopped); int cfg80211_stop_sched_scan_req(struct cfg80211_registered_device *rdev, struct cfg80211_sched_scan_request *req, bool driver_initiated) { lockdep_assert_held(&rdev->wiphy.mtx); if (!driver_initiated) { int err = rdev_sched_scan_stop(rdev, req->dev, req->reqid); if (err) return err; } nl80211_send_sched_scan(req, NL80211_CMD_SCHED_SCAN_STOPPED); cfg80211_del_sched_scan_req(rdev, req); return 0; } int __cfg80211_stop_sched_scan(struct cfg80211_registered_device *rdev, u64 reqid, bool driver_initiated) { struct cfg80211_sched_scan_request *sched_scan_req; lockdep_assert_held(&rdev->wiphy.mtx); sched_scan_req = cfg80211_find_sched_scan_req(rdev, reqid); if (!sched_scan_req) return -ENOENT; return cfg80211_stop_sched_scan_req(rdev, sched_scan_req, driver_initiated); } void cfg80211_bss_age(struct cfg80211_registered_device *rdev, unsigned long age_secs) { struct cfg80211_internal_bss *bss; unsigned long age_jiffies = secs_to_jiffies(age_secs); spin_lock_bh(&rdev->bss_lock); list_for_each_entry(bss, &rdev->bss_list, list) bss->ts -= age_jiffies; spin_unlock_bh(&rdev->bss_lock); } void cfg80211_bss_expire(struct cfg80211_registered_device *rdev) { __cfg80211_bss_expire(rdev, jiffies - IEEE80211_SCAN_RESULT_EXPIRE); } void cfg80211_bss_flush(struct wiphy *wiphy) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); spin_lock_bh(&rdev->bss_lock); __cfg80211_bss_expire(rdev, jiffies); spin_unlock_bh(&rdev->bss_lock); } EXPORT_SYMBOL(cfg80211_bss_flush); const struct element * cfg80211_find_elem_match(u8 eid, const u8 *ies, unsigned int len, const u8 *match, unsigned int match_len, unsigned int match_offset) { const struct element *elem; for_each_element_id(elem, eid, ies, len) { if (elem->datalen >= match_offset + match_len && !memcmp(elem->data + match_offset, match, match_len)) return elem; } return NULL; } EXPORT_SYMBOL(cfg80211_find_elem_match); const struct element *cfg80211_find_vendor_elem(unsigned int oui, int oui_type, const u8 *ies, unsigned int len) { const struct element *elem; u8 match[] = { oui >> 16, oui >> 8, oui, oui_type }; int match_len = (oui_type < 0) ? 3 : sizeof(match); if (WARN_ON(oui_type > 0xff)) return NULL; elem = cfg80211_find_elem_match(WLAN_EID_VENDOR_SPECIFIC, ies, len, match, match_len, 0); if (!elem || elem->datalen < 4) return NULL; return elem; } EXPORT_SYMBOL(cfg80211_find_vendor_elem); /** * enum bss_compare_mode - BSS compare mode * @BSS_CMP_REGULAR: regular compare mode (for insertion and normal find) * @BSS_CMP_HIDE_ZLEN: find hidden SSID with zero-length mode * @BSS_CMP_HIDE_NUL: find hidden SSID with NUL-ed out mode */ enum bss_compare_mode { BSS_CMP_REGULAR, BSS_CMP_HIDE_ZLEN, BSS_CMP_HIDE_NUL, }; static int cmp_bss(struct cfg80211_bss *a, struct cfg80211_bss *b, enum bss_compare_mode mode) { const struct cfg80211_bss_ies *a_ies, *b_ies; const u8 *ie1 = NULL; const u8 *ie2 = NULL; int i, r; if (a->channel != b->channel) return (b->channel->center_freq * 1000 + b->channel->freq_offset) - (a->channel->center_freq * 1000 + a->channel->freq_offset); a_ies = rcu_access_pointer(a->ies); if (!a_ies) return -1; b_ies = rcu_access_pointer(b->ies); if (!b_ies) return 1; if (WLAN_CAPABILITY_IS_STA_BSS(a->capability)) ie1 = cfg80211_find_ie(WLAN_EID_MESH_ID, a_ies->data, a_ies->len); if (WLAN_CAPABILITY_IS_STA_BSS(b->capability)) ie2 = cfg80211_find_ie(WLAN_EID_MESH_ID, b_ies->data, b_ies->len); if (ie1 && ie2) { int mesh_id_cmp; if (ie1[1] == ie2[1]) mesh_id_cmp = memcmp(ie1 + 2, ie2 + 2, ie1[1]); else mesh_id_cmp = ie2[1] - ie1[1]; ie1 = cfg80211_find_ie(WLAN_EID_MESH_CONFIG, a_ies->data, a_ies->len); ie2 = cfg80211_find_ie(WLAN_EID_MESH_CONFIG, b_ies->data, b_ies->len); if (ie1 && ie2) { if (mesh_id_cmp) return mesh_id_cmp; if (ie1[1] != ie2[1]) return ie2[1] - ie1[1]; return memcmp(ie1 + 2, ie2 + 2, ie1[1]); } } r = memcmp(a->bssid, b->bssid, sizeof(a->bssid)); if (r) return r; ie1 = cfg80211_find_ie(WLAN_EID_SSID, a_ies->data, a_ies->len); ie2 = cfg80211_find_ie(WLAN_EID_SSID, b_ies->data, b_ies->len); if (!ie1 && !ie2) return 0; /* * Note that with "hide_ssid", the function returns a match if * the already-present BSS ("b") is a hidden SSID beacon for * the new BSS ("a"). */ /* sort missing IE before (left of) present IE */ if (!ie1) return -1; if (!ie2) return 1; switch (mode) { case BSS_CMP_HIDE_ZLEN: /* * In ZLEN mode we assume the BSS entry we're * looking for has a zero-length SSID. So if * the one we're looking at right now has that, * return 0. Otherwise, return the difference * in length, but since we're looking for the * 0-length it's really equivalent to returning * the length of the one we're looking at. * * No content comparison is needed as we assume * the content length is zero. */ return ie2[1]; case BSS_CMP_REGULAR: default: /* sort by length first, then by contents */ if (ie1[1] != ie2[1]) return ie2[1] - ie1[1]; return memcmp(ie1 + 2, ie2 + 2, ie1[1]); case BSS_CMP_HIDE_NUL: if (ie1[1] != ie2[1]) return ie2[1] - ie1[1]; /* this is equivalent to memcmp(zeroes, ie2 + 2, len) */ for (i = 0; i < ie2[1]; i++) if (ie2[i + 2]) return -1; return 0; } } static bool cfg80211_bss_type_match(u16 capability, enum nl80211_band band, enum ieee80211_bss_type bss_type) { bool ret = true; u16 mask, val; if (bss_type == IEEE80211_BSS_TYPE_ANY) return ret; if (band == NL80211_BAND_60GHZ) { mask = WLAN_CAPABILITY_DMG_TYPE_MASK; switch (bss_type) { case IEEE80211_BSS_TYPE_ESS: val = WLAN_CAPABILITY_DMG_TYPE_AP; break; case IEEE80211_BSS_TYPE_PBSS: val = WLAN_CAPABILITY_DMG_TYPE_PBSS; break; case IEEE80211_BSS_TYPE_IBSS: val = WLAN_CAPABILITY_DMG_TYPE_IBSS; break; default: return false; } } else { mask = WLAN_CAPABILITY_ESS | WLAN_CAPABILITY_IBSS; switch (bss_type) { case IEEE80211_BSS_TYPE_ESS: val = WLAN_CAPABILITY_ESS; break; case IEEE80211_BSS_TYPE_IBSS: val = WLAN_CAPABILITY_IBSS; break; case IEEE80211_BSS_TYPE_MBSS: val = 0; break; default: return false; } } ret = ((capability & mask) == val); return ret; } /* Returned bss is reference counted and must be cleaned up appropriately. */ struct cfg80211_bss *__cfg80211_get_bss(struct wiphy *wiphy, struct ieee80211_channel *channel, const u8 *bssid, const u8 *ssid, size_t ssid_len, enum ieee80211_bss_type bss_type, enum ieee80211_privacy privacy, u32 use_for) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); struct cfg80211_internal_bss *bss, *res = NULL; unsigned long now = jiffies; int bss_privacy; trace_cfg80211_get_bss(wiphy, channel, bssid, ssid, ssid_len, bss_type, privacy); spin_lock_bh(&rdev->bss_lock); list_for_each_entry(bss, &rdev->bss_list, list) { if (!cfg80211_bss_type_match(bss->pub.capability, bss->pub.channel->band, bss_type)) continue; bss_privacy = (bss->pub.capability & WLAN_CAPABILITY_PRIVACY); if ((privacy == IEEE80211_PRIVACY_ON && !bss_privacy) || (privacy == IEEE80211_PRIVACY_OFF && bss_privacy)) continue; if (channel && bss->pub.channel != channel) continue; if (!is_valid_ether_addr(bss->pub.bssid)) continue; if ((bss->pub.use_for & use_for) != use_for) continue; /* Don't get expired BSS structs */ if (time_after(now, bss->ts + IEEE80211_SCAN_RESULT_EXPIRE) && !atomic_read(&bss->hold)) continue; if (is_bss(&bss->pub, bssid, ssid, ssid_len)) { res = bss; bss_ref_get(rdev, res); break; } } spin_unlock_bh(&rdev->bss_lock); if (!res) return NULL; trace_cfg80211_return_bss(&res->pub); return &res->pub; } EXPORT_SYMBOL(__cfg80211_get_bss); static bool rb_insert_bss(struct cfg80211_registered_device *rdev, struct cfg80211_internal_bss *bss) { struct rb_node **p = &rdev->bss_tree.rb_node; struct rb_node *parent = NULL; struct cfg80211_internal_bss *tbss; int cmp; while (*p) { parent = *p; tbss = rb_entry(parent, struct cfg80211_internal_bss, rbn); cmp = cmp_bss(&bss->pub, &tbss->pub, BSS_CMP_REGULAR); if (WARN_ON(!cmp)) { /* will sort of leak this BSS */ return false; } if (cmp < 0) p = &(*p)->rb_left; else p = &(*p)->rb_right; } rb_link_node(&bss->rbn, parent, p); rb_insert_color(&bss->rbn, &rdev->bss_tree); return true; } static struct cfg80211_internal_bss * rb_find_bss(struct cfg80211_registered_device *rdev, struct cfg80211_internal_bss *res, enum bss_compare_mode mode) { struct rb_node *n = rdev->bss_tree.rb_node; struct cfg80211_internal_bss *bss; int r; while (n) { bss = rb_entry(n, struct cfg80211_internal_bss, rbn); r = cmp_bss(&res->pub, &bss->pub, mode); if (r == 0) return bss; else if (r < 0) n = n->rb_left; else n = n->rb_right; } return NULL; } static void cfg80211_insert_bss(struct cfg80211_registered_device *rdev, struct cfg80211_internal_bss *bss) { lockdep_assert_held(&rdev->bss_lock); if (!rb_insert_bss(rdev, bss)) return; list_add_tail(&bss->list, &rdev->bss_list); rdev->bss_entries++; } static void cfg80211_rehash_bss(struct cfg80211_registered_device *rdev, struct cfg80211_internal_bss *bss) { lockdep_assert_held(&rdev->bss_lock); rb_erase(&bss->rbn, &rdev->bss_tree); if (!rb_insert_bss(rdev, bss)) { list_del(&bss->list); if (!list_empty(&bss->hidden_list)) list_del_init(&bss->hidden_list); if (!list_empty(&bss->pub.nontrans_list)) list_del_init(&bss->pub.nontrans_list); rdev->bss_entries--; } rdev->bss_generation++; } static bool cfg80211_combine_bsses(struct cfg80211_registered_device *rdev, struct cfg80211_internal_bss *new) { const struct cfg80211_bss_ies *ies; struct cfg80211_internal_bss *bss; const u8 *ie; int i, ssidlen; u8 fold = 0; u32 n_entries = 0; ies = rcu_access_pointer(new->pub.beacon_ies); if (WARN_ON(!ies)) return false; ie = cfg80211_find_ie(WLAN_EID_SSID, ies->data, ies->len); if (!ie) { /* nothing to do */ return true; } ssidlen = ie[1]; for (i = 0; i < ssidlen; i++) fold |= ie[2 + i]; if (fold) { /* not a hidden SSID */ return true; } /* This is the bad part ... */ list_for_each_entry(bss, &rdev->bss_list, list) { /* * we're iterating all the entries anyway, so take the * opportunity to validate the list length accounting */ n_entries++; if (!ether_addr_equal(bss->pub.bssid, new->pub.bssid)) continue; if (bss->pub.channel != new->pub.channel) continue; if (rcu_access_pointer(bss->pub.beacon_ies)) continue; ies = rcu_access_pointer(bss->pub.ies); if (!ies) continue; ie = cfg80211_find_ie(WLAN_EID_SSID, ies->data, ies->len); if (!ie) continue; if (ssidlen && ie[1] != ssidlen) continue; if (WARN_ON_ONCE(bss->pub.hidden_beacon_bss)) continue; if (WARN_ON_ONCE(!list_empty(&bss->hidden_list))) list_del(&bss->hidden_list); /* combine them */ list_add(&bss->hidden_list, &new->hidden_list); bss->pub.hidden_beacon_bss = &new->pub; new->refcount += bss->refcount; rcu_assign_pointer(bss->pub.beacon_ies, new->pub.beacon_ies); } WARN_ONCE(n_entries != rdev->bss_entries, "rdev bss entries[%d]/list[len:%d] corruption\n", rdev->bss_entries, n_entries); return true; } static void cfg80211_update_hidden_bsses(struct cfg80211_internal_bss *known, const struct cfg80211_bss_ies *new_ies, const struct cfg80211_bss_ies *old_ies) { struct cfg80211_internal_bss *bss; /* Assign beacon IEs to all sub entries */ list_for_each_entry(bss, &known->hidden_list, hidden_list) { const struct cfg80211_bss_ies *ies; ies = rcu_access_pointer(bss->pub.beacon_ies); WARN_ON(ies != old_ies); rcu_assign_pointer(bss->pub.beacon_ies, new_ies); } } static void cfg80211_check_stuck_ecsa(struct cfg80211_registered_device *rdev, struct cfg80211_internal_bss *known, const struct cfg80211_bss_ies *old) { const struct ieee80211_ext_chansw_ie *ecsa; const struct element *elem_new, *elem_old; const struct cfg80211_bss_ies *new, *bcn; if (known->pub.proberesp_ecsa_stuck) return; new = rcu_dereference_protected(known->pub.proberesp_ies, lockdep_is_held(&rdev->bss_lock)); if (WARN_ON(!new)) return; if (new->tsf - old->tsf < USEC_PER_SEC) return; elem_old = cfg80211_find_elem(WLAN_EID_EXT_CHANSWITCH_ANN, old->data, old->len); if (!elem_old) return; elem_new = cfg80211_find_elem(WLAN_EID_EXT_CHANSWITCH_ANN, new->data, new->len); if (!elem_new) return; bcn = rcu_dereference_protected(known->pub.beacon_ies, lockdep_is_held(&rdev->bss_lock)); if (bcn && cfg80211_find_elem(WLAN_EID_EXT_CHANSWITCH_ANN, bcn->data, bcn->len)) return; if (elem_new->datalen != elem_old->datalen) return; if (elem_new->datalen < sizeof(struct ieee80211_ext_chansw_ie)) return; if (memcmp(elem_new->data, elem_old->data, elem_new->datalen)) return; ecsa = (void *)elem_new->data; if (!ecsa->mode) return; if (ecsa->new_ch_num != ieee80211_frequency_to_channel(known->pub.channel->center_freq)) return; known->pub.proberesp_ecsa_stuck = 1; } static bool cfg80211_update_known_bss(struct cfg80211_registered_device *rdev, struct cfg80211_internal_bss *known, struct cfg80211_internal_bss *new, bool signal_valid) { lockdep_assert_held(&rdev->bss_lock); /* Update IEs */ if (rcu_access_pointer(new->pub.proberesp_ies)) { const struct cfg80211_bss_ies *old; old = rcu_access_pointer(known->pub.proberesp_ies); rcu_assign_pointer(known->pub.proberesp_ies, new->pub.proberesp_ies); /* Override possible earlier Beacon frame IEs */ rcu_assign_pointer(known->pub.ies, new->pub.proberesp_ies); if (old) { cfg80211_check_stuck_ecsa(rdev, known, old); kfree_rcu((struct cfg80211_bss_ies *)old, rcu_head); } } if (rcu_access_pointer(new->pub.beacon_ies)) { const struct cfg80211_bss_ies *old; if (known->pub.hidden_beacon_bss && !list_empty(&known->hidden_list)) { const struct cfg80211_bss_ies *f; /* The known BSS struct is one of the probe * response members of a group, but we're * receiving a beacon (beacon_ies in the new * bss is used). This can only mean that the * AP changed its beacon from not having an * SSID to showing it, which is confusing so * drop this information. */ f = rcu_access_pointer(new->pub.beacon_ies); kfree_rcu((struct cfg80211_bss_ies *)f, rcu_head); return false; } old = rcu_access_pointer(known->pub.beacon_ies); rcu_assign_pointer(known->pub.beacon_ies, new->pub.beacon_ies); /* Override IEs if they were from a beacon before */ if (old == rcu_access_pointer(known->pub.ies)) rcu_assign_pointer(known->pub.ies, new->pub.beacon_ies); cfg80211_update_hidden_bsses(known, rcu_access_pointer(new->pub.beacon_ies), old); if (old) kfree_rcu((struct cfg80211_bss_ies *)old, rcu_head); } known->pub.beacon_interval = new->pub.beacon_interval; /* don't update the signal if beacon was heard on * adjacent channel. */ if (signal_valid) known->pub.signal = new->pub.signal; known->pub.capability = new->pub.capability; known->ts = new->ts; known->pub.ts_boottime = new->pub.ts_boottime; known->parent_tsf = new->parent_tsf; known->pub.chains = new->pub.chains; memcpy(known->pub.chain_signal, new->pub.chain_signal, IEEE80211_MAX_CHAINS); ether_addr_copy(known->parent_bssid, new->parent_bssid); known->pub.max_bssid_indicator = new->pub.max_bssid_indicator; known->pub.bssid_index = new->pub.bssid_index; known->pub.use_for &= new->pub.use_for; known->pub.cannot_use_reasons = new->pub.cannot_use_reasons; known->bss_source = new->bss_source; return true; } /* Returned bss is reference counted and must be cleaned up appropriately. */ static struct cfg80211_internal_bss * __cfg80211_bss_update(struct cfg80211_registered_device *rdev, struct cfg80211_internal_bss *tmp, bool signal_valid, unsigned long ts) { struct cfg80211_internal_bss *found = NULL; struct cfg80211_bss_ies *ies; if (WARN_ON(!tmp->pub.channel)) goto free_ies; tmp->ts = ts; if (WARN_ON(!rcu_access_pointer(tmp->pub.ies))) goto free_ies; found = rb_find_bss(rdev, tmp, BSS_CMP_REGULAR); if (found) { if (!cfg80211_update_known_bss(rdev, found, tmp, signal_valid)) return NULL; } else { struct cfg80211_internal_bss *new; struct cfg80211_internal_bss *hidden; /* * create a copy -- the "res" variable that is passed in * is allocated on the stack since it's not needed in the * more common case of an update */ new = kzalloc(sizeof(*new) + rdev->wiphy.bss_priv_size, GFP_ATOMIC); if (!new) goto free_ies; memcpy(new, tmp, sizeof(*new)); new->refcount = 1; INIT_LIST_HEAD(&new->hidden_list); INIT_LIST_HEAD(&new->pub.nontrans_list); /* we'll set this later if it was non-NULL */ new->pub.transmitted_bss = NULL; if (rcu_access_pointer(tmp->pub.proberesp_ies)) { hidden = rb_find_bss(rdev, tmp, BSS_CMP_HIDE_ZLEN); if (!hidden) hidden = rb_find_bss(rdev, tmp, BSS_CMP_HIDE_NUL); if (hidden) { new->pub.hidden_beacon_bss = &hidden->pub; list_add(&new->hidden_list, &hidden->hidden_list); hidden->refcount++; ies = (void *)rcu_access_pointer(new->pub.beacon_ies); rcu_assign_pointer(new->pub.beacon_ies, hidden->pub.beacon_ies); if (ies) kfree_rcu(ies, rcu_head); } } else { /* * Ok so we found a beacon, and don't have an entry. If * it's a beacon with hidden SSID, we might be in for an * expensive search for any probe responses that should * be grouped with this beacon for updates ... */ if (!cfg80211_combine_bsses(rdev, new)) { bss_ref_put(rdev, new); return NULL; } } if (rdev->bss_entries >= bss_entries_limit && !cfg80211_bss_expire_oldest(rdev)) { bss_ref_put(rdev, new); return NULL; } /* This must be before the call to bss_ref_get */ if (tmp->pub.transmitted_bss) { new->pub.transmitted_bss = tmp->pub.transmitted_bss; bss_ref_get(rdev, bss_from_pub(tmp->pub.transmitted_bss)); } cfg80211_insert_bss(rdev, new); found = new; } rdev->bss_generation++; bss_ref_get(rdev, found); return found; free_ies: ies = (void *)rcu_access_pointer(tmp->pub.beacon_ies); if (ies) kfree_rcu(ies, rcu_head); ies = (void *)rcu_access_pointer(tmp->pub.proberesp_ies); if (ies) kfree_rcu(ies, rcu_head); return NULL; } struct cfg80211_internal_bss * cfg80211_bss_update(struct cfg80211_registered_device *rdev, struct cfg80211_internal_bss *tmp, bool signal_valid, unsigned long ts) { struct cfg80211_internal_bss *res; spin_lock_bh(&rdev->bss_lock); res = __cfg80211_bss_update(rdev, tmp, signal_valid, ts); spin_unlock_bh(&rdev->bss_lock); return res; } int cfg80211_get_ies_channel_number(const u8 *ie, size_t ielen, enum nl80211_band band) { const struct element *tmp; if (band == NL80211_BAND_6GHZ) { struct ieee80211_he_operation *he_oper; tmp = cfg80211_find_ext_elem(WLAN_EID_EXT_HE_OPERATION, ie, ielen); if (tmp && tmp->datalen >= sizeof(*he_oper) && tmp->datalen >= ieee80211_he_oper_size(&tmp->data[1])) { const struct ieee80211_he_6ghz_oper *he_6ghz_oper; he_oper = (void *)&tmp->data[1]; he_6ghz_oper = ieee80211_he_6ghz_oper(he_oper); if (!he_6ghz_oper) return -1; return he_6ghz_oper->primary; } } else if (band == NL80211_BAND_S1GHZ) { tmp = cfg80211_find_elem(WLAN_EID_S1G_OPERATION, ie, ielen); if (tmp && tmp->datalen >= sizeof(struct ieee80211_s1g_oper_ie)) { struct ieee80211_s1g_oper_ie *s1gop = (void *)tmp->data; return s1gop->oper_ch; } } else { tmp = cfg80211_find_elem(WLAN_EID_DS_PARAMS, ie, ielen); if (tmp && tmp->datalen == 1) return tmp->data[0]; tmp = cfg80211_find_elem(WLAN_EID_HT_OPERATION, ie, ielen); if (tmp && tmp->datalen >= sizeof(struct ieee80211_ht_operation)) { struct ieee80211_ht_operation *htop = (void *)tmp->data; return htop->primary_chan; } } return -1; } EXPORT_SYMBOL(cfg80211_get_ies_channel_number); /* * Update RX channel information based on the available frame payload * information. This is mainly for the 2.4 GHz band where frames can be received * from neighboring channels and the Beacon frames use the DSSS Parameter Set * element to indicate the current (transmitting) channel, but this might also * be needed on other bands if RX frequency does not match with the actual * operating channel of a BSS, or if the AP reports a different primary channel. */ static struct ieee80211_channel * cfg80211_get_bss_channel(struct wiphy *wiphy, const u8 *ie, size_t ielen, struct ieee80211_channel *channel) { u32 freq; int channel_number; struct ieee80211_channel *alt_channel; channel_number = cfg80211_get_ies_channel_number(ie, ielen, channel->band); if (channel_number < 0) { /* No channel information in frame payload */ return channel; } freq = ieee80211_channel_to_freq_khz(channel_number, channel->band); /* * Frame info (beacon/prob res) is the same as received channel, * no need for further processing. */ if (freq == ieee80211_channel_to_khz(channel)) return channel; alt_channel = ieee80211_get_channel_khz(wiphy, freq); if (!alt_channel) { if (channel->band == NL80211_BAND_2GHZ || channel->band == NL80211_BAND_6GHZ) { /* * Better not allow unexpected channels when that could * be going beyond the 1-11 range (e.g., discovering * BSS on channel 12 when radio is configured for * channel 11) or beyond the 6 GHz channel range. */ return NULL; } /* No match for the payload channel number - ignore it */ return channel; } /* * Use the channel determined through the payload channel number * instead of the RX channel reported by the driver. */ if (alt_channel->flags & IEEE80211_CHAN_DISABLED) return NULL; return alt_channel; } struct cfg80211_inform_single_bss_data { struct cfg80211_inform_bss *drv_data; enum cfg80211_bss_frame_type ftype; struct ieee80211_channel *channel; u8 bssid[ETH_ALEN]; u64 tsf; u16 capability; u16 beacon_interval; const u8 *ie; size_t ielen; enum bss_source_type bss_source; /* Set if reporting bss_source != BSS_SOURCE_DIRECT */ struct cfg80211_bss *source_bss; u8 max_bssid_indicator; u8 bssid_index; u8 use_for; u64 cannot_use_reasons; }; enum ieee80211_ap_reg_power cfg80211_get_6ghz_power_type(const u8 *elems, size_t elems_len) { const struct ieee80211_he_6ghz_oper *he_6ghz_oper; struct ieee80211_he_operation *he_oper; const struct element *tmp; tmp = cfg80211_find_ext_elem(WLAN_EID_EXT_HE_OPERATION, elems, elems_len); if (!tmp || tmp->datalen < sizeof(*he_oper) + 1 || tmp->datalen < ieee80211_he_oper_size(tmp->data + 1)) return IEEE80211_REG_UNSET_AP; he_oper = (void *)&tmp->data[1]; he_6ghz_oper = ieee80211_he_6ghz_oper(he_oper); if (!he_6ghz_oper) return IEEE80211_REG_UNSET_AP; switch (u8_get_bits(he_6ghz_oper->control, IEEE80211_HE_6GHZ_OPER_CTRL_REG_INFO)) { case IEEE80211_6GHZ_CTRL_REG_LPI_AP: case IEEE80211_6GHZ_CTRL_REG_INDOOR_LPI_AP: return IEEE80211_REG_LPI_AP; case IEEE80211_6GHZ_CTRL_REG_SP_AP: case IEEE80211_6GHZ_CTRL_REG_INDOOR_SP_AP: return IEEE80211_REG_SP_AP; case IEEE80211_6GHZ_CTRL_REG_VLP_AP: return IEEE80211_REG_VLP_AP; default: return IEEE80211_REG_UNSET_AP; } } static bool cfg80211_6ghz_power_type_valid(const u8 *elems, size_t elems_len, const u32 flags) { switch (cfg80211_get_6ghz_power_type(elems, elems_len)) { case IEEE80211_REG_LPI_AP: return true; case IEEE80211_REG_SP_AP: return !(flags & IEEE80211_CHAN_NO_6GHZ_AFC_CLIENT); case IEEE80211_REG_VLP_AP: return !(flags & IEEE80211_CHAN_NO_6GHZ_VLP_CLIENT); default: return false; } } /* Returned bss is reference counted and must be cleaned up appropriately. */ static struct cfg80211_bss * cfg80211_inform_single_bss_data(struct wiphy *wiphy, struct cfg80211_inform_single_bss_data *data, gfp_t gfp) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); struct cfg80211_inform_bss *drv_data = data->drv_data; struct cfg80211_bss_ies *ies; struct ieee80211_channel *channel; struct cfg80211_internal_bss tmp = {}, *res; int bss_type; bool signal_valid; unsigned long ts; if (WARN_ON(!wiphy)) return NULL; if (WARN_ON(wiphy->signal_type == CFG80211_SIGNAL_TYPE_UNSPEC && (drv_data->signal < 0 || drv_data->signal > 100))) return NULL; if (WARN_ON(data->bss_source != BSS_SOURCE_DIRECT && !data->source_bss)) return NULL; channel = data->channel; if (!channel) channel = cfg80211_get_bss_channel(wiphy, data->ie, data->ielen, drv_data->chan); if (!channel) return NULL; if (channel->band == NL80211_BAND_6GHZ && !cfg80211_6ghz_power_type_valid(data->ie, data->ielen, channel->flags)) { data->use_for = 0; data->cannot_use_reasons = NL80211_BSS_CANNOT_USE_6GHZ_PWR_MISMATCH; } memcpy(tmp.pub.bssid, data->bssid, ETH_ALEN); tmp.pub.channel = channel; if (data->bss_source != BSS_SOURCE_STA_PROFILE) tmp.pub.signal = drv_data->signal; else tmp.pub.signal = 0; tmp.pub.beacon_interval = data->beacon_interval; tmp.pub.capability = data->capability; tmp.pub.ts_boottime = drv_data->boottime_ns; tmp.parent_tsf = drv_data->parent_tsf; ether_addr_copy(tmp.parent_bssid, drv_data->parent_bssid); tmp.pub.chains = drv_data->chains; memcpy(tmp.pub.chain_signal, drv_data->chain_signal, IEEE80211_MAX_CHAINS); tmp.pub.use_for = data->use_for; tmp.pub.cannot_use_reasons = data->cannot_use_reasons; tmp.bss_source = data->bss_source; switch (data->bss_source) { case BSS_SOURCE_MBSSID: tmp.pub.transmitted_bss = data->source_bss; fallthrough; case BSS_SOURCE_STA_PROFILE: ts = bss_from_pub(data->source_bss)->ts; tmp.pub.bssid_index = data->bssid_index; tmp.pub.max_bssid_indicator = data->max_bssid_indicator; break; case BSS_SOURCE_DIRECT: ts = jiffies; if (channel->band == NL80211_BAND_60GHZ) { bss_type = data->capability & WLAN_CAPABILITY_DMG_TYPE_MASK; if (bss_type == WLAN_CAPABILITY_DMG_TYPE_AP || bss_type == WLAN_CAPABILITY_DMG_TYPE_PBSS) regulatory_hint_found_beacon(wiphy, channel, gfp); } else { if (data->capability & WLAN_CAPABILITY_ESS) regulatory_hint_found_beacon(wiphy, channel, gfp); } break; } /* * If we do not know here whether the IEs are from a Beacon or Probe * Response frame, we need to pick one of the options and only use it * with the driver that does not provide the full Beacon/Probe Response * frame. Use Beacon frame pointer to avoid indicating that this should * override the IEs pointer should we have received an earlier * indication of Probe Response data. */ ies = kzalloc(sizeof(*ies) + data->ielen, gfp); if (!ies) return NULL; ies->len = data->ielen; ies->tsf = data->tsf; ies->from_beacon = false; memcpy(ies->data, data->ie, data->ielen); switch (data->ftype) { case CFG80211_BSS_FTYPE_BEACON: case CFG80211_BSS_FTYPE_S1G_BEACON: ies->from_beacon = true; fallthrough; case CFG80211_BSS_FTYPE_UNKNOWN: rcu_assign_pointer(tmp.pub.beacon_ies, ies); break; case CFG80211_BSS_FTYPE_PRESP: rcu_assign_pointer(tmp.pub.proberesp_ies, ies); break; } rcu_assign_pointer(tmp.pub.ies, ies); signal_valid = drv_data->chan == channel; spin_lock_bh(&rdev->bss_lock); res = __cfg80211_bss_update(rdev, &tmp, signal_valid, ts); if (!res) goto drop; rdev_inform_bss(rdev, &res->pub, ies, drv_data->drv_data); if (data->bss_source == BSS_SOURCE_MBSSID) { /* this is a nontransmitting bss, we need to add it to * transmitting bss' list if it is not there */ if (cfg80211_add_nontrans_list(data->source_bss, &res->pub)) { if (__cfg80211_unlink_bss(rdev, res)) { rdev->bss_generation++; res = NULL; } } if (!res) goto drop; } spin_unlock_bh(&rdev->bss_lock); trace_cfg80211_return_bss(&res->pub); /* __cfg80211_bss_update gives us a referenced result */ return &res->pub; drop: spin_unlock_bh(&rdev->bss_lock); return NULL; } static const struct element *cfg80211_get_profile_continuation(const u8 *ie, size_t ielen, const struct element *mbssid_elem, const struct element *sub_elem) { const u8 *mbssid_end = mbssid_elem->data + mbssid_elem->datalen; const struct element *next_mbssid; const struct element *next_sub; next_mbssid = cfg80211_find_elem(WLAN_EID_MULTIPLE_BSSID, mbssid_end, ielen - (mbssid_end - ie)); /* * If it is not the last subelement in current MBSSID IE or there isn't * a next MBSSID IE - profile is complete. */ if ((sub_elem->data + sub_elem->datalen < mbssid_end - 1) || !next_mbssid) return NULL; /* For any length error, just return NULL */ if (next_mbssid->datalen < 4) return NULL; next_sub = (void *)&next_mbssid->data[1]; if (next_mbssid->data + next_mbssid->datalen < next_sub->data + next_sub->datalen) return NULL; if (next_sub->id != 0 || next_sub->datalen < 2) return NULL; /* * Check if the first element in the next sub element is a start * of a new profile */ return next_sub->data[0] == WLAN_EID_NON_TX_BSSID_CAP ? NULL : next_mbssid; } size_t cfg80211_merge_profile(const u8 *ie, size_t ielen, const struct element *mbssid_elem, const struct element *sub_elem, u8 *merged_ie, size_t max_copy_len) { size_t copied_len = sub_elem->datalen; const struct element *next_mbssid; if (sub_elem->datalen > max_copy_len) return 0; memcpy(merged_ie, sub_elem->data, sub_elem->datalen); while ((next_mbssid = cfg80211_get_profile_continuation(ie, ielen, mbssid_elem, sub_elem))) { const struct element *next_sub = (void *)&next_mbssid->data[1]; if (copied_len + next_sub->datalen > max_copy_len) break; memcpy(merged_ie + copied_len, next_sub->data, next_sub->datalen); copied_len += next_sub->datalen; } return copied_len; } EXPORT_SYMBOL(cfg80211_merge_profile); static void cfg80211_parse_mbssid_data(struct wiphy *wiphy, struct cfg80211_inform_single_bss_data *tx_data, struct cfg80211_bss *source_bss, gfp_t gfp) { struct cfg80211_inform_single_bss_data data = { .drv_data = tx_data->drv_data, .ftype = tx_data->ftype, .tsf = tx_data->tsf, .beacon_interval = tx_data->beacon_interval, .source_bss = source_bss, .bss_source = BSS_SOURCE_MBSSID, .use_for = tx_data->use_for, .cannot_use_reasons = tx_data->cannot_use_reasons, }; const u8 *mbssid_index_ie; const struct element *elem, *sub; u8 *new_ie, *profile; u64 seen_indices = 0; struct cfg80211_bss *bss; if (!source_bss) return; if (!cfg80211_find_elem(WLAN_EID_MULTIPLE_BSSID, tx_data->ie, tx_data->ielen)) return; if (!wiphy->support_mbssid) return; if (wiphy->support_only_he_mbssid && !cfg80211_find_ext_elem(WLAN_EID_EXT_HE_CAPABILITY, tx_data->ie, tx_data->ielen)) return; new_ie = kmalloc(IEEE80211_MAX_DATA_LEN, gfp); if (!new_ie) return; profile = kmalloc(tx_data->ielen, gfp); if (!profile) goto out; for_each_element_id(elem, WLAN_EID_MULTIPLE_BSSID, tx_data->ie, tx_data->ielen) { if (elem->datalen < 4) continue; if (elem->data[0] < 1 || (int)elem->data[0] > 8) continue; for_each_element(sub, elem->data + 1, elem->datalen - 1) { u8 profile_len; if (sub->id != 0 || sub->datalen < 4) { /* not a valid BSS profile */ continue; } if (sub->data[0] != WLAN_EID_NON_TX_BSSID_CAP || sub->data[1] != 2) { /* The first element within the Nontransmitted * BSSID Profile is not the Nontransmitted * BSSID Capability element. */ continue; } memset(profile, 0, tx_data->ielen); profile_len = cfg80211_merge_profile(tx_data->ie, tx_data->ielen, elem, sub, profile, tx_data->ielen); /* found a Nontransmitted BSSID Profile */ mbssid_index_ie = cfg80211_find_ie (WLAN_EID_MULTI_BSSID_IDX, profile, profile_len); if (!mbssid_index_ie || mbssid_index_ie[1] < 1 || mbssid_index_ie[2] == 0 || mbssid_index_ie[2] > 46 || mbssid_index_ie[2] >= (1 << elem->data[0])) { /* No valid Multiple BSSID-Index element */ continue; } if (seen_indices & BIT_ULL(mbssid_index_ie[2])) /* We don't support legacy split of a profile */ net_dbg_ratelimited("Partial info for BSSID index %d\n", mbssid_index_ie[2]); seen_indices |= BIT_ULL(mbssid_index_ie[2]); data.bssid_index = mbssid_index_ie[2]; data.max_bssid_indicator = elem->data[0]; cfg80211_gen_new_bssid(tx_data->bssid, data.max_bssid_indicator, data.bssid_index, data.bssid); memset(new_ie, 0, IEEE80211_MAX_DATA_LEN); data.ie = new_ie; data.ielen = cfg80211_gen_new_ie(tx_data->ie, tx_data->ielen, profile, profile_len, new_ie, IEEE80211_MAX_DATA_LEN); if (!data.ielen) continue; data.capability = get_unaligned_le16(profile + 2); bss = cfg80211_inform_single_bss_data(wiphy, &data, gfp); if (!bss) break; cfg80211_put_bss(wiphy, bss); } } out: kfree(new_ie); kfree(profile); } ssize_t cfg80211_defragment_element(const struct element *elem, const u8 *ies, size_t ieslen, u8 *data, size_t data_len, u8 frag_id) { const struct element *next; ssize_t copied; u8 elem_datalen; if (!elem) return -EINVAL; /* elem might be invalid after the memmove */ next = (void *)(elem->data + elem->datalen); elem_datalen = elem->datalen; if (elem->id == WLAN_EID_EXTENSION) { copied = elem->datalen - 1; if (data) { if (copied > data_len) return -ENOSPC; memmove(data, elem->data + 1, copied); } } else { copied = elem->datalen; if (data) { if (copied > data_len) return -ENOSPC; memmove(data, elem->data, copied); } } /* Fragmented elements must have 255 bytes */ if (elem_datalen < 255) return copied; for (elem = next; elem->data < ies + ieslen && elem->data + elem->datalen <= ies + ieslen; elem = next) { /* elem might be invalid after the memmove */ next = (void *)(elem->data + elem->datalen); if (elem->id != frag_id) break; elem_datalen = elem->datalen; if (data) { if (copied + elem_datalen > data_len) return -ENOSPC; memmove(data + copied, elem->data, elem_datalen); } copied += elem_datalen; /* Only the last fragment may be short */ if (elem_datalen != 255) break; } return copied; } EXPORT_SYMBOL(cfg80211_defragment_element); struct cfg80211_mle { struct ieee80211_multi_link_elem *mle; struct ieee80211_mle_per_sta_profile *sta_prof[IEEE80211_MLD_MAX_NUM_LINKS]; ssize_t sta_prof_len[IEEE80211_MLD_MAX_NUM_LINKS]; u8 data[]; }; static struct cfg80211_mle * cfg80211_defrag_mle(const struct element *mle, const u8 *ie, size_t ielen, gfp_t gfp) { const struct element *elem; struct cfg80211_mle *res; size_t buf_len; ssize_t mle_len; u8 common_size, idx; if (!mle || !ieee80211_mle_size_ok(mle->data + 1, mle->datalen - 1)) return NULL; /* Required length for first defragmentation */ buf_len = mle->datalen - 1; for_each_element(elem, mle->data + mle->datalen, ie + ielen - mle->data - mle->datalen) { if (elem->id != WLAN_EID_FRAGMENT) break; buf_len += elem->datalen; } res = kzalloc(struct_size(res, data, buf_len), gfp); if (!res) return NULL; mle_len = cfg80211_defragment_element(mle, ie, ielen, res->data, buf_len, WLAN_EID_FRAGMENT); if (mle_len < 0) goto error; res->mle = (void *)res->data; /* Find the sub-element area in the buffer */ common_size = ieee80211_mle_common_size((u8 *)res->mle); ie = res->data + common_size; ielen = mle_len - common_size; idx = 0; for_each_element_id(elem, IEEE80211_MLE_SUBELEM_PER_STA_PROFILE, ie, ielen) { res->sta_prof[idx] = (void *)elem->data; res->sta_prof_len[idx] = elem->datalen; idx++; if (idx >= IEEE80211_MLD_MAX_NUM_LINKS) break; } if (!for_each_element_completed(elem, ie, ielen)) goto error; /* Defragment sta_info in-place */ for (idx = 0; idx < IEEE80211_MLD_MAX_NUM_LINKS && res->sta_prof[idx]; idx++) { if (res->sta_prof_len[idx] < 255) continue; elem = (void *)res->sta_prof[idx] - 2; if (idx + 1 < ARRAY_SIZE(res->sta_prof) && res->sta_prof[idx + 1]) buf_len = (u8 *)res->sta_prof[idx + 1] - (u8 *)res->sta_prof[idx]; else buf_len = ielen + ie - (u8 *)elem; res->sta_prof_len[idx] = cfg80211_defragment_element(elem, (u8 *)elem, buf_len, (u8 *)res->sta_prof[idx], buf_len, IEEE80211_MLE_SUBELEM_FRAGMENT); if (res->sta_prof_len[idx] < 0) goto error; } return res; error: kfree(res); return NULL; } struct tbtt_info_iter_data { const struct ieee80211_neighbor_ap_info *ap_info; u8 param_ch_count; u32 use_for; u8 mld_id, link_id; bool non_tx; }; static enum cfg80211_rnr_iter_ret cfg802121_mld_ap_rnr_iter(void *_data, u8 type, const struct ieee80211_neighbor_ap_info *info, const u8 *tbtt_info, u8 tbtt_info_len) { const struct ieee80211_rnr_mld_params *mld_params; struct tbtt_info_iter_data *data = _data; u8 link_id; bool non_tx = false; if (type == IEEE80211_TBTT_INFO_TYPE_TBTT && tbtt_info_len >= offsetofend(struct ieee80211_tbtt_info_ge_11, mld_params)) { const struct ieee80211_tbtt_info_ge_11 *tbtt_info_ge_11 = (void *)tbtt_info; non_tx = (tbtt_info_ge_11->bss_params & (IEEE80211_RNR_TBTT_PARAMS_MULTI_BSSID | IEEE80211_RNR_TBTT_PARAMS_TRANSMITTED_BSSID)) == IEEE80211_RNR_TBTT_PARAMS_MULTI_BSSID; mld_params = &tbtt_info_ge_11->mld_params; } else if (type == IEEE80211_TBTT_INFO_TYPE_MLD && tbtt_info_len >= sizeof(struct ieee80211_rnr_mld_params)) mld_params = (void *)tbtt_info; else return RNR_ITER_CONTINUE; link_id = le16_get_bits(mld_params->params, IEEE80211_RNR_MLD_PARAMS_LINK_ID); if (data->mld_id != mld_params->mld_id) return RNR_ITER_CONTINUE; if (data->link_id != link_id) return RNR_ITER_CONTINUE; data->ap_info = info; data->param_ch_count = le16_get_bits(mld_params->params, IEEE80211_RNR_MLD_PARAMS_BSS_CHANGE_COUNT); data->non_tx = non_tx; if (type == IEEE80211_TBTT_INFO_TYPE_TBTT) data->use_for = NL80211_BSS_USE_FOR_ALL; else data->use_for = NL80211_BSS_USE_FOR_MLD_LINK; return RNR_ITER_BREAK; } static u8 cfg80211_rnr_info_for_mld_ap(const u8 *ie, size_t ielen, u8 mld_id, u8 link_id, const struct ieee80211_neighbor_ap_info **ap_info, u8 *param_ch_count, bool *non_tx) { struct tbtt_info_iter_data data = { .mld_id = mld_id, .link_id = link_id, }; cfg80211_iter_rnr(ie, ielen, cfg802121_mld_ap_rnr_iter, &data); *ap_info = data.ap_info; *param_ch_count = data.param_ch_count; *non_tx = data.non_tx; return data.use_for; } static struct element * cfg80211_gen_reporter_rnr(struct cfg80211_bss *source_bss, bool is_mbssid, bool same_mld, u8 link_id, u8 bss_change_count, gfp_t gfp) { const struct cfg80211_bss_ies *ies; struct ieee80211_neighbor_ap_info ap_info; struct ieee80211_tbtt_info_ge_11 tbtt_info; u32 short_ssid; const struct element *elem; struct element *res; /* * We only generate the RNR to permit ML lookups. For that we do not * need an entry for the corresponding transmitting BSS, lets just skip * it even though it would be easy to add. */ if (!same_mld) return NULL; /* We could use tx_data->ies if we change cfg80211_calc_short_ssid */ rcu_read_lock(); ies = rcu_dereference(source_bss->ies); ap_info.tbtt_info_len = offsetofend(typeof(tbtt_info), mld_params); ap_info.tbtt_info_hdr = u8_encode_bits(IEEE80211_TBTT_INFO_TYPE_TBTT, IEEE80211_AP_INFO_TBTT_HDR_TYPE) | u8_encode_bits(0, IEEE80211_AP_INFO_TBTT_HDR_COUNT); ap_info.channel = ieee80211_frequency_to_channel(source_bss->channel->center_freq); /* operating class */ elem = cfg80211_find_elem(WLAN_EID_SUPPORTED_REGULATORY_CLASSES, ies->data, ies->len); if (elem && elem->datalen >= 1) { ap_info.op_class = elem->data[0]; } else { struct cfg80211_chan_def chandef; /* The AP is not providing us with anything to work with. So * make up a somewhat reasonable operating class, but don't * bother with it too much as no one will ever use the * information. */ cfg80211_chandef_create(&chandef, source_bss->channel, NL80211_CHAN_NO_HT); if (!ieee80211_chandef_to_operating_class(&chandef, &ap_info.op_class)) goto out_unlock; } /* Just set TBTT offset and PSD 20 to invalid/unknown */ tbtt_info.tbtt_offset = 255; tbtt_info.psd_20 = IEEE80211_RNR_TBTT_PARAMS_PSD_RESERVED; memcpy(tbtt_info.bssid, source_bss->bssid, ETH_ALEN); if (cfg80211_calc_short_ssid(ies, &elem, &short_ssid)) goto out_unlock; rcu_read_unlock(); tbtt_info.short_ssid = cpu_to_le32(short_ssid); tbtt_info.bss_params = IEEE80211_RNR_TBTT_PARAMS_SAME_SSID; if (is_mbssid) { tbtt_info.bss_params |= IEEE80211_RNR_TBTT_PARAMS_MULTI_BSSID; tbtt_info.bss_params |= IEEE80211_RNR_TBTT_PARAMS_TRANSMITTED_BSSID; } tbtt_info.mld_params.mld_id = 0; tbtt_info.mld_params.params = le16_encode_bits(link_id, IEEE80211_RNR_MLD_PARAMS_LINK_ID) | le16_encode_bits(bss_change_count, IEEE80211_RNR_MLD_PARAMS_BSS_CHANGE_COUNT); res = kzalloc(struct_size(res, data, sizeof(ap_info) + ap_info.tbtt_info_len), gfp); if (!res) return NULL; /* Copy the data */ res->id = WLAN_EID_REDUCED_NEIGHBOR_REPORT; res->datalen = sizeof(ap_info) + ap_info.tbtt_info_len; memcpy(res->data, &ap_info, sizeof(ap_info)); memcpy(res->data + sizeof(ap_info), &tbtt_info, ap_info.tbtt_info_len); return res; out_unlock: rcu_read_unlock(); return NULL; } static void cfg80211_parse_ml_elem_sta_data(struct wiphy *wiphy, struct cfg80211_inform_single_bss_data *tx_data, struct cfg80211_bss *source_bss, const struct element *elem, gfp_t gfp) { struct cfg80211_inform_single_bss_data data = { .drv_data = tx_data->drv_data, .ftype = tx_data->ftype, .source_bss = source_bss, .bss_source = BSS_SOURCE_STA_PROFILE, }; struct element *reporter_rnr = NULL; struct ieee80211_multi_link_elem *ml_elem; struct cfg80211_mle *mle; const struct element *ssid_elem; const u8 *ssid = NULL; size_t ssid_len = 0; u16 control; u8 ml_common_len; u8 *new_ie = NULL; struct cfg80211_bss *bss; u8 mld_id, reporter_link_id, bss_change_count; u16 seen_links = 0; u8 i; if (!ieee80211_mle_type_ok(elem->data + 1, IEEE80211_ML_CONTROL_TYPE_BASIC, elem->datalen - 1)) return; ml_elem = (void *)(elem->data + 1); control = le16_to_cpu(ml_elem->control); ml_common_len = ml_elem->variable[0]; /* Must be present when transmitted by an AP (in a probe response) */ if (!(control & IEEE80211_MLC_BASIC_PRES_BSS_PARAM_CH_CNT) || !(control & IEEE80211_MLC_BASIC_PRES_LINK_ID) || !(control & IEEE80211_MLC_BASIC_PRES_MLD_CAPA_OP)) return; reporter_link_id = ieee80211_mle_get_link_id(elem->data + 1); bss_change_count = ieee80211_mle_get_bss_param_ch_cnt(elem->data + 1); /* * The MLD ID of the reporting AP is always zero. It is set if the AP * is part of an MBSSID set and will be non-zero for ML Elements * relating to a nontransmitted BSS (matching the Multi-BSSID Index, * Draft P802.11be_D3.2, 35.3.4.2) */ mld_id = ieee80211_mle_get_mld_id(elem->data + 1); /* Fully defrag the ML element for sta information/profile iteration */ mle = cfg80211_defrag_mle(elem, tx_data->ie, tx_data->ielen, gfp); if (!mle) return; /* No point in doing anything if there is no per-STA profile */ if (!mle->sta_prof[0]) goto out; new_ie = kmalloc(IEEE80211_MAX_DATA_LEN, gfp); if (!new_ie) goto out; reporter_rnr = cfg80211_gen_reporter_rnr(source_bss, u16_get_bits(control, IEEE80211_MLC_BASIC_PRES_MLD_ID), mld_id == 0, reporter_link_id, bss_change_count, gfp); ssid_elem = cfg80211_find_elem(WLAN_EID_SSID, tx_data->ie, tx_data->ielen); if (ssid_elem) { ssid = ssid_elem->data; ssid_len = ssid_elem->datalen; } for (i = 0; i < ARRAY_SIZE(mle->sta_prof) && mle->sta_prof[i]; i++) { const struct ieee80211_neighbor_ap_info *ap_info; enum nl80211_band band; u32 freq; const u8 *profile; ssize_t profile_len; u8 param_ch_count; u8 link_id, use_for; bool non_tx; if (!ieee80211_mle_basic_sta_prof_size_ok((u8 *)mle->sta_prof[i], mle->sta_prof_len[i])) continue; control = le16_to_cpu(mle->sta_prof[i]->control); if (!(control & IEEE80211_MLE_STA_CONTROL_COMPLETE_PROFILE)) continue; link_id = u16_get_bits(control, IEEE80211_MLE_STA_CONTROL_LINK_ID); if (seen_links & BIT(link_id)) break; seen_links |= BIT(link_id); if (!(control & IEEE80211_MLE_STA_CONTROL_BEACON_INT_PRESENT) || !(control & IEEE80211_MLE_STA_CONTROL_TSF_OFFS_PRESENT) || !(control & IEEE80211_MLE_STA_CONTROL_STA_MAC_ADDR_PRESENT)) continue; memcpy(data.bssid, mle->sta_prof[i]->variable, ETH_ALEN); data.beacon_interval = get_unaligned_le16(mle->sta_prof[i]->variable + 6); data.tsf = tx_data->tsf + get_unaligned_le64(mle->sta_prof[i]->variable + 8); /* sta_info_len counts itself */ profile = mle->sta_prof[i]->variable + mle->sta_prof[i]->sta_info_len - 1; profile_len = (u8 *)mle->sta_prof[i] + mle->sta_prof_len[i] - profile; if (profile_len < 2) continue; data.capability = get_unaligned_le16(profile); profile += 2; profile_len -= 2; /* Find in RNR to look up channel information */ use_for = cfg80211_rnr_info_for_mld_ap(tx_data->ie, tx_data->ielen, mld_id, link_id, &ap_info, &param_ch_count, &non_tx); if (!use_for) continue; /* * As of 802.11be_D5.0, the specification does not give us any * way of discovering both the MaxBSSID and the Multiple-BSSID * Index. It does seem like the Multiple-BSSID Index element * may be provided, but section 9.4.2.45 explicitly forbids * including a Multiple-BSSID Element (in this case without any * subelements). * Without both pieces of information we cannot calculate the * reference BSSID, so simply ignore the BSS. */ if (non_tx) continue; /* We could sanity check the BSSID is included */ if (!ieee80211_operating_class_to_band(ap_info->op_class, &band)) continue; freq = ieee80211_channel_to_freq_khz(ap_info->channel, band); data.channel = ieee80211_get_channel_khz(wiphy, freq); /* Skip if RNR element specifies an unsupported channel */ if (!data.channel) continue; /* Skip if BSS entry generated from MBSSID or DIRECT source * frame data available already. */ bss = cfg80211_get_bss(wiphy, data.channel, data.bssid, ssid, ssid_len, IEEE80211_BSS_TYPE_ANY, IEEE80211_PRIVACY_ANY); if (bss) { struct cfg80211_internal_bss *ibss = bss_from_pub(bss); if (data.capability == bss->capability && ibss->bss_source != BSS_SOURCE_STA_PROFILE) { cfg80211_put_bss(wiphy, bss); continue; } cfg80211_put_bss(wiphy, bss); } if (use_for == NL80211_BSS_USE_FOR_MLD_LINK && !(wiphy->flags & WIPHY_FLAG_SUPPORTS_NSTR_NONPRIMARY)) { use_for = 0; data.cannot_use_reasons = NL80211_BSS_CANNOT_USE_NSTR_NONPRIMARY; } data.use_for = use_for; /* Generate new elements */ memset(new_ie, 0, IEEE80211_MAX_DATA_LEN); data.ie = new_ie; data.ielen = cfg80211_gen_new_ie(tx_data->ie, tx_data->ielen, profile, profile_len, new_ie, IEEE80211_MAX_DATA_LEN); if (!data.ielen) continue; /* The generated elements do not contain: * - Basic ML element * - A TBTT entry in the RNR for the transmitting AP * * This information is needed both internally and in userspace * as such, we should append it here. */ if (data.ielen + 3 + sizeof(*ml_elem) + ml_common_len > IEEE80211_MAX_DATA_LEN) continue; /* Copy the Basic Multi-Link element including the common * information, and then fix up the link ID and BSS param * change count. * Note that the ML element length has been verified and we * also checked that it contains the link ID. */ new_ie[data.ielen++] = WLAN_EID_EXTENSION; new_ie[data.ielen++] = 1 + sizeof(*ml_elem) + ml_common_len; new_ie[data.ielen++] = WLAN_EID_EXT_EHT_MULTI_LINK; memcpy(new_ie + data.ielen, ml_elem, sizeof(*ml_elem) + ml_common_len); new_ie[data.ielen + sizeof(*ml_elem) + 1 + ETH_ALEN] = link_id; new_ie[data.ielen + sizeof(*ml_elem) + 1 + ETH_ALEN + 1] = param_ch_count; data.ielen += sizeof(*ml_elem) + ml_common_len; if (reporter_rnr && (use_for & NL80211_BSS_USE_FOR_NORMAL)) { if (data.ielen + sizeof(struct element) + reporter_rnr->datalen > IEEE80211_MAX_DATA_LEN) continue; memcpy(new_ie + data.ielen, reporter_rnr, sizeof(struct element) + reporter_rnr->datalen); data.ielen += sizeof(struct element) + reporter_rnr->datalen; } bss = cfg80211_inform_single_bss_data(wiphy, &data, gfp); if (!bss) break; cfg80211_put_bss(wiphy, bss); } out: kfree(reporter_rnr); kfree(new_ie); kfree(mle); } static void cfg80211_parse_ml_sta_data(struct wiphy *wiphy, struct cfg80211_inform_single_bss_data *tx_data, struct cfg80211_bss *source_bss, gfp_t gfp) { const struct element *elem; if (!source_bss) return; if (tx_data->ftype != CFG80211_BSS_FTYPE_PRESP) return; for_each_element_extid(elem, WLAN_EID_EXT_EHT_MULTI_LINK, tx_data->ie, tx_data->ielen) cfg80211_parse_ml_elem_sta_data(wiphy, tx_data, source_bss, elem, gfp); } struct cfg80211_bss * cfg80211_inform_bss_data(struct wiphy *wiphy, struct cfg80211_inform_bss *data, enum cfg80211_bss_frame_type ftype, const u8 *bssid, u64 tsf, u16 capability, u16 beacon_interval, const u8 *ie, size_t ielen, gfp_t gfp) { struct cfg80211_inform_single_bss_data inform_data = { .drv_data = data, .ftype = ftype, .tsf = tsf, .capability = capability, .beacon_interval = beacon_interval, .ie = ie, .ielen = ielen, .use_for = data->restrict_use ? data->use_for : NL80211_BSS_USE_FOR_ALL, .cannot_use_reasons = data->cannot_use_reasons, }; struct cfg80211_bss *res; memcpy(inform_data.bssid, bssid, ETH_ALEN); res = cfg80211_inform_single_bss_data(wiphy, &inform_data, gfp); if (!res) return NULL; /* don't do any further MBSSID/ML handling for S1G */ if (ftype == CFG80211_BSS_FTYPE_S1G_BEACON) return res; cfg80211_parse_mbssid_data(wiphy, &inform_data, res, gfp); cfg80211_parse_ml_sta_data(wiphy, &inform_data, res, gfp); return res; } EXPORT_SYMBOL(cfg80211_inform_bss_data); struct cfg80211_bss * cfg80211_inform_bss_frame_data(struct wiphy *wiphy, struct cfg80211_inform_bss *data, struct ieee80211_mgmt *mgmt, size_t len, gfp_t gfp) { size_t min_hdr_len; struct ieee80211_ext *ext = NULL; enum cfg80211_bss_frame_type ftype; u16 beacon_interval; const u8 *bssid; u16 capability; const u8 *ie; size_t ielen; u64 tsf; if (WARN_ON(!mgmt)) return NULL; if (WARN_ON(!wiphy)) return NULL; BUILD_BUG_ON(offsetof(struct ieee80211_mgmt, u.probe_resp.variable) != offsetof(struct ieee80211_mgmt, u.beacon.variable)); trace_cfg80211_inform_bss_frame(wiphy, data, mgmt, len); if (ieee80211_is_s1g_beacon(mgmt->frame_control)) { ext = (void *) mgmt; if (ieee80211_is_s1g_short_beacon(mgmt->frame_control)) min_hdr_len = offsetof(struct ieee80211_ext, u.s1g_short_beacon.variable); else min_hdr_len = offsetof(struct ieee80211_ext, u.s1g_beacon.variable); } else { /* same for beacons */ min_hdr_len = offsetof(struct ieee80211_mgmt, u.probe_resp.variable); } if (WARN_ON(len < min_hdr_len)) return NULL; ielen = len - min_hdr_len; ie = mgmt->u.probe_resp.variable; if (ext) { const struct ieee80211_s1g_bcn_compat_ie *compat; const struct element *elem; if (ieee80211_is_s1g_short_beacon(mgmt->frame_control)) ie = ext->u.s1g_short_beacon.variable; else ie = ext->u.s1g_beacon.variable; elem = cfg80211_find_elem(WLAN_EID_S1G_BCN_COMPAT, ie, ielen); if (!elem) return NULL; if (elem->datalen < sizeof(*compat)) return NULL; compat = (void *)elem->data; bssid = ext->u.s1g_beacon.sa; capability = le16_to_cpu(compat->compat_info); beacon_interval = le16_to_cpu(compat->beacon_int); } else { bssid = mgmt->bssid; beacon_interval = le16_to_cpu(mgmt->u.probe_resp.beacon_int); capability = le16_to_cpu(mgmt->u.probe_resp.capab_info); } tsf = le64_to_cpu(mgmt->u.probe_resp.timestamp); if (ieee80211_is_probe_resp(mgmt->frame_control)) ftype = CFG80211_BSS_FTYPE_PRESP; else if (ext) ftype = CFG80211_BSS_FTYPE_S1G_BEACON; else ftype = CFG80211_BSS_FTYPE_BEACON; return cfg80211_inform_bss_data(wiphy, data, ftype, bssid, tsf, capability, beacon_interval, ie, ielen, gfp); } EXPORT_SYMBOL(cfg80211_inform_bss_frame_data); void cfg80211_ref_bss(struct wiphy *wiphy, struct cfg80211_bss *pub) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); if (!pub) return; spin_lock_bh(&rdev->bss_lock); bss_ref_get(rdev, bss_from_pub(pub)); spin_unlock_bh(&rdev->bss_lock); } EXPORT_SYMBOL(cfg80211_ref_bss); void cfg80211_put_bss(struct wiphy *wiphy, struct cfg80211_bss *pub) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); if (!pub) return; spin_lock_bh(&rdev->bss_lock); bss_ref_put(rdev, bss_from_pub(pub)); spin_unlock_bh(&rdev->bss_lock); } EXPORT_SYMBOL(cfg80211_put_bss); void cfg80211_unlink_bss(struct wiphy *wiphy, struct cfg80211_bss *pub) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); struct cfg80211_internal_bss *bss, *tmp1; struct cfg80211_bss *nontrans_bss, *tmp; if (WARN_ON(!pub)) return; bss = bss_from_pub(pub); spin_lock_bh(&rdev->bss_lock); if (list_empty(&bss->list)) goto out; list_for_each_entry_safe(nontrans_bss, tmp, &pub->nontrans_list, nontrans_list) { tmp1 = bss_from_pub(nontrans_bss); if (__cfg80211_unlink_bss(rdev, tmp1)) rdev->bss_generation++; } if (__cfg80211_unlink_bss(rdev, bss)) rdev->bss_generation++; out: spin_unlock_bh(&rdev->bss_lock); } EXPORT_SYMBOL(cfg80211_unlink_bss); void cfg80211_bss_iter(struct wiphy *wiphy, struct cfg80211_chan_def *chandef, void (*iter)(struct wiphy *wiphy, struct cfg80211_bss *bss, void *data), void *iter_data) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); struct cfg80211_internal_bss *bss; spin_lock_bh(&rdev->bss_lock); list_for_each_entry(bss, &rdev->bss_list, list) { if (!chandef || cfg80211_is_sub_chan(chandef, bss->pub.channel, false)) iter(wiphy, &bss->pub, iter_data); } spin_unlock_bh(&rdev->bss_lock); } EXPORT_SYMBOL(cfg80211_bss_iter); void cfg80211_update_assoc_bss_entry(struct wireless_dev *wdev, unsigned int link_id, struct ieee80211_channel *chan) { struct wiphy *wiphy = wdev->wiphy; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); struct cfg80211_internal_bss *cbss = wdev->links[link_id].client.current_bss; struct cfg80211_internal_bss *new = NULL; struct cfg80211_internal_bss *bss; struct cfg80211_bss *nontrans_bss; struct cfg80211_bss *tmp; spin_lock_bh(&rdev->bss_lock); /* * Some APs use CSA also for bandwidth changes, i.e., without actually * changing the control channel, so no need to update in such a case. */ if (cbss->pub.channel == chan) goto done; /* use transmitting bss */ if (cbss->pub.transmitted_bss) cbss = bss_from_pub(cbss->pub.transmitted_bss); cbss->pub.channel = chan; list_for_each_entry(bss, &rdev->bss_list, list) { if (!cfg80211_bss_type_match(bss->pub.capability, bss->pub.channel->band, wdev->conn_bss_type)) continue; if (bss == cbss) continue; if (!cmp_bss(&bss->pub, &cbss->pub, BSS_CMP_REGULAR)) { new = bss; break; } } if (new) { /* to save time, update IEs for transmitting bss only */ cfg80211_update_known_bss(rdev, cbss, new, false); new->pub.proberesp_ies = NULL; new->pub.beacon_ies = NULL; list_for_each_entry_safe(nontrans_bss, tmp, &new->pub.nontrans_list, nontrans_list) { bss = bss_from_pub(nontrans_bss); if (__cfg80211_unlink_bss(rdev, bss)) rdev->bss_generation++; } WARN_ON(atomic_read(&new->hold)); if (!WARN_ON(!__cfg80211_unlink_bss(rdev, new))) rdev->bss_generation++; } cfg80211_rehash_bss(rdev, cbss); list_for_each_entry_safe(nontrans_bss, tmp, &cbss->pub.nontrans_list, nontrans_list) { bss = bss_from_pub(nontrans_bss); bss->pub.channel = chan; cfg80211_rehash_bss(rdev, bss); } done: spin_unlock_bh(&rdev->bss_lock); } #ifdef CONFIG_CFG80211_WEXT static struct cfg80211_registered_device * cfg80211_get_dev_from_ifindex(struct net *net, int ifindex) { struct cfg80211_registered_device *rdev; struct net_device *dev; ASSERT_RTNL(); dev = dev_get_by_index(net, ifindex); if (!dev) return ERR_PTR(-ENODEV); if (dev->ieee80211_ptr) rdev = wiphy_to_rdev(dev->ieee80211_ptr->wiphy); else rdev = ERR_PTR(-ENODEV); dev_put(dev); return rdev; } int cfg80211_wext_siwscan(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra) { struct cfg80211_registered_device *rdev; struct wiphy *wiphy; struct iw_scan_req *wreq = NULL; struct cfg80211_scan_request *creq; int i, err, n_channels = 0; enum nl80211_band band; if (!netif_running(dev)) return -ENETDOWN; if (wrqu->data.length == sizeof(struct iw_scan_req)) wreq = (struct iw_scan_req *)extra; rdev = cfg80211_get_dev_from_ifindex(dev_net(dev), dev->ifindex); if (IS_ERR(rdev)) return PTR_ERR(rdev); if (rdev->scan_req || rdev->scan_msg) return -EBUSY; wiphy = &rdev->wiphy; /* Determine number of channels, needed to allocate creq */ if (wreq && wreq->num_channels) { /* Passed from userspace so should be checked */ if (unlikely(wreq->num_channels > IW_MAX_FREQUENCIES)) return -EINVAL; n_channels = wreq->num_channels; } else { n_channels = ieee80211_get_num_supported_channels(wiphy); } creq = kzalloc(struct_size(creq, channels, n_channels) + sizeof(struct cfg80211_ssid), GFP_ATOMIC); if (!creq) return -ENOMEM; creq->wiphy = wiphy; creq->wdev = dev->ieee80211_ptr; /* SSIDs come after channels */ creq->ssids = (void *)creq + struct_size(creq, channels, n_channels); creq->n_channels = n_channels; creq->n_ssids = 1; creq->scan_start = jiffies; /* translate "Scan on frequencies" request */ i = 0; for (band = 0; band < NUM_NL80211_BANDS; band++) { int j; if (!wiphy->bands[band]) continue; for (j = 0; j < wiphy->bands[band]->n_channels; j++) { struct ieee80211_channel *chan; /* ignore disabled channels */ chan = &wiphy->bands[band]->channels[j]; if (chan->flags & IEEE80211_CHAN_DISABLED || !cfg80211_wdev_channel_allowed(creq->wdev, chan)) continue; /* If we have a wireless request structure and the * wireless request specifies frequencies, then search * for the matching hardware channel. */ if (wreq && wreq->num_channels) { int k; int wiphy_freq = wiphy->bands[band]->channels[j].center_freq; for (k = 0; k < wreq->num_channels; k++) { struct iw_freq *freq = &wreq->channel_list[k]; int wext_freq = cfg80211_wext_freq(freq); if (wext_freq == wiphy_freq) goto wext_freq_found; } goto wext_freq_not_found; } wext_freq_found: creq->channels[i] = &wiphy->bands[band]->channels[j]; i++; wext_freq_not_found: ; } } /* No channels found? */ if (!i) { err = -EINVAL; goto out; } /* Set real number of channels specified in creq->channels[] */ creq->n_channels = i; /* translate "Scan for SSID" request */ if (wreq) { if (wrqu->data.flags & IW_SCAN_THIS_ESSID) { if (wreq->essid_len > IEEE80211_MAX_SSID_LEN) return -EINVAL; memcpy(creq->ssids[0].ssid, wreq->essid, wreq->essid_len); creq->ssids[0].ssid_len = wreq->essid_len; } if (wreq->scan_type == IW_SCAN_TYPE_PASSIVE) { creq->ssids = NULL; creq->n_ssids = 0; } } for (i = 0; i < NUM_NL80211_BANDS; i++) if (wiphy->bands[i]) creq->rates[i] = (1 << wiphy->bands[i]->n_bitrates) - 1; eth_broadcast_addr(creq->bssid); scoped_guard(wiphy, &rdev->wiphy) { rdev->scan_req = creq; err = rdev_scan(rdev, creq); if (err) { rdev->scan_req = NULL; /* creq will be freed below */ } else { nl80211_send_scan_start(rdev, dev->ieee80211_ptr); /* creq now owned by driver */ creq = NULL; dev_hold(dev); } } out: kfree(creq); return err; } static char *ieee80211_scan_add_ies(struct iw_request_info *info, const struct cfg80211_bss_ies *ies, char *current_ev, char *end_buf) { const u8 *pos, *end, *next; struct iw_event iwe; if (!ies) return current_ev; /* * If needed, fragment the IEs buffer (at IE boundaries) into short * enough fragments to fit into IW_GENERIC_IE_MAX octet messages. */ pos = ies->data; end = pos + ies->len; while (end - pos > IW_GENERIC_IE_MAX) { next = pos + 2 + pos[1]; while (next + 2 + next[1] - pos < IW_GENERIC_IE_MAX) next = next + 2 + next[1]; memset(&iwe, 0, sizeof(iwe)); iwe.cmd = IWEVGENIE; iwe.u.data.length = next - pos; current_ev = iwe_stream_add_point_check(info, current_ev, end_buf, &iwe, (void *)pos); if (IS_ERR(current_ev)) return current_ev; pos = next; } if (end > pos) { memset(&iwe, 0, sizeof(iwe)); iwe.cmd = IWEVGENIE; iwe.u.data.length = end - pos; current_ev = iwe_stream_add_point_check(info, current_ev, end_buf, &iwe, (void *)pos); if (IS_ERR(current_ev)) return current_ev; } return current_ev; } static char * ieee80211_bss(struct wiphy *wiphy, struct iw_request_info *info, struct cfg80211_internal_bss *bss, char *current_ev, char *end_buf) { const struct cfg80211_bss_ies *ies; struct iw_event iwe; const u8 *ie; u8 buf[50]; u8 *cfg, *p, *tmp; int rem, i, sig; bool ismesh = false; memset(&iwe, 0, sizeof(iwe)); iwe.cmd = SIOCGIWAP; iwe.u.ap_addr.sa_family = ARPHRD_ETHER; memcpy(iwe.u.ap_addr.sa_data, bss->pub.bssid, ETH_ALEN); current_ev = iwe_stream_add_event_check(info, current_ev, end_buf, &iwe, IW_EV_ADDR_LEN); if (IS_ERR(current_ev)) return current_ev; memset(&iwe, 0, sizeof(iwe)); iwe.cmd = SIOCGIWFREQ; iwe.u.freq.m = ieee80211_frequency_to_channel(bss->pub.channel->center_freq); iwe.u.freq.e = 0; current_ev = iwe_stream_add_event_check(info, current_ev, end_buf, &iwe, IW_EV_FREQ_LEN); if (IS_ERR(current_ev)) return current_ev; memset(&iwe, 0, sizeof(iwe)); iwe.cmd = SIOCGIWFREQ; iwe.u.freq.m = bss->pub.channel->center_freq; iwe.u.freq.e = 6; current_ev = iwe_stream_add_event_check(info, current_ev, end_buf, &iwe, IW_EV_FREQ_LEN); if (IS_ERR(current_ev)) return current_ev; if (wiphy->signal_type != CFG80211_SIGNAL_TYPE_NONE) { memset(&iwe, 0, sizeof(iwe)); iwe.cmd = IWEVQUAL; iwe.u.qual.updated = IW_QUAL_LEVEL_UPDATED | IW_QUAL_NOISE_INVALID | IW_QUAL_QUAL_UPDATED; switch (wiphy->signal_type) { case CFG80211_SIGNAL_TYPE_MBM: sig = bss->pub.signal / 100; iwe.u.qual.level = sig; iwe.u.qual.updated |= IW_QUAL_DBM; if (sig < -110) /* rather bad */ sig = -110; else if (sig > -40) /* perfect */ sig = -40; /* will give a range of 0 .. 70 */ iwe.u.qual.qual = sig + 110; break; case CFG80211_SIGNAL_TYPE_UNSPEC: iwe.u.qual.level = bss->pub.signal; /* will give range 0 .. 100 */ iwe.u.qual.qual = bss->pub.signal; break; default: /* not reached */ break; } current_ev = iwe_stream_add_event_check(info, current_ev, end_buf, &iwe, IW_EV_QUAL_LEN); if (IS_ERR(current_ev)) return current_ev; } memset(&iwe, 0, sizeof(iwe)); iwe.cmd = SIOCGIWENCODE; if (bss->pub.capability & WLAN_CAPABILITY_PRIVACY) iwe.u.data.flags = IW_ENCODE_ENABLED | IW_ENCODE_NOKEY; else iwe.u.data.flags = IW_ENCODE_DISABLED; iwe.u.data.length = 0; current_ev = iwe_stream_add_point_check(info, current_ev, end_buf, &iwe, ""); if (IS_ERR(current_ev)) return current_ev; rcu_read_lock(); ies = rcu_dereference(bss->pub.ies); rem = ies->len; ie = ies->data; while (rem >= 2) { /* invalid data */ if (ie[1] > rem - 2) break; switch (ie[0]) { case WLAN_EID_SSID: memset(&iwe, 0, sizeof(iwe)); iwe.cmd = SIOCGIWESSID; iwe.u.data.length = ie[1]; iwe.u.data.flags = 1; current_ev = iwe_stream_add_point_check(info, current_ev, end_buf, &iwe, (u8 *)ie + 2); if (IS_ERR(current_ev)) goto unlock; break; case WLAN_EID_MESH_ID: memset(&iwe, 0, sizeof(iwe)); iwe.cmd = SIOCGIWESSID; iwe.u.data.length = ie[1]; iwe.u.data.flags = 1; current_ev = iwe_stream_add_point_check(info, current_ev, end_buf, &iwe, (u8 *)ie + 2); if (IS_ERR(current_ev)) goto unlock; break; case WLAN_EID_MESH_CONFIG: ismesh = true; if (ie[1] != sizeof(struct ieee80211_meshconf_ie)) break; cfg = (u8 *)ie + 2; memset(&iwe, 0, sizeof(iwe)); iwe.cmd = IWEVCUSTOM; iwe.u.data.length = sprintf(buf, "Mesh Network Path Selection Protocol ID: 0x%02X", cfg[0]); current_ev = iwe_stream_add_point_check(info, current_ev, end_buf, &iwe, buf); if (IS_ERR(current_ev)) goto unlock; iwe.u.data.length = sprintf(buf, "Path Selection Metric ID: 0x%02X", cfg[1]); current_ev = iwe_stream_add_point_check(info, current_ev, end_buf, &iwe, buf); if (IS_ERR(current_ev)) goto unlock; iwe.u.data.length = sprintf(buf, "Congestion Control Mode ID: 0x%02X", cfg[2]); current_ev = iwe_stream_add_point_check(info, current_ev, end_buf, &iwe, buf); if (IS_ERR(current_ev)) goto unlock; iwe.u.data.length = sprintf(buf, "Synchronization ID: 0x%02X", cfg[3]); current_ev = iwe_stream_add_point_check(info, current_ev, end_buf, &iwe, buf); if (IS_ERR(current_ev)) goto unlock; iwe.u.data.length = sprintf(buf, "Authentication ID: 0x%02X", cfg[4]); current_ev = iwe_stream_add_point_check(info, current_ev, end_buf, &iwe, buf); if (IS_ERR(current_ev)) goto unlock; iwe.u.data.length = sprintf(buf, "Formation Info: 0x%02X", cfg[5]); current_ev = iwe_stream_add_point_check(info, current_ev, end_buf, &iwe, buf); if (IS_ERR(current_ev)) goto unlock; iwe.u.data.length = sprintf(buf, "Capabilities: 0x%02X", cfg[6]); current_ev = iwe_stream_add_point_check(info, current_ev, end_buf, &iwe, buf); if (IS_ERR(current_ev)) goto unlock; break; case WLAN_EID_SUPP_RATES: case WLAN_EID_EXT_SUPP_RATES: /* display all supported rates in readable format */ p = current_ev + iwe_stream_lcp_len(info); memset(&iwe, 0, sizeof(iwe)); iwe.cmd = SIOCGIWRATE; /* Those two flags are ignored... */ iwe.u.bitrate.fixed = iwe.u.bitrate.disabled = 0; for (i = 0; i < ie[1]; i++) { iwe.u.bitrate.value = ((ie[i + 2] & 0x7f) * 500000); tmp = p; p = iwe_stream_add_value(info, current_ev, p, end_buf, &iwe, IW_EV_PARAM_LEN); if (p == tmp) { current_ev = ERR_PTR(-E2BIG); goto unlock; } } current_ev = p; break; } rem -= ie[1] + 2; ie += ie[1] + 2; } if (bss->pub.capability & (WLAN_CAPABILITY_ESS | WLAN_CAPABILITY_IBSS) || ismesh) { memset(&iwe, 0, sizeof(iwe)); iwe.cmd = SIOCGIWMODE; if (ismesh) iwe.u.mode = IW_MODE_MESH; else if (bss->pub.capability & WLAN_CAPABILITY_ESS) iwe.u.mode = IW_MODE_MASTER; else iwe.u.mode = IW_MODE_ADHOC; current_ev = iwe_stream_add_event_check(info, current_ev, end_buf, &iwe, IW_EV_UINT_LEN); if (IS_ERR(current_ev)) goto unlock; } memset(&iwe, 0, sizeof(iwe)); iwe.cmd = IWEVCUSTOM; iwe.u.data.length = sprintf(buf, "tsf=%016llx", (unsigned long long)(ies->tsf)); current_ev = iwe_stream_add_point_check(info, current_ev, end_buf, &iwe, buf); if (IS_ERR(current_ev)) goto unlock; memset(&iwe, 0, sizeof(iwe)); iwe.cmd = IWEVCUSTOM; iwe.u.data.length = sprintf(buf, " Last beacon: %ums ago", elapsed_jiffies_msecs(bss->ts)); current_ev = iwe_stream_add_point_check(info, current_ev, end_buf, &iwe, buf); if (IS_ERR(current_ev)) goto unlock; current_ev = ieee80211_scan_add_ies(info, ies, current_ev, end_buf); unlock: rcu_read_unlock(); return current_ev; } static int ieee80211_scan_results(struct cfg80211_registered_device *rdev, struct iw_request_info *info, char *buf, size_t len) { char *current_ev = buf; char *end_buf = buf + len; struct cfg80211_internal_bss *bss; int err = 0; spin_lock_bh(&rdev->bss_lock); cfg80211_bss_expire(rdev); list_for_each_entry(bss, &rdev->bss_list, list) { if (buf + len - current_ev <= IW_EV_ADDR_LEN) { err = -E2BIG; break; } current_ev = ieee80211_bss(&rdev->wiphy, info, bss, current_ev, end_buf); if (IS_ERR(current_ev)) { err = PTR_ERR(current_ev); break; } } spin_unlock_bh(&rdev->bss_lock); if (err) return err; return current_ev - buf; } int cfg80211_wext_giwscan(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra) { struct iw_point *data = &wrqu->data; struct cfg80211_registered_device *rdev; int res; if (!netif_running(dev)) return -ENETDOWN; rdev = cfg80211_get_dev_from_ifindex(dev_net(dev), dev->ifindex); if (IS_ERR(rdev)) return PTR_ERR(rdev); if (rdev->scan_req || rdev->scan_msg) return -EAGAIN; res = ieee80211_scan_results(rdev, info, extra, data->length); data->length = 0; if (res >= 0) { data->length = res; res = 0; } return res; } #endif
390 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_PGTABLE_64_H #define _ASM_X86_PGTABLE_64_H #include <linux/const.h> #include <asm/pgtable_64_types.h> #ifndef __ASSEMBLER__ /* * This file contains the functions and defines necessary to modify and use * the x86-64 page table tree. */ #include <asm/processor.h> #include <linux/bitops.h> #include <linux/threads.h> #include <asm/fixmap.h> extern p4d_t level4_kernel_pgt[512]; extern p4d_t level4_ident_pgt[512]; extern pud_t level3_kernel_pgt[512]; extern pud_t level3_ident_pgt[512]; extern pmd_t level2_kernel_pgt[512]; extern pmd_t level2_fixmap_pgt[512]; extern pmd_t level2_ident_pgt[512]; extern pte_t level1_fixmap_pgt[512 * FIXMAP_PMD_NUM]; extern pgd_t init_top_pgt[]; #define swapper_pg_dir init_top_pgt extern void paging_init(void); static inline void sync_initial_page_table(void) { } #define pte_ERROR(e) \ pr_err("%s:%d: bad pte %p(%016lx)\n", \ __FILE__, __LINE__, &(e), pte_val(e)) #define pmd_ERROR(e) \ pr_err("%s:%d: bad pmd %p(%016lx)\n", \ __FILE__, __LINE__, &(e), pmd_val(e)) #define pud_ERROR(e) \ pr_err("%s:%d: bad pud %p(%016lx)\n", \ __FILE__, __LINE__, &(e), pud_val(e)) #if CONFIG_PGTABLE_LEVELS >= 5 #define p4d_ERROR(e) \ pr_err("%s:%d: bad p4d %p(%016lx)\n", \ __FILE__, __LINE__, &(e), p4d_val(e)) #endif #define pgd_ERROR(e) \ pr_err("%s:%d: bad pgd %p(%016lx)\n", \ __FILE__, __LINE__, &(e), pgd_val(e)) struct mm_struct; #define mm_p4d_folded mm_p4d_folded static inline bool mm_p4d_folded(struct mm_struct *mm) { return !pgtable_l5_enabled(); } void set_pte_vaddr_p4d(p4d_t *p4d_page, unsigned long vaddr, pte_t new_pte); void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte); static inline void native_set_pte(pte_t *ptep, pte_t pte) { WRITE_ONCE(*ptep, pte); } static inline void native_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { native_set_pte(ptep, native_make_pte(0)); } static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte) { native_set_pte(ptep, pte); } static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd) { WRITE_ONCE(*pmdp, pmd); } static inline void native_pmd_clear(pmd_t *pmd) { native_set_pmd(pmd, native_make_pmd(0)); } static inline pte_t native_ptep_get_and_clear(pte_t *xp) { #ifdef CONFIG_SMP return native_make_pte(xchg(&xp->pte, 0)); #else /* native_local_ptep_get_and_clear, but duplicated because of cyclic dependency */ pte_t ret = *xp; native_pte_clear(NULL, 0, xp); return ret; #endif } static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp) { #ifdef CONFIG_SMP return native_make_pmd(xchg(&xp->pmd, 0)); #else /* native_local_pmdp_get_and_clear, but duplicated because of cyclic dependency */ pmd_t ret = *xp; native_pmd_clear(xp); return ret; #endif } static inline void native_set_pud(pud_t *pudp, pud_t pud) { WRITE_ONCE(*pudp, pud); } static inline void native_pud_clear(pud_t *pud) { native_set_pud(pud, native_make_pud(0)); } static inline pud_t native_pudp_get_and_clear(pud_t *xp) { #ifdef CONFIG_SMP return native_make_pud(xchg(&xp->pud, 0)); #else /* native_local_pudp_get_and_clear, * but duplicated because of cyclic dependency */ pud_t ret = *xp; native_pud_clear(xp); return ret; #endif } static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d) { pgd_t pgd; if (pgtable_l5_enabled() || !IS_ENABLED(CONFIG_MITIGATION_PAGE_TABLE_ISOLATION)) { WRITE_ONCE(*p4dp, p4d); return; } pgd = native_make_pgd(native_p4d_val(p4d)); pgd = pti_set_user_pgtbl((pgd_t *)p4dp, pgd); WRITE_ONCE(*p4dp, native_make_p4d(native_pgd_val(pgd))); } static inline void native_p4d_clear(p4d_t *p4d) { native_set_p4d(p4d, native_make_p4d(0)); } static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd) { WRITE_ONCE(*pgdp, pti_set_user_pgtbl(pgdp, pgd)); } static inline void native_pgd_clear(pgd_t *pgd) { native_set_pgd(pgd, native_make_pgd(0)); } /* * Conversion functions: convert a page and protection to a page entry, * and a page entry and page directory to the page they refer to. */ /* PGD - Level 4 access */ /* PUD - Level 3 access */ /* PMD - Level 2 access */ /* PTE - Level 1 access */ /* * Encode and de-code a swap entry * * | ... | 11| 10| 9|8|7|6|5| 4| 3|2| 1|0| <- bit number * | ... |SW3|SW2|SW1|G|L|D|A|CD|WT|U| W|P| <- bit names * | TYPE (59-63) | ~OFFSET (9-58) |0|0|X|X| X| E|F|SD|0| <- swp entry * * G (8) is aliased and used as a PROT_NONE indicator for * !present ptes. We need to start storing swap entries above * there. We also need to avoid using A and D because of an * erratum where they can be incorrectly set by hardware on * non-present PTEs. * * SD Bits 1-4 are not used in non-present format and available for * special use described below: * * SD (1) in swp entry is used to store soft dirty bit, which helps us * remember soft dirty over page migration * * F (2) in swp entry is used to record when a pagetable is * writeprotected by userfaultfd WP support. * * E (3) in swp entry is used to remember PG_anon_exclusive. * * Bit 7 in swp entry should be 0 because pmd_present checks not only P, * but also L and G. * * The offset is inverted by a binary not operation to make the high * physical bits set. */ #define SWP_TYPE_BITS 5 #define SWP_OFFSET_FIRST_BIT (_PAGE_BIT_PROTNONE + 1) /* We always extract/encode the offset by shifting it all the way up, and then down again */ #define SWP_OFFSET_SHIFT (SWP_OFFSET_FIRST_BIT+SWP_TYPE_BITS) #define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS) /* Extract the high bits for type */ #define __swp_type(x) ((x).val >> (64 - SWP_TYPE_BITS)) /* Shift up (to get rid of type), then down to get value */ #define __swp_offset(x) (~(x).val << SWP_TYPE_BITS >> SWP_OFFSET_SHIFT) /* * Shift the offset up "too far" by TYPE bits, then down again * The offset is inverted by a binary not operation to make the high * physical bits set. */ #define __swp_entry(type, offset) ((swp_entry_t) { \ (~(unsigned long)(offset) << SWP_OFFSET_SHIFT >> SWP_TYPE_BITS) \ | ((unsigned long)(type) << (64-SWP_TYPE_BITS)) }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val((pte)) }) #define __pmd_to_swp_entry(pmd) ((swp_entry_t) { pmd_val((pmd)) }) #define __swp_entry_to_pte(x) (__pte((x).val)) #define __swp_entry_to_pmd(x) (__pmd((x).val)) extern void cleanup_highmap(void); #define HAVE_ARCH_UNMAPPED_AREA #define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN #define PAGE_AGP PAGE_KERNEL_NOCACHE #define HAVE_PAGE_AGP 1 /* fs/proc/kcore.c */ #define kc_vaddr_to_offset(v) ((v) & __VIRTUAL_MASK) #define kc_offset_to_vaddr(o) ((o) | ~__VIRTUAL_MASK) #define __HAVE_ARCH_PTE_SAME #define vmemmap ((struct page *)VMEMMAP_START) extern void init_extra_mapping_uc(unsigned long phys, unsigned long size); extern void init_extra_mapping_wb(unsigned long phys, unsigned long size); #define gup_fast_permitted gup_fast_permitted static inline bool gup_fast_permitted(unsigned long start, unsigned long end) { if (end >> __VIRTUAL_MASK_SHIFT) return false; return true; } #include <asm/pgtable-invert.h> #else /* __ASSEMBLER__ */ #define l4_index(x) (((x) >> 39) & 511) #define pud_index(x) (((x) >> PUD_SHIFT) & (PTRS_PER_PUD - 1)) L4_PAGE_OFFSET = l4_index(__PAGE_OFFSET_BASE_L4) L4_START_KERNEL = l4_index(__START_KERNEL_map) L3_START_KERNEL = pud_index(__START_KERNEL_map) #define SYM_DATA_START_PAGE_ALIGNED(name) \ SYM_START(name, SYM_L_GLOBAL, .balign PAGE_SIZE) /* Automate the creation of 1 to 1 mapping pmd entries */ #define PMDS(START, PERM, COUNT) \ i = 0 ; \ .rept (COUNT) ; \ .quad (START) + (i << PMD_SHIFT) + (PERM) ; \ i = i + 1 ; \ .endr #endif /* __ASSEMBLER__ */ #endif /* _ASM_X86_PGTABLE_64_H */
1452 1452 1331 1328 497 495 496 495 17 76 76 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2022 Christian Brauner <brauner@kernel.org> */ #include <linux/cred.h> #include <linux/fs.h> #include <linux/mnt_idmapping.h> #include <linux/slab.h> #include <linux/user_namespace.h> #include <linux/seq_file.h> #include "internal.h" /* * Outside of this file vfs{g,u}id_t are always created from k{g,u}id_t, * never from raw values. These are just internal helpers. */ #define VFSUIDT_INIT_RAW(val) (vfsuid_t){ val } #define VFSGIDT_INIT_RAW(val) (vfsgid_t){ val } struct mnt_idmap { struct uid_gid_map uid_map; struct uid_gid_map gid_map; refcount_t count; }; /* * Carries the initial idmapping of 0:0:4294967295 which is an identity * mapping. This means that {g,u}id 0 is mapped to {g,u}id 0, {g,u}id 1 is * mapped to {g,u}id 1, [...], {g,u}id 1000 to {g,u}id 1000, [...]. */ struct mnt_idmap nop_mnt_idmap = { .count = REFCOUNT_INIT(1), }; EXPORT_SYMBOL_GPL(nop_mnt_idmap); /* * Carries the invalid idmapping of a full 0-4294967295 {g,u}id range. * This means that all {g,u}ids are mapped to INVALID_VFS{G,U}ID. */ struct mnt_idmap invalid_mnt_idmap = { .count = REFCOUNT_INIT(1), }; EXPORT_SYMBOL_GPL(invalid_mnt_idmap); /** * initial_idmapping - check whether this is the initial mapping * @ns: idmapping to check * * Check whether this is the initial mapping, mapping 0 to 0, 1 to 1, * [...], 1000 to 1000 [...]. * * Return: true if this is the initial mapping, false if not. */ static inline bool initial_idmapping(const struct user_namespace *ns) { return ns == &init_user_ns; } /** * make_vfsuid - map a filesystem kuid according to an idmapping * @idmap: the mount's idmapping * @fs_userns: the filesystem's idmapping * @kuid : kuid to be mapped * * Take a @kuid and remap it from @fs_userns into @idmap. Use this * function when preparing a @kuid to be reported to userspace. * * If initial_idmapping() determines that this is not an idmapped mount * we can simply return @kuid unchanged. * If initial_idmapping() tells us that the filesystem is not mounted with an * idmapping we know the value of @kuid won't change when calling * from_kuid() so we can simply retrieve the value via __kuid_val() * directly. * * Return: @kuid mapped according to @idmap. * If @kuid has no mapping in either @idmap or @fs_userns INVALID_UID is * returned. */ vfsuid_t make_vfsuid(struct mnt_idmap *idmap, struct user_namespace *fs_userns, kuid_t kuid) { uid_t uid; if (idmap == &nop_mnt_idmap) return VFSUIDT_INIT(kuid); if (idmap == &invalid_mnt_idmap) return INVALID_VFSUID; if (initial_idmapping(fs_userns)) uid = __kuid_val(kuid); else uid = from_kuid(fs_userns, kuid); if (uid == (uid_t)-1) return INVALID_VFSUID; return VFSUIDT_INIT_RAW(map_id_down(&idmap->uid_map, uid)); } EXPORT_SYMBOL_GPL(make_vfsuid); /** * make_vfsgid - map a filesystem kgid according to an idmapping * @idmap: the mount's idmapping * @fs_userns: the filesystem's idmapping * @kgid : kgid to be mapped * * Take a @kgid and remap it from @fs_userns into @idmap. Use this * function when preparing a @kgid to be reported to userspace. * * If initial_idmapping() determines that this is not an idmapped mount * we can simply return @kgid unchanged. * If initial_idmapping() tells us that the filesystem is not mounted with an * idmapping we know the value of @kgid won't change when calling * from_kgid() so we can simply retrieve the value via __kgid_val() * directly. * * Return: @kgid mapped according to @idmap. * If @kgid has no mapping in either @idmap or @fs_userns INVALID_GID is * returned. */ vfsgid_t make_vfsgid(struct mnt_idmap *idmap, struct user_namespace *fs_userns, kgid_t kgid) { gid_t gid; if (idmap == &nop_mnt_idmap) return VFSGIDT_INIT(kgid); if (idmap == &invalid_mnt_idmap) return INVALID_VFSGID; if (initial_idmapping(fs_userns)) gid = __kgid_val(kgid); else gid = from_kgid(fs_userns, kgid); if (gid == (gid_t)-1) return INVALID_VFSGID; return VFSGIDT_INIT_RAW(map_id_down(&idmap->gid_map, gid)); } EXPORT_SYMBOL_GPL(make_vfsgid); /** * from_vfsuid - map a vfsuid into the filesystem idmapping * @idmap: the mount's idmapping * @fs_userns: the filesystem's idmapping * @vfsuid : vfsuid to be mapped * * Map @vfsuid into the filesystem idmapping. This function has to be used in * order to e.g. write @vfsuid to inode->i_uid. * * Return: @vfsuid mapped into the filesystem idmapping */ kuid_t from_vfsuid(struct mnt_idmap *idmap, struct user_namespace *fs_userns, vfsuid_t vfsuid) { uid_t uid; if (idmap == &nop_mnt_idmap) return AS_KUIDT(vfsuid); if (idmap == &invalid_mnt_idmap) return INVALID_UID; uid = map_id_up(&idmap->uid_map, __vfsuid_val(vfsuid)); if (uid == (uid_t)-1) return INVALID_UID; if (initial_idmapping(fs_userns)) return KUIDT_INIT(uid); return make_kuid(fs_userns, uid); } EXPORT_SYMBOL_GPL(from_vfsuid); /** * from_vfsgid - map a vfsgid into the filesystem idmapping * @idmap: the mount's idmapping * @fs_userns: the filesystem's idmapping * @vfsgid : vfsgid to be mapped * * Map @vfsgid into the filesystem idmapping. This function has to be used in * order to e.g. write @vfsgid to inode->i_gid. * * Return: @vfsgid mapped into the filesystem idmapping */ kgid_t from_vfsgid(struct mnt_idmap *idmap, struct user_namespace *fs_userns, vfsgid_t vfsgid) { gid_t gid; if (idmap == &nop_mnt_idmap) return AS_KGIDT(vfsgid); if (idmap == &invalid_mnt_idmap) return INVALID_GID; gid = map_id_up(&idmap->gid_map, __vfsgid_val(vfsgid)); if (gid == (gid_t)-1) return INVALID_GID; if (initial_idmapping(fs_userns)) return KGIDT_INIT(gid); return make_kgid(fs_userns, gid); } EXPORT_SYMBOL_GPL(from_vfsgid); #ifdef CONFIG_MULTIUSER /** * vfsgid_in_group_p() - check whether a vfsuid matches the caller's groups * @vfsgid: the mnt gid to match * * This function can be used to determine whether @vfsuid matches any of the * caller's groups. * * Return: 1 if vfsuid matches caller's groups, 0 if not. */ int vfsgid_in_group_p(vfsgid_t vfsgid) { return in_group_p(AS_KGIDT(vfsgid)); } #else int vfsgid_in_group_p(vfsgid_t vfsgid) { return 1; } #endif EXPORT_SYMBOL_GPL(vfsgid_in_group_p); static int copy_mnt_idmap(struct uid_gid_map *map_from, struct uid_gid_map *map_to) { struct uid_gid_extent *forward, *reverse; u32 nr_extents = READ_ONCE(map_from->nr_extents); /* Pairs with smp_wmb() when writing the idmapping. */ smp_rmb(); /* * Don't blindly copy @map_to into @map_from if nr_extents is * smaller or equal to UID_GID_MAP_MAX_BASE_EXTENTS. Since we * read @nr_extents someone could have written an idmapping and * then we might end up with inconsistent data. So just don't do * anything at all. */ if (nr_extents == 0) return -EINVAL; /* * Here we know that nr_extents is greater than zero which means * a map has been written. Since idmappings can't be changed * once they have been written we know that we can safely copy * from @map_to into @map_from. */ if (nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) { *map_to = *map_from; return 0; } forward = kmemdup_array(map_from->forward, nr_extents, sizeof(struct uid_gid_extent), GFP_KERNEL_ACCOUNT); if (!forward) return -ENOMEM; reverse = kmemdup_array(map_from->reverse, nr_extents, sizeof(struct uid_gid_extent), GFP_KERNEL_ACCOUNT); if (!reverse) { kfree(forward); return -ENOMEM; } /* * The idmapping isn't exposed anywhere so we don't need to care * about ordering between extent pointers and @nr_extents * initialization. */ map_to->forward = forward; map_to->reverse = reverse; map_to->nr_extents = nr_extents; return 0; } static void free_mnt_idmap(struct mnt_idmap *idmap) { if (idmap->uid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { kfree(idmap->uid_map.forward); kfree(idmap->uid_map.reverse); } if (idmap->gid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { kfree(idmap->gid_map.forward); kfree(idmap->gid_map.reverse); } kfree(idmap); } struct mnt_idmap *alloc_mnt_idmap(struct user_namespace *mnt_userns) { struct mnt_idmap *idmap; int ret; idmap = kzalloc(sizeof(struct mnt_idmap), GFP_KERNEL_ACCOUNT); if (!idmap) return ERR_PTR(-ENOMEM); refcount_set(&idmap->count, 1); ret = copy_mnt_idmap(&mnt_userns->uid_map, &idmap->uid_map); if (!ret) ret = copy_mnt_idmap(&mnt_userns->gid_map, &idmap->gid_map); if (ret) { free_mnt_idmap(idmap); idmap = ERR_PTR(ret); } return idmap; } /** * mnt_idmap_get - get a reference to an idmapping * @idmap: the idmap to bump the reference on * * If @idmap is not the @nop_mnt_idmap bump the reference count. * * Return: @idmap with reference count bumped if @not_mnt_idmap isn't passed. */ struct mnt_idmap *mnt_idmap_get(struct mnt_idmap *idmap) { if (idmap != &nop_mnt_idmap && idmap != &invalid_mnt_idmap) refcount_inc(&idmap->count); return idmap; } EXPORT_SYMBOL_GPL(mnt_idmap_get); /** * mnt_idmap_put - put a reference to an idmapping * @idmap: the idmap to put the reference on * * If this is a non-initial idmapping, put the reference count when a mount is * released and free it if we're the last user. */ void mnt_idmap_put(struct mnt_idmap *idmap) { if (idmap != &nop_mnt_idmap && idmap != &invalid_mnt_idmap && refcount_dec_and_test(&idmap->count)) free_mnt_idmap(idmap); } EXPORT_SYMBOL_GPL(mnt_idmap_put); int statmount_mnt_idmap(struct mnt_idmap *idmap, struct seq_file *seq, bool uid_map) { struct uid_gid_map *map, *map_up; u32 idx, nr_mappings; if (!is_valid_mnt_idmap(idmap)) return 0; /* * Idmappings are shown relative to the caller's idmapping. * This is both the most intuitive and most useful solution. */ if (uid_map) { map = &idmap->uid_map; map_up = &current_user_ns()->uid_map; } else { map = &idmap->gid_map; map_up = &current_user_ns()->gid_map; } for (idx = 0, nr_mappings = 0; idx < map->nr_extents; idx++) { uid_t lower; struct uid_gid_extent *extent; if (map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) extent = &map->extent[idx]; else extent = &map->forward[idx]; /* * Verify that the whole range of the mapping can be * resolved in the caller's idmapping. If it cannot be * resolved skip the mapping. */ lower = map_id_range_up(map_up, extent->lower_first, extent->count); if (lower == (uid_t) -1) continue; seq_printf(seq, "%u %u %u", extent->first, lower, extent->count); seq->count++; /* mappings are separated by \0 */ if (seq_has_overflowed(seq)) return -EAGAIN; nr_mappings++; } return nr_mappings; }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __SHMEM_FS_H #define __SHMEM_FS_H #include <linux/file.h> #include <linux/swap.h> #include <linux/mempolicy.h> #include <linux/pagemap.h> #include <linux/percpu_counter.h> #include <linux/xattr.h> #include <linux/fs_parser.h> #include <linux/userfaultfd_k.h> /* inode in-kernel data */ #ifdef CONFIG_TMPFS_QUOTA #define SHMEM_MAXQUOTAS 2 #endif struct shmem_inode_info { spinlock_t lock; unsigned int seals; /* shmem seals */ unsigned long flags; unsigned long alloced; /* data pages alloced to file */ unsigned long swapped; /* subtotal assigned to swap */ union { struct offset_ctx dir_offsets; /* stable directory offsets */ struct { struct list_head shrinklist; /* shrinkable hpage inodes */ struct list_head swaplist; /* chain of maybes on swap */ }; }; struct timespec64 i_crtime; /* file creation time */ struct shared_policy policy; /* NUMA memory alloc policy */ struct simple_xattrs xattrs; /* list of xattrs */ pgoff_t fallocend; /* highest fallocate endindex */ unsigned int fsflags; /* for FS_IOC_[SG]ETFLAGS */ atomic_t stop_eviction; /* hold when working on inode */ #ifdef CONFIG_TMPFS_QUOTA struct dquot __rcu *i_dquot[MAXQUOTAS]; #endif struct inode vfs_inode; }; #define SHMEM_FL_USER_VISIBLE (FS_FL_USER_VISIBLE | FS_CASEFOLD_FL) #define SHMEM_FL_USER_MODIFIABLE \ (FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NODUMP_FL | FS_NOATIME_FL | FS_CASEFOLD_FL) #define SHMEM_FL_INHERITED (FS_NODUMP_FL | FS_NOATIME_FL | FS_CASEFOLD_FL) struct shmem_quota_limits { qsize_t usrquota_bhardlimit; /* Default user quota block hard limit */ qsize_t usrquota_ihardlimit; /* Default user quota inode hard limit */ qsize_t grpquota_bhardlimit; /* Default group quota block hard limit */ qsize_t grpquota_ihardlimit; /* Default group quota inode hard limit */ }; struct shmem_sb_info { unsigned long max_blocks; /* How many blocks are allowed */ struct percpu_counter used_blocks; /* How many are allocated */ unsigned long max_inodes; /* How many inodes are allowed */ unsigned long free_ispace; /* How much ispace left for allocation */ raw_spinlock_t stat_lock; /* Serialize shmem_sb_info changes */ umode_t mode; /* Mount mode for root directory */ unsigned char huge; /* Whether to try for hugepages */ kuid_t uid; /* Mount uid for root directory */ kgid_t gid; /* Mount gid for root directory */ bool full_inums; /* If i_ino should be uint or ino_t */ bool noswap; /* ignores VM reclaim / swap requests */ ino_t next_ino; /* The next per-sb inode number to use */ ino_t __percpu *ino_batch; /* The next per-cpu inode number to use */ struct mempolicy *mpol; /* default memory policy for mappings */ spinlock_t shrinklist_lock; /* Protects shrinklist */ struct list_head shrinklist; /* List of shinkable inodes */ unsigned long shrinklist_len; /* Length of shrinklist */ struct shmem_quota_limits qlimits; /* Default quota limits */ }; static inline struct shmem_inode_info *SHMEM_I(struct inode *inode) { return container_of(inode, struct shmem_inode_info, vfs_inode); } /* * Functions in mm/shmem.c called directly from elsewhere: */ extern const struct fs_parameter_spec shmem_fs_parameters[]; extern void shmem_init(void); extern int shmem_init_fs_context(struct fs_context *fc); extern struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags); extern struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned long flags); extern struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt, const char *name, loff_t size, unsigned long flags); extern int shmem_zero_setup(struct vm_area_struct *); extern unsigned long shmem_get_unmapped_area(struct file *, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); extern int shmem_lock(struct file *file, int lock, struct ucounts *ucounts); #ifdef CONFIG_SHMEM bool shmem_mapping(struct address_space *mapping); #else static inline bool shmem_mapping(struct address_space *mapping) { return false; } #endif /* CONFIG_SHMEM */ extern void shmem_unlock_mapping(struct address_space *mapping); extern struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, pgoff_t index, gfp_t gfp_mask); extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end); int shmem_unuse(unsigned int type); #ifdef CONFIG_TRANSPARENT_HUGEPAGE unsigned long shmem_allowable_huge_orders(struct inode *inode, struct vm_area_struct *vma, pgoff_t index, loff_t write_end, bool shmem_huge_force); bool shmem_hpage_pmd_enabled(void); #else static inline unsigned long shmem_allowable_huge_orders(struct inode *inode, struct vm_area_struct *vma, pgoff_t index, loff_t write_end, bool shmem_huge_force) { return 0; } static inline bool shmem_hpage_pmd_enabled(void) { return false; } #endif #ifdef CONFIG_SHMEM extern unsigned long shmem_swap_usage(struct vm_area_struct *vma); #else static inline unsigned long shmem_swap_usage(struct vm_area_struct *vma) { return 0; } #endif extern unsigned long shmem_partial_swap_usage(struct address_space *mapping, pgoff_t start, pgoff_t end); /* Flag allocation requirements to shmem_get_folio */ enum sgp_type { SGP_READ, /* don't exceed i_size, don't allocate page */ SGP_NOALLOC, /* similar, but fail on hole or use fallocated page */ SGP_CACHE, /* don't exceed i_size, may allocate page */ SGP_WRITE, /* may exceed i_size, may allocate !Uptodate page */ SGP_FALLOC, /* like SGP_WRITE, but make existing page Uptodate */ }; int shmem_get_folio(struct inode *inode, pgoff_t index, loff_t write_end, struct folio **foliop, enum sgp_type sgp); struct folio *shmem_read_folio_gfp(struct address_space *mapping, pgoff_t index, gfp_t gfp); static inline struct folio *shmem_read_folio(struct address_space *mapping, pgoff_t index) { return shmem_read_folio_gfp(mapping, index, mapping_gfp_mask(mapping)); } static inline struct page *shmem_read_mapping_page( struct address_space *mapping, pgoff_t index) { return shmem_read_mapping_page_gfp(mapping, index, mapping_gfp_mask(mapping)); } static inline bool shmem_file(struct file *file) { if (!IS_ENABLED(CONFIG_SHMEM)) return false; if (!file || !file->f_mapping) return false; return shmem_mapping(file->f_mapping); } /* * If fallocate(FALLOC_FL_KEEP_SIZE) has been used, there may be pages * beyond i_size's notion of EOF, which fallocate has committed to reserving: * which split_huge_page() must therefore not delete. This use of a single * "fallocend" per inode errs on the side of not deleting a reservation when * in doubt: there are plenty of cases when it preserves unreserved pages. */ static inline pgoff_t shmem_fallocend(struct inode *inode, pgoff_t eof) { return max(eof, SHMEM_I(inode)->fallocend); } extern bool shmem_charge(struct inode *inode, long pages); extern void shmem_uncharge(struct inode *inode, long pages); #ifdef CONFIG_USERFAULTFD #ifdef CONFIG_SHMEM extern int shmem_mfill_atomic_pte(pmd_t *dst_pmd, struct vm_area_struct *dst_vma, unsigned long dst_addr, unsigned long src_addr, uffd_flags_t flags, struct folio **foliop); #else /* !CONFIG_SHMEM */ #define shmem_mfill_atomic_pte(dst_pmd, dst_vma, dst_addr, \ src_addr, flags, foliop) ({ BUG(); 0; }) #endif /* CONFIG_SHMEM */ #endif /* CONFIG_USERFAULTFD */ /* * Used space is stored as unsigned 64-bit value in bytes but * quota core supports only signed 64-bit values so use that * as a limit */ #define SHMEM_QUOTA_MAX_SPC_LIMIT 0x7fffffffffffffffLL /* 2^63-1 */ #define SHMEM_QUOTA_MAX_INO_LIMIT 0x7fffffffffffffffLL #ifdef CONFIG_TMPFS_QUOTA extern const struct dquot_operations shmem_quota_operations; extern struct quota_format_type shmem_quota_format; #endif /* CONFIG_TMPFS_QUOTA */ #endif
142 142 141 273 273 142 185 185 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 // SPDX-License-Identifier: GPL-2.0 /* * This file contains functions which manage clock event devices. * * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner */ #include <linux/clockchips.h> #include <linux/hrtimer.h> #include <linux/init.h> #include <linux/module.h> #include <linux/smp.h> #include <linux/device.h> #include "tick-internal.h" /* The registered clock event devices */ static LIST_HEAD(clockevent_devices); static LIST_HEAD(clockevents_released); /* Protection for the above */ static DEFINE_RAW_SPINLOCK(clockevents_lock); /* Protection for unbind operations */ static DEFINE_MUTEX(clockevents_mutex); struct ce_unbind { struct clock_event_device *ce; int res; }; static u64 cev_delta2ns(unsigned long latch, struct clock_event_device *evt, bool ismax) { u64 clc = (u64) latch << evt->shift; u64 rnd; if (WARN_ON(!evt->mult)) evt->mult = 1; rnd = (u64) evt->mult - 1; /* * Upper bound sanity check. If the backwards conversion is * not equal latch, we know that the above shift overflowed. */ if ((clc >> evt->shift) != (u64)latch) clc = ~0ULL; /* * Scaled math oddities: * * For mult <= (1 << shift) we can safely add mult - 1 to * prevent integer rounding loss. So the backwards conversion * from nsec to device ticks will be correct. * * For mult > (1 << shift), i.e. device frequency is > 1GHz we * need to be careful. Adding mult - 1 will result in a value * which when converted back to device ticks can be larger * than latch by up to (mult - 1) >> shift. For the min_delta * calculation we still want to apply this in order to stay * above the minimum device ticks limit. For the upper limit * we would end up with a latch value larger than the upper * limit of the device, so we omit the add to stay below the * device upper boundary. * * Also omit the add if it would overflow the u64 boundary. */ if ((~0ULL - clc > rnd) && (!ismax || evt->mult <= (1ULL << evt->shift))) clc += rnd; do_div(clc, evt->mult); /* Deltas less than 1usec are pointless noise */ return clc > 1000 ? clc : 1000; } /** * clockevent_delta2ns - Convert a latch value (device ticks) to nanoseconds * @latch: value to convert * @evt: pointer to clock event device descriptor * * Math helper, returns latch value converted to nanoseconds (bound checked) */ u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt) { return cev_delta2ns(latch, evt, false); } EXPORT_SYMBOL_GPL(clockevent_delta2ns); static int __clockevents_switch_state(struct clock_event_device *dev, enum clock_event_state state) { if (dev->features & CLOCK_EVT_FEAT_DUMMY) return 0; /* Transition with new state-specific callbacks */ switch (state) { case CLOCK_EVT_STATE_DETACHED: /* The clockevent device is getting replaced. Shut it down. */ case CLOCK_EVT_STATE_SHUTDOWN: if (dev->set_state_shutdown) return dev->set_state_shutdown(dev); return 0; case CLOCK_EVT_STATE_PERIODIC: /* Core internal bug */ if (!(dev->features & CLOCK_EVT_FEAT_PERIODIC)) return -ENOSYS; if (dev->set_state_periodic) return dev->set_state_periodic(dev); return 0; case CLOCK_EVT_STATE_ONESHOT: /* Core internal bug */ if (!(dev->features & CLOCK_EVT_FEAT_ONESHOT)) return -ENOSYS; if (dev->set_state_oneshot) return dev->set_state_oneshot(dev); return 0; case CLOCK_EVT_STATE_ONESHOT_STOPPED: /* Core internal bug */ if (WARN_ONCE(!clockevent_state_oneshot(dev), "Current state: %d\n", clockevent_get_state(dev))) return -EINVAL; if (dev->set_state_oneshot_stopped) return dev->set_state_oneshot_stopped(dev); else return -ENOSYS; default: return -ENOSYS; } } /** * clockevents_switch_state - set the operating state of a clock event device * @dev: device to modify * @state: new state * * Must be called with interrupts disabled ! */ void clockevents_switch_state(struct clock_event_device *dev, enum clock_event_state state) { if (clockevent_get_state(dev) != state) { if (__clockevents_switch_state(dev, state)) return; clockevent_set_state(dev, state); /* * A nsec2cyc multiplicator of 0 is invalid and we'd crash * on it, so fix it up and emit a warning: */ if (clockevent_state_oneshot(dev)) { if (WARN_ON(!dev->mult)) dev->mult = 1; } } } /** * clockevents_shutdown - shutdown the device and clear next_event * @dev: device to shutdown */ void clockevents_shutdown(struct clock_event_device *dev) { clockevents_switch_state(dev, CLOCK_EVT_STATE_SHUTDOWN); dev->next_event = KTIME_MAX; } /** * clockevents_tick_resume - Resume the tick device before using it again * @dev: device to resume */ int clockevents_tick_resume(struct clock_event_device *dev) { int ret = 0; if (dev->tick_resume) ret = dev->tick_resume(dev); return ret; } #ifdef CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST /* Limit min_delta to a jiffy */ #define MIN_DELTA_LIMIT (NSEC_PER_SEC / HZ) /** * clockevents_increase_min_delta - raise minimum delta of a clock event device * @dev: device to increase the minimum delta * * Returns 0 on success, -ETIME when the minimum delta reached the limit. */ static int clockevents_increase_min_delta(struct clock_event_device *dev) { /* Nothing to do if we already reached the limit */ if (dev->min_delta_ns >= MIN_DELTA_LIMIT) { printk_deferred(KERN_WARNING "CE: Reprogramming failure. Giving up\n"); dev->next_event = KTIME_MAX; return -ETIME; } if (dev->min_delta_ns < 5000) dev->min_delta_ns = 5000; else dev->min_delta_ns += dev->min_delta_ns >> 1; if (dev->min_delta_ns > MIN_DELTA_LIMIT) dev->min_delta_ns = MIN_DELTA_LIMIT; printk_deferred(KERN_WARNING "CE: %s increased min_delta_ns to %llu nsec\n", dev->name ? dev->name : "?", (unsigned long long) dev->min_delta_ns); return 0; } /** * clockevents_program_min_delta - Set clock event device to the minimum delay. * @dev: device to program * * Returns 0 on success, -ETIME when the retry loop failed. */ static int clockevents_program_min_delta(struct clock_event_device *dev) { unsigned long long clc; int64_t delta; int i; for (i = 0;;) { delta = dev->min_delta_ns; dev->next_event = ktime_add_ns(ktime_get(), delta); if (clockevent_state_shutdown(dev)) return 0; dev->retries++; clc = ((unsigned long long) delta * dev->mult) >> dev->shift; if (dev->set_next_event((unsigned long) clc, dev) == 0) return 0; if (++i > 2) { /* * We tried 3 times to program the device with the * given min_delta_ns. Try to increase the minimum * delta, if that fails as well get out of here. */ if (clockevents_increase_min_delta(dev)) return -ETIME; i = 0; } } } #else /* CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST */ /** * clockevents_program_min_delta - Set clock event device to the minimum delay. * @dev: device to program * * Returns 0 on success, -ETIME when the retry loop failed. */ static int clockevents_program_min_delta(struct clock_event_device *dev) { unsigned long long clc; int64_t delta = 0; int i; for (i = 0; i < 10; i++) { delta += dev->min_delta_ns; dev->next_event = ktime_add_ns(ktime_get(), delta); if (clockevent_state_shutdown(dev)) return 0; dev->retries++; clc = ((unsigned long long) delta * dev->mult) >> dev->shift; if (dev->set_next_event((unsigned long) clc, dev) == 0) return 0; } return -ETIME; } #endif /* CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST */ /** * clockevents_program_event - Reprogram the clock event device. * @dev: device to program * @expires: absolute expiry time (monotonic clock) * @force: program minimum delay if expires can not be set * * Returns 0 on success, -ETIME when the event is in the past. */ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, bool force) { unsigned long long clc; int64_t delta; int rc; if (WARN_ON_ONCE(expires < 0)) return -ETIME; dev->next_event = expires; if (clockevent_state_shutdown(dev)) return 0; /* We must be in ONESHOT state here */ WARN_ONCE(!clockevent_state_oneshot(dev), "Current state: %d\n", clockevent_get_state(dev)); /* Shortcut for clockevent devices that can deal with ktime. */ if (dev->features & CLOCK_EVT_FEAT_KTIME) return dev->set_next_ktime(expires, dev); delta = ktime_to_ns(ktime_sub(expires, ktime_get())); if (delta <= 0) return force ? clockevents_program_min_delta(dev) : -ETIME; delta = min(delta, (int64_t) dev->max_delta_ns); delta = max(delta, (int64_t) dev->min_delta_ns); clc = ((unsigned long long) delta * dev->mult) >> dev->shift; rc = dev->set_next_event((unsigned long) clc, dev); return (rc && force) ? clockevents_program_min_delta(dev) : rc; } /* * Called after a clockevent has been added which might * have replaced a current regular or broadcast device. A * released normal device might be a suitable replacement * for the current broadcast device. Similarly a released * broadcast device might be a suitable replacement for a * normal device. */ static void clockevents_notify_released(void) { struct clock_event_device *dev; /* * Keep iterating as long as tick_check_new_device() * replaces a device. */ while (!list_empty(&clockevents_released)) { dev = list_entry(clockevents_released.next, struct clock_event_device, list); list_move(&dev->list, &clockevent_devices); tick_check_new_device(dev); } } /* * Try to install a replacement clock event device */ static int clockevents_replace(struct clock_event_device *ced) { struct clock_event_device *dev, *newdev = NULL; list_for_each_entry(dev, &clockevent_devices, list) { if (dev == ced || !clockevent_state_detached(dev)) continue; if (!tick_check_replacement(newdev, dev)) continue; if (!try_module_get(dev->owner)) continue; if (newdev) module_put(newdev->owner); newdev = dev; } if (newdev) { tick_install_replacement(newdev); list_del_init(&ced->list); } return newdev ? 0 : -EBUSY; } /* * Called with clockevents_mutex and clockevents_lock held */ static int __clockevents_try_unbind(struct clock_event_device *ced, int cpu) { /* Fast track. Device is unused */ if (clockevent_state_detached(ced)) { list_del_init(&ced->list); return 0; } return ced == per_cpu(tick_cpu_device, cpu).evtdev ? -EAGAIN : -EBUSY; } /* * SMP function call to unbind a device */ static void __clockevents_unbind(void *arg) { struct ce_unbind *cu = arg; int res; raw_spin_lock(&clockevents_lock); res = __clockevents_try_unbind(cu->ce, smp_processor_id()); if (res == -EAGAIN) res = clockevents_replace(cu->ce); cu->res = res; raw_spin_unlock(&clockevents_lock); } /* * Issues smp function call to unbind a per cpu device. Called with * clockevents_mutex held. */ static int clockevents_unbind(struct clock_event_device *ced, int cpu) { struct ce_unbind cu = { .ce = ced, .res = -ENODEV }; smp_call_function_single(cpu, __clockevents_unbind, &cu, 1); return cu.res; } /* * Unbind a clockevents device. */ int clockevents_unbind_device(struct clock_event_device *ced, int cpu) { int ret; mutex_lock(&clockevents_mutex); ret = clockevents_unbind(ced, cpu); mutex_unlock(&clockevents_mutex); return ret; } EXPORT_SYMBOL_GPL(clockevents_unbind_device); /** * clockevents_register_device - register a clock event device * @dev: device to register */ void clockevents_register_device(struct clock_event_device *dev) { unsigned long flags; /* Initialize state to DETACHED */ clockevent_set_state(dev, CLOCK_EVT_STATE_DETACHED); if (!dev->cpumask) { WARN_ON(num_possible_cpus() > 1); dev->cpumask = cpumask_of(smp_processor_id()); } if (dev->cpumask == cpu_all_mask) { WARN(1, "%s cpumask == cpu_all_mask, using cpu_possible_mask instead\n", dev->name); dev->cpumask = cpu_possible_mask; } raw_spin_lock_irqsave(&clockevents_lock, flags); list_add(&dev->list, &clockevent_devices); tick_check_new_device(dev); clockevents_notify_released(); raw_spin_unlock_irqrestore(&clockevents_lock, flags); } EXPORT_SYMBOL_GPL(clockevents_register_device); static void clockevents_config(struct clock_event_device *dev, u32 freq) { u64 sec; if (!(dev->features & CLOCK_EVT_FEAT_ONESHOT)) return; /* * Calculate the maximum number of seconds we can sleep. Limit * to 10 minutes for hardware which can program more than * 32bit ticks so we still get reasonable conversion values. */ sec = dev->max_delta_ticks; do_div(sec, freq); if (!sec) sec = 1; else if (sec > 600 && dev->max_delta_ticks > UINT_MAX) sec = 600; clockevents_calc_mult_shift(dev, freq, sec); dev->min_delta_ns = cev_delta2ns(dev->min_delta_ticks, dev, false); dev->max_delta_ns = cev_delta2ns(dev->max_delta_ticks, dev, true); } /** * clockevents_config_and_register - Configure and register a clock event device * @dev: device to register * @freq: The clock frequency * @min_delta: The minimum clock ticks to program in oneshot mode * @max_delta: The maximum clock ticks to program in oneshot mode * * min/max_delta can be 0 for devices which do not support oneshot mode. */ void clockevents_config_and_register(struct clock_event_device *dev, u32 freq, unsigned long min_delta, unsigned long max_delta) { dev->min_delta_ticks = min_delta; dev->max_delta_ticks = max_delta; clockevents_config(dev, freq); clockevents_register_device(dev); } EXPORT_SYMBOL_GPL(clockevents_config_and_register); int __clockevents_update_freq(struct clock_event_device *dev, u32 freq) { clockevents_config(dev, freq); if (clockevent_state_oneshot(dev)) return clockevents_program_event(dev, dev->next_event, false); if (clockevent_state_periodic(dev)) return __clockevents_switch_state(dev, CLOCK_EVT_STATE_PERIODIC); return 0; } /** * clockevents_update_freq - Update frequency and reprogram a clock event device. * @dev: device to modify * @freq: new device frequency * * Reconfigure and reprogram a clock event device in oneshot * mode. Must be called on the cpu for which the device delivers per * cpu timer events. If called for the broadcast device the core takes * care of serialization. * * Returns 0 on success, -ETIME when the event is in the past. */ int clockevents_update_freq(struct clock_event_device *dev, u32 freq) { unsigned long flags; int ret; local_irq_save(flags); ret = tick_broadcast_update_freq(dev, freq); if (ret == -ENODEV) ret = __clockevents_update_freq(dev, freq); local_irq_restore(flags); return ret; } /* * Noop handler when we shut down an event device */ void clockevents_handle_noop(struct clock_event_device *dev) { } /** * clockevents_exchange_device - release and request clock devices * @old: device to release (can be NULL) * @new: device to request (can be NULL) * * Called from various tick functions with clockevents_lock held and * interrupts disabled. */ void clockevents_exchange_device(struct clock_event_device *old, struct clock_event_device *new) { /* * Caller releases a clock event device. We queue it into the * released list and do a notify add later. */ if (old) { module_put(old->owner); clockevents_switch_state(old, CLOCK_EVT_STATE_DETACHED); list_move(&old->list, &clockevents_released); } if (new) { BUG_ON(!clockevent_state_detached(new)); clockevents_shutdown(new); } } /** * clockevents_suspend - suspend clock devices */ void clockevents_suspend(void) { struct clock_event_device *dev; list_for_each_entry_reverse(dev, &clockevent_devices, list) if (dev->suspend && !clockevent_state_detached(dev)) dev->suspend(dev); } /** * clockevents_resume - resume clock devices */ void clockevents_resume(void) { struct clock_event_device *dev; list_for_each_entry(dev, &clockevent_devices, list) if (dev->resume && !clockevent_state_detached(dev)) dev->resume(dev); } #ifdef CONFIG_HOTPLUG_CPU /** * tick_offline_cpu - Shutdown all clock events related * to this CPU and take it out of the * broadcast mechanism. * @cpu: The outgoing CPU * * Called by the dying CPU during teardown. */ void tick_offline_cpu(unsigned int cpu) { struct clock_event_device *dev, *tmp; raw_spin_lock(&clockevents_lock); tick_broadcast_offline(cpu); tick_shutdown(cpu); /* * Unregister the clock event devices which were * released above. */ list_for_each_entry_safe(dev, tmp, &clockevents_released, list) list_del(&dev->list); /* * Now check whether the CPU has left unused per cpu devices */ list_for_each_entry_safe(dev, tmp, &clockevent_devices, list) { if (cpumask_test_cpu(cpu, dev->cpumask) && cpumask_weight(dev->cpumask) == 1 && !tick_is_broadcast_device(dev)) { BUG_ON(!clockevent_state_detached(dev)); list_del(&dev->list); } } raw_spin_unlock(&clockevents_lock); } #endif #ifdef CONFIG_SYSFS static const struct bus_type clockevents_subsys = { .name = "clockevents", .dev_name = "clockevent", }; static DEFINE_PER_CPU(struct device, tick_percpu_dev); static struct tick_device *tick_get_tick_dev(struct device *dev); static ssize_t current_device_show(struct device *dev, struct device_attribute *attr, char *buf) { struct tick_device *td; ssize_t count = 0; raw_spin_lock_irq(&clockevents_lock); td = tick_get_tick_dev(dev); if (td && td->evtdev) count = sysfs_emit(buf, "%s\n", td->evtdev->name); raw_spin_unlock_irq(&clockevents_lock); return count; } static DEVICE_ATTR_RO(current_device); /* We don't support the abomination of removable broadcast devices */ static ssize_t unbind_device_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { char name[CS_NAME_LEN]; ssize_t ret = sysfs_get_uname(buf, name, count); struct clock_event_device *ce = NULL, *iter; if (ret < 0) return ret; ret = -ENODEV; mutex_lock(&clockevents_mutex); raw_spin_lock_irq(&clockevents_lock); list_for_each_entry(iter, &clockevent_devices, list) { if (!strcmp(iter->name, name)) { ret = __clockevents_try_unbind(iter, dev->id); ce = iter; break; } } raw_spin_unlock_irq(&clockevents_lock); /* * We hold clockevents_mutex, so ce can't go away */ if (ret == -EAGAIN) ret = clockevents_unbind(ce, dev->id); mutex_unlock(&clockevents_mutex); return ret ? ret : count; } static DEVICE_ATTR_WO(unbind_device); #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST static struct device tick_bc_dev = { .init_name = "broadcast", .id = 0, .bus = &clockevents_subsys, }; static struct tick_device *tick_get_tick_dev(struct device *dev) { return dev == &tick_bc_dev ? tick_get_broadcast_device() : &per_cpu(tick_cpu_device, dev->id); } static __init int tick_broadcast_init_sysfs(void) { int err = device_register(&tick_bc_dev); if (!err) err = device_create_file(&tick_bc_dev, &dev_attr_current_device); return err; } #else static struct tick_device *tick_get_tick_dev(struct device *dev) { return &per_cpu(tick_cpu_device, dev->id); } static inline int tick_broadcast_init_sysfs(void) { return 0; } #endif static int __init tick_init_sysfs(void) { int cpu; for_each_possible_cpu(cpu) { struct device *dev = &per_cpu(tick_percpu_dev, cpu); int err; dev->id = cpu; dev->bus = &clockevents_subsys; err = device_register(dev); if (!err) err = device_create_file(dev, &dev_attr_current_device); if (!err) err = device_create_file(dev, &dev_attr_unbind_device); if (err) return err; } return tick_broadcast_init_sysfs(); } static int __init clockevents_init_sysfs(void) { int err = subsys_system_register(&clockevents_subsys, NULL); if (!err) err = tick_init_sysfs(); return err; } device_initcall(clockevents_init_sysfs); #endif /* SYSFS */
196 41 3107 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM notifier #if !defined(_TRACE_NOTIFIERS_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_NOTIFIERS_H #include <linux/tracepoint.h> DECLARE_EVENT_CLASS(notifier_info, TP_PROTO(void *cb), TP_ARGS(cb), TP_STRUCT__entry( __field(void *, cb) ), TP_fast_assign( __entry->cb = cb; ), TP_printk("%ps", __entry->cb) ); /* * notifier_register - called upon notifier callback registration * * @cb: callback pointer * */ DEFINE_EVENT(notifier_info, notifier_register, TP_PROTO(void *cb), TP_ARGS(cb) ); /* * notifier_unregister - called upon notifier callback unregistration * * @cb: callback pointer * */ DEFINE_EVENT(notifier_info, notifier_unregister, TP_PROTO(void *cb), TP_ARGS(cb) ); /* * notifier_run - called upon notifier callback execution * * @cb: callback pointer * */ DEFINE_EVENT(notifier_info, notifier_run, TP_PROTO(void *cb), TP_ARGS(cb) ); #endif /* _TRACE_NOTIFIERS_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
3290 2 942 3 2354 6 3300 82 82 50 28 54 53 54 18 54 54 10 10 105 5 3 3 2 10 82 46 49 2292 2286 1050 1627 2339 3 3 3 2 2322 37 2307 717 2041 2237 77 4 53 22 64 11 73 77 2318 14 456 1877 1062 1661 2254 2315 2 4 2 29 31 38 38 1 4 2 16 17 24 24 592 591 592 592 419 159 31 9 22 39 168 4 2 6 6 130 22 140 124 519 2 2 8 9 497 463 31 117 414 483 101 3 59 38 84 25 54 54 98 478 8 387 88 430 61 132 383 469 4 2 48 28 76 1 3 2 18 30 48 101 475 77 77 50 49 131 6 2 114 14 3 2 2 117 114 3 2 103 10 45 2 50 107 134 2 13 118 2 160 160 164 6 66 96 161 161 165 2 164 11 1 3 1 1 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/read_write.c * * Copyright (C) 1991, 1992 Linus Torvalds */ #include <linux/slab.h> #include <linux/stat.h> #include <linux/sched/xacct.h> #include <linux/fcntl.h> #include <linux/file.h> #include <linux/uio.h> #include <linux/fsnotify.h> #include <linux/security.h> #include <linux/export.h> #include <linux/syscalls.h> #include <linux/pagemap.h> #include <linux/splice.h> #include <linux/compat.h> #include <linux/mount.h> #include <linux/fs.h> #include "internal.h" #include <linux/uaccess.h> #include <asm/unistd.h> const struct file_operations generic_ro_fops = { .llseek = generic_file_llseek, .read_iter = generic_file_read_iter, .mmap = generic_file_readonly_mmap, .splice_read = filemap_splice_read, }; EXPORT_SYMBOL(generic_ro_fops); static inline bool unsigned_offsets(struct file *file) { return file->f_op->fop_flags & FOP_UNSIGNED_OFFSET; } /** * vfs_setpos_cookie - update the file offset for lseek and reset cookie * @file: file structure in question * @offset: file offset to seek to * @maxsize: maximum file size * @cookie: cookie to reset * * Update the file offset to the value specified by @offset if the given * offset is valid and it is not equal to the current file offset and * reset the specified cookie to indicate that a seek happened. * * Return the specified offset on success and -EINVAL on invalid offset. */ static loff_t vfs_setpos_cookie(struct file *file, loff_t offset, loff_t maxsize, u64 *cookie) { if (offset < 0 && !unsigned_offsets(file)) return -EINVAL; if (offset > maxsize) return -EINVAL; if (offset != file->f_pos) { file->f_pos = offset; if (cookie) *cookie = 0; } return offset; } /** * vfs_setpos - update the file offset for lseek * @file: file structure in question * @offset: file offset to seek to * @maxsize: maximum file size * * This is a low-level filesystem helper for updating the file offset to * the value specified by @offset if the given offset is valid and it is * not equal to the current file offset. * * Return the specified offset on success and -EINVAL on invalid offset. */ loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize) { return vfs_setpos_cookie(file, offset, maxsize, NULL); } EXPORT_SYMBOL(vfs_setpos); /** * must_set_pos - check whether f_pos has to be updated * @file: file to seek on * @offset: offset to use * @whence: type of seek operation * @eof: end of file * * Check whether f_pos needs to be updated and update @offset according * to @whence. * * Return: 0 if f_pos doesn't need to be updated, 1 if f_pos has to be * updated, and negative error code on failure. */ static int must_set_pos(struct file *file, loff_t *offset, int whence, loff_t eof) { switch (whence) { case SEEK_END: *offset += eof; break; case SEEK_CUR: /* * Here we special-case the lseek(fd, 0, SEEK_CUR) * position-querying operation. Avoid rewriting the "same" * f_pos value back to the file because a concurrent read(), * write() or lseek() might have altered it */ if (*offset == 0) { *offset = file->f_pos; return 0; } break; case SEEK_DATA: /* * In the generic case the entire file is data, so as long as * offset isn't at the end of the file then the offset is data. */ if ((unsigned long long)*offset >= eof) return -ENXIO; break; case SEEK_HOLE: /* * There is a virtual hole at the end of the file, so as long as * offset isn't i_size or larger, return i_size. */ if ((unsigned long long)*offset >= eof) return -ENXIO; *offset = eof; break; } return 1; } /** * generic_file_llseek_size - generic llseek implementation for regular files * @file: file structure to seek on * @offset: file offset to seek to * @whence: type of seek * @maxsize: max size of this file in file system * @eof: offset used for SEEK_END position * * This is a variant of generic_file_llseek that allows passing in a custom * maximum file size and a custom EOF position, for e.g. hashed directories * * Synchronization: * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms) * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes. * read/writes behave like SEEK_SET against seeks. */ loff_t generic_file_llseek_size(struct file *file, loff_t offset, int whence, loff_t maxsize, loff_t eof) { int ret; ret = must_set_pos(file, &offset, whence, eof); if (ret < 0) return ret; if (ret == 0) return offset; if (whence == SEEK_CUR) { /* * If the file requires locking via f_pos_lock we know * that mutual exclusion for SEEK_CUR on the same file * is guaranteed. If the file isn't locked, we take * f_lock to protect against f_pos races with other * SEEK_CURs. */ if (file_seek_cur_needs_f_lock(file)) { guard(spinlock)(&file->f_lock); return vfs_setpos(file, file->f_pos + offset, maxsize); } return vfs_setpos(file, file->f_pos + offset, maxsize); } return vfs_setpos(file, offset, maxsize); } EXPORT_SYMBOL(generic_file_llseek_size); /** * generic_llseek_cookie - versioned llseek implementation * @file: file structure to seek on * @offset: file offset to seek to * @whence: type of seek * @cookie: cookie to update * * See generic_file_llseek for a general description and locking assumptions. * * In contrast to generic_file_llseek, this function also resets a * specified cookie to indicate a seek took place. */ loff_t generic_llseek_cookie(struct file *file, loff_t offset, int whence, u64 *cookie) { struct inode *inode = file->f_mapping->host; loff_t maxsize = inode->i_sb->s_maxbytes; loff_t eof = i_size_read(inode); int ret; if (WARN_ON_ONCE(!cookie)) return -EINVAL; /* * Require that this is only used for directories that guarantee * synchronization between readdir and seek so that an update to * @cookie is correctly synchronized with concurrent readdir. */ if (WARN_ON_ONCE(!(file->f_mode & FMODE_ATOMIC_POS))) return -EINVAL; ret = must_set_pos(file, &offset, whence, eof); if (ret < 0) return ret; if (ret == 0) return offset; /* No need to hold f_lock because we know that f_pos_lock is held. */ if (whence == SEEK_CUR) return vfs_setpos_cookie(file, file->f_pos + offset, maxsize, cookie); return vfs_setpos_cookie(file, offset, maxsize, cookie); } EXPORT_SYMBOL(generic_llseek_cookie); /** * generic_file_llseek - generic llseek implementation for regular files * @file: file structure to seek on * @offset: file offset to seek to * @whence: type of seek * * This is a generic implemenation of ->llseek useable for all normal local * filesystems. It just updates the file offset to the value specified by * @offset and @whence. */ loff_t generic_file_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file->f_mapping->host; return generic_file_llseek_size(file, offset, whence, inode->i_sb->s_maxbytes, i_size_read(inode)); } EXPORT_SYMBOL(generic_file_llseek); /** * fixed_size_llseek - llseek implementation for fixed-sized devices * @file: file structure to seek on * @offset: file offset to seek to * @whence: type of seek * @size: size of the file * */ loff_t fixed_size_llseek(struct file *file, loff_t offset, int whence, loff_t size) { switch (whence) { case SEEK_SET: case SEEK_CUR: case SEEK_END: return generic_file_llseek_size(file, offset, whence, size, size); default: return -EINVAL; } } EXPORT_SYMBOL(fixed_size_llseek); /** * no_seek_end_llseek - llseek implementation for fixed-sized devices * @file: file structure to seek on * @offset: file offset to seek to * @whence: type of seek * */ loff_t no_seek_end_llseek(struct file *file, loff_t offset, int whence) { switch (whence) { case SEEK_SET: case SEEK_CUR: return generic_file_llseek_size(file, offset, whence, OFFSET_MAX, 0); default: return -EINVAL; } } EXPORT_SYMBOL(no_seek_end_llseek); /** * no_seek_end_llseek_size - llseek implementation for fixed-sized devices * @file: file structure to seek on * @offset: file offset to seek to * @whence: type of seek * @size: maximal offset allowed * */ loff_t no_seek_end_llseek_size(struct file *file, loff_t offset, int whence, loff_t size) { switch (whence) { case SEEK_SET: case SEEK_CUR: return generic_file_llseek_size(file, offset, whence, size, 0); default: return -EINVAL; } } EXPORT_SYMBOL(no_seek_end_llseek_size); /** * noop_llseek - No Operation Performed llseek implementation * @file: file structure to seek on * @offset: file offset to seek to * @whence: type of seek * * This is an implementation of ->llseek useable for the rare special case when * userspace expects the seek to succeed but the (device) file is actually not * able to perform the seek. In this case you use noop_llseek() instead of * falling back to the default implementation of ->llseek. */ loff_t noop_llseek(struct file *file, loff_t offset, int whence) { return file->f_pos; } EXPORT_SYMBOL(noop_llseek); loff_t default_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file_inode(file); loff_t retval; inode_lock(inode); switch (whence) { case SEEK_END: offset += i_size_read(inode); break; case SEEK_CUR: if (offset == 0) { retval = file->f_pos; goto out; } offset += file->f_pos; break; case SEEK_DATA: /* * In the generic case the entire file is data, so as * long as offset isn't at the end of the file then the * offset is data. */ if (offset >= inode->i_size) { retval = -ENXIO; goto out; } break; case SEEK_HOLE: /* * There is a virtual hole at the end of the file, so * as long as offset isn't i_size or larger, return * i_size. */ if (offset >= inode->i_size) { retval = -ENXIO; goto out; } offset = inode->i_size; break; } retval = -EINVAL; if (offset >= 0 || unsigned_offsets(file)) { if (offset != file->f_pos) file->f_pos = offset; retval = offset; } out: inode_unlock(inode); return retval; } EXPORT_SYMBOL(default_llseek); loff_t vfs_llseek(struct file *file, loff_t offset, int whence) { if (!(file->f_mode & FMODE_LSEEK)) return -ESPIPE; return file->f_op->llseek(file, offset, whence); } EXPORT_SYMBOL(vfs_llseek); static off_t ksys_lseek(unsigned int fd, off_t offset, unsigned int whence) { off_t retval; CLASS(fd_pos, f)(fd); if (fd_empty(f)) return -EBADF; retval = -EINVAL; if (whence <= SEEK_MAX) { loff_t res = vfs_llseek(fd_file(f), offset, whence); retval = res; if (res != (loff_t)retval) retval = -EOVERFLOW; /* LFS: should only happen on 32 bit platforms */ } return retval; } SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence) { return ksys_lseek(fd, offset, whence); } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE3(lseek, unsigned int, fd, compat_off_t, offset, unsigned int, whence) { return ksys_lseek(fd, offset, whence); } #endif #if !defined(CONFIG_64BIT) || defined(CONFIG_COMPAT) || \ defined(__ARCH_WANT_SYS_LLSEEK) SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high, unsigned long, offset_low, loff_t __user *, result, unsigned int, whence) { int retval; CLASS(fd_pos, f)(fd); loff_t offset; if (fd_empty(f)) return -EBADF; if (whence > SEEK_MAX) return -EINVAL; offset = vfs_llseek(fd_file(f), ((loff_t) offset_high << 32) | offset_low, whence); retval = (int)offset; if (offset >= 0) { retval = -EFAULT; if (!copy_to_user(result, &offset, sizeof(offset))) retval = 0; } return retval; } #endif int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count) { int mask = read_write == READ ? MAY_READ : MAY_WRITE; int ret; if (unlikely((ssize_t) count < 0)) return -EINVAL; if (ppos) { loff_t pos = *ppos; if (unlikely(pos < 0)) { if (!unsigned_offsets(file)) return -EINVAL; if (count >= -pos) /* both values are in 0..LLONG_MAX */ return -EOVERFLOW; } else if (unlikely((loff_t) (pos + count) < 0)) { if (!unsigned_offsets(file)) return -EINVAL; } } ret = security_file_permission(file, mask); if (ret) return ret; return fsnotify_file_area_perm(file, mask, ppos, count); } EXPORT_SYMBOL(rw_verify_area); static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos) { struct kiocb kiocb; struct iov_iter iter; ssize_t ret; init_sync_kiocb(&kiocb, filp); kiocb.ki_pos = (ppos ? *ppos : 0); iov_iter_ubuf(&iter, ITER_DEST, buf, len); ret = filp->f_op->read_iter(&kiocb, &iter); BUG_ON(ret == -EIOCBQUEUED); if (ppos) *ppos = kiocb.ki_pos; return ret; } static int warn_unsupported(struct file *file, const char *op) { pr_warn_ratelimited( "kernel %s not supported for file %pD4 (pid: %d comm: %.20s)\n", op, file, current->pid, current->comm); return -EINVAL; } ssize_t __kernel_read(struct file *file, void *buf, size_t count, loff_t *pos) { struct kvec iov = { .iov_base = buf, .iov_len = min_t(size_t, count, MAX_RW_COUNT), }; struct kiocb kiocb; struct iov_iter iter; ssize_t ret; if (WARN_ON_ONCE(!(file->f_mode & FMODE_READ))) return -EINVAL; if (!(file->f_mode & FMODE_CAN_READ)) return -EINVAL; /* * Also fail if ->read_iter and ->read are both wired up as that * implies very convoluted semantics. */ if (unlikely(!file->f_op->read_iter || file->f_op->read)) return warn_unsupported(file, "read"); init_sync_kiocb(&kiocb, file); kiocb.ki_pos = pos ? *pos : 0; iov_iter_kvec(&iter, ITER_DEST, &iov, 1, iov.iov_len); ret = file->f_op->read_iter(&kiocb, &iter); if (ret > 0) { if (pos) *pos = kiocb.ki_pos; fsnotify_access(file); add_rchar(current, ret); } inc_syscr(current); return ret; } ssize_t kernel_read(struct file *file, void *buf, size_t count, loff_t *pos) { ssize_t ret; ret = rw_verify_area(READ, file, pos, count); if (ret) return ret; return __kernel_read(file, buf, count, pos); } EXPORT_SYMBOL(kernel_read); ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos) { ssize_t ret; if (!(file->f_mode & FMODE_READ)) return -EBADF; if (!(file->f_mode & FMODE_CAN_READ)) return -EINVAL; if (unlikely(!access_ok(buf, count))) return -EFAULT; ret = rw_verify_area(READ, file, pos, count); if (ret) return ret; if (count > MAX_RW_COUNT) count = MAX_RW_COUNT; if (file->f_op->read) ret = file->f_op->read(file, buf, count, pos); else if (file->f_op->read_iter) ret = new_sync_read(file, buf, count, pos); else ret = -EINVAL; if (ret > 0) { fsnotify_access(file); add_rchar(current, ret); } inc_syscr(current); return ret; } static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos) { struct kiocb kiocb; struct iov_iter iter; ssize_t ret; init_sync_kiocb(&kiocb, filp); kiocb.ki_pos = (ppos ? *ppos : 0); iov_iter_ubuf(&iter, ITER_SOURCE, (void __user *)buf, len); ret = filp->f_op->write_iter(&kiocb, &iter); BUG_ON(ret == -EIOCBQUEUED); if (ret > 0 && ppos) *ppos = kiocb.ki_pos; return ret; } /* caller is responsible for file_start_write/file_end_write */ ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *pos) { struct kiocb kiocb; ssize_t ret; if (WARN_ON_ONCE(!(file->f_mode & FMODE_WRITE))) return -EBADF; if (!(file->f_mode & FMODE_CAN_WRITE)) return -EINVAL; /* * Also fail if ->write_iter and ->write are both wired up as that * implies very convoluted semantics. */ if (unlikely(!file->f_op->write_iter || file->f_op->write)) return warn_unsupported(file, "write"); init_sync_kiocb(&kiocb, file); kiocb.ki_pos = pos ? *pos : 0; ret = file->f_op->write_iter(&kiocb, from); if (ret > 0) { if (pos) *pos = kiocb.ki_pos; fsnotify_modify(file); add_wchar(current, ret); } inc_syscw(current); return ret; } /* caller is responsible for file_start_write/file_end_write */ ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos) { struct kvec iov = { .iov_base = (void *)buf, .iov_len = min_t(size_t, count, MAX_RW_COUNT), }; struct iov_iter iter; iov_iter_kvec(&iter, ITER_SOURCE, &iov, 1, iov.iov_len); return __kernel_write_iter(file, &iter, pos); } /* * This "EXPORT_SYMBOL_GPL()" is more of a "EXPORT_SYMBOL_DONTUSE()", * but autofs is one of the few internal kernel users that actually * wants this _and_ can be built as a module. So we need to export * this symbol for autofs, even though it really isn't appropriate * for any other kernel modules. */ EXPORT_SYMBOL_GPL(__kernel_write); ssize_t kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos) { ssize_t ret; ret = rw_verify_area(WRITE, file, pos, count); if (ret) return ret; file_start_write(file); ret = __kernel_write(file, buf, count, pos); file_end_write(file); return ret; } EXPORT_SYMBOL(kernel_write); ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos) { ssize_t ret; if (!(file->f_mode & FMODE_WRITE)) return -EBADF; if (!(file->f_mode & FMODE_CAN_WRITE)) return -EINVAL; if (unlikely(!access_ok(buf, count))) return -EFAULT; ret = rw_verify_area(WRITE, file, pos, count); if (ret) return ret; if (count > MAX_RW_COUNT) count = MAX_RW_COUNT; file_start_write(file); if (file->f_op->write) ret = file->f_op->write(file, buf, count, pos); else if (file->f_op->write_iter) ret = new_sync_write(file, buf, count, pos); else ret = -EINVAL; if (ret > 0) { fsnotify_modify(file); add_wchar(current, ret); } inc_syscw(current); file_end_write(file); return ret; } /* file_ppos returns &file->f_pos or NULL if file is stream */ static inline loff_t *file_ppos(struct file *file) { return file->f_mode & FMODE_STREAM ? NULL : &file->f_pos; } ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count) { CLASS(fd_pos, f)(fd); ssize_t ret = -EBADF; if (!fd_empty(f)) { loff_t pos, *ppos = file_ppos(fd_file(f)); if (ppos) { pos = *ppos; ppos = &pos; } ret = vfs_read(fd_file(f), buf, count, ppos); if (ret >= 0 && ppos) fd_file(f)->f_pos = pos; } return ret; } SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count) { return ksys_read(fd, buf, count); } ssize_t ksys_write(unsigned int fd, const char __user *buf, size_t count) { CLASS(fd_pos, f)(fd); ssize_t ret = -EBADF; if (!fd_empty(f)) { loff_t pos, *ppos = file_ppos(fd_file(f)); if (ppos) { pos = *ppos; ppos = &pos; } ret = vfs_write(fd_file(f), buf, count, ppos); if (ret >= 0 && ppos) fd_file(f)->f_pos = pos; } return ret; } SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf, size_t, count) { return ksys_write(fd, buf, count); } ssize_t ksys_pread64(unsigned int fd, char __user *buf, size_t count, loff_t pos) { if (pos < 0) return -EINVAL; CLASS(fd, f)(fd); if (fd_empty(f)) return -EBADF; if (fd_file(f)->f_mode & FMODE_PREAD) return vfs_read(fd_file(f), buf, count, &pos); return -ESPIPE; } SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf, size_t, count, loff_t, pos) { return ksys_pread64(fd, buf, count, pos); } #if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_PREAD64) COMPAT_SYSCALL_DEFINE5(pread64, unsigned int, fd, char __user *, buf, size_t, count, compat_arg_u64_dual(pos)) { return ksys_pread64(fd, buf, count, compat_arg_u64_glue(pos)); } #endif ssize_t ksys_pwrite64(unsigned int fd, const char __user *buf, size_t count, loff_t pos) { if (pos < 0) return -EINVAL; CLASS(fd, f)(fd); if (fd_empty(f)) return -EBADF; if (fd_file(f)->f_mode & FMODE_PWRITE) return vfs_write(fd_file(f), buf, count, &pos); return -ESPIPE; } SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf, size_t, count, loff_t, pos) { return ksys_pwrite64(fd, buf, count, pos); } #if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_PWRITE64) COMPAT_SYSCALL_DEFINE5(pwrite64, unsigned int, fd, const char __user *, buf, size_t, count, compat_arg_u64_dual(pos)) { return ksys_pwrite64(fd, buf, count, compat_arg_u64_glue(pos)); } #endif static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter, loff_t *ppos, int type, rwf_t flags) { struct kiocb kiocb; ssize_t ret; init_sync_kiocb(&kiocb, filp); ret = kiocb_set_rw_flags(&kiocb, flags, type); if (ret) return ret; kiocb.ki_pos = (ppos ? *ppos : 0); if (type == READ) ret = filp->f_op->read_iter(&kiocb, iter); else ret = filp->f_op->write_iter(&kiocb, iter); BUG_ON(ret == -EIOCBQUEUED); if (ppos) *ppos = kiocb.ki_pos; return ret; } /* Do it by hand, with file-ops */ static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter, loff_t *ppos, int type, rwf_t flags) { ssize_t ret = 0; if (flags & ~RWF_HIPRI) return -EOPNOTSUPP; while (iov_iter_count(iter)) { ssize_t nr; if (type == READ) { nr = filp->f_op->read(filp, iter_iov_addr(iter), iter_iov_len(iter), ppos); } else { nr = filp->f_op->write(filp, iter_iov_addr(iter), iter_iov_len(iter), ppos); } if (nr < 0) { if (!ret) ret = nr; break; } ret += nr; if (nr != iter_iov_len(iter)) break; iov_iter_advance(iter, nr); } return ret; } ssize_t vfs_iocb_iter_read(struct file *file, struct kiocb *iocb, struct iov_iter *iter) { size_t tot_len; ssize_t ret = 0; if (!file->f_op->read_iter) return -EINVAL; if (!(file->f_mode & FMODE_READ)) return -EBADF; if (!(file->f_mode & FMODE_CAN_READ)) return -EINVAL; tot_len = iov_iter_count(iter); if (!tot_len) goto out; ret = rw_verify_area(READ, file, &iocb->ki_pos, tot_len); if (ret < 0) return ret; ret = file->f_op->read_iter(iocb, iter); out: if (ret >= 0) fsnotify_access(file); return ret; } EXPORT_SYMBOL(vfs_iocb_iter_read); ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos, rwf_t flags) { size_t tot_len; ssize_t ret = 0; if (!file->f_op->read_iter) return -EINVAL; if (!(file->f_mode & FMODE_READ)) return -EBADF; if (!(file->f_mode & FMODE_CAN_READ)) return -EINVAL; tot_len = iov_iter_count(iter); if (!tot_len) goto out; ret = rw_verify_area(READ, file, ppos, tot_len); if (ret < 0) return ret; ret = do_iter_readv_writev(file, iter, ppos, READ, flags); out: if (ret >= 0) fsnotify_access(file); return ret; } EXPORT_SYMBOL(vfs_iter_read); /* * Caller is responsible for calling kiocb_end_write() on completion * if async iocb was queued. */ ssize_t vfs_iocb_iter_write(struct file *file, struct kiocb *iocb, struct iov_iter *iter) { size_t tot_len; ssize_t ret = 0; if (!file->f_op->write_iter) return -EINVAL; if (!(file->f_mode & FMODE_WRITE)) return -EBADF; if (!(file->f_mode & FMODE_CAN_WRITE)) return -EINVAL; tot_len = iov_iter_count(iter); if (!tot_len) return 0; ret = rw_verify_area(WRITE, file, &iocb->ki_pos, tot_len); if (ret < 0) return ret; kiocb_start_write(iocb); ret = file->f_op->write_iter(iocb, iter); if (ret != -EIOCBQUEUED) kiocb_end_write(iocb); if (ret > 0) fsnotify_modify(file); return ret; } EXPORT_SYMBOL(vfs_iocb_iter_write); ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos, rwf_t flags) { size_t tot_len; ssize_t ret; if (!(file->f_mode & FMODE_WRITE)) return -EBADF; if (!(file->f_mode & FMODE_CAN_WRITE)) return -EINVAL; if (!file->f_op->write_iter) return -EINVAL; tot_len = iov_iter_count(iter); if (!tot_len) return 0; ret = rw_verify_area(WRITE, file, ppos, tot_len); if (ret < 0) return ret; file_start_write(file); ret = do_iter_readv_writev(file, iter, ppos, WRITE, flags); if (ret > 0) fsnotify_modify(file); file_end_write(file); return ret; } EXPORT_SYMBOL(vfs_iter_write); static ssize_t vfs_readv(struct file *file, const struct iovec __user *vec, unsigned long vlen, loff_t *pos, rwf_t flags) { struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; struct iov_iter iter; size_t tot_len; ssize_t ret = 0; if (!(file->f_mode & FMODE_READ)) return -EBADF; if (!(file->f_mode & FMODE_CAN_READ)) return -EINVAL; ret = import_iovec(ITER_DEST, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); if (ret < 0) return ret; tot_len = iov_iter_count(&iter); if (!tot_len) goto out; ret = rw_verify_area(READ, file, pos, tot_len); if (ret < 0) goto out; if (file->f_op->read_iter) ret = do_iter_readv_writev(file, &iter, pos, READ, flags); else ret = do_loop_readv_writev(file, &iter, pos, READ, flags); out: if (ret >= 0) fsnotify_access(file); kfree(iov); return ret; } static ssize_t vfs_writev(struct file *file, const struct iovec __user *vec, unsigned long vlen, loff_t *pos, rwf_t flags) { struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; struct iov_iter iter; size_t tot_len; ssize_t ret = 0; if (!(file->f_mode & FMODE_WRITE)) return -EBADF; if (!(file->f_mode & FMODE_CAN_WRITE)) return -EINVAL; ret = import_iovec(ITER_SOURCE, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); if (ret < 0) return ret; tot_len = iov_iter_count(&iter); if (!tot_len) goto out; ret = rw_verify_area(WRITE, file, pos, tot_len); if (ret < 0) goto out; file_start_write(file); if (file->f_op->write_iter) ret = do_iter_readv_writev(file, &iter, pos, WRITE, flags); else ret = do_loop_readv_writev(file, &iter, pos, WRITE, flags); if (ret > 0) fsnotify_modify(file); file_end_write(file); out: kfree(iov); return ret; } static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec, unsigned long vlen, rwf_t flags) { CLASS(fd_pos, f)(fd); ssize_t ret = -EBADF; if (!fd_empty(f)) { loff_t pos, *ppos = file_ppos(fd_file(f)); if (ppos) { pos = *ppos; ppos = &pos; } ret = vfs_readv(fd_file(f), vec, vlen, ppos, flags); if (ret >= 0 && ppos) fd_file(f)->f_pos = pos; } if (ret > 0) add_rchar(current, ret); inc_syscr(current); return ret; } static ssize_t do_writev(unsigned long fd, const struct iovec __user *vec, unsigned long vlen, rwf_t flags) { CLASS(fd_pos, f)(fd); ssize_t ret = -EBADF; if (!fd_empty(f)) { loff_t pos, *ppos = file_ppos(fd_file(f)); if (ppos) { pos = *ppos; ppos = &pos; } ret = vfs_writev(fd_file(f), vec, vlen, ppos, flags); if (ret >= 0 && ppos) fd_file(f)->f_pos = pos; } if (ret > 0) add_wchar(current, ret); inc_syscw(current); return ret; } static inline loff_t pos_from_hilo(unsigned long high, unsigned long low) { #define HALF_LONG_BITS (BITS_PER_LONG / 2) return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low; } static ssize_t do_preadv(unsigned long fd, const struct iovec __user *vec, unsigned long vlen, loff_t pos, rwf_t flags) { ssize_t ret = -EBADF; if (pos < 0) return -EINVAL; CLASS(fd, f)(fd); if (!fd_empty(f)) { ret = -ESPIPE; if (fd_file(f)->f_mode & FMODE_PREAD) ret = vfs_readv(fd_file(f), vec, vlen, &pos, flags); } if (ret > 0) add_rchar(current, ret); inc_syscr(current); return ret; } static ssize_t do_pwritev(unsigned long fd, const struct iovec __user *vec, unsigned long vlen, loff_t pos, rwf_t flags) { ssize_t ret = -EBADF; if (pos < 0) return -EINVAL; CLASS(fd, f)(fd); if (!fd_empty(f)) { ret = -ESPIPE; if (fd_file(f)->f_mode & FMODE_PWRITE) ret = vfs_writev(fd_file(f), vec, vlen, &pos, flags); } if (ret > 0) add_wchar(current, ret); inc_syscw(current); return ret; } SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen) { return do_readv(fd, vec, vlen, 0); } SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen) { return do_writev(fd, vec, vlen, 0); } SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h) { loff_t pos = pos_from_hilo(pos_h, pos_l); return do_preadv(fd, vec, vlen, pos, 0); } SYSCALL_DEFINE6(preadv2, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h, rwf_t, flags) { loff_t pos = pos_from_hilo(pos_h, pos_l); if (pos == -1) return do_readv(fd, vec, vlen, flags); return do_preadv(fd, vec, vlen, pos, flags); } SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h) { loff_t pos = pos_from_hilo(pos_h, pos_l); return do_pwritev(fd, vec, vlen, pos, 0); } SYSCALL_DEFINE6(pwritev2, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h, rwf_t, flags) { loff_t pos = pos_from_hilo(pos_h, pos_l); if (pos == -1) return do_writev(fd, vec, vlen, flags); return do_pwritev(fd, vec, vlen, pos, flags); } /* * Various compat syscalls. Note that they all pretend to take a native * iovec - import_iovec will properly treat those as compat_iovecs based on * in_compat_syscall(). */ #ifdef CONFIG_COMPAT #ifdef __ARCH_WANT_COMPAT_SYS_PREADV64 COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, loff_t, pos) { return do_preadv(fd, vec, vlen, pos, 0); } #endif COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd, const struct iovec __user *, vec, compat_ulong_t, vlen, u32, pos_low, u32, pos_high) { loff_t pos = ((loff_t)pos_high << 32) | pos_low; return do_preadv(fd, vec, vlen, pos, 0); } #ifdef __ARCH_WANT_COMPAT_SYS_PREADV64V2 COMPAT_SYSCALL_DEFINE5(preadv64v2, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, loff_t, pos, rwf_t, flags) { if (pos == -1) return do_readv(fd, vec, vlen, flags); return do_preadv(fd, vec, vlen, pos, flags); } #endif COMPAT_SYSCALL_DEFINE6(preadv2, compat_ulong_t, fd, const struct iovec __user *, vec, compat_ulong_t, vlen, u32, pos_low, u32, pos_high, rwf_t, flags) { loff_t pos = ((loff_t)pos_high << 32) | pos_low; if (pos == -1) return do_readv(fd, vec, vlen, flags); return do_preadv(fd, vec, vlen, pos, flags); } #ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64 COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, loff_t, pos) { return do_pwritev(fd, vec, vlen, pos, 0); } #endif COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd, const struct iovec __user *,vec, compat_ulong_t, vlen, u32, pos_low, u32, pos_high) { loff_t pos = ((loff_t)pos_high << 32) | pos_low; return do_pwritev(fd, vec, vlen, pos, 0); } #ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64V2 COMPAT_SYSCALL_DEFINE5(pwritev64v2, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, loff_t, pos, rwf_t, flags) { if (pos == -1) return do_writev(fd, vec, vlen, flags); return do_pwritev(fd, vec, vlen, pos, flags); } #endif COMPAT_SYSCALL_DEFINE6(pwritev2, compat_ulong_t, fd, const struct iovec __user *,vec, compat_ulong_t, vlen, u32, pos_low, u32, pos_high, rwf_t, flags) { loff_t pos = ((loff_t)pos_high << 32) | pos_low; if (pos == -1) return do_writev(fd, vec, vlen, flags); return do_pwritev(fd, vec, vlen, pos, flags); } #endif /* CONFIG_COMPAT */ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, size_t count, loff_t max) { struct inode *in_inode, *out_inode; struct pipe_inode_info *opipe; loff_t pos; loff_t out_pos; ssize_t retval; int fl; /* * Get input file, and verify that it is ok.. */ CLASS(fd, in)(in_fd); if (fd_empty(in)) return -EBADF; if (!(fd_file(in)->f_mode & FMODE_READ)) return -EBADF; if (!ppos) { pos = fd_file(in)->f_pos; } else { pos = *ppos; if (!(fd_file(in)->f_mode & FMODE_PREAD)) return -ESPIPE; } retval = rw_verify_area(READ, fd_file(in), &pos, count); if (retval < 0) return retval; if (count > MAX_RW_COUNT) count = MAX_RW_COUNT; /* * Get output file, and verify that it is ok.. */ CLASS(fd, out)(out_fd); if (fd_empty(out)) return -EBADF; if (!(fd_file(out)->f_mode & FMODE_WRITE)) return -EBADF; in_inode = file_inode(fd_file(in)); out_inode = file_inode(fd_file(out)); out_pos = fd_file(out)->f_pos; if (!max) max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes); if (unlikely(pos + count > max)) { if (pos >= max) return -EOVERFLOW; count = max - pos; } fl = 0; #if 0 /* * We need to debate whether we can enable this or not. The * man page documents EAGAIN return for the output at least, * and the application is arguably buggy if it doesn't expect * EAGAIN on a non-blocking file descriptor. */ if (fd_file(in)->f_flags & O_NONBLOCK) fl = SPLICE_F_NONBLOCK; #endif opipe = get_pipe_info(fd_file(out), true); if (!opipe) { retval = rw_verify_area(WRITE, fd_file(out), &out_pos, count); if (retval < 0) return retval; retval = do_splice_direct(fd_file(in), &pos, fd_file(out), &out_pos, count, fl); } else { if (fd_file(out)->f_flags & O_NONBLOCK) fl |= SPLICE_F_NONBLOCK; retval = splice_file_to_pipe(fd_file(in), opipe, &pos, count, fl); } if (retval > 0) { add_rchar(current, retval); add_wchar(current, retval); fsnotify_access(fd_file(in)); fsnotify_modify(fd_file(out)); fd_file(out)->f_pos = out_pos; if (ppos) *ppos = pos; else fd_file(in)->f_pos = pos; } inc_syscr(current); inc_syscw(current); if (pos > max) retval = -EOVERFLOW; return retval; } SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count) { loff_t pos; off_t off; ssize_t ret; if (offset) { if (unlikely(get_user(off, offset))) return -EFAULT; pos = off; ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS); if (unlikely(put_user(pos, offset))) return -EFAULT; return ret; } return do_sendfile(out_fd, in_fd, NULL, count, 0); } SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, size_t, count) { loff_t pos; ssize_t ret; if (offset) { if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t)))) return -EFAULT; ret = do_sendfile(out_fd, in_fd, &pos, count, 0); if (unlikely(put_user(pos, offset))) return -EFAULT; return ret; } return do_sendfile(out_fd, in_fd, NULL, count, 0); } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, compat_off_t __user *, offset, compat_size_t, count) { loff_t pos; off_t off; ssize_t ret; if (offset) { if (unlikely(get_user(off, offset))) return -EFAULT; pos = off; ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS); if (unlikely(put_user(pos, offset))) return -EFAULT; return ret; } return do_sendfile(out_fd, in_fd, NULL, count, 0); } COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, compat_loff_t __user *, offset, compat_size_t, count) { loff_t pos; ssize_t ret; if (offset) { if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t)))) return -EFAULT; ret = do_sendfile(out_fd, in_fd, &pos, count, 0); if (unlikely(put_user(pos, offset))) return -EFAULT; return ret; } return do_sendfile(out_fd, in_fd, NULL, count, 0); } #endif /* * Performs necessary checks before doing a file copy * * Can adjust amount of bytes to copy via @req_count argument. * Returns appropriate error code that caller should return or * zero in case the copy should be allowed. */ static int generic_copy_file_checks(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, size_t *req_count, unsigned int flags) { struct inode *inode_in = file_inode(file_in); struct inode *inode_out = file_inode(file_out); uint64_t count = *req_count; loff_t size_in; int ret; ret = generic_file_rw_checks(file_in, file_out); if (ret) return ret; /* * We allow some filesystems to handle cross sb copy, but passing * a file of the wrong filesystem type to filesystem driver can result * in an attempt to dereference the wrong type of ->private_data, so * avoid doing that until we really have a good reason. * * nfs and cifs define several different file_system_type structures * and several different sets of file_operations, but they all end up * using the same ->copy_file_range() function pointer. */ if (flags & COPY_FILE_SPLICE) { /* cross sb splice is allowed */ } else if (file_out->f_op->copy_file_range) { if (file_in->f_op->copy_file_range != file_out->f_op->copy_file_range) return -EXDEV; } else if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb) { return -EXDEV; } /* Don't touch certain kinds of inodes */ if (IS_IMMUTABLE(inode_out)) return -EPERM; if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out)) return -ETXTBSY; /* Ensure offsets don't wrap. */ if (pos_in + count < pos_in || pos_out + count < pos_out) return -EOVERFLOW; /* Shorten the copy to EOF */ size_in = i_size_read(inode_in); if (pos_in >= size_in) count = 0; else count = min(count, size_in - (uint64_t)pos_in); ret = generic_write_check_limits(file_out, pos_out, &count); if (ret) return ret; /* Don't allow overlapped copying within the same file. */ if (inode_in == inode_out && pos_out + count > pos_in && pos_out < pos_in + count) return -EINVAL; *req_count = count; return 0; } /* * copy_file_range() differs from regular file read and write in that it * specifically allows return partial success. When it does so is up to * the copy_file_range method. */ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, size_t len, unsigned int flags) { ssize_t ret; bool splice = flags & COPY_FILE_SPLICE; bool samesb = file_inode(file_in)->i_sb == file_inode(file_out)->i_sb; if (flags & ~COPY_FILE_SPLICE) return -EINVAL; ret = generic_copy_file_checks(file_in, pos_in, file_out, pos_out, &len, flags); if (unlikely(ret)) return ret; ret = rw_verify_area(READ, file_in, &pos_in, len); if (unlikely(ret)) return ret; ret = rw_verify_area(WRITE, file_out, &pos_out, len); if (unlikely(ret)) return ret; if (len == 0) return 0; file_start_write(file_out); /* * Cloning is supported by more file systems, so we implement copy on * same sb using clone, but for filesystems where both clone and copy * are supported (e.g. nfs,cifs), we only call the copy method. */ if (!splice && file_out->f_op->copy_file_range) { ret = file_out->f_op->copy_file_range(file_in, pos_in, file_out, pos_out, len, flags); } else if (!splice && file_in->f_op->remap_file_range && samesb) { ret = file_in->f_op->remap_file_range(file_in, pos_in, file_out, pos_out, min_t(loff_t, MAX_RW_COUNT, len), REMAP_FILE_CAN_SHORTEN); /* fallback to splice */ if (ret <= 0) splice = true; } else if (samesb) { /* Fallback to splice for same sb copy for backward compat */ splice = true; } file_end_write(file_out); if (!splice) goto done; /* * We can get here for same sb copy of filesystems that do not implement * ->copy_file_range() in case filesystem does not support clone or in * case filesystem supports clone but rejected the clone request (e.g. * because it was not block aligned). * * In both cases, fall back to kernel copy so we are able to maintain a * consistent story about which filesystems support copy_file_range() * and which filesystems do not, that will allow userspace tools to * make consistent desicions w.r.t using copy_file_range(). * * We also get here if caller (e.g. nfsd) requested COPY_FILE_SPLICE * for server-side-copy between any two sb. * * In any case, we call do_splice_direct() and not splice_file_range(), * without file_start_write() held, to avoid possible deadlocks related * to splicing from input file, while file_start_write() is held on * the output file on a different sb. */ ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out, min_t(size_t, len, MAX_RW_COUNT), 0); done: if (ret > 0) { fsnotify_access(file_in); add_rchar(current, ret); fsnotify_modify(file_out); add_wchar(current, ret); } inc_syscr(current); inc_syscw(current); return ret; } EXPORT_SYMBOL(vfs_copy_file_range); SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t __user *, off_in, int, fd_out, loff_t __user *, off_out, size_t, len, unsigned int, flags) { loff_t pos_in; loff_t pos_out; ssize_t ret = -EBADF; CLASS(fd, f_in)(fd_in); if (fd_empty(f_in)) return -EBADF; CLASS(fd, f_out)(fd_out); if (fd_empty(f_out)) return -EBADF; if (off_in) { if (copy_from_user(&pos_in, off_in, sizeof(loff_t))) return -EFAULT; } else { pos_in = fd_file(f_in)->f_pos; } if (off_out) { if (copy_from_user(&pos_out, off_out, sizeof(loff_t))) return -EFAULT; } else { pos_out = fd_file(f_out)->f_pos; } if (flags != 0) return -EINVAL; ret = vfs_copy_file_range(fd_file(f_in), pos_in, fd_file(f_out), pos_out, len, flags); if (ret > 0) { pos_in += ret; pos_out += ret; if (off_in) { if (copy_to_user(off_in, &pos_in, sizeof(loff_t))) ret = -EFAULT; } else { fd_file(f_in)->f_pos = pos_in; } if (off_out) { if (copy_to_user(off_out, &pos_out, sizeof(loff_t))) ret = -EFAULT; } else { fd_file(f_out)->f_pos = pos_out; } } return ret; } /* * Don't operate on ranges the page cache doesn't support, and don't exceed the * LFS limits. If pos is under the limit it becomes a short access. If it * exceeds the limit we return -EFBIG. */ int generic_write_check_limits(struct file *file, loff_t pos, loff_t *count) { struct inode *inode = file->f_mapping->host; loff_t max_size = inode->i_sb->s_maxbytes; loff_t limit = rlimit(RLIMIT_FSIZE); if (limit != RLIM_INFINITY) { if (pos >= limit) { send_sig(SIGXFSZ, current, 0); return -EFBIG; } *count = min(*count, limit - pos); } if (!(file->f_flags & O_LARGEFILE)) max_size = MAX_NON_LFS; if (unlikely(pos >= max_size)) return -EFBIG; *count = min(*count, max_size - pos); return 0; } EXPORT_SYMBOL_GPL(generic_write_check_limits); /* Like generic_write_checks(), but takes size of write instead of iter. */ int generic_write_checks_count(struct kiocb *iocb, loff_t *count) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; if (IS_SWAPFILE(inode)) return -ETXTBSY; if (!*count) return 0; if (iocb->ki_flags & IOCB_APPEND) iocb->ki_pos = i_size_read(inode); if ((iocb->ki_flags & IOCB_NOWAIT) && !((iocb->ki_flags & IOCB_DIRECT) || (file->f_op->fop_flags & FOP_BUFFER_WASYNC))) return -EINVAL; return generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, count); } EXPORT_SYMBOL(generic_write_checks_count); /* * Performs necessary checks before doing a write * * Can adjust writing position or amount of bytes to write. * Returns appropriate error code that caller should return or * zero in case that write should be allowed. */ ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from) { loff_t count = iov_iter_count(from); int ret; ret = generic_write_checks_count(iocb, &count); if (ret) return ret; iov_iter_truncate(from, count); return iov_iter_count(from); } EXPORT_SYMBOL(generic_write_checks); /* * Performs common checks before doing a file copy/clone * from @file_in to @file_out. */ int generic_file_rw_checks(struct file *file_in, struct file *file_out) { struct inode *inode_in = file_inode(file_in); struct inode *inode_out = file_inode(file_out); /* Don't copy dirs, pipes, sockets... */ if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) return -EISDIR; if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) return -EINVAL; if (!(file_in->f_mode & FMODE_READ) || !(file_out->f_mode & FMODE_WRITE) || (file_out->f_flags & O_APPEND)) return -EBADF; return 0; } int generic_atomic_write_valid(struct kiocb *iocb, struct iov_iter *iter) { size_t len = iov_iter_count(iter); if (!iter_is_ubuf(iter)) return -EINVAL; if (!is_power_of_2(len)) return -EINVAL; if (!IS_ALIGNED(iocb->ki_pos, len)) return -EINVAL; if (!(iocb->ki_flags & IOCB_DIRECT)) return -EOPNOTSUPP; return 0; } EXPORT_SYMBOL_GPL(generic_atomic_write_valid);
2 2 2 2 48 10 33 22 1 31 1 9 2 2 15 1 14 2 185 21 81 66 66 66 66 66 66 66 66 66 66 66 66 66 4 7 9 3 6 21 24 24 3 17 19 4 3 7 4 1 1 1 1 3 8 2 10 130 5 19 5 105 1 1 4 1 6 6 6 4 6 6 1 1 1 1 1 1 1 1 1 1 52 52 35 35 41 41 4 2 2 38 3 35 35 16 27 2 35 37 37 25 10 27 9 29 19 3 3 3 3 3 3 2 6 6 6 6 6 6 23 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 2 4 3 4 26 26 26 131 133 36 37 90 2 60 6 52 31 31 3 7 7 7 30 30 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 // SPDX-License-Identifier: GPL-2.0 /* * Wireless utility functions * * Copyright 2007-2009 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright 2017 Intel Deutschland GmbH * Copyright (C) 2018-2023, 2025 Intel Corporation */ #include <linux/export.h> #include <linux/bitops.h> #include <linux/etherdevice.h> #include <linux/slab.h> #include <linux/ieee80211.h> #include <net/cfg80211.h> #include <net/ip.h> #include <net/dsfield.h> #include <linux/if_vlan.h> #include <linux/mpls.h> #include <linux/gcd.h> #include <linux/bitfield.h> #include <linux/nospec.h> #include "core.h" #include "rdev-ops.h" const struct ieee80211_rate * ieee80211_get_response_rate(struct ieee80211_supported_band *sband, u32 basic_rates, int bitrate) { struct ieee80211_rate *result = &sband->bitrates[0]; int i; for (i = 0; i < sband->n_bitrates; i++) { if (!(basic_rates & BIT(i))) continue; if (sband->bitrates[i].bitrate > bitrate) continue; result = &sband->bitrates[i]; } return result; } EXPORT_SYMBOL(ieee80211_get_response_rate); u32 ieee80211_mandatory_rates(struct ieee80211_supported_band *sband) { struct ieee80211_rate *bitrates; u32 mandatory_rates = 0; enum ieee80211_rate_flags mandatory_flag; int i; if (WARN_ON(!sband)) return 1; if (sband->band == NL80211_BAND_2GHZ) mandatory_flag = IEEE80211_RATE_MANDATORY_B; else mandatory_flag = IEEE80211_RATE_MANDATORY_A; bitrates = sband->bitrates; for (i = 0; i < sband->n_bitrates; i++) if (bitrates[i].flags & mandatory_flag) mandatory_rates |= BIT(i); return mandatory_rates; } EXPORT_SYMBOL(ieee80211_mandatory_rates); u32 ieee80211_channel_to_freq_khz(int chan, enum nl80211_band band) { /* see 802.11 17.3.8.3.2 and Annex J * there are overlapping channel numbers in 5GHz and 2GHz bands */ if (chan <= 0) return 0; /* not supported */ switch (band) { case NL80211_BAND_2GHZ: case NL80211_BAND_LC: if (chan == 14) return MHZ_TO_KHZ(2484); else if (chan < 14) return MHZ_TO_KHZ(2407 + chan * 5); break; case NL80211_BAND_5GHZ: if (chan >= 182 && chan <= 196) return MHZ_TO_KHZ(4000 + chan * 5); else return MHZ_TO_KHZ(5000 + chan * 5); break; case NL80211_BAND_6GHZ: /* see 802.11ax D6.1 27.3.23.2 */ if (chan == 2) return MHZ_TO_KHZ(5935); if (chan <= 233) return MHZ_TO_KHZ(5950 + chan * 5); break; case NL80211_BAND_60GHZ: if (chan < 7) return MHZ_TO_KHZ(56160 + chan * 2160); break; case NL80211_BAND_S1GHZ: return 902000 + chan * 500; default: ; } return 0; /* not supported */ } EXPORT_SYMBOL(ieee80211_channel_to_freq_khz); enum nl80211_chan_width ieee80211_s1g_channel_width(const struct ieee80211_channel *chan) { if (WARN_ON(!chan || chan->band != NL80211_BAND_S1GHZ)) return NL80211_CHAN_WIDTH_20_NOHT; /*S1G defines a single allowed channel width per channel. * Extract that width here. */ if (chan->flags & IEEE80211_CHAN_1MHZ) return NL80211_CHAN_WIDTH_1; else if (chan->flags & IEEE80211_CHAN_2MHZ) return NL80211_CHAN_WIDTH_2; else if (chan->flags & IEEE80211_CHAN_4MHZ) return NL80211_CHAN_WIDTH_4; else if (chan->flags & IEEE80211_CHAN_8MHZ) return NL80211_CHAN_WIDTH_8; else if (chan->flags & IEEE80211_CHAN_16MHZ) return NL80211_CHAN_WIDTH_16; pr_err("unknown channel width for channel at %dKHz?\n", ieee80211_channel_to_khz(chan)); return NL80211_CHAN_WIDTH_1; } EXPORT_SYMBOL(ieee80211_s1g_channel_width); int ieee80211_freq_khz_to_channel(u32 freq) { /* TODO: just handle MHz for now */ freq = KHZ_TO_MHZ(freq); /* see 802.11 17.3.8.3.2 and Annex J */ if (freq == 2484) return 14; else if (freq < 2484) return (freq - 2407) / 5; else if (freq >= 4910 && freq <= 4980) return (freq - 4000) / 5; else if (freq < 5925) return (freq - 5000) / 5; else if (freq == 5935) return 2; else if (freq <= 45000) /* DMG band lower limit */ /* see 802.11ax D6.1 27.3.22.2 */ return (freq - 5950) / 5; else if (freq >= 58320 && freq <= 70200) return (freq - 56160) / 2160; else return 0; } EXPORT_SYMBOL(ieee80211_freq_khz_to_channel); struct ieee80211_channel *ieee80211_get_channel_khz(struct wiphy *wiphy, u32 freq) { enum nl80211_band band; struct ieee80211_supported_band *sband; int i; for (band = 0; band < NUM_NL80211_BANDS; band++) { sband = wiphy->bands[band]; if (!sband) continue; for (i = 0; i < sband->n_channels; i++) { struct ieee80211_channel *chan = &sband->channels[i]; if (ieee80211_channel_to_khz(chan) == freq) return chan; } } return NULL; } EXPORT_SYMBOL(ieee80211_get_channel_khz); static void set_mandatory_flags_band(struct ieee80211_supported_band *sband) { int i, want; switch (sband->band) { case NL80211_BAND_5GHZ: case NL80211_BAND_6GHZ: want = 3; for (i = 0; i < sband->n_bitrates; i++) { if (sband->bitrates[i].bitrate == 60 || sband->bitrates[i].bitrate == 120 || sband->bitrates[i].bitrate == 240) { sband->bitrates[i].flags |= IEEE80211_RATE_MANDATORY_A; want--; } } WARN_ON(want); break; case NL80211_BAND_2GHZ: case NL80211_BAND_LC: want = 7; for (i = 0; i < sband->n_bitrates; i++) { switch (sband->bitrates[i].bitrate) { case 10: case 20: case 55: case 110: sband->bitrates[i].flags |= IEEE80211_RATE_MANDATORY_B | IEEE80211_RATE_MANDATORY_G; want--; break; case 60: case 120: case 240: sband->bitrates[i].flags |= IEEE80211_RATE_MANDATORY_G; want--; fallthrough; default: sband->bitrates[i].flags |= IEEE80211_RATE_ERP_G; break; } } WARN_ON(want != 0 && want != 3); break; case NL80211_BAND_60GHZ: /* check for mandatory HT MCS 1..4 */ WARN_ON(!sband->ht_cap.ht_supported); WARN_ON((sband->ht_cap.mcs.rx_mask[0] & 0x1e) != 0x1e); break; case NL80211_BAND_S1GHZ: /* Figure 9-589bd: 3 means unsupported, so != 3 means at least * mandatory is ok. */ WARN_ON((sband->s1g_cap.nss_mcs[0] & 0x3) == 0x3); break; case NUM_NL80211_BANDS: default: WARN_ON(1); break; } } void ieee80211_set_bitrate_flags(struct wiphy *wiphy) { enum nl80211_band band; for (band = 0; band < NUM_NL80211_BANDS; band++) if (wiphy->bands[band]) set_mandatory_flags_band(wiphy->bands[band]); } bool cfg80211_supported_cipher_suite(struct wiphy *wiphy, u32 cipher) { int i; for (i = 0; i < wiphy->n_cipher_suites; i++) if (cipher == wiphy->cipher_suites[i]) return true; return false; } static bool cfg80211_igtk_cipher_supported(struct cfg80211_registered_device *rdev) { struct wiphy *wiphy = &rdev->wiphy; int i; for (i = 0; i < wiphy->n_cipher_suites; i++) { switch (wiphy->cipher_suites[i]) { case WLAN_CIPHER_SUITE_AES_CMAC: case WLAN_CIPHER_SUITE_BIP_CMAC_256: case WLAN_CIPHER_SUITE_BIP_GMAC_128: case WLAN_CIPHER_SUITE_BIP_GMAC_256: return true; } } return false; } bool cfg80211_valid_key_idx(struct cfg80211_registered_device *rdev, int key_idx, bool pairwise) { int max_key_idx; if (pairwise) max_key_idx = 3; else if (wiphy_ext_feature_isset(&rdev->wiphy, NL80211_EXT_FEATURE_BEACON_PROTECTION) || wiphy_ext_feature_isset(&rdev->wiphy, NL80211_EXT_FEATURE_BEACON_PROTECTION_CLIENT)) max_key_idx = 7; else if (cfg80211_igtk_cipher_supported(rdev)) max_key_idx = 5; else max_key_idx = 3; if (key_idx < 0 || key_idx > max_key_idx) return false; return true; } int cfg80211_validate_key_settings(struct cfg80211_registered_device *rdev, struct key_params *params, int key_idx, bool pairwise, const u8 *mac_addr) { if (!cfg80211_valid_key_idx(rdev, key_idx, pairwise)) return -EINVAL; if (!pairwise && mac_addr && !(rdev->wiphy.flags & WIPHY_FLAG_IBSS_RSN)) return -EINVAL; if (pairwise && !mac_addr) return -EINVAL; switch (params->cipher) { case WLAN_CIPHER_SUITE_TKIP: /* Extended Key ID can only be used with CCMP/GCMP ciphers */ if ((pairwise && key_idx) || params->mode != NL80211_KEY_RX_TX) return -EINVAL; break; case WLAN_CIPHER_SUITE_CCMP: case WLAN_CIPHER_SUITE_CCMP_256: case WLAN_CIPHER_SUITE_GCMP: case WLAN_CIPHER_SUITE_GCMP_256: /* IEEE802.11-2016 allows only 0 and - when supporting * Extended Key ID - 1 as index for pairwise keys. * @NL80211_KEY_NO_TX is only allowed for pairwise keys when * the driver supports Extended Key ID. * @NL80211_KEY_SET_TX can't be set when installing and * validating a key. */ if ((params->mode == NL80211_KEY_NO_TX && !pairwise) || params->mode == NL80211_KEY_SET_TX) return -EINVAL; if (wiphy_ext_feature_isset(&rdev->wiphy, NL80211_EXT_FEATURE_EXT_KEY_ID)) { if (pairwise && (key_idx < 0 || key_idx > 1)) return -EINVAL; } else if (pairwise && key_idx) { return -EINVAL; } break; case WLAN_CIPHER_SUITE_AES_CMAC: case WLAN_CIPHER_SUITE_BIP_CMAC_256: case WLAN_CIPHER_SUITE_BIP_GMAC_128: case WLAN_CIPHER_SUITE_BIP_GMAC_256: /* Disallow BIP (group-only) cipher as pairwise cipher */ if (pairwise) return -EINVAL; if (key_idx < 4) return -EINVAL; break; case WLAN_CIPHER_SUITE_WEP40: case WLAN_CIPHER_SUITE_WEP104: if (key_idx > 3) return -EINVAL; break; default: break; } switch (params->cipher) { case WLAN_CIPHER_SUITE_WEP40: if (params->key_len != WLAN_KEY_LEN_WEP40) return -EINVAL; break; case WLAN_CIPHER_SUITE_TKIP: if (params->key_len != WLAN_KEY_LEN_TKIP) return -EINVAL; break; case WLAN_CIPHER_SUITE_CCMP: if (params->key_len != WLAN_KEY_LEN_CCMP) return -EINVAL; break; case WLAN_CIPHER_SUITE_CCMP_256: if (params->key_len != WLAN_KEY_LEN_CCMP_256) return -EINVAL; break; case WLAN_CIPHER_SUITE_GCMP: if (params->key_len != WLAN_KEY_LEN_GCMP) return -EINVAL; break; case WLAN_CIPHER_SUITE_GCMP_256: if (params->key_len != WLAN_KEY_LEN_GCMP_256) return -EINVAL; break; case WLAN_CIPHER_SUITE_WEP104: if (params->key_len != WLAN_KEY_LEN_WEP104) return -EINVAL; break; case WLAN_CIPHER_SUITE_AES_CMAC: if (params->key_len != WLAN_KEY_LEN_AES_CMAC) return -EINVAL; break; case WLAN_CIPHER_SUITE_BIP_CMAC_256: if (params->key_len != WLAN_KEY_LEN_BIP_CMAC_256) return -EINVAL; break; case WLAN_CIPHER_SUITE_BIP_GMAC_128: if (params->key_len != WLAN_KEY_LEN_BIP_GMAC_128) return -EINVAL; break; case WLAN_CIPHER_SUITE_BIP_GMAC_256: if (params->key_len != WLAN_KEY_LEN_BIP_GMAC_256) return -EINVAL; break; default: /* * We don't know anything about this algorithm, * allow using it -- but the driver must check * all parameters! We still check below whether * or not the driver supports this algorithm, * of course. */ break; } if (params->seq) { switch (params->cipher) { case WLAN_CIPHER_SUITE_WEP40: case WLAN_CIPHER_SUITE_WEP104: /* These ciphers do not use key sequence */ return -EINVAL; case WLAN_CIPHER_SUITE_TKIP: case WLAN_CIPHER_SUITE_CCMP: case WLAN_CIPHER_SUITE_CCMP_256: case WLAN_CIPHER_SUITE_GCMP: case WLAN_CIPHER_SUITE_GCMP_256: case WLAN_CIPHER_SUITE_AES_CMAC: case WLAN_CIPHER_SUITE_BIP_CMAC_256: case WLAN_CIPHER_SUITE_BIP_GMAC_128: case WLAN_CIPHER_SUITE_BIP_GMAC_256: if (params->seq_len != 6) return -EINVAL; break; } } if (!cfg80211_supported_cipher_suite(&rdev->wiphy, params->cipher)) return -EINVAL; return 0; } unsigned int __attribute_const__ ieee80211_hdrlen(__le16 fc) { unsigned int hdrlen = 24; if (ieee80211_is_ext(fc)) { hdrlen = 4; goto out; } if (ieee80211_is_data(fc)) { if (ieee80211_has_a4(fc)) hdrlen = 30; if (ieee80211_is_data_qos(fc)) { hdrlen += IEEE80211_QOS_CTL_LEN; if (ieee80211_has_order(fc)) hdrlen += IEEE80211_HT_CTL_LEN; } goto out; } if (ieee80211_is_mgmt(fc)) { if (ieee80211_has_order(fc)) hdrlen += IEEE80211_HT_CTL_LEN; goto out; } if (ieee80211_is_ctl(fc)) { /* * ACK and CTS are 10 bytes, all others 16. To see how * to get this condition consider * subtype mask: 0b0000000011110000 (0x00F0) * ACK subtype: 0b0000000011010000 (0x00D0) * CTS subtype: 0b0000000011000000 (0x00C0) * bits that matter: ^^^ (0x00E0) * value of those: 0b0000000011000000 (0x00C0) */ if ((fc & cpu_to_le16(0x00E0)) == cpu_to_le16(0x00C0)) hdrlen = 10; else hdrlen = 16; } out: return hdrlen; } EXPORT_SYMBOL(ieee80211_hdrlen); unsigned int ieee80211_get_hdrlen_from_skb(const struct sk_buff *skb) { const struct ieee80211_hdr *hdr = (const struct ieee80211_hdr *)skb->data; unsigned int hdrlen; if (unlikely(skb->len < 10)) return 0; hdrlen = ieee80211_hdrlen(hdr->frame_control); if (unlikely(hdrlen > skb->len)) return 0; return hdrlen; } EXPORT_SYMBOL(ieee80211_get_hdrlen_from_skb); static unsigned int __ieee80211_get_mesh_hdrlen(u8 flags) { int ae = flags & MESH_FLAGS_AE; /* 802.11-2012, 8.2.4.7.3 */ switch (ae) { default: case 0: return 6; case MESH_FLAGS_AE_A4: return 12; case MESH_FLAGS_AE_A5_A6: return 18; } } unsigned int ieee80211_get_mesh_hdrlen(struct ieee80211s_hdr *meshhdr) { return __ieee80211_get_mesh_hdrlen(meshhdr->flags); } EXPORT_SYMBOL(ieee80211_get_mesh_hdrlen); bool ieee80211_get_8023_tunnel_proto(const void *hdr, __be16 *proto) { const __be16 *hdr_proto = hdr + ETH_ALEN; if (!(ether_addr_equal(hdr, rfc1042_header) && *hdr_proto != htons(ETH_P_AARP) && *hdr_proto != htons(ETH_P_IPX)) && !ether_addr_equal(hdr, bridge_tunnel_header)) return false; *proto = *hdr_proto; return true; } EXPORT_SYMBOL(ieee80211_get_8023_tunnel_proto); int ieee80211_strip_8023_mesh_hdr(struct sk_buff *skb) { const void *mesh_addr; struct { struct ethhdr eth; u8 flags; } payload; int hdrlen; int ret; ret = skb_copy_bits(skb, 0, &payload, sizeof(payload)); if (ret) return ret; hdrlen = sizeof(payload.eth) + __ieee80211_get_mesh_hdrlen(payload.flags); if (likely(pskb_may_pull(skb, hdrlen + 8) && ieee80211_get_8023_tunnel_proto(skb->data + hdrlen, &payload.eth.h_proto))) hdrlen += ETH_ALEN + 2; else if (!pskb_may_pull(skb, hdrlen)) return -EINVAL; else payload.eth.h_proto = htons(skb->len - hdrlen); mesh_addr = skb->data + sizeof(payload.eth) + ETH_ALEN; switch (payload.flags & MESH_FLAGS_AE) { case MESH_FLAGS_AE_A4: memcpy(&payload.eth.h_source, mesh_addr, ETH_ALEN); break; case MESH_FLAGS_AE_A5_A6: memcpy(&payload.eth, mesh_addr, 2 * ETH_ALEN); break; default: break; } pskb_pull(skb, hdrlen - sizeof(payload.eth)); memcpy(skb->data, &payload.eth, sizeof(payload.eth)); return 0; } EXPORT_SYMBOL(ieee80211_strip_8023_mesh_hdr); int ieee80211_data_to_8023_exthdr(struct sk_buff *skb, struct ethhdr *ehdr, const u8 *addr, enum nl80211_iftype iftype, u8 data_offset, bool is_amsdu) { struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; struct { u8 hdr[ETH_ALEN] __aligned(2); __be16 proto; } payload; struct ethhdr tmp; u16 hdrlen; if (unlikely(!ieee80211_is_data_present(hdr->frame_control))) return -1; hdrlen = ieee80211_hdrlen(hdr->frame_control) + data_offset; if (skb->len < hdrlen) return -1; /* convert IEEE 802.11 header + possible LLC headers into Ethernet * header * IEEE 802.11 address fields: * ToDS FromDS Addr1 Addr2 Addr3 Addr4 * 0 0 DA SA BSSID n/a * 0 1 DA BSSID SA n/a * 1 0 BSSID SA DA n/a * 1 1 RA TA DA SA */ memcpy(tmp.h_dest, ieee80211_get_DA(hdr), ETH_ALEN); memcpy(tmp.h_source, ieee80211_get_SA(hdr), ETH_ALEN); switch (hdr->frame_control & cpu_to_le16(IEEE80211_FCTL_TODS | IEEE80211_FCTL_FROMDS)) { case cpu_to_le16(IEEE80211_FCTL_TODS): if (unlikely(iftype != NL80211_IFTYPE_AP && iftype != NL80211_IFTYPE_AP_VLAN && iftype != NL80211_IFTYPE_P2P_GO)) return -1; break; case cpu_to_le16(IEEE80211_FCTL_TODS | IEEE80211_FCTL_FROMDS): if (unlikely(iftype != NL80211_IFTYPE_MESH_POINT && iftype != NL80211_IFTYPE_AP_VLAN && iftype != NL80211_IFTYPE_STATION)) return -1; break; case cpu_to_le16(IEEE80211_FCTL_FROMDS): if ((iftype != NL80211_IFTYPE_STATION && iftype != NL80211_IFTYPE_P2P_CLIENT && iftype != NL80211_IFTYPE_MESH_POINT) || (is_multicast_ether_addr(tmp.h_dest) && ether_addr_equal(tmp.h_source, addr))) return -1; break; case cpu_to_le16(0): if (iftype != NL80211_IFTYPE_ADHOC && iftype != NL80211_IFTYPE_STATION && iftype != NL80211_IFTYPE_OCB) return -1; break; } if (likely(!is_amsdu && iftype != NL80211_IFTYPE_MESH_POINT && skb_copy_bits(skb, hdrlen, &payload, sizeof(payload)) == 0 && ieee80211_get_8023_tunnel_proto(&payload, &tmp.h_proto))) { /* remove RFC1042 or Bridge-Tunnel encapsulation */ hdrlen += ETH_ALEN + 2; skb_postpull_rcsum(skb, &payload, ETH_ALEN + 2); } else { tmp.h_proto = htons(skb->len - hdrlen); } pskb_pull(skb, hdrlen); if (!ehdr) ehdr = skb_push(skb, sizeof(struct ethhdr)); memcpy(ehdr, &tmp, sizeof(tmp)); return 0; } EXPORT_SYMBOL(ieee80211_data_to_8023_exthdr); static void __frame_add_frag(struct sk_buff *skb, struct page *page, void *ptr, int len, int size) { struct skb_shared_info *sh = skb_shinfo(skb); int page_offset; get_page(page); page_offset = ptr - page_address(page); skb_add_rx_frag(skb, sh->nr_frags, page, page_offset, len, size); } static void __ieee80211_amsdu_copy_frag(struct sk_buff *skb, struct sk_buff *frame, int offset, int len) { struct skb_shared_info *sh = skb_shinfo(skb); const skb_frag_t *frag = &sh->frags[0]; struct page *frag_page; void *frag_ptr; int frag_len, frag_size; int head_size = skb->len - skb->data_len; int cur_len; frag_page = virt_to_head_page(skb->head); frag_ptr = skb->data; frag_size = head_size; while (offset >= frag_size) { offset -= frag_size; frag_page = skb_frag_page(frag); frag_ptr = skb_frag_address(frag); frag_size = skb_frag_size(frag); frag++; } frag_ptr += offset; frag_len = frag_size - offset; cur_len = min(len, frag_len); __frame_add_frag(frame, frag_page, frag_ptr, cur_len, frag_size); len -= cur_len; while (len > 0) { frag_len = skb_frag_size(frag); cur_len = min(len, frag_len); __frame_add_frag(frame, skb_frag_page(frag), skb_frag_address(frag), cur_len, frag_len); len -= cur_len; frag++; } } static struct sk_buff * __ieee80211_amsdu_copy(struct sk_buff *skb, unsigned int hlen, int offset, int len, bool reuse_frag, int min_len) { struct sk_buff *frame; int cur_len = len; if (skb->len - offset < len) return NULL; /* * When reusing fragments, copy some data to the head to simplify * ethernet header handling and speed up protocol header processing * in the stack later. */ if (reuse_frag) cur_len = min_t(int, len, min_len); /* * Allocate and reserve two bytes more for payload * alignment since sizeof(struct ethhdr) is 14. */ frame = dev_alloc_skb(hlen + sizeof(struct ethhdr) + 2 + cur_len); if (!frame) return NULL; frame->priority = skb->priority; skb_reserve(frame, hlen + sizeof(struct ethhdr) + 2); skb_copy_bits(skb, offset, skb_put(frame, cur_len), cur_len); len -= cur_len; if (!len) return frame; offset += cur_len; __ieee80211_amsdu_copy_frag(skb, frame, offset, len); return frame; } static u16 ieee80211_amsdu_subframe_length(void *field, u8 mesh_flags, u8 hdr_type) { __le16 *field_le = field; __be16 *field_be = field; u16 len; if (hdr_type >= 2) len = le16_to_cpu(*field_le); else len = be16_to_cpu(*field_be); if (hdr_type) len += __ieee80211_get_mesh_hdrlen(mesh_flags); return len; } bool ieee80211_is_valid_amsdu(struct sk_buff *skb, u8 mesh_hdr) { int offset = 0, subframe_len, padding; for (offset = 0; offset < skb->len; offset += subframe_len + padding) { int remaining = skb->len - offset; struct { __be16 len; u8 mesh_flags; } hdr; u16 len; if (sizeof(hdr) > remaining) return false; if (skb_copy_bits(skb, offset + 2 * ETH_ALEN, &hdr, sizeof(hdr)) < 0) return false; len = ieee80211_amsdu_subframe_length(&hdr.len, hdr.mesh_flags, mesh_hdr); subframe_len = sizeof(struct ethhdr) + len; padding = (4 - subframe_len) & 0x3; if (subframe_len > remaining) return false; } return true; } EXPORT_SYMBOL(ieee80211_is_valid_amsdu); void ieee80211_amsdu_to_8023s(struct sk_buff *skb, struct sk_buff_head *list, const u8 *addr, enum nl80211_iftype iftype, const unsigned int extra_headroom, const u8 *check_da, const u8 *check_sa, u8 mesh_control) { unsigned int hlen = ALIGN(extra_headroom, 4); struct sk_buff *frame = NULL; int offset = 0; struct { struct ethhdr eth; uint8_t flags; } hdr; bool reuse_frag = skb->head_frag && !skb_has_frag_list(skb); bool reuse_skb = false; bool last = false; int copy_len = sizeof(hdr.eth); if (iftype == NL80211_IFTYPE_MESH_POINT) copy_len = sizeof(hdr); while (!last) { int remaining = skb->len - offset; unsigned int subframe_len; int len, mesh_len = 0; u8 padding; if (copy_len > remaining) goto purge; skb_copy_bits(skb, offset, &hdr, copy_len); if (iftype == NL80211_IFTYPE_MESH_POINT) mesh_len = __ieee80211_get_mesh_hdrlen(hdr.flags); len = ieee80211_amsdu_subframe_length(&hdr.eth.h_proto, hdr.flags, mesh_control); subframe_len = sizeof(struct ethhdr) + len; padding = (4 - subframe_len) & 0x3; /* the last MSDU has no padding */ if (subframe_len > remaining) goto purge; /* mitigate A-MSDU aggregation injection attacks */ if (ether_addr_equal(hdr.eth.h_dest, rfc1042_header)) goto purge; offset += sizeof(struct ethhdr); last = remaining <= subframe_len + padding; /* FIXME: should we really accept multicast DA? */ if ((check_da && !is_multicast_ether_addr(hdr.eth.h_dest) && !ether_addr_equal(check_da, hdr.eth.h_dest)) || (check_sa && !ether_addr_equal(check_sa, hdr.eth.h_source))) { offset += len + padding; continue; } /* reuse skb for the last subframe */ if (!skb_is_nonlinear(skb) && !reuse_frag && last) { skb_pull(skb, offset); frame = skb; reuse_skb = true; } else { frame = __ieee80211_amsdu_copy(skb, hlen, offset, len, reuse_frag, 32 + mesh_len); if (!frame) goto purge; offset += len + padding; } skb_reset_network_header(frame); frame->dev = skb->dev; frame->priority = skb->priority; if (likely(iftype != NL80211_IFTYPE_MESH_POINT && ieee80211_get_8023_tunnel_proto(frame->data, &hdr.eth.h_proto))) skb_pull(frame, ETH_ALEN + 2); memcpy(skb_push(frame, sizeof(hdr.eth)), &hdr.eth, sizeof(hdr.eth)); __skb_queue_tail(list, frame); } if (!reuse_skb) dev_kfree_skb(skb); return; purge: __skb_queue_purge(list); dev_kfree_skb(skb); } EXPORT_SYMBOL(ieee80211_amsdu_to_8023s); /* Given a data frame determine the 802.1p/1d tag to use. */ unsigned int cfg80211_classify8021d(struct sk_buff *skb, struct cfg80211_qos_map *qos_map) { unsigned int dscp; unsigned char vlan_priority; unsigned int ret; /* skb->priority values from 256->263 are magic values to * directly indicate a specific 802.1d priority. This is used * to allow 802.1d priority to be passed directly in from VLAN * tags, etc. */ if (skb->priority >= 256 && skb->priority <= 263) { ret = skb->priority - 256; goto out; } if (skb_vlan_tag_present(skb)) { vlan_priority = (skb_vlan_tag_get(skb) & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT; if (vlan_priority > 0) { ret = vlan_priority; goto out; } } switch (skb->protocol) { case htons(ETH_P_IP): dscp = ipv4_get_dsfield(ip_hdr(skb)) & 0xfc; break; case htons(ETH_P_IPV6): dscp = ipv6_get_dsfield(ipv6_hdr(skb)) & 0xfc; break; case htons(ETH_P_MPLS_UC): case htons(ETH_P_MPLS_MC): { struct mpls_label mpls_tmp, *mpls; mpls = skb_header_pointer(skb, sizeof(struct ethhdr), sizeof(*mpls), &mpls_tmp); if (!mpls) return 0; ret = (ntohl(mpls->entry) & MPLS_LS_TC_MASK) >> MPLS_LS_TC_SHIFT; goto out; } case htons(ETH_P_80221): /* 802.21 is always network control traffic */ return 7; default: return 0; } if (qos_map) { unsigned int i, tmp_dscp = dscp >> 2; for (i = 0; i < qos_map->num_des; i++) { if (tmp_dscp == qos_map->dscp_exception[i].dscp) { ret = qos_map->dscp_exception[i].up; goto out; } } for (i = 0; i < 8; i++) { if (tmp_dscp >= qos_map->up[i].low && tmp_dscp <= qos_map->up[i].high) { ret = i; goto out; } } } /* The default mapping as defined Section 2.3 in RFC8325: The three * Most Significant Bits (MSBs) of the DSCP are used as the * corresponding L2 markings. */ ret = dscp >> 5; /* Handle specific DSCP values for which the default mapping (as * described above) doesn't adhere to the intended usage of the DSCP * value. See section 4 in RFC8325. Specifically, for the following * Diffserv Service Classes no update is needed: * - Standard: DF * - Low Priority Data: CS1 * - Multimedia Conferencing: AF41, AF42, AF43 * - Network Control Traffic: CS7 * - Real-Time Interactive: CS4 * - Signaling: CS5 */ switch (dscp >> 2) { case 10: case 12: case 14: /* High throughput data: AF11, AF12, AF13 */ ret = 0; break; case 16: /* Operations, Administration, and Maintenance and Provisioning: * CS2 */ ret = 0; break; case 18: case 20: case 22: /* Low latency data: AF21, AF22, AF23 */ ret = 3; break; case 24: /* Broadcasting video: CS3 */ ret = 4; break; case 26: case 28: case 30: /* Multimedia Streaming: AF31, AF32, AF33 */ ret = 4; break; case 44: /* Voice Admit: VA */ ret = 6; break; case 46: /* Telephony traffic: EF */ ret = 6; break; case 48: /* Network Control Traffic: CS6 */ ret = 7; break; } out: return array_index_nospec(ret, IEEE80211_NUM_TIDS); } EXPORT_SYMBOL(cfg80211_classify8021d); const struct element *ieee80211_bss_get_elem(struct cfg80211_bss *bss, u8 id) { const struct cfg80211_bss_ies *ies; ies = rcu_dereference(bss->ies); if (!ies) return NULL; return cfg80211_find_elem(id, ies->data, ies->len); } EXPORT_SYMBOL(ieee80211_bss_get_elem); void cfg80211_upload_connect_keys(struct wireless_dev *wdev) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct net_device *dev = wdev->netdev; int i; if (!wdev->connect_keys) return; for (i = 0; i < 4; i++) { if (!wdev->connect_keys->params[i].cipher) continue; if (rdev_add_key(rdev, dev, -1, i, false, NULL, &wdev->connect_keys->params[i])) { netdev_err(dev, "failed to set key %d\n", i); continue; } if (wdev->connect_keys->def == i && rdev_set_default_key(rdev, dev, -1, i, true, true)) { netdev_err(dev, "failed to set defkey %d\n", i); continue; } } kfree_sensitive(wdev->connect_keys); wdev->connect_keys = NULL; } void cfg80211_process_wdev_events(struct wireless_dev *wdev) { struct cfg80211_event *ev; unsigned long flags; spin_lock_irqsave(&wdev->event_lock, flags); while (!list_empty(&wdev->event_list)) { ev = list_first_entry(&wdev->event_list, struct cfg80211_event, list); list_del(&ev->list); spin_unlock_irqrestore(&wdev->event_lock, flags); switch (ev->type) { case EVENT_CONNECT_RESULT: __cfg80211_connect_result( wdev->netdev, &ev->cr, ev->cr.status == WLAN_STATUS_SUCCESS); break; case EVENT_ROAMED: __cfg80211_roamed(wdev, &ev->rm); break; case EVENT_DISCONNECTED: __cfg80211_disconnected(wdev->netdev, ev->dc.ie, ev->dc.ie_len, ev->dc.reason, !ev->dc.locally_generated); break; case EVENT_IBSS_JOINED: __cfg80211_ibss_joined(wdev->netdev, ev->ij.bssid, ev->ij.channel); break; case EVENT_STOPPED: cfg80211_leave(wiphy_to_rdev(wdev->wiphy), wdev); break; case EVENT_PORT_AUTHORIZED: __cfg80211_port_authorized(wdev, ev->pa.peer_addr, ev->pa.td_bitmap, ev->pa.td_bitmap_len); break; } kfree(ev); spin_lock_irqsave(&wdev->event_lock, flags); } spin_unlock_irqrestore(&wdev->event_lock, flags); } void cfg80211_process_rdev_events(struct cfg80211_registered_device *rdev) { struct wireless_dev *wdev; lockdep_assert_held(&rdev->wiphy.mtx); list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) cfg80211_process_wdev_events(wdev); } int cfg80211_change_iface(struct cfg80211_registered_device *rdev, struct net_device *dev, enum nl80211_iftype ntype, struct vif_params *params) { int err; enum nl80211_iftype otype = dev->ieee80211_ptr->iftype; lockdep_assert_held(&rdev->wiphy.mtx); /* don't support changing VLANs, you just re-create them */ if (otype == NL80211_IFTYPE_AP_VLAN) return -EOPNOTSUPP; /* cannot change into P2P device or NAN */ if (ntype == NL80211_IFTYPE_P2P_DEVICE || ntype == NL80211_IFTYPE_NAN) return -EOPNOTSUPP; if (!rdev->ops->change_virtual_intf || !(rdev->wiphy.interface_modes & (1 << ntype))) return -EOPNOTSUPP; if (ntype != otype) { /* if it's part of a bridge, reject changing type to station/ibss */ if (netif_is_bridge_port(dev) && (ntype == NL80211_IFTYPE_ADHOC || ntype == NL80211_IFTYPE_STATION || ntype == NL80211_IFTYPE_P2P_CLIENT)) return -EBUSY; dev->ieee80211_ptr->use_4addr = false; rdev_set_qos_map(rdev, dev, NULL); switch (otype) { case NL80211_IFTYPE_AP: case NL80211_IFTYPE_P2P_GO: cfg80211_stop_ap(rdev, dev, -1, true); break; case NL80211_IFTYPE_ADHOC: cfg80211_leave_ibss(rdev, dev, false); break; case NL80211_IFTYPE_STATION: case NL80211_IFTYPE_P2P_CLIENT: cfg80211_disconnect(rdev, dev, WLAN_REASON_DEAUTH_LEAVING, true); break; case NL80211_IFTYPE_MESH_POINT: /* mesh should be handled? */ break; case NL80211_IFTYPE_OCB: cfg80211_leave_ocb(rdev, dev); break; default: break; } cfg80211_process_rdev_events(rdev); cfg80211_mlme_purge_registrations(dev->ieee80211_ptr); memset(&dev->ieee80211_ptr->u, 0, sizeof(dev->ieee80211_ptr->u)); memset(&dev->ieee80211_ptr->links, 0, sizeof(dev->ieee80211_ptr->links)); } err = rdev_change_virtual_intf(rdev, dev, ntype, params); WARN_ON(!err && dev->ieee80211_ptr->iftype != ntype); if (!err && params && params->use_4addr != -1) dev->ieee80211_ptr->use_4addr = params->use_4addr; if (!err) { dev->priv_flags &= ~IFF_DONT_BRIDGE; switch (ntype) { case NL80211_IFTYPE_STATION: if (dev->ieee80211_ptr->use_4addr) break; fallthrough; case NL80211_IFTYPE_OCB: case NL80211_IFTYPE_P2P_CLIENT: case NL80211_IFTYPE_ADHOC: dev->priv_flags |= IFF_DONT_BRIDGE; break; case NL80211_IFTYPE_P2P_GO: case NL80211_IFTYPE_AP: case NL80211_IFTYPE_AP_VLAN: case NL80211_IFTYPE_MESH_POINT: /* bridging OK */ break; case NL80211_IFTYPE_MONITOR: /* monitor can't bridge anyway */ break; case NL80211_IFTYPE_UNSPECIFIED: case NUM_NL80211_IFTYPES: /* not happening */ break; case NL80211_IFTYPE_P2P_DEVICE: case NL80211_IFTYPE_WDS: case NL80211_IFTYPE_NAN: WARN_ON(1); break; } } if (!err && ntype != otype && netif_running(dev)) { cfg80211_update_iface_num(rdev, ntype, 1); cfg80211_update_iface_num(rdev, otype, -1); } return err; } static u32 cfg80211_calculate_bitrate_ht(struct rate_info *rate) { int modulation, streams, bitrate; /* the formula below does only work for MCS values smaller than 32 */ if (WARN_ON_ONCE(rate->mcs >= 32)) return 0; modulation = rate->mcs & 7; streams = (rate->mcs >> 3) + 1; bitrate = (rate->bw == RATE_INFO_BW_40) ? 13500000 : 6500000; if (modulation < 4) bitrate *= (modulation + 1); else if (modulation == 4) bitrate *= (modulation + 2); else bitrate *= (modulation + 3); bitrate *= streams; if (rate->flags & RATE_INFO_FLAGS_SHORT_GI) bitrate = (bitrate / 9) * 10; /* do NOT round down here */ return (bitrate + 50000) / 100000; } static u32 cfg80211_calculate_bitrate_dmg(struct rate_info *rate) { static const u32 __mcs2bitrate[] = { /* control PHY */ [0] = 275, /* SC PHY */ [1] = 3850, [2] = 7700, [3] = 9625, [4] = 11550, [5] = 12512, /* 1251.25 mbps */ [6] = 15400, [7] = 19250, [8] = 23100, [9] = 25025, [10] = 30800, [11] = 38500, [12] = 46200, /* OFDM PHY */ [13] = 6930, [14] = 8662, /* 866.25 mbps */ [15] = 13860, [16] = 17325, [17] = 20790, [18] = 27720, [19] = 34650, [20] = 41580, [21] = 45045, [22] = 51975, [23] = 62370, [24] = 67568, /* 6756.75 mbps */ /* LP-SC PHY */ [25] = 6260, [26] = 8340, [27] = 11120, [28] = 12510, [29] = 16680, [30] = 22240, [31] = 25030, }; if (WARN_ON_ONCE(rate->mcs >= ARRAY_SIZE(__mcs2bitrate))) return 0; return __mcs2bitrate[rate->mcs]; } static u32 cfg80211_calculate_bitrate_extended_sc_dmg(struct rate_info *rate) { static const u32 __mcs2bitrate[] = { [6 - 6] = 26950, /* MCS 9.1 : 2695.0 mbps */ [7 - 6] = 50050, /* MCS 12.1 */ [8 - 6] = 53900, [9 - 6] = 57750, [10 - 6] = 63900, [11 - 6] = 75075, [12 - 6] = 80850, }; /* Extended SC MCS not defined for base MCS below 6 or above 12 */ if (WARN_ON_ONCE(rate->mcs < 6 || rate->mcs > 12)) return 0; return __mcs2bitrate[rate->mcs - 6]; } static u32 cfg80211_calculate_bitrate_edmg(struct rate_info *rate) { static const u32 __mcs2bitrate[] = { /* control PHY */ [0] = 275, /* SC PHY */ [1] = 3850, [2] = 7700, [3] = 9625, [4] = 11550, [5] = 12512, /* 1251.25 mbps */ [6] = 13475, [7] = 15400, [8] = 19250, [9] = 23100, [10] = 25025, [11] = 26950, [12] = 30800, [13] = 38500, [14] = 46200, [15] = 50050, [16] = 53900, [17] = 57750, [18] = 69300, [19] = 75075, [20] = 80850, }; if (WARN_ON_ONCE(rate->mcs >= ARRAY_SIZE(__mcs2bitrate))) return 0; return __mcs2bitrate[rate->mcs] * rate->n_bonded_ch; } static u32 cfg80211_calculate_bitrate_vht(struct rate_info *rate) { static const u32 base[4][12] = { { 6500000, 13000000, 19500000, 26000000, 39000000, 52000000, 58500000, 65000000, 78000000, /* not in the spec, but some devices use this: */ 86700000, 97500000, 108300000, }, { 13500000, 27000000, 40500000, 54000000, 81000000, 108000000, 121500000, 135000000, 162000000, 180000000, 202500000, 225000000, }, { 29300000, 58500000, 87800000, 117000000, 175500000, 234000000, 263300000, 292500000, 351000000, 390000000, 438800000, 487500000, }, { 58500000, 117000000, 175500000, 234000000, 351000000, 468000000, 526500000, 585000000, 702000000, 780000000, 877500000, 975000000, }, }; u32 bitrate; int idx; if (rate->mcs > 11) goto warn; switch (rate->bw) { case RATE_INFO_BW_160: idx = 3; break; case RATE_INFO_BW_80: idx = 2; break; case RATE_INFO_BW_40: idx = 1; break; case RATE_INFO_BW_5: case RATE_INFO_BW_10: default: goto warn; case RATE_INFO_BW_20: idx = 0; } bitrate = base[idx][rate->mcs]; bitrate *= rate->nss; if (rate->flags & RATE_INFO_FLAGS_SHORT_GI) bitrate = (bitrate / 9) * 10; /* do NOT round down here */ return (bitrate + 50000) / 100000; warn: WARN_ONCE(1, "invalid rate bw=%d, mcs=%d, nss=%d\n", rate->bw, rate->mcs, rate->nss); return 0; } static u32 cfg80211_calculate_bitrate_he(struct rate_info *rate) { #define SCALE 6144 u32 mcs_divisors[14] = { 102399, /* 16.666666... */ 51201, /* 8.333333... */ 34134, /* 5.555555... */ 25599, /* 4.166666... */ 17067, /* 2.777777... */ 12801, /* 2.083333... */ 11377, /* 1.851725... */ 10239, /* 1.666666... */ 8532, /* 1.388888... */ 7680, /* 1.250000... */ 6828, /* 1.111111... */ 6144, /* 1.000000... */ 5690, /* 0.926106... */ 5120, /* 0.833333... */ }; u32 rates_160M[3] = { 960777777, 907400000, 816666666 }; u32 rates_996[3] = { 480388888, 453700000, 408333333 }; u32 rates_484[3] = { 229411111, 216666666, 195000000 }; u32 rates_242[3] = { 114711111, 108333333, 97500000 }; u32 rates_106[3] = { 40000000, 37777777, 34000000 }; u32 rates_52[3] = { 18820000, 17777777, 16000000 }; u32 rates_26[3] = { 9411111, 8888888, 8000000 }; u64 tmp; u32 result; if (WARN_ON_ONCE(rate->mcs > 13)) return 0; if (WARN_ON_ONCE(rate->he_gi > NL80211_RATE_INFO_HE_GI_3_2)) return 0; if (WARN_ON_ONCE(rate->he_ru_alloc > NL80211_RATE_INFO_HE_RU_ALLOC_2x996)) return 0; if (WARN_ON_ONCE(rate->nss < 1 || rate->nss > 8)) return 0; if (rate->bw == RATE_INFO_BW_160 || (rate->bw == RATE_INFO_BW_HE_RU && rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_2x996)) result = rates_160M[rate->he_gi]; else if (rate->bw == RATE_INFO_BW_80 || (rate->bw == RATE_INFO_BW_HE_RU && rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_996)) result = rates_996[rate->he_gi]; else if (rate->bw == RATE_INFO_BW_40 || (rate->bw == RATE_INFO_BW_HE_RU && rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_484)) result = rates_484[rate->he_gi]; else if (rate->bw == RATE_INFO_BW_20 || (rate->bw == RATE_INFO_BW_HE_RU && rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_242)) result = rates_242[rate->he_gi]; else if (rate->bw == RATE_INFO_BW_HE_RU && rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_106) result = rates_106[rate->he_gi]; else if (rate->bw == RATE_INFO_BW_HE_RU && rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_52) result = rates_52[rate->he_gi]; else if (rate->bw == RATE_INFO_BW_HE_RU && rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_26) result = rates_26[rate->he_gi]; else { WARN(1, "invalid HE MCS: bw:%d, ru:%d\n", rate->bw, rate->he_ru_alloc); return 0; } /* now scale to the appropriate MCS */ tmp = result; tmp *= SCALE; do_div(tmp, mcs_divisors[rate->mcs]); result = tmp; /* and take NSS, DCM into account */ result = (result * rate->nss) / 8; if (rate->he_dcm) result /= 2; return result / 10000; } static u32 cfg80211_calculate_bitrate_eht(struct rate_info *rate) { #define SCALE 6144 static const u32 mcs_divisors[16] = { 102399, /* 16.666666... */ 51201, /* 8.333333... */ 34134, /* 5.555555... */ 25599, /* 4.166666... */ 17067, /* 2.777777... */ 12801, /* 2.083333... */ 11377, /* 1.851725... */ 10239, /* 1.666666... */ 8532, /* 1.388888... */ 7680, /* 1.250000... */ 6828, /* 1.111111... */ 6144, /* 1.000000... */ 5690, /* 0.926106... */ 5120, /* 0.833333... */ 409600, /* 66.666666... */ 204800, /* 33.333333... */ }; static const u32 rates_996[3] = { 480388888, 453700000, 408333333 }; static const u32 rates_484[3] = { 229411111, 216666666, 195000000 }; static const u32 rates_242[3] = { 114711111, 108333333, 97500000 }; static const u32 rates_106[3] = { 40000000, 37777777, 34000000 }; static const u32 rates_52[3] = { 18820000, 17777777, 16000000 }; static const u32 rates_26[3] = { 9411111, 8888888, 8000000 }; u64 tmp; u32 result; if (WARN_ON_ONCE(rate->mcs > 15)) return 0; if (WARN_ON_ONCE(rate->eht_gi > NL80211_RATE_INFO_EHT_GI_3_2)) return 0; if (WARN_ON_ONCE(rate->eht_ru_alloc > NL80211_RATE_INFO_EHT_RU_ALLOC_4x996)) return 0; if (WARN_ON_ONCE(rate->nss < 1 || rate->nss > 8)) return 0; /* Bandwidth checks for MCS 14 */ if (rate->mcs == 14) { if ((rate->bw != RATE_INFO_BW_EHT_RU && rate->bw != RATE_INFO_BW_80 && rate->bw != RATE_INFO_BW_160 && rate->bw != RATE_INFO_BW_320) || (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc != NL80211_RATE_INFO_EHT_RU_ALLOC_996 && rate->eht_ru_alloc != NL80211_RATE_INFO_EHT_RU_ALLOC_2x996 && rate->eht_ru_alloc != NL80211_RATE_INFO_EHT_RU_ALLOC_4x996)) { WARN(1, "invalid EHT BW for MCS 14: bw:%d, ru:%d\n", rate->bw, rate->eht_ru_alloc); return 0; } } if (rate->bw == RATE_INFO_BW_320 || (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_4x996)) result = 4 * rates_996[rate->eht_gi]; else if (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_3x996P484) result = 3 * rates_996[rate->eht_gi] + rates_484[rate->eht_gi]; else if (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_3x996) result = 3 * rates_996[rate->eht_gi]; else if (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_2x996P484) result = 2 * rates_996[rate->eht_gi] + rates_484[rate->eht_gi]; else if (rate->bw == RATE_INFO_BW_160 || (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_2x996)) result = 2 * rates_996[rate->eht_gi]; else if (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_996P484P242) result = rates_996[rate->eht_gi] + rates_484[rate->eht_gi] + rates_242[rate->eht_gi]; else if (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_996P484) result = rates_996[rate->eht_gi] + rates_484[rate->eht_gi]; else if (rate->bw == RATE_INFO_BW_80 || (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_996)) result = rates_996[rate->eht_gi]; else if (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_484P242) result = rates_484[rate->eht_gi] + rates_242[rate->eht_gi]; else if (rate->bw == RATE_INFO_BW_40 || (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_484)) result = rates_484[rate->eht_gi]; else if (rate->bw == RATE_INFO_BW_20 || (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_242)) result = rates_242[rate->eht_gi]; else if (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_106P26) result = rates_106[rate->eht_gi] + rates_26[rate->eht_gi]; else if (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_106) result = rates_106[rate->eht_gi]; else if (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_52P26) result = rates_52[rate->eht_gi] + rates_26[rate->eht_gi]; else if (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_52) result = rates_52[rate->eht_gi]; else if (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_26) result = rates_26[rate->eht_gi]; else { WARN(1, "invalid EHT MCS: bw:%d, ru:%d\n", rate->bw, rate->eht_ru_alloc); return 0; } /* now scale to the appropriate MCS */ tmp = result; tmp *= SCALE; do_div(tmp, mcs_divisors[rate->mcs]); /* and take NSS */ tmp *= rate->nss; do_div(tmp, 8); result = tmp; return result / 10000; } static u32 cfg80211_calculate_bitrate_s1g(struct rate_info *rate) { /* For 1, 2, 4, 8 and 16 MHz channels */ static const u32 base[5][11] = { { 300000, 600000, 900000, 1200000, 1800000, 2400000, 2700000, 3000000, 3600000, 4000000, /* MCS 10 supported in 1 MHz only */ 150000, }, { 650000, 1300000, 1950000, 2600000, 3900000, 5200000, 5850000, 6500000, 7800000, /* MCS 9 not valid */ }, { 1350000, 2700000, 4050000, 5400000, 8100000, 10800000, 12150000, 13500000, 16200000, 18000000, }, { 2925000, 5850000, 8775000, 11700000, 17550000, 23400000, 26325000, 29250000, 35100000, 39000000, }, { 8580000, 11700000, 17550000, 23400000, 35100000, 46800000, 52650000, 58500000, 70200000, 78000000, }, }; u32 bitrate; /* default is 1 MHz index */ int idx = 0; if (rate->mcs >= 11) goto warn; switch (rate->bw) { case RATE_INFO_BW_16: idx = 4; break; case RATE_INFO_BW_8: idx = 3; break; case RATE_INFO_BW_4: idx = 2; break; case RATE_INFO_BW_2: idx = 1; break; case RATE_INFO_BW_1: idx = 0; break; case RATE_INFO_BW_5: case RATE_INFO_BW_10: case RATE_INFO_BW_20: case RATE_INFO_BW_40: case RATE_INFO_BW_80: case RATE_INFO_BW_160: default: goto warn; } bitrate = base[idx][rate->mcs]; bitrate *= rate->nss; if (rate->flags & RATE_INFO_FLAGS_SHORT_GI) bitrate = (bitrate / 9) * 10; /* do NOT round down here */ return (bitrate + 50000) / 100000; warn: WARN_ONCE(1, "invalid rate bw=%d, mcs=%d, nss=%d\n", rate->bw, rate->mcs, rate->nss); return 0; } u32 cfg80211_calculate_bitrate(struct rate_info *rate) { if (rate->flags & RATE_INFO_FLAGS_MCS) return cfg80211_calculate_bitrate_ht(rate); if (rate->flags & RATE_INFO_FLAGS_DMG) return cfg80211_calculate_bitrate_dmg(rate); if (rate->flags & RATE_INFO_FLAGS_EXTENDED_SC_DMG) return cfg80211_calculate_bitrate_extended_sc_dmg(rate); if (rate->flags & RATE_INFO_FLAGS_EDMG) return cfg80211_calculate_bitrate_edmg(rate); if (rate->flags & RATE_INFO_FLAGS_VHT_MCS) return cfg80211_calculate_bitrate_vht(rate); if (rate->flags & RATE_INFO_FLAGS_HE_MCS) return cfg80211_calculate_bitrate_he(rate); if (rate->flags & RATE_INFO_FLAGS_EHT_MCS) return cfg80211_calculate_bitrate_eht(rate); if (rate->flags & RATE_INFO_FLAGS_S1G_MCS) return cfg80211_calculate_bitrate_s1g(rate); return rate->legacy; } EXPORT_SYMBOL(cfg80211_calculate_bitrate); int cfg80211_get_p2p_attr(const u8 *ies, unsigned int len, enum ieee80211_p2p_attr_id attr, u8 *buf, unsigned int bufsize) { u8 *out = buf; u16 attr_remaining = 0; bool desired_attr = false; u16 desired_len = 0; while (len > 0) { unsigned int iedatalen; unsigned int copy; const u8 *iedata; if (len < 2) return -EILSEQ; iedatalen = ies[1]; if (iedatalen + 2 > len) return -EILSEQ; if (ies[0] != WLAN_EID_VENDOR_SPECIFIC) goto cont; if (iedatalen < 4) goto cont; iedata = ies + 2; /* check WFA OUI, P2P subtype */ if (iedata[0] != 0x50 || iedata[1] != 0x6f || iedata[2] != 0x9a || iedata[3] != 0x09) goto cont; iedatalen -= 4; iedata += 4; /* check attribute continuation into this IE */ copy = min_t(unsigned int, attr_remaining, iedatalen); if (copy && desired_attr) { desired_len += copy; if (out) { memcpy(out, iedata, min(bufsize, copy)); out += min(bufsize, copy); bufsize -= min(bufsize, copy); } if (copy == attr_remaining) return desired_len; } attr_remaining -= copy; if (attr_remaining) goto cont; iedatalen -= copy; iedata += copy; while (iedatalen > 0) { u16 attr_len; /* P2P attribute ID & size must fit */ if (iedatalen < 3) return -EILSEQ; desired_attr = iedata[0] == attr; attr_len = get_unaligned_le16(iedata + 1); iedatalen -= 3; iedata += 3; copy = min_t(unsigned int, attr_len, iedatalen); if (desired_attr) { desired_len += copy; if (out) { memcpy(out, iedata, min(bufsize, copy)); out += min(bufsize, copy); bufsize -= min(bufsize, copy); } if (copy == attr_len) return desired_len; } iedata += copy; iedatalen -= copy; attr_remaining = attr_len - copy; } cont: len -= ies[1] + 2; ies += ies[1] + 2; } if (attr_remaining && desired_attr) return -EILSEQ; return -ENOENT; } EXPORT_SYMBOL(cfg80211_get_p2p_attr); static bool ieee80211_id_in_list(const u8 *ids, int n_ids, u8 id, bool id_ext) { int i; /* Make sure array values are legal */ if (WARN_ON(ids[n_ids - 1] == WLAN_EID_EXTENSION)) return false; i = 0; while (i < n_ids) { if (ids[i] == WLAN_EID_EXTENSION) { if (id_ext && (ids[i + 1] == id)) return true; i += 2; continue; } if (ids[i] == id && !id_ext) return true; i++; } return false; } static size_t skip_ie(const u8 *ies, size_t ielen, size_t pos) { /* we assume a validly formed IEs buffer */ u8 len = ies[pos + 1]; pos += 2 + len; /* the IE itself must have 255 bytes for fragments to follow */ if (len < 255) return pos; while (pos < ielen && ies[pos] == WLAN_EID_FRAGMENT) { len = ies[pos + 1]; pos += 2 + len; } return pos; } size_t ieee80211_ie_split_ric(const u8 *ies, size_t ielen, const u8 *ids, int n_ids, const u8 *after_ric, int n_after_ric, size_t offset) { size_t pos = offset; while (pos < ielen) { u8 ext = 0; if (ies[pos] == WLAN_EID_EXTENSION) ext = 2; if ((pos + ext) >= ielen) break; if (!ieee80211_id_in_list(ids, n_ids, ies[pos + ext], ies[pos] == WLAN_EID_EXTENSION)) break; if (ies[pos] == WLAN_EID_RIC_DATA && n_after_ric) { pos = skip_ie(ies, ielen, pos); while (pos < ielen) { if (ies[pos] == WLAN_EID_EXTENSION) ext = 2; else ext = 0; if ((pos + ext) >= ielen) break; if (!ieee80211_id_in_list(after_ric, n_after_ric, ies[pos + ext], ext == 2)) pos = skip_ie(ies, ielen, pos); else break; } } else { pos = skip_ie(ies, ielen, pos); } } return pos; } EXPORT_SYMBOL(ieee80211_ie_split_ric); void ieee80211_fragment_element(struct sk_buff *skb, u8 *len_pos, u8 frag_id) { unsigned int elem_len; if (!len_pos) return; elem_len = skb->data + skb->len - len_pos - 1; while (elem_len > 255) { /* this one is 255 */ *len_pos = 255; /* remaining data gets smaller */ elem_len -= 255; /* make space for the fragment ID/len in SKB */ skb_put(skb, 2); /* shift back the remaining data to place fragment ID/len */ memmove(len_pos + 255 + 3, len_pos + 255 + 1, elem_len); /* place the fragment ID */ len_pos += 255 + 1; *len_pos = frag_id; /* and point to fragment length to update later */ len_pos++; } *len_pos = elem_len; } EXPORT_SYMBOL(ieee80211_fragment_element); bool ieee80211_operating_class_to_band(u8 operating_class, enum nl80211_band *band) { switch (operating_class) { case 112: case 115 ... 127: case 128 ... 130: *band = NL80211_BAND_5GHZ; return true; case 131 ... 135: case 137: *band = NL80211_BAND_6GHZ; return true; case 81: case 82: case 83: case 84: *band = NL80211_BAND_2GHZ; return true; case 180: *band = NL80211_BAND_60GHZ; return true; } return false; } EXPORT_SYMBOL(ieee80211_operating_class_to_band); bool ieee80211_operating_class_to_chandef(u8 operating_class, struct ieee80211_channel *chan, struct cfg80211_chan_def *chandef) { u32 control_freq, offset = 0; enum nl80211_band band; if (!ieee80211_operating_class_to_band(operating_class, &band) || !chan || band != chan->band) return false; control_freq = chan->center_freq; chandef->chan = chan; if (control_freq >= 5955) offset = control_freq - 5955; else if (control_freq >= 5745) offset = control_freq - 5745; else if (control_freq >= 5180) offset = control_freq - 5180; offset /= 20; switch (operating_class) { case 81: /* 2 GHz band; 20 MHz; channels 1..13 */ case 82: /* 2 GHz band; 20 MHz; channel 14 */ case 115: /* 5 GHz band; 20 MHz; channels 36,40,44,48 */ case 118: /* 5 GHz band; 20 MHz; channels 52,56,60,64 */ case 121: /* 5 GHz band; 20 MHz; channels 100..144 */ case 124: /* 5 GHz band; 20 MHz; channels 149,153,157,161 */ case 125: /* 5 GHz band; 20 MHz; channels 149..177 */ case 131: /* 6 GHz band; 20 MHz; channels 1..233*/ case 136: /* 6 GHz band; 20 MHz; channel 2 */ chandef->center_freq1 = control_freq; chandef->width = NL80211_CHAN_WIDTH_20; return true; case 83: /* 2 GHz band; 40 MHz; channels 1..9 */ case 116: /* 5 GHz band; 40 MHz; channels 36,44 */ case 119: /* 5 GHz band; 40 MHz; channels 52,60 */ case 122: /* 5 GHz band; 40 MHz; channels 100,108,116,124,132,140 */ case 126: /* 5 GHz band; 40 MHz; channels 149,157,165,173 */ chandef->center_freq1 = control_freq + 10; chandef->width = NL80211_CHAN_WIDTH_40; return true; case 84: /* 2 GHz band; 40 MHz; channels 5..13 */ case 117: /* 5 GHz band; 40 MHz; channels 40,48 */ case 120: /* 5 GHz band; 40 MHz; channels 56,64 */ case 123: /* 5 GHz band; 40 MHz; channels 104,112,120,128,136,144 */ case 127: /* 5 GHz band; 40 MHz; channels 153,161,169,177 */ chandef->center_freq1 = control_freq - 10; chandef->width = NL80211_CHAN_WIDTH_40; return true; case 132: /* 6 GHz band; 40 MHz; channels 1,5,..,229*/ chandef->center_freq1 = control_freq + 10 - (offset & 1) * 20; chandef->width = NL80211_CHAN_WIDTH_40; return true; case 128: /* 5 GHz band; 80 MHz; channels 36..64,100..144,149..177 */ case 133: /* 6 GHz band; 80 MHz; channels 1,5,..,229 */ chandef->center_freq1 = control_freq + 30 - (offset & 3) * 20; chandef->width = NL80211_CHAN_WIDTH_80; return true; case 129: /* 5 GHz band; 160 MHz; channels 36..64,100..144,149..177 */ case 134: /* 6 GHz band; 160 MHz; channels 1,5,..,229 */ chandef->center_freq1 = control_freq + 70 - (offset & 7) * 20; chandef->width = NL80211_CHAN_WIDTH_160; return true; case 130: /* 5 GHz band; 80+80 MHz; channels 36..64,100..144,149..177 */ case 135: /* 6 GHz band; 80+80 MHz; channels 1,5,..,229 */ /* The center_freq2 of 80+80 MHz is unknown */ case 137: /* 6 GHz band; 320 MHz; channels 1,5,..,229 */ /* 320-1 or 320-2 channelization is unknown */ default: return false; } } EXPORT_SYMBOL(ieee80211_operating_class_to_chandef); bool ieee80211_chandef_to_operating_class(struct cfg80211_chan_def *chandef, u8 *op_class) { u8 vht_opclass; u32 freq = chandef->center_freq1; if (freq >= 2412 && freq <= 2472) { if (chandef->width > NL80211_CHAN_WIDTH_40) return false; /* 2.407 GHz, channels 1..13 */ if (chandef->width == NL80211_CHAN_WIDTH_40) { if (freq > chandef->chan->center_freq) *op_class = 83; /* HT40+ */ else *op_class = 84; /* HT40- */ } else { *op_class = 81; } return true; } if (freq == 2484) { /* channel 14 is only for IEEE 802.11b */ if (chandef->width != NL80211_CHAN_WIDTH_20_NOHT) return false; *op_class = 82; /* channel 14 */ return true; } switch (chandef->width) { case NL80211_CHAN_WIDTH_80: vht_opclass = 128; break; case NL80211_CHAN_WIDTH_160: vht_opclass = 129; break; case NL80211_CHAN_WIDTH_80P80: vht_opclass = 130; break; case NL80211_CHAN_WIDTH_10: case NL80211_CHAN_WIDTH_5: return false; /* unsupported for now */ default: vht_opclass = 0; break; } /* 5 GHz, channels 36..48 */ if (freq >= 5180 && freq <= 5240) { if (vht_opclass) { *op_class = vht_opclass; } else if (chandef->width == NL80211_CHAN_WIDTH_40) { if (freq > chandef->chan->center_freq) *op_class = 116; else *op_class = 117; } else { *op_class = 115; } return true; } /* 5 GHz, channels 52..64 */ if (freq >= 5260 && freq <= 5320) { if (vht_opclass) { *op_class = vht_opclass; } else if (chandef->width == NL80211_CHAN_WIDTH_40) { if (freq > chandef->chan->center_freq) *op_class = 119; else *op_class = 120; } else { *op_class = 118; } return true; } /* 5 GHz, channels 100..144 */ if (freq >= 5500 && freq <= 5720) { if (vht_opclass) { *op_class = vht_opclass; } else if (chandef->width == NL80211_CHAN_WIDTH_40) { if (freq > chandef->chan->center_freq) *op_class = 122; else *op_class = 123; } else { *op_class = 121; } return true; } /* 5 GHz, channels 149..169 */ if (freq >= 5745 && freq <= 5845) { if (vht_opclass) { *op_class = vht_opclass; } else if (chandef->width == NL80211_CHAN_WIDTH_40) { if (freq > chandef->chan->center_freq) *op_class = 126; else *op_class = 127; } else if (freq <= 5805) { *op_class = 124; } else { *op_class = 125; } return true; } /* 56.16 GHz, channel 1..4 */ if (freq >= 56160 + 2160 * 1 && freq <= 56160 + 2160 * 6) { if (chandef->width >= NL80211_CHAN_WIDTH_40) return false; *op_class = 180; return true; } /* not supported yet */ return false; } EXPORT_SYMBOL(ieee80211_chandef_to_operating_class); static int cfg80211_wdev_bi(struct wireless_dev *wdev) { switch (wdev->iftype) { case NL80211_IFTYPE_AP: case NL80211_IFTYPE_P2P_GO: WARN_ON(wdev->valid_links); return wdev->links[0].ap.beacon_interval; case NL80211_IFTYPE_MESH_POINT: return wdev->u.mesh.beacon_interval; case NL80211_IFTYPE_ADHOC: return wdev->u.ibss.beacon_interval; default: break; } return 0; } static void cfg80211_calculate_bi_data(struct wiphy *wiphy, u32 new_beacon_int, u32 *beacon_int_gcd, bool *beacon_int_different, int radio_idx) { struct cfg80211_registered_device *rdev; struct wireless_dev *wdev; *beacon_int_gcd = 0; *beacon_int_different = false; rdev = wiphy_to_rdev(wiphy); list_for_each_entry(wdev, &wiphy->wdev_list, list) { int wdev_bi; /* this feature isn't supported with MLO */ if (wdev->valid_links) continue; /* skip wdevs not active on the given wiphy radio */ if (radio_idx >= 0 && !(rdev_get_radio_mask(rdev, wdev->netdev) & BIT(radio_idx))) continue; wdev_bi = cfg80211_wdev_bi(wdev); if (!wdev_bi) continue; if (!*beacon_int_gcd) { *beacon_int_gcd = wdev_bi; continue; } if (wdev_bi == *beacon_int_gcd) continue; *beacon_int_different = true; *beacon_int_gcd = gcd(*beacon_int_gcd, wdev_bi); } if (new_beacon_int && *beacon_int_gcd != new_beacon_int) { if (*beacon_int_gcd) *beacon_int_different = true; *beacon_int_gcd = gcd(*beacon_int_gcd, new_beacon_int); } } int cfg80211_validate_beacon_int(struct cfg80211_registered_device *rdev, enum nl80211_iftype iftype, u32 beacon_int) { /* * This is just a basic pre-condition check; if interface combinations * are possible the driver must already be checking those with a call * to cfg80211_check_combinations(), in which case we'll validate more * through the cfg80211_calculate_bi_data() call and code in * cfg80211_iter_combinations(). */ if (beacon_int < 10 || beacon_int > 10000) return -EINVAL; return 0; } int cfg80211_iter_combinations(struct wiphy *wiphy, struct iface_combination_params *params, void (*iter)(const struct ieee80211_iface_combination *c, void *data), void *data) { const struct wiphy_radio *radio = NULL; const struct ieee80211_iface_combination *c, *cs; const struct ieee80211_regdomain *regdom; enum nl80211_dfs_regions region = 0; int i, j, n, iftype; int num_interfaces = 0; u32 used_iftypes = 0; u32 beacon_int_gcd; bool beacon_int_different; if (params->radio_idx >= 0) radio = &wiphy->radio[params->radio_idx]; /* * This is a bit strange, since the iteration used to rely only on * the data given by the driver, but here it now relies on context, * in form of the currently operating interfaces. * This is OK for all current users, and saves us from having to * push the GCD calculations into all the drivers. * In the future, this should probably rely more on data that's in * cfg80211 already - the only thing not would appear to be any new * interfaces (while being brought up) and channel/radar data. */ cfg80211_calculate_bi_data(wiphy, params->new_beacon_int, &beacon_int_gcd, &beacon_int_different, params->radio_idx); if (params->radar_detect) { rcu_read_lock(); regdom = rcu_dereference(cfg80211_regdomain); if (regdom) region = regdom->dfs_region; rcu_read_unlock(); } for (iftype = 0; iftype < NUM_NL80211_IFTYPES; iftype++) { num_interfaces += params->iftype_num[iftype]; if (params->iftype_num[iftype] > 0 && !cfg80211_iftype_allowed(wiphy, iftype, 0, 1)) used_iftypes |= BIT(iftype); } if (radio) { cs = radio->iface_combinations; n = radio->n_iface_combinations; } else { cs = wiphy->iface_combinations; n = wiphy->n_iface_combinations; } for (i = 0; i < n; i++) { struct ieee80211_iface_limit *limits; u32 all_iftypes = 0; c = &cs[i]; if (num_interfaces > c->max_interfaces) continue; if (params->num_different_channels > c->num_different_channels) continue; limits = kmemdup_array(c->limits, c->n_limits, sizeof(*limits), GFP_KERNEL); if (!limits) return -ENOMEM; for (iftype = 0; iftype < NUM_NL80211_IFTYPES; iftype++) { if (cfg80211_iftype_allowed(wiphy, iftype, 0, 1)) continue; for (j = 0; j < c->n_limits; j++) { all_iftypes |= limits[j].types; if (!(limits[j].types & BIT(iftype))) continue; if (limits[j].max < params->iftype_num[iftype]) goto cont; limits[j].max -= params->iftype_num[iftype]; } } if (params->radar_detect != (c->radar_detect_widths & params->radar_detect)) goto cont; if (params->radar_detect && c->radar_detect_regions && !(c->radar_detect_regions & BIT(region))) goto cont; /* Finally check that all iftypes that we're currently * using are actually part of this combination. If they * aren't then we can't use this combination and have * to continue to the next. */ if ((all_iftypes & used_iftypes) != used_iftypes) goto cont; if (beacon_int_gcd) { if (c->beacon_int_min_gcd && beacon_int_gcd < c->beacon_int_min_gcd) goto cont; if (!c->beacon_int_min_gcd && beacon_int_different) goto cont; } /* This combination covered all interface types and * supported the requested numbers, so we're good. */ (*iter)(c, data); cont: kfree(limits); } return 0; } EXPORT_SYMBOL(cfg80211_iter_combinations); static void cfg80211_iter_sum_ifcombs(const struct ieee80211_iface_combination *c, void *data) { int *num = data; (*num)++; } int cfg80211_check_combinations(struct wiphy *wiphy, struct iface_combination_params *params) { int err, num = 0; err = cfg80211_iter_combinations(wiphy, params, cfg80211_iter_sum_ifcombs, &num); if (err) return err; if (num == 0) return -EBUSY; return 0; } EXPORT_SYMBOL(cfg80211_check_combinations); int ieee80211_get_ratemask(struct ieee80211_supported_band *sband, const u8 *rates, unsigned int n_rates, u32 *mask) { int i, j; if (!sband) return -EINVAL; if (n_rates == 0 || n_rates > NL80211_MAX_SUPP_RATES) return -EINVAL; *mask = 0; for (i = 0; i < n_rates; i++) { int rate = (rates[i] & 0x7f) * 5; bool found = false; for (j = 0; j < sband->n_bitrates; j++) { if (sband->bitrates[j].bitrate == rate) { found = true; *mask |= BIT(j); break; } } if (!found) return -EINVAL; } /* * mask must have at least one bit set here since we * didn't accept a 0-length rates array nor allowed * entries in the array that didn't exist */ return 0; } unsigned int ieee80211_get_num_supported_channels(struct wiphy *wiphy) { enum nl80211_band band; unsigned int n_channels = 0; for (band = 0; band < NUM_NL80211_BANDS; band++) if (wiphy->bands[band]) n_channels += wiphy->bands[band]->n_channels; return n_channels; } EXPORT_SYMBOL(ieee80211_get_num_supported_channels); int cfg80211_get_station(struct net_device *dev, const u8 *mac_addr, struct station_info *sinfo) { struct cfg80211_registered_device *rdev; struct wireless_dev *wdev; wdev = dev->ieee80211_ptr; if (!wdev) return -EOPNOTSUPP; rdev = wiphy_to_rdev(wdev->wiphy); if (!rdev->ops->get_station) return -EOPNOTSUPP; memset(sinfo, 0, sizeof(*sinfo)); guard(wiphy)(&rdev->wiphy); return rdev_get_station(rdev, dev, mac_addr, sinfo); } EXPORT_SYMBOL(cfg80211_get_station); void cfg80211_free_nan_func(struct cfg80211_nan_func *f) { int i; if (!f) return; kfree(f->serv_spec_info); kfree(f->srf_bf); kfree(f->srf_macs); for (i = 0; i < f->num_rx_filters; i++) kfree(f->rx_filters[i].filter); for (i = 0; i < f->num_tx_filters; i++) kfree(f->tx_filters[i].filter); kfree(f->rx_filters); kfree(f->tx_filters); kfree(f); } EXPORT_SYMBOL(cfg80211_free_nan_func); bool cfg80211_does_bw_fit_range(const struct ieee80211_freq_range *freq_range, u32 center_freq_khz, u32 bw_khz) { u32 start_freq_khz, end_freq_khz; start_freq_khz = center_freq_khz - (bw_khz / 2); end_freq_khz = center_freq_khz + (bw_khz / 2); if (start_freq_khz >= freq_range->start_freq_khz && end_freq_khz <= freq_range->end_freq_khz) return true; return false; } int cfg80211_sinfo_alloc_tid_stats(struct station_info *sinfo, gfp_t gfp) { sinfo->pertid = kcalloc(IEEE80211_NUM_TIDS + 1, sizeof(*(sinfo->pertid)), gfp); if (!sinfo->pertid) return -ENOMEM; return 0; } EXPORT_SYMBOL(cfg80211_sinfo_alloc_tid_stats); /* See IEEE 802.1H for LLC/SNAP encapsulation/decapsulation */ /* Ethernet-II snap header (RFC1042 for most EtherTypes) */ const unsigned char rfc1042_header[] __aligned(2) = { 0xaa, 0xaa, 0x03, 0x00, 0x00, 0x00 }; EXPORT_SYMBOL(rfc1042_header); /* Bridge-Tunnel header (for EtherTypes ETH_P_AARP and ETH_P_IPX) */ const unsigned char bridge_tunnel_header[] __aligned(2) = { 0xaa, 0xaa, 0x03, 0x00, 0x00, 0xf8 }; EXPORT_SYMBOL(bridge_tunnel_header); /* Layer 2 Update frame (802.2 Type 1 LLC XID Update response) */ struct iapp_layer2_update { u8 da[ETH_ALEN]; /* broadcast */ u8 sa[ETH_ALEN]; /* STA addr */ __be16 len; /* 6 */ u8 dsap; /* 0 */ u8 ssap; /* 0 */ u8 control; u8 xid_info[3]; } __packed; void cfg80211_send_layer2_update(struct net_device *dev, const u8 *addr) { struct iapp_layer2_update *msg; struct sk_buff *skb; /* Send Level 2 Update Frame to update forwarding tables in layer 2 * bridge devices */ skb = dev_alloc_skb(sizeof(*msg)); if (!skb) return; msg = skb_put(skb, sizeof(*msg)); /* 802.2 Type 1 Logical Link Control (LLC) Exchange Identifier (XID) * Update response frame; IEEE Std 802.2-1998, 5.4.1.2.1 */ eth_broadcast_addr(msg->da); ether_addr_copy(msg->sa, addr); msg->len = htons(6); msg->dsap = 0; msg->ssap = 0x01; /* NULL LSAP, CR Bit: Response */ msg->control = 0xaf; /* XID response lsb.1111F101. * F=0 (no poll command; unsolicited frame) */ msg->xid_info[0] = 0x81; /* XID format identifier */ msg->xid_info[1] = 1; /* LLC types/classes: Type 1 LLC */ msg->xid_info[2] = 0; /* XID sender's receive window size (RW) */ skb->dev = dev; skb->protocol = eth_type_trans(skb, dev); memset(skb->cb, 0, sizeof(skb->cb)); netif_rx(skb); } EXPORT_SYMBOL(cfg80211_send_layer2_update); int ieee80211_get_vht_max_nss(struct ieee80211_vht_cap *cap, enum ieee80211_vht_chanwidth bw, int mcs, bool ext_nss_bw_capable, unsigned int max_vht_nss) { u16 map = le16_to_cpu(cap->supp_mcs.rx_mcs_map); int ext_nss_bw; int supp_width; int i, mcs_encoding; if (map == 0xffff) return 0; if (WARN_ON(mcs > 9 || max_vht_nss > 8)) return 0; if (mcs <= 7) mcs_encoding = 0; else if (mcs == 8) mcs_encoding = 1; else mcs_encoding = 2; if (!max_vht_nss) { /* find max_vht_nss for the given MCS */ for (i = 7; i >= 0; i--) { int supp = (map >> (2 * i)) & 3; if (supp == 3) continue; if (supp >= mcs_encoding) { max_vht_nss = i + 1; break; } } } if (!(cap->supp_mcs.tx_mcs_map & cpu_to_le16(IEEE80211_VHT_EXT_NSS_BW_CAPABLE))) return max_vht_nss; ext_nss_bw = le32_get_bits(cap->vht_cap_info, IEEE80211_VHT_CAP_EXT_NSS_BW_MASK); supp_width = le32_get_bits(cap->vht_cap_info, IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK); /* if not capable, treat ext_nss_bw as 0 */ if (!ext_nss_bw_capable) ext_nss_bw = 0; /* This is invalid */ if (supp_width == 3) return 0; /* This is an invalid combination so pretend nothing is supported */ if (supp_width == 2 && (ext_nss_bw == 1 || ext_nss_bw == 2)) return 0; /* * Cover all the special cases according to IEEE 802.11-2016 * Table 9-250. All other cases are either factor of 1 or not * valid/supported. */ switch (bw) { case IEEE80211_VHT_CHANWIDTH_USE_HT: case IEEE80211_VHT_CHANWIDTH_80MHZ: if ((supp_width == 1 || supp_width == 2) && ext_nss_bw == 3) return 2 * max_vht_nss; break; case IEEE80211_VHT_CHANWIDTH_160MHZ: if (supp_width == 0 && (ext_nss_bw == 1 || ext_nss_bw == 2)) return max_vht_nss / 2; if (supp_width == 0 && ext_nss_bw == 3) return (3 * max_vht_nss) / 4; if (supp_width == 1 && ext_nss_bw == 3) return 2 * max_vht_nss; break; case IEEE80211_VHT_CHANWIDTH_80P80MHZ: if (supp_width == 0 && ext_nss_bw == 1) return 0; /* not possible */ if (supp_width == 0 && ext_nss_bw == 2) return max_vht_nss / 2; if (supp_width == 0 && ext_nss_bw == 3) return (3 * max_vht_nss) / 4; if (supp_width == 1 && ext_nss_bw == 0) return 0; /* not possible */ if (supp_width == 1 && ext_nss_bw == 1) return max_vht_nss / 2; if (supp_width == 1 && ext_nss_bw == 2) return (3 * max_vht_nss) / 4; break; } /* not covered or invalid combination received */ return max_vht_nss; } EXPORT_SYMBOL(ieee80211_get_vht_max_nss); bool cfg80211_iftype_allowed(struct wiphy *wiphy, enum nl80211_iftype iftype, bool is_4addr, u8 check_swif) { bool is_vlan = iftype == NL80211_IFTYPE_AP_VLAN; switch (check_swif) { case 0: if (is_vlan && is_4addr) return wiphy->flags & WIPHY_FLAG_4ADDR_AP; return wiphy->interface_modes & BIT(iftype); case 1: if (!(wiphy->software_iftypes & BIT(iftype)) && is_vlan) return wiphy->flags & WIPHY_FLAG_4ADDR_AP; return wiphy->software_iftypes & BIT(iftype); default: break; } return false; } EXPORT_SYMBOL(cfg80211_iftype_allowed); void cfg80211_remove_link(struct wireless_dev *wdev, unsigned int link_id) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); lockdep_assert_wiphy(wdev->wiphy); switch (wdev->iftype) { case NL80211_IFTYPE_AP: case NL80211_IFTYPE_P2P_GO: cfg80211_stop_ap(rdev, wdev->netdev, link_id, true); break; default: /* per-link not relevant */ break; } rdev_del_intf_link(rdev, wdev, link_id); wdev->valid_links &= ~BIT(link_id); eth_zero_addr(wdev->links[link_id].addr); } void cfg80211_remove_links(struct wireless_dev *wdev) { unsigned int link_id; /* * links are controlled by upper layers (userspace/cfg) * only for AP mode, so only remove them here for AP */ if (wdev->iftype != NL80211_IFTYPE_AP) return; if (wdev->valid_links) { for_each_valid_link(wdev, link_id) cfg80211_remove_link(wdev, link_id); } } int cfg80211_remove_virtual_intf(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev) { cfg80211_remove_links(wdev); return rdev_del_virtual_intf(rdev, wdev); } const struct wiphy_iftype_ext_capab * cfg80211_get_iftype_ext_capa(struct wiphy *wiphy, enum nl80211_iftype type) { int i; for (i = 0; i < wiphy->num_iftype_ext_capab; i++) { if (wiphy->iftype_ext_capab[i].iftype == type) return &wiphy->iftype_ext_capab[i]; } return NULL; } EXPORT_SYMBOL(cfg80211_get_iftype_ext_capa); static bool ieee80211_radio_freq_range_valid(const struct wiphy_radio *radio, u32 freq, u32 width) { const struct wiphy_radio_freq_range *r; int i; for (i = 0; i < radio->n_freq_range; i++) { r = &radio->freq_range[i]; if (freq - width / 2 >= r->start_freq && freq + width / 2 <= r->end_freq) return true; } return false; } bool cfg80211_radio_chandef_valid(const struct wiphy_radio *radio, const struct cfg80211_chan_def *chandef) { u32 freq, width; freq = ieee80211_chandef_to_khz(chandef); width = cfg80211_chandef_get_width(chandef); if (!ieee80211_radio_freq_range_valid(radio, freq, width)) return false; freq = MHZ_TO_KHZ(chandef->center_freq2); if (freq && !ieee80211_radio_freq_range_valid(radio, freq, width)) return false; return true; } EXPORT_SYMBOL(cfg80211_radio_chandef_valid); bool cfg80211_wdev_channel_allowed(struct wireless_dev *wdev, struct ieee80211_channel *chan) { struct wiphy *wiphy = wdev->wiphy; const struct wiphy_radio *radio; struct cfg80211_chan_def chandef; u32 radio_mask; int i; radio_mask = wdev->radio_mask; if (!wiphy->n_radio || radio_mask == BIT(wiphy->n_radio) - 1) return true; cfg80211_chandef_create(&chandef, chan, NL80211_CHAN_HT20); for (i = 0; i < wiphy->n_radio; i++) { if (!(radio_mask & BIT(i))) continue; radio = &wiphy->radio[i]; if (!cfg80211_radio_chandef_valid(radio, &chandef)) continue; return true; } return false; } EXPORT_SYMBOL(cfg80211_wdev_channel_allowed);
8 13 7 8 8 1 1 10 10 53 6 6 53 53 6 6 6 53 6 6 1 6 1 6 6 10 10 7 10 6 10 10 10 10 13 13 13 7 8 53 74 53 53 53 53 74 74 74 74 74 53 53 53 53 53 53 53 53 53 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 // SPDX-License-Identifier: GPL-2.0-only /* * IEEE 802.1D Generic Attribute Registration Protocol (GARP) * * Copyright (c) 2008 Patrick McHardy <kaber@trash.net> */ #include <linux/kernel.h> #include <linux/timer.h> #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/rtnetlink.h> #include <linux/llc.h> #include <linux/slab.h> #include <linux/module.h> #include <net/llc.h> #include <net/llc_pdu.h> #include <net/garp.h> #include <linux/unaligned.h> static unsigned int garp_join_time __read_mostly = 200; module_param(garp_join_time, uint, 0644); MODULE_PARM_DESC(garp_join_time, "Join time in ms (default 200ms)"); MODULE_DESCRIPTION("IEEE 802.1D Generic Attribute Registration Protocol (GARP)"); MODULE_LICENSE("GPL"); static const struct garp_state_trans { u8 state; u8 action; } garp_applicant_state_table[GARP_APPLICANT_MAX + 1][GARP_EVENT_MAX + 1] = { [GARP_APPLICANT_VA] = { [GARP_EVENT_TRANSMIT_PDU] = { .state = GARP_APPLICANT_AA, .action = GARP_ACTION_S_JOIN_IN }, [GARP_EVENT_R_JOIN_IN] = { .state = GARP_APPLICANT_AA }, [GARP_EVENT_R_JOIN_EMPTY] = { .state = GARP_APPLICANT_VA }, [GARP_EVENT_R_EMPTY] = { .state = GARP_APPLICANT_VA }, [GARP_EVENT_R_LEAVE_IN] = { .state = GARP_APPLICANT_VA }, [GARP_EVENT_R_LEAVE_EMPTY] = { .state = GARP_APPLICANT_VP }, [GARP_EVENT_REQ_JOIN] = { .state = GARP_APPLICANT_INVALID }, [GARP_EVENT_REQ_LEAVE] = { .state = GARP_APPLICANT_LA }, }, [GARP_APPLICANT_AA] = { [GARP_EVENT_TRANSMIT_PDU] = { .state = GARP_APPLICANT_QA, .action = GARP_ACTION_S_JOIN_IN }, [GARP_EVENT_R_JOIN_IN] = { .state = GARP_APPLICANT_QA }, [GARP_EVENT_R_JOIN_EMPTY] = { .state = GARP_APPLICANT_VA }, [GARP_EVENT_R_EMPTY] = { .state = GARP_APPLICANT_VA }, [GARP_EVENT_R_LEAVE_IN] = { .state = GARP_APPLICANT_VA }, [GARP_EVENT_R_LEAVE_EMPTY] = { .state = GARP_APPLICANT_VP }, [GARP_EVENT_REQ_JOIN] = { .state = GARP_APPLICANT_INVALID }, [GARP_EVENT_REQ_LEAVE] = { .state = GARP_APPLICANT_LA }, }, [GARP_APPLICANT_QA] = { [GARP_EVENT_TRANSMIT_PDU] = { .state = GARP_APPLICANT_INVALID }, [GARP_EVENT_R_JOIN_IN] = { .state = GARP_APPLICANT_QA }, [GARP_EVENT_R_JOIN_EMPTY] = { .state = GARP_APPLICANT_VA }, [GARP_EVENT_R_EMPTY] = { .state = GARP_APPLICANT_VA }, [GARP_EVENT_R_LEAVE_IN] = { .state = GARP_APPLICANT_VP }, [GARP_EVENT_R_LEAVE_EMPTY] = { .state = GARP_APPLICANT_VP }, [GARP_EVENT_REQ_JOIN] = { .state = GARP_APPLICANT_INVALID }, [GARP_EVENT_REQ_LEAVE] = { .state = GARP_APPLICANT_LA }, }, [GARP_APPLICANT_LA] = { [GARP_EVENT_TRANSMIT_PDU] = { .state = GARP_APPLICANT_VO, .action = GARP_ACTION_S_LEAVE_EMPTY }, [GARP_EVENT_R_JOIN_IN] = { .state = GARP_APPLICANT_LA }, [GARP_EVENT_R_JOIN_EMPTY] = { .state = GARP_APPLICANT_VO }, [GARP_EVENT_R_EMPTY] = { .state = GARP_APPLICANT_LA }, [GARP_EVENT_R_LEAVE_IN] = { .state = GARP_APPLICANT_LA }, [GARP_EVENT_R_LEAVE_EMPTY] = { .state = GARP_APPLICANT_VO }, [GARP_EVENT_REQ_JOIN] = { .state = GARP_APPLICANT_VA }, [GARP_EVENT_REQ_LEAVE] = { .state = GARP_APPLICANT_INVALID }, }, [GARP_APPLICANT_VP] = { [GARP_EVENT_TRANSMIT_PDU] = { .state = GARP_APPLICANT_AA, .action = GARP_ACTION_S_JOIN_IN }, [GARP_EVENT_R_JOIN_IN] = { .state = GARP_APPLICANT_AP }, [GARP_EVENT_R_JOIN_EMPTY] = { .state = GARP_APPLICANT_VP }, [GARP_EVENT_R_EMPTY] = { .state = GARP_APPLICANT_VP }, [GARP_EVENT_R_LEAVE_IN] = { .state = GARP_APPLICANT_VP }, [GARP_EVENT_R_LEAVE_EMPTY] = { .state = GARP_APPLICANT_VP }, [GARP_EVENT_REQ_JOIN] = { .state = GARP_APPLICANT_INVALID }, [GARP_EVENT_REQ_LEAVE] = { .state = GARP_APPLICANT_VO }, }, [GARP_APPLICANT_AP] = { [GARP_EVENT_TRANSMIT_PDU] = { .state = GARP_APPLICANT_QA, .action = GARP_ACTION_S_JOIN_IN }, [GARP_EVENT_R_JOIN_IN] = { .state = GARP_APPLICANT_QP }, [GARP_EVENT_R_JOIN_EMPTY] = { .state = GARP_APPLICANT_VP }, [GARP_EVENT_R_EMPTY] = { .state = GARP_APPLICANT_VP }, [GARP_EVENT_R_LEAVE_IN] = { .state = GARP_APPLICANT_VP }, [GARP_EVENT_R_LEAVE_EMPTY] = { .state = GARP_APPLICANT_VP }, [GARP_EVENT_REQ_JOIN] = { .state = GARP_APPLICANT_INVALID }, [GARP_EVENT_REQ_LEAVE] = { .state = GARP_APPLICANT_AO }, }, [GARP_APPLICANT_QP] = { [GARP_EVENT_TRANSMIT_PDU] = { .state = GARP_APPLICANT_INVALID }, [GARP_EVENT_R_JOIN_IN] = { .state = GARP_APPLICANT_QP }, [GARP_EVENT_R_JOIN_EMPTY] = { .state = GARP_APPLICANT_VP }, [GARP_EVENT_R_EMPTY] = { .state = GARP_APPLICANT_VP }, [GARP_EVENT_R_LEAVE_IN] = { .state = GARP_APPLICANT_VP }, [GARP_EVENT_R_LEAVE_EMPTY] = { .state = GARP_APPLICANT_VP }, [GARP_EVENT_REQ_JOIN] = { .state = GARP_APPLICANT_INVALID }, [GARP_EVENT_REQ_LEAVE] = { .state = GARP_APPLICANT_QO }, }, [GARP_APPLICANT_VO] = { [GARP_EVENT_TRANSMIT_PDU] = { .state = GARP_APPLICANT_INVALID }, [GARP_EVENT_R_JOIN_IN] = { .state = GARP_APPLICANT_AO }, [GARP_EVENT_R_JOIN_EMPTY] = { .state = GARP_APPLICANT_VO }, [GARP_EVENT_R_EMPTY] = { .state = GARP_APPLICANT_VO }, [GARP_EVENT_R_LEAVE_IN] = { .state = GARP_APPLICANT_VO }, [GARP_EVENT_R_LEAVE_EMPTY] = { .state = GARP_APPLICANT_VO }, [GARP_EVENT_REQ_JOIN] = { .state = GARP_APPLICANT_VP }, [GARP_EVENT_REQ_LEAVE] = { .state = GARP_APPLICANT_INVALID }, }, [GARP_APPLICANT_AO] = { [GARP_EVENT_TRANSMIT_PDU] = { .state = GARP_APPLICANT_INVALID }, [GARP_EVENT_R_JOIN_IN] = { .state = GARP_APPLICANT_QO }, [GARP_EVENT_R_JOIN_EMPTY] = { .state = GARP_APPLICANT_VO }, [GARP_EVENT_R_EMPTY] = { .state = GARP_APPLICANT_VO }, [GARP_EVENT_R_LEAVE_IN] = { .state = GARP_APPLICANT_VO }, [GARP_EVENT_R_LEAVE_EMPTY] = { .state = GARP_APPLICANT_VO }, [GARP_EVENT_REQ_JOIN] = { .state = GARP_APPLICANT_AP }, [GARP_EVENT_REQ_LEAVE] = { .state = GARP_APPLICANT_INVALID }, }, [GARP_APPLICANT_QO] = { [GARP_EVENT_TRANSMIT_PDU] = { .state = GARP_APPLICANT_INVALID }, [GARP_EVENT_R_JOIN_IN] = { .state = GARP_APPLICANT_QO }, [GARP_EVENT_R_JOIN_EMPTY] = { .state = GARP_APPLICANT_VO }, [GARP_EVENT_R_EMPTY] = { .state = GARP_APPLICANT_VO }, [GARP_EVENT_R_LEAVE_IN] = { .state = GARP_APPLICANT_VO }, [GARP_EVENT_R_LEAVE_EMPTY] = { .state = GARP_APPLICANT_VO }, [GARP_EVENT_REQ_JOIN] = { .state = GARP_APPLICANT_QP }, [GARP_EVENT_REQ_LEAVE] = { .state = GARP_APPLICANT_INVALID }, }, }; static int garp_attr_cmp(const struct garp_attr *attr, const void *data, u8 len, u8 type) { if (attr->type != type) return attr->type - type; if (attr->dlen != len) return attr->dlen - len; return memcmp(attr->data, data, len); } static struct garp_attr *garp_attr_lookup(const struct garp_applicant *app, const void *data, u8 len, u8 type) { struct rb_node *parent = app->gid.rb_node; struct garp_attr *attr; int d; while (parent) { attr = rb_entry(parent, struct garp_attr, node); d = garp_attr_cmp(attr, data, len, type); if (d > 0) parent = parent->rb_left; else if (d < 0) parent = parent->rb_right; else return attr; } return NULL; } static struct garp_attr *garp_attr_create(struct garp_applicant *app, const void *data, u8 len, u8 type) { struct rb_node *parent = NULL, **p = &app->gid.rb_node; struct garp_attr *attr; int d; while (*p) { parent = *p; attr = rb_entry(parent, struct garp_attr, node); d = garp_attr_cmp(attr, data, len, type); if (d > 0) p = &parent->rb_left; else if (d < 0) p = &parent->rb_right; else { /* The attribute already exists; re-use it. */ return attr; } } attr = kmalloc(sizeof(*attr) + len, GFP_ATOMIC); if (!attr) return attr; attr->state = GARP_APPLICANT_VO; attr->type = type; attr->dlen = len; memcpy(attr->data, data, len); rb_link_node(&attr->node, parent, p); rb_insert_color(&attr->node, &app->gid); return attr; } static void garp_attr_destroy(struct garp_applicant *app, struct garp_attr *attr) { rb_erase(&attr->node, &app->gid); kfree(attr); } static void garp_attr_destroy_all(struct garp_applicant *app) { struct rb_node *node, *next; struct garp_attr *attr; for (node = rb_first(&app->gid); next = node ? rb_next(node) : NULL, node != NULL; node = next) { attr = rb_entry(node, struct garp_attr, node); garp_attr_destroy(app, attr); } } static int garp_pdu_init(struct garp_applicant *app) { struct sk_buff *skb; struct garp_pdu_hdr *gp; #define LLC_RESERVE sizeof(struct llc_pdu_un) skb = alloc_skb(app->dev->mtu + LL_RESERVED_SPACE(app->dev), GFP_ATOMIC); if (!skb) return -ENOMEM; skb->dev = app->dev; skb->protocol = htons(ETH_P_802_2); skb_reserve(skb, LL_RESERVED_SPACE(app->dev) + LLC_RESERVE); gp = __skb_put(skb, sizeof(*gp)); put_unaligned(htons(GARP_PROTOCOL_ID), &gp->protocol); app->pdu = skb; return 0; } static int garp_pdu_append_end_mark(struct garp_applicant *app) { if (skb_tailroom(app->pdu) < sizeof(u8)) return -1; __skb_put_u8(app->pdu, GARP_END_MARK); return 0; } static void garp_pdu_queue(struct garp_applicant *app) { if (!app->pdu) return; garp_pdu_append_end_mark(app); garp_pdu_append_end_mark(app); llc_pdu_header_init(app->pdu, LLC_PDU_TYPE_U, LLC_SAP_BSPAN, LLC_SAP_BSPAN, LLC_PDU_CMD); llc_pdu_init_as_ui_cmd(app->pdu); llc_mac_hdr_init(app->pdu, app->dev->dev_addr, app->app->proto.group_address); skb_queue_tail(&app->queue, app->pdu); app->pdu = NULL; } static void garp_queue_xmit(struct garp_applicant *app) { struct sk_buff *skb; while ((skb = skb_dequeue(&app->queue))) dev_queue_xmit(skb); } static int garp_pdu_append_msg(struct garp_applicant *app, u8 attrtype) { struct garp_msg_hdr *gm; if (skb_tailroom(app->pdu) < sizeof(*gm)) return -1; gm = __skb_put(app->pdu, sizeof(*gm)); gm->attrtype = attrtype; garp_cb(app->pdu)->cur_type = attrtype; return 0; } static int garp_pdu_append_attr(struct garp_applicant *app, const struct garp_attr *attr, enum garp_attr_event event) { struct garp_attr_hdr *ga; unsigned int len; int err; again: if (!app->pdu) { err = garp_pdu_init(app); if (err < 0) return err; } if (garp_cb(app->pdu)->cur_type != attr->type) { if (garp_cb(app->pdu)->cur_type && garp_pdu_append_end_mark(app) < 0) goto queue; if (garp_pdu_append_msg(app, attr->type) < 0) goto queue; } len = sizeof(*ga) + attr->dlen; if (skb_tailroom(app->pdu) < len) goto queue; ga = __skb_put(app->pdu, len); ga->len = len; ga->event = event; memcpy(ga->data, attr->data, attr->dlen); return 0; queue: garp_pdu_queue(app); goto again; } static void garp_attr_event(struct garp_applicant *app, struct garp_attr *attr, enum garp_event event) { enum garp_applicant_state state; state = garp_applicant_state_table[attr->state][event].state; if (state == GARP_APPLICANT_INVALID) return; switch (garp_applicant_state_table[attr->state][event].action) { case GARP_ACTION_NONE: break; case GARP_ACTION_S_JOIN_IN: /* When appending the attribute fails, don't update state in * order to retry on next TRANSMIT_PDU event. */ if (garp_pdu_append_attr(app, attr, GARP_JOIN_IN) < 0) return; break; case GARP_ACTION_S_LEAVE_EMPTY: garp_pdu_append_attr(app, attr, GARP_LEAVE_EMPTY); /* As a pure applicant, sending a leave message implies that * the attribute was unregistered and can be destroyed. */ garp_attr_destroy(app, attr); return; default: WARN_ON(1); } attr->state = state; } int garp_request_join(const struct net_device *dev, const struct garp_application *appl, const void *data, u8 len, u8 type) { struct garp_port *port = rtnl_dereference(dev->garp_port); struct garp_applicant *app = rtnl_dereference(port->applicants[appl->type]); struct garp_attr *attr; spin_lock_bh(&app->lock); attr = garp_attr_create(app, data, len, type); if (!attr) { spin_unlock_bh(&app->lock); return -ENOMEM; } garp_attr_event(app, attr, GARP_EVENT_REQ_JOIN); spin_unlock_bh(&app->lock); return 0; } EXPORT_SYMBOL_GPL(garp_request_join); void garp_request_leave(const struct net_device *dev, const struct garp_application *appl, const void *data, u8 len, u8 type) { struct garp_port *port = rtnl_dereference(dev->garp_port); struct garp_applicant *app = rtnl_dereference(port->applicants[appl->type]); struct garp_attr *attr; spin_lock_bh(&app->lock); attr = garp_attr_lookup(app, data, len, type); if (!attr) { spin_unlock_bh(&app->lock); return; } garp_attr_event(app, attr, GARP_EVENT_REQ_LEAVE); spin_unlock_bh(&app->lock); } EXPORT_SYMBOL_GPL(garp_request_leave); static void garp_gid_event(struct garp_applicant *app, enum garp_event event) { struct rb_node *node, *next; struct garp_attr *attr; for (node = rb_first(&app->gid); next = node ? rb_next(node) : NULL, node != NULL; node = next) { attr = rb_entry(node, struct garp_attr, node); garp_attr_event(app, attr, event); } } static void garp_join_timer_arm(struct garp_applicant *app) { unsigned long delay; delay = get_random_u32_below(msecs_to_jiffies(garp_join_time)); mod_timer(&app->join_timer, jiffies + delay); } static void garp_join_timer(struct timer_list *t) { struct garp_applicant *app = from_timer(app, t, join_timer); spin_lock(&app->lock); garp_gid_event(app, GARP_EVENT_TRANSMIT_PDU); garp_pdu_queue(app); spin_unlock(&app->lock); garp_queue_xmit(app); garp_join_timer_arm(app); } static int garp_pdu_parse_end_mark(struct sk_buff *skb) { if (!pskb_may_pull(skb, sizeof(u8))) return -1; if (*skb->data == GARP_END_MARK) { skb_pull(skb, sizeof(u8)); return -1; } return 0; } static int garp_pdu_parse_attr(struct garp_applicant *app, struct sk_buff *skb, u8 attrtype) { const struct garp_attr_hdr *ga; struct garp_attr *attr; enum garp_event event; unsigned int dlen; if (!pskb_may_pull(skb, sizeof(*ga))) return -1; ga = (struct garp_attr_hdr *)skb->data; if (ga->len < sizeof(*ga)) return -1; if (!pskb_may_pull(skb, ga->len)) return -1; skb_pull(skb, ga->len); dlen = sizeof(*ga) - ga->len; if (attrtype > app->app->maxattr) return 0; switch (ga->event) { case GARP_LEAVE_ALL: if (dlen != 0) return -1; garp_gid_event(app, GARP_EVENT_R_LEAVE_EMPTY); return 0; case GARP_JOIN_EMPTY: event = GARP_EVENT_R_JOIN_EMPTY; break; case GARP_JOIN_IN: event = GARP_EVENT_R_JOIN_IN; break; case GARP_LEAVE_EMPTY: event = GARP_EVENT_R_LEAVE_EMPTY; break; case GARP_EMPTY: event = GARP_EVENT_R_EMPTY; break; default: return 0; } if (dlen == 0) return -1; attr = garp_attr_lookup(app, ga->data, dlen, attrtype); if (attr == NULL) return 0; garp_attr_event(app, attr, event); return 0; } static int garp_pdu_parse_msg(struct garp_applicant *app, struct sk_buff *skb) { const struct garp_msg_hdr *gm; if (!pskb_may_pull(skb, sizeof(*gm))) return -1; gm = (struct garp_msg_hdr *)skb->data; if (gm->attrtype == 0) return -1; skb_pull(skb, sizeof(*gm)); while (skb->len > 0) { if (garp_pdu_parse_attr(app, skb, gm->attrtype) < 0) return -1; if (garp_pdu_parse_end_mark(skb) < 0) break; } return 0; } static void garp_pdu_rcv(const struct stp_proto *proto, struct sk_buff *skb, struct net_device *dev) { struct garp_application *appl = proto->data; struct garp_port *port; struct garp_applicant *app; const struct garp_pdu_hdr *gp; port = rcu_dereference(dev->garp_port); if (!port) goto err; app = rcu_dereference(port->applicants[appl->type]); if (!app) goto err; if (!pskb_may_pull(skb, sizeof(*gp))) goto err; gp = (struct garp_pdu_hdr *)skb->data; if (get_unaligned(&gp->protocol) != htons(GARP_PROTOCOL_ID)) goto err; skb_pull(skb, sizeof(*gp)); spin_lock(&app->lock); while (skb->len > 0) { if (garp_pdu_parse_msg(app, skb) < 0) break; if (garp_pdu_parse_end_mark(skb) < 0) break; } spin_unlock(&app->lock); err: kfree_skb(skb); } static int garp_init_port(struct net_device *dev) { struct garp_port *port; port = kzalloc(sizeof(*port), GFP_KERNEL); if (!port) return -ENOMEM; rcu_assign_pointer(dev->garp_port, port); return 0; } static void garp_release_port(struct net_device *dev) { struct garp_port *port = rtnl_dereference(dev->garp_port); unsigned int i; for (i = 0; i <= GARP_APPLICATION_MAX; i++) { if (rtnl_dereference(port->applicants[i])) return; } RCU_INIT_POINTER(dev->garp_port, NULL); kfree_rcu(port, rcu); } int garp_init_applicant(struct net_device *dev, struct garp_application *appl) { struct garp_applicant *app; int err; ASSERT_RTNL(); if (!rtnl_dereference(dev->garp_port)) { err = garp_init_port(dev); if (err < 0) goto err1; } err = -ENOMEM; app = kzalloc(sizeof(*app), GFP_KERNEL); if (!app) goto err2; err = dev_mc_add(dev, appl->proto.group_address); if (err < 0) goto err3; app->dev = dev; app->app = appl; app->gid = RB_ROOT; spin_lock_init(&app->lock); skb_queue_head_init(&app->queue); rcu_assign_pointer(dev->garp_port->applicants[appl->type], app); timer_setup(&app->join_timer, garp_join_timer, 0); garp_join_timer_arm(app); return 0; err3: kfree(app); err2: garp_release_port(dev); err1: return err; } EXPORT_SYMBOL_GPL(garp_init_applicant); void garp_uninit_applicant(struct net_device *dev, struct garp_application *appl) { struct garp_port *port = rtnl_dereference(dev->garp_port); struct garp_applicant *app = rtnl_dereference(port->applicants[appl->type]); ASSERT_RTNL(); RCU_INIT_POINTER(port->applicants[appl->type], NULL); /* Delete timer and generate a final TRANSMIT_PDU event to flush out * all pending messages before the applicant is gone. */ timer_shutdown_sync(&app->join_timer); spin_lock_bh(&app->lock); garp_gid_event(app, GARP_EVENT_TRANSMIT_PDU); garp_attr_destroy_all(app); garp_pdu_queue(app); spin_unlock_bh(&app->lock); garp_queue_xmit(app); dev_mc_del(dev, appl->proto.group_address); kfree_rcu(app, rcu); garp_release_port(dev); } EXPORT_SYMBOL_GPL(garp_uninit_applicant); int garp_register_application(struct garp_application *appl) { appl->proto.rcv = garp_pdu_rcv; appl->proto.data = appl; return stp_proto_register(&appl->proto); } EXPORT_SYMBOL_GPL(garp_register_application); void garp_unregister_application(struct garp_application *appl) { stp_proto_unregister(&appl->proto); } EXPORT_SYMBOL_GPL(garp_unregister_application);
37 36 37 37 37 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2002 Richard Henderson * Copyright (C) 2001 Rusty Russell, 2002, 2010 Rusty Russell IBM. * Copyright (C) 2023 Luis Chamberlain <mcgrof@kernel.org> * Copyright (C) 2024 Mike Rapoport IBM. */ #define pr_fmt(fmt) "execmem: " fmt #include <linux/mm.h> #include <linux/mutex.h> #include <linux/vmalloc.h> #include <linux/execmem.h> #include <linux/maple_tree.h> #include <linux/set_memory.h> #include <linux/moduleloader.h> #include <linux/text-patching.h> #include <asm/tlbflush.h> #include "internal.h" static struct execmem_info *execmem_info __ro_after_init; static struct execmem_info default_execmem_info __ro_after_init; #ifdef CONFIG_MMU static void *execmem_vmalloc(struct execmem_range *range, size_t size, pgprot_t pgprot, unsigned long vm_flags) { bool kasan = range->flags & EXECMEM_KASAN_SHADOW; gfp_t gfp_flags = GFP_KERNEL | __GFP_NOWARN; unsigned int align = range->alignment; unsigned long start = range->start; unsigned long end = range->end; void *p; if (kasan) vm_flags |= VM_DEFER_KMEMLEAK; if (vm_flags & VM_ALLOW_HUGE_VMAP) align = PMD_SIZE; p = __vmalloc_node_range(size, align, start, end, gfp_flags, pgprot, vm_flags, NUMA_NO_NODE, __builtin_return_address(0)); if (!p && range->fallback_start) { start = range->fallback_start; end = range->fallback_end; p = __vmalloc_node_range(size, align, start, end, gfp_flags, pgprot, vm_flags, NUMA_NO_NODE, __builtin_return_address(0)); } if (!p) { pr_warn_ratelimited("unable to allocate memory\n"); return NULL; } if (kasan && (kasan_alloc_module_shadow(p, size, GFP_KERNEL) < 0)) { vfree(p); return NULL; } return p; } struct vm_struct *execmem_vmap(size_t size) { struct execmem_range *range = &execmem_info->ranges[EXECMEM_MODULE_DATA]; struct vm_struct *area; area = __get_vm_area_node(size, range->alignment, PAGE_SHIFT, VM_ALLOC, range->start, range->end, NUMA_NO_NODE, GFP_KERNEL, __builtin_return_address(0)); if (!area && range->fallback_start) area = __get_vm_area_node(size, range->alignment, PAGE_SHIFT, VM_ALLOC, range->fallback_start, range->fallback_end, NUMA_NO_NODE, GFP_KERNEL, __builtin_return_address(0)); return area; } #else static void *execmem_vmalloc(struct execmem_range *range, size_t size, pgprot_t pgprot, unsigned long vm_flags) { return vmalloc(size); } #endif /* CONFIG_MMU */ #ifdef CONFIG_ARCH_HAS_EXECMEM_ROX struct execmem_cache { struct mutex mutex; struct maple_tree busy_areas; struct maple_tree free_areas; }; static struct execmem_cache execmem_cache = { .mutex = __MUTEX_INITIALIZER(execmem_cache.mutex), .busy_areas = MTREE_INIT_EXT(busy_areas, MT_FLAGS_LOCK_EXTERN, execmem_cache.mutex), .free_areas = MTREE_INIT_EXT(free_areas, MT_FLAGS_LOCK_EXTERN, execmem_cache.mutex), }; static inline unsigned long mas_range_len(struct ma_state *mas) { return mas->last - mas->index + 1; } static int execmem_set_direct_map_valid(struct vm_struct *vm, bool valid) { unsigned int nr = (1 << get_vm_area_page_order(vm)); unsigned int updated = 0; int err = 0; for (int i = 0; i < vm->nr_pages; i += nr) { err = set_direct_map_valid_noflush(vm->pages[i], nr, valid); if (err) goto err_restore; updated += nr; } return 0; err_restore: for (int i = 0; i < updated; i += nr) set_direct_map_valid_noflush(vm->pages[i], nr, !valid); return err; } static void execmem_cache_clean(struct work_struct *work) { struct maple_tree *free_areas = &execmem_cache.free_areas; struct mutex *mutex = &execmem_cache.mutex; MA_STATE(mas, free_areas, 0, ULONG_MAX); void *area; mutex_lock(mutex); mas_for_each(&mas, area, ULONG_MAX) { size_t size = mas_range_len(&mas); if (IS_ALIGNED(size, PMD_SIZE) && IS_ALIGNED(mas.index, PMD_SIZE)) { struct vm_struct *vm = find_vm_area(area); execmem_set_direct_map_valid(vm, true); mas_store_gfp(&mas, NULL, GFP_KERNEL); vfree(area); } } mutex_unlock(mutex); } static DECLARE_WORK(execmem_cache_clean_work, execmem_cache_clean); static int execmem_cache_add(void *ptr, size_t size) { struct maple_tree *free_areas = &execmem_cache.free_areas; struct mutex *mutex = &execmem_cache.mutex; unsigned long addr = (unsigned long)ptr; MA_STATE(mas, free_areas, addr - 1, addr + 1); unsigned long lower, upper; void *area = NULL; int err; lower = addr; upper = addr + size - 1; mutex_lock(mutex); area = mas_walk(&mas); if (area && mas.last == addr - 1) lower = mas.index; area = mas_next(&mas, ULONG_MAX); if (area && mas.index == addr + size) upper = mas.last; mas_set_range(&mas, lower, upper); err = mas_store_gfp(&mas, (void *)lower, GFP_KERNEL); mutex_unlock(mutex); if (err) return err; return 0; } static bool within_range(struct execmem_range *range, struct ma_state *mas, size_t size) { unsigned long addr = mas->index; if (addr >= range->start && addr + size < range->end) return true; if (range->fallback_start && addr >= range->fallback_start && addr + size < range->fallback_end) return true; return false; } static void *__execmem_cache_alloc(struct execmem_range *range, size_t size) { struct maple_tree *free_areas = &execmem_cache.free_areas; struct maple_tree *busy_areas = &execmem_cache.busy_areas; MA_STATE(mas_free, free_areas, 0, ULONG_MAX); MA_STATE(mas_busy, busy_areas, 0, ULONG_MAX); struct mutex *mutex = &execmem_cache.mutex; unsigned long addr, last, area_size = 0; void *area, *ptr = NULL; int err; mutex_lock(mutex); mas_for_each(&mas_free, area, ULONG_MAX) { area_size = mas_range_len(&mas_free); if (area_size >= size && within_range(range, &mas_free, size)) break; } if (area_size < size) goto out_unlock; addr = mas_free.index; last = mas_free.last; /* insert allocated size to busy_areas at range [addr, addr + size) */ mas_set_range(&mas_busy, addr, addr + size - 1); err = mas_store_gfp(&mas_busy, (void *)addr, GFP_KERNEL); if (err) goto out_unlock; mas_store_gfp(&mas_free, NULL, GFP_KERNEL); if (area_size > size) { void *ptr = (void *)(addr + size); /* * re-insert remaining free size to free_areas at range * [addr + size, last] */ mas_set_range(&mas_free, addr + size, last); err = mas_store_gfp(&mas_free, ptr, GFP_KERNEL); if (err) { mas_store_gfp(&mas_busy, NULL, GFP_KERNEL); goto out_unlock; } } ptr = (void *)addr; out_unlock: mutex_unlock(mutex); return ptr; } static bool execmem_cache_rox = false; void execmem_cache_make_ro(void) { struct maple_tree *free_areas = &execmem_cache.free_areas; struct maple_tree *busy_areas = &execmem_cache.busy_areas; MA_STATE(mas_free, free_areas, 0, ULONG_MAX); MA_STATE(mas_busy, busy_areas, 0, ULONG_MAX); struct mutex *mutex = &execmem_cache.mutex; void *area; execmem_cache_rox = true; mutex_lock(mutex); mas_for_each(&mas_free, area, ULONG_MAX) { unsigned long pages = mas_range_len(&mas_free) >> PAGE_SHIFT; set_memory_ro(mas_free.index, pages); } mas_for_each(&mas_busy, area, ULONG_MAX) { unsigned long pages = mas_range_len(&mas_busy) >> PAGE_SHIFT; set_memory_ro(mas_busy.index, pages); } mutex_unlock(mutex); } static int execmem_cache_populate(struct execmem_range *range, size_t size) { unsigned long vm_flags = VM_ALLOW_HUGE_VMAP; struct vm_struct *vm; size_t alloc_size; int err = -ENOMEM; void *p; alloc_size = round_up(size, PMD_SIZE); p = execmem_vmalloc(range, alloc_size, PAGE_KERNEL, vm_flags); if (!p) return err; vm = find_vm_area(p); if (!vm) goto err_free_mem; /* fill memory with instructions that will trap */ execmem_fill_trapping_insns(p, alloc_size, /* writable = */ true); if (execmem_cache_rox) { err = set_memory_rox((unsigned long)p, vm->nr_pages); if (err) goto err_free_mem; } else { err = set_memory_x((unsigned long)p, vm->nr_pages); if (err) goto err_free_mem; } err = execmem_cache_add(p, alloc_size); if (err) goto err_reset_direct_map; return 0; err_reset_direct_map: execmem_set_direct_map_valid(vm, true); err_free_mem: vfree(p); return err; } static void *execmem_cache_alloc(struct execmem_range *range, size_t size) { void *p; int err; p = __execmem_cache_alloc(range, size); if (p) return p; err = execmem_cache_populate(range, size); if (err) return NULL; return __execmem_cache_alloc(range, size); } static bool execmem_cache_free(void *ptr) { struct maple_tree *busy_areas = &execmem_cache.busy_areas; struct mutex *mutex = &execmem_cache.mutex; unsigned long addr = (unsigned long)ptr; MA_STATE(mas, busy_areas, addr, addr); size_t size; void *area; mutex_lock(mutex); area = mas_walk(&mas); if (!area) { mutex_unlock(mutex); return false; } size = mas_range_len(&mas); mas_store_gfp(&mas, NULL, GFP_KERNEL); mutex_unlock(mutex); execmem_fill_trapping_insns(ptr, size, /* writable = */ false); execmem_cache_add(ptr, size); schedule_work(&execmem_cache_clean_work); return true; } int execmem_make_temp_rw(void *ptr, size_t size) { unsigned int nr = PAGE_ALIGN(size) >> PAGE_SHIFT; unsigned long addr = (unsigned long)ptr; int ret; ret = set_memory_nx(addr, nr); if (ret) return ret; return set_memory_rw(addr, nr); } int execmem_restore_rox(void *ptr, size_t size) { unsigned int nr = PAGE_ALIGN(size) >> PAGE_SHIFT; unsigned long addr = (unsigned long)ptr; return set_memory_rox(addr, nr); } #else /* CONFIG_ARCH_HAS_EXECMEM_ROX */ static void *execmem_cache_alloc(struct execmem_range *range, size_t size) { return NULL; } static bool execmem_cache_free(void *ptr) { return false; } #endif /* CONFIG_ARCH_HAS_EXECMEM_ROX */ void *execmem_alloc(enum execmem_type type, size_t size) { struct execmem_range *range = &execmem_info->ranges[type]; bool use_cache = range->flags & EXECMEM_ROX_CACHE; unsigned long vm_flags = VM_FLUSH_RESET_PERMS; pgprot_t pgprot = range->pgprot; void *p; if (use_cache) p = execmem_cache_alloc(range, size); else p = execmem_vmalloc(range, size, pgprot, vm_flags); return kasan_reset_tag(p); } void execmem_free(void *ptr) { /* * This memory may be RO, and freeing RO memory in an interrupt is not * supported by vmalloc. */ WARN_ON(in_interrupt()); if (!execmem_cache_free(ptr)) vfree(ptr); } void *execmem_update_copy(void *dst, const void *src, size_t size) { return text_poke_copy(dst, src, size); } bool execmem_is_rox(enum execmem_type type) { return !!(execmem_info->ranges[type].flags & EXECMEM_ROX_CACHE); } static bool execmem_validate(struct execmem_info *info) { struct execmem_range *r = &info->ranges[EXECMEM_DEFAULT]; if (!r->alignment || !r->start || !r->end || !pgprot_val(r->pgprot)) { pr_crit("Invalid parameters for execmem allocator, module loading will fail"); return false; } if (!IS_ENABLED(CONFIG_ARCH_HAS_EXECMEM_ROX)) { for (int i = EXECMEM_DEFAULT; i < EXECMEM_TYPE_MAX; i++) { r = &info->ranges[i]; if (r->flags & EXECMEM_ROX_CACHE) { pr_warn_once("ROX cache is not supported\n"); r->flags &= ~EXECMEM_ROX_CACHE; } } } return true; } static void execmem_init_missing(struct execmem_info *info) { struct execmem_range *default_range = &info->ranges[EXECMEM_DEFAULT]; for (int i = EXECMEM_DEFAULT + 1; i < EXECMEM_TYPE_MAX; i++) { struct execmem_range *r = &info->ranges[i]; if (!r->start) { if (i == EXECMEM_MODULE_DATA) r->pgprot = PAGE_KERNEL; else r->pgprot = default_range->pgprot; r->alignment = default_range->alignment; r->start = default_range->start; r->end = default_range->end; r->flags = default_range->flags; r->fallback_start = default_range->fallback_start; r->fallback_end = default_range->fallback_end; } } } struct execmem_info * __weak execmem_arch_setup(void) { return NULL; } static void __init __execmem_init(void) { struct execmem_info *info = execmem_arch_setup(); if (!info) { info = execmem_info = &default_execmem_info; info->ranges[EXECMEM_DEFAULT].start = VMALLOC_START; info->ranges[EXECMEM_DEFAULT].end = VMALLOC_END; info->ranges[EXECMEM_DEFAULT].pgprot = PAGE_KERNEL_EXEC; info->ranges[EXECMEM_DEFAULT].alignment = 1; } if (!execmem_validate(info)) return; execmem_init_missing(info); execmem_info = info; } #ifdef CONFIG_ARCH_WANTS_EXECMEM_LATE static int __init execmem_late_init(void) { __execmem_init(); return 0; } core_initcall(execmem_late_init); #else void __init execmem_init(void) { __execmem_init(); } #endif
7 7 7 7 2 2 2 2 7 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 // SPDX-License-Identifier: GPL-2.0 /* * Parts of this file are * Copyright (C) 2022-2023 Intel Corporation */ #include <linux/ieee80211.h> #include <linux/export.h> #include <net/cfg80211.h> #include "nl80211.h" #include "core.h" #include "rdev-ops.h" static int ___cfg80211_stop_ap(struct cfg80211_registered_device *rdev, struct net_device *dev, unsigned int link_id, bool notify) { struct wireless_dev *wdev = dev->ieee80211_ptr; int err; lockdep_assert_wiphy(wdev->wiphy); if (!rdev->ops->stop_ap) return -EOPNOTSUPP; if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP && dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) return -EOPNOTSUPP; if (!wdev->links[link_id].ap.beacon_interval) return -ENOENT; err = rdev_stop_ap(rdev, dev, link_id); if (!err) { wdev->conn_owner_nlportid = 0; wdev->links[link_id].ap.beacon_interval = 0; memset(&wdev->links[link_id].ap.chandef, 0, sizeof(wdev->links[link_id].ap.chandef)); wdev->u.ap.ssid_len = 0; rdev_set_qos_map(rdev, dev, NULL); if (notify) nl80211_send_ap_stopped(wdev, link_id); /* Should we apply the grace period during beaconing interface * shutdown also? */ cfg80211_sched_dfs_chan_update(rdev); } schedule_work(&cfg80211_disconnect_work); return err; } int cfg80211_stop_ap(struct cfg80211_registered_device *rdev, struct net_device *dev, int link_id, bool notify) { unsigned int link; int ret = 0; if (link_id >= 0) return ___cfg80211_stop_ap(rdev, dev, link_id, notify); for_each_valid_link(dev->ieee80211_ptr, link) { int ret1 = ___cfg80211_stop_ap(rdev, dev, link, notify); if (ret1) ret = ret1; /* try the next one also if one errored */ } return ret; }
9 646 2 62 62 583 32 6 50 16 488 33 9 7 29 33 53 11 2 7 15 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 /* SPDX-License-Identifier: GPL-2.0 */ /* IP Virtual Server * data structure and functionality definitions */ #ifndef _NET_IP_VS_H #define _NET_IP_VS_H #include <linux/ip_vs.h> /* definitions shared with userland */ #include <asm/types.h> /* for __uXX types */ #include <linux/list.h> /* for struct list_head */ #include <linux/spinlock.h> /* for struct rwlock_t */ #include <linux/atomic.h> /* for struct atomic_t */ #include <linux/refcount.h> /* for struct refcount_t */ #include <linux/workqueue.h> #include <linux/compiler.h> #include <linux/timer.h> #include <linux/bug.h> #include <net/checksum.h> #include <linux/netfilter.h> /* for union nf_inet_addr */ #include <linux/ip.h> #include <linux/ipv6.h> /* for struct ipv6hdr */ #include <net/ipv6.h> #if IS_ENABLED(CONFIG_NF_CONNTRACK) #include <net/netfilter/nf_conntrack.h> #endif #include <net/net_namespace.h> /* Netw namespace */ #include <linux/sched/isolation.h> #define IP_VS_HDR_INVERSE 1 #define IP_VS_HDR_ICMP 2 /* Generic access of ipvs struct */ static inline struct netns_ipvs *net_ipvs(struct net* net) { return net->ipvs; } /* Connections' size value needed by ip_vs_ctl.c */ extern int ip_vs_conn_tab_size; extern struct mutex __ip_vs_mutex; struct ip_vs_iphdr { int hdr_flags; /* ipvs flags */ __u32 off; /* Where IP or IPv4 header starts */ __u32 len; /* IPv4 simply where L4 starts * IPv6 where L4 Transport Header starts */ __u16 fragoffs; /* IPv6 fragment offset, 0 if first frag (or not frag)*/ __s16 protocol; __s32 flags; union nf_inet_addr saddr; union nf_inet_addr daddr; }; static inline void *frag_safe_skb_hp(const struct sk_buff *skb, int offset, int len, void *buffer) { return skb_header_pointer(skb, offset, len, buffer); } /* This function handles filling *ip_vs_iphdr, both for IPv4 and IPv6. * IPv6 requires some extra work, as finding proper header position, * depend on the IPv6 extension headers. */ static inline int ip_vs_fill_iph_skb_off(int af, const struct sk_buff *skb, int offset, int hdr_flags, struct ip_vs_iphdr *iphdr) { iphdr->hdr_flags = hdr_flags; iphdr->off = offset; #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) { struct ipv6hdr _iph; const struct ipv6hdr *iph = skb_header_pointer( skb, offset, sizeof(_iph), &_iph); if (!iph) return 0; iphdr->saddr.in6 = iph->saddr; iphdr->daddr.in6 = iph->daddr; /* ipv6_find_hdr() updates len, flags */ iphdr->len = offset; iphdr->flags = 0; iphdr->protocol = ipv6_find_hdr(skb, &iphdr->len, -1, &iphdr->fragoffs, &iphdr->flags); if (iphdr->protocol < 0) return 0; } else #endif { struct iphdr _iph; const struct iphdr *iph = skb_header_pointer( skb, offset, sizeof(_iph), &_iph); if (!iph) return 0; iphdr->len = offset + iph->ihl * 4; iphdr->fragoffs = 0; iphdr->protocol = iph->protocol; iphdr->saddr.ip = iph->saddr; iphdr->daddr.ip = iph->daddr; } return 1; } static inline int ip_vs_fill_iph_skb_icmp(int af, const struct sk_buff *skb, int offset, bool inverse, struct ip_vs_iphdr *iphdr) { int hdr_flags = IP_VS_HDR_ICMP; if (inverse) hdr_flags |= IP_VS_HDR_INVERSE; return ip_vs_fill_iph_skb_off(af, skb, offset, hdr_flags, iphdr); } static inline int ip_vs_fill_iph_skb(int af, const struct sk_buff *skb, bool inverse, struct ip_vs_iphdr *iphdr) { int hdr_flags = 0; if (inverse) hdr_flags |= IP_VS_HDR_INVERSE; return ip_vs_fill_iph_skb_off(af, skb, skb_network_offset(skb), hdr_flags, iphdr); } static inline bool ip_vs_iph_inverse(const struct ip_vs_iphdr *iph) { return !!(iph->hdr_flags & IP_VS_HDR_INVERSE); } static inline bool ip_vs_iph_icmp(const struct ip_vs_iphdr *iph) { return !!(iph->hdr_flags & IP_VS_HDR_ICMP); } static inline void ip_vs_addr_copy(int af, union nf_inet_addr *dst, const union nf_inet_addr *src) { #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) dst->in6 = src->in6; else #endif dst->ip = src->ip; } static inline void ip_vs_addr_set(int af, union nf_inet_addr *dst, const union nf_inet_addr *src) { #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) { dst->in6 = src->in6; return; } #endif dst->ip = src->ip; dst->all[1] = 0; dst->all[2] = 0; dst->all[3] = 0; } static inline int ip_vs_addr_equal(int af, const union nf_inet_addr *a, const union nf_inet_addr *b) { #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) return ipv6_addr_equal(&a->in6, &b->in6); #endif return a->ip == b->ip; } #ifdef CONFIG_IP_VS_DEBUG #include <linux/net.h> int ip_vs_get_debug_level(void); static inline const char *ip_vs_dbg_addr(int af, char *buf, size_t buf_len, const union nf_inet_addr *addr, int *idx) { int len; #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) len = snprintf(&buf[*idx], buf_len - *idx, "[%pI6c]", &addr->in6) + 1; else #endif len = snprintf(&buf[*idx], buf_len - *idx, "%pI4", &addr->ip) + 1; *idx += len; BUG_ON(*idx > buf_len + 1); return &buf[*idx - len]; } #define IP_VS_DBG_BUF(level, msg, ...) \ do { \ char ip_vs_dbg_buf[160]; \ int ip_vs_dbg_idx = 0; \ if (level <= ip_vs_get_debug_level()) \ printk(KERN_DEBUG pr_fmt(msg), ##__VA_ARGS__); \ } while (0) #define IP_VS_ERR_BUF(msg...) \ do { \ char ip_vs_dbg_buf[160]; \ int ip_vs_dbg_idx = 0; \ pr_err(msg); \ } while (0) /* Only use from within IP_VS_DBG_BUF() or IP_VS_ERR_BUF macros */ #define IP_VS_DBG_ADDR(af, addr) \ ip_vs_dbg_addr(af, ip_vs_dbg_buf, \ sizeof(ip_vs_dbg_buf), addr, \ &ip_vs_dbg_idx) #define IP_VS_DBG(level, msg, ...) \ do { \ if (level <= ip_vs_get_debug_level()) \ printk(KERN_DEBUG pr_fmt(msg), ##__VA_ARGS__); \ } while (0) #define IP_VS_DBG_RL(msg, ...) \ do { \ if (net_ratelimit()) \ printk(KERN_DEBUG pr_fmt(msg), ##__VA_ARGS__); \ } while (0) #define IP_VS_DBG_PKT(level, af, pp, skb, ofs, msg) \ do { \ if (level <= ip_vs_get_debug_level()) \ pp->debug_packet(af, pp, skb, ofs, msg); \ } while (0) #define IP_VS_DBG_RL_PKT(level, af, pp, skb, ofs, msg) \ do { \ if (level <= ip_vs_get_debug_level() && \ net_ratelimit()) \ pp->debug_packet(af, pp, skb, ofs, msg); \ } while (0) #else /* NO DEBUGGING at ALL */ #define IP_VS_DBG_BUF(level, msg...) do {} while (0) #define IP_VS_ERR_BUF(msg...) do {} while (0) #define IP_VS_DBG(level, msg...) do {} while (0) #define IP_VS_DBG_RL(msg...) do {} while (0) #define IP_VS_DBG_PKT(level, af, pp, skb, ofs, msg) do {} while (0) #define IP_VS_DBG_RL_PKT(level, af, pp, skb, ofs, msg) do {} while (0) #endif #define IP_VS_BUG() BUG() #define IP_VS_ERR_RL(msg, ...) \ do { \ if (net_ratelimit()) \ pr_err(msg, ##__VA_ARGS__); \ } while (0) /* The port number of FTP service (in network order). */ #define FTPPORT cpu_to_be16(21) #define FTPDATA cpu_to_be16(20) /* TCP State Values */ enum { IP_VS_TCP_S_NONE = 0, IP_VS_TCP_S_ESTABLISHED, IP_VS_TCP_S_SYN_SENT, IP_VS_TCP_S_SYN_RECV, IP_VS_TCP_S_FIN_WAIT, IP_VS_TCP_S_TIME_WAIT, IP_VS_TCP_S_CLOSE, IP_VS_TCP_S_CLOSE_WAIT, IP_VS_TCP_S_LAST_ACK, IP_VS_TCP_S_LISTEN, IP_VS_TCP_S_SYNACK, IP_VS_TCP_S_LAST }; /* UDP State Values */ enum { IP_VS_UDP_S_NORMAL, IP_VS_UDP_S_LAST, }; /* ICMP State Values */ enum { IP_VS_ICMP_S_NORMAL, IP_VS_ICMP_S_LAST, }; /* SCTP State Values */ enum ip_vs_sctp_states { IP_VS_SCTP_S_NONE, IP_VS_SCTP_S_INIT1, IP_VS_SCTP_S_INIT, IP_VS_SCTP_S_COOKIE_SENT, IP_VS_SCTP_S_COOKIE_REPLIED, IP_VS_SCTP_S_COOKIE_WAIT, IP_VS_SCTP_S_COOKIE, IP_VS_SCTP_S_COOKIE_ECHOED, IP_VS_SCTP_S_ESTABLISHED, IP_VS_SCTP_S_SHUTDOWN_SENT, IP_VS_SCTP_S_SHUTDOWN_RECEIVED, IP_VS_SCTP_S_SHUTDOWN_ACK_SENT, IP_VS_SCTP_S_REJECTED, IP_VS_SCTP_S_CLOSED, IP_VS_SCTP_S_LAST }; /* Connection templates use bits from state */ #define IP_VS_CTPL_S_NONE 0x0000 #define IP_VS_CTPL_S_ASSURED 0x0001 #define IP_VS_CTPL_S_LAST 0x0002 /* Delta sequence info structure * Each ip_vs_conn has 2 (output AND input seq. changes). * Only used in the VS/NAT. */ struct ip_vs_seq { __u32 init_seq; /* Add delta from this seq */ __u32 delta; /* Delta in sequence numbers */ __u32 previous_delta; /* Delta in sequence numbers * before last resized pkt */ }; /* counters per cpu */ struct ip_vs_counters { u64_stats_t conns; /* connections scheduled */ u64_stats_t inpkts; /* incoming packets */ u64_stats_t outpkts; /* outgoing packets */ u64_stats_t inbytes; /* incoming bytes */ u64_stats_t outbytes; /* outgoing bytes */ }; /* Stats per cpu */ struct ip_vs_cpu_stats { struct ip_vs_counters cnt; struct u64_stats_sync syncp; }; /* Default nice for estimator kthreads */ #define IPVS_EST_NICE 0 /* IPVS statistics objects */ struct ip_vs_estimator { struct hlist_node list; u64 last_inbytes; u64 last_outbytes; u64 last_conns; u64 last_inpkts; u64 last_outpkts; u64 cps; u64 inpps; u64 outpps; u64 inbps; u64 outbps; s32 ktid:16, /* kthread ID, -1=temp list */ ktrow:8, /* row/tick ID for kthread */ ktcid:8; /* chain ID for kthread tick */ }; /* * IPVS statistics object, 64-bit kernel version of struct ip_vs_stats_user */ struct ip_vs_kstats { u64 conns; /* connections scheduled */ u64 inpkts; /* incoming packets */ u64 outpkts; /* outgoing packets */ u64 inbytes; /* incoming bytes */ u64 outbytes; /* outgoing bytes */ u64 cps; /* current connection rate */ u64 inpps; /* current in packet rate */ u64 outpps; /* current out packet rate */ u64 inbps; /* current in byte rate */ u64 outbps; /* current out byte rate */ }; struct ip_vs_stats { struct ip_vs_kstats kstats; /* kernel statistics */ struct ip_vs_estimator est; /* estimator */ struct ip_vs_cpu_stats __percpu *cpustats; /* per cpu counters */ spinlock_t lock; /* spin lock */ struct ip_vs_kstats kstats0; /* reset values */ }; struct ip_vs_stats_rcu { struct ip_vs_stats s; struct rcu_head rcu_head; }; int ip_vs_stats_init_alloc(struct ip_vs_stats *s); struct ip_vs_stats *ip_vs_stats_alloc(void); void ip_vs_stats_release(struct ip_vs_stats *stats); void ip_vs_stats_free(struct ip_vs_stats *stats); /* Process estimators in multiple timer ticks (20/50/100, see ktrow) */ #define IPVS_EST_NTICKS 50 /* Estimation uses a 2-second period containing ticks (in jiffies) */ #define IPVS_EST_TICK ((2 * HZ) / IPVS_EST_NTICKS) /* Limit of CPU load per kthread (8 for 12.5%), ratio of CPU capacity (1/C). * Value of 4 and above ensures kthreads will take work without exceeding * the CPU capacity under different circumstances. */ #define IPVS_EST_LOAD_DIVISOR 8 /* Kthreads should not have work that exceeds the CPU load above 50% */ #define IPVS_EST_CPU_KTHREADS (IPVS_EST_LOAD_DIVISOR / 2) /* Desired number of chains per timer tick (chain load factor in 100us units), * 48=4.8ms of 40ms tick (12% CPU usage): * 2 sec * 1000 ms in sec * 10 (100us in ms) / 8 (12.5%) / 50 */ #define IPVS_EST_CHAIN_FACTOR \ ALIGN_DOWN(2 * 1000 * 10 / IPVS_EST_LOAD_DIVISOR / IPVS_EST_NTICKS, 8) /* Compiled number of chains per tick * The defines should match cond_resched_rcu */ #if defined(CONFIG_DEBUG_ATOMIC_SLEEP) || !defined(CONFIG_PREEMPT_RCU) #define IPVS_EST_TICK_CHAINS IPVS_EST_CHAIN_FACTOR #else #define IPVS_EST_TICK_CHAINS 1 #endif #if IPVS_EST_NTICKS > 127 #error Too many timer ticks for ktrow #endif /* Multiple chains processed in same tick */ struct ip_vs_est_tick_data { struct rcu_head rcu_head; struct hlist_head chains[IPVS_EST_TICK_CHAINS]; DECLARE_BITMAP(present, IPVS_EST_TICK_CHAINS); DECLARE_BITMAP(full, IPVS_EST_TICK_CHAINS); int chain_len[IPVS_EST_TICK_CHAINS]; }; /* Context for estimation kthread */ struct ip_vs_est_kt_data { struct netns_ipvs *ipvs; struct task_struct *task; /* task if running */ struct ip_vs_est_tick_data __rcu *ticks[IPVS_EST_NTICKS]; DECLARE_BITMAP(avail, IPVS_EST_NTICKS); /* tick has space for ests */ unsigned long est_timer; /* estimation timer (jiffies) */ struct ip_vs_stats *calc_stats; /* Used for calculation */ int tick_len[IPVS_EST_NTICKS]; /* est count */ int id; /* ktid per netns */ int chain_max; /* max ests per tick chain */ int tick_max; /* max ests per tick */ int est_count; /* attached ests to kthread */ int est_max_count; /* max ests per kthread */ int add_row; /* row for new ests */ int est_row; /* estimated row */ }; struct dst_entry; struct iphdr; struct ip_vs_conn; struct ip_vs_app; struct sk_buff; struct ip_vs_proto_data; struct ip_vs_protocol { struct ip_vs_protocol *next; char *name; u16 protocol; u16 num_states; int dont_defrag; void (*init)(struct ip_vs_protocol *pp); void (*exit)(struct ip_vs_protocol *pp); int (*init_netns)(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd); void (*exit_netns)(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd); int (*conn_schedule)(struct netns_ipvs *ipvs, int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, int *verdict, struct ip_vs_conn **cpp, struct ip_vs_iphdr *iph); struct ip_vs_conn * (*conn_in_get)(struct netns_ipvs *ipvs, int af, const struct sk_buff *skb, const struct ip_vs_iphdr *iph); struct ip_vs_conn * (*conn_out_get)(struct netns_ipvs *ipvs, int af, const struct sk_buff *skb, const struct ip_vs_iphdr *iph); int (*snat_handler)(struct sk_buff *skb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp, struct ip_vs_iphdr *iph); int (*dnat_handler)(struct sk_buff *skb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp, struct ip_vs_iphdr *iph); const char *(*state_name)(int state); void (*state_transition)(struct ip_vs_conn *cp, int direction, const struct sk_buff *skb, struct ip_vs_proto_data *pd); int (*register_app)(struct netns_ipvs *ipvs, struct ip_vs_app *inc); void (*unregister_app)(struct netns_ipvs *ipvs, struct ip_vs_app *inc); int (*app_conn_bind)(struct ip_vs_conn *cp); void (*debug_packet)(int af, struct ip_vs_protocol *pp, const struct sk_buff *skb, int offset, const char *msg); void (*timeout_change)(struct ip_vs_proto_data *pd, int flags); }; /* protocol data per netns */ struct ip_vs_proto_data { struct ip_vs_proto_data *next; struct ip_vs_protocol *pp; int *timeout_table; /* protocol timeout table */ atomic_t appcnt; /* counter of proto app incs. */ struct tcp_states_t *tcp_state_table; }; struct ip_vs_protocol *ip_vs_proto_get(unsigned short proto); struct ip_vs_proto_data *ip_vs_proto_data_get(struct netns_ipvs *ipvs, unsigned short proto); struct ip_vs_conn_param { struct netns_ipvs *ipvs; const union nf_inet_addr *caddr; const union nf_inet_addr *vaddr; __be16 cport; __be16 vport; __u16 protocol; u16 af; const struct ip_vs_pe *pe; char *pe_data; __u8 pe_data_len; }; /* IP_VS structure allocated for each dynamically scheduled connection */ struct ip_vs_conn { struct hlist_node c_list; /* hashed list heads */ /* Protocol, addresses and port numbers */ __be16 cport; __be16 dport; __be16 vport; u16 af; /* address family */ union nf_inet_addr caddr; /* client address */ union nf_inet_addr vaddr; /* virtual address */ union nf_inet_addr daddr; /* destination address */ volatile __u32 flags; /* status flags */ __u16 protocol; /* Which protocol (TCP/UDP) */ __u16 daf; /* Address family of the dest */ struct netns_ipvs *ipvs; /* counter and timer */ refcount_t refcnt; /* reference count */ struct timer_list timer; /* Expiration timer */ volatile unsigned long timeout; /* timeout */ /* Flags and state transition */ spinlock_t lock; /* lock for state transition */ volatile __u16 state; /* state info */ volatile __u16 old_state; /* old state, to be used for * state transition triggered * synchronization */ __u32 fwmark; /* Fire wall mark from skb */ unsigned long sync_endtime; /* jiffies + sent_retries */ /* Control members */ struct ip_vs_conn *control; /* Master control connection */ atomic_t n_control; /* Number of controlled ones */ struct ip_vs_dest *dest; /* real server */ atomic_t in_pkts; /* incoming packet counter */ /* Packet transmitter for different forwarding methods. If it * mangles the packet, it must return NF_DROP or better NF_STOLEN, * otherwise this must be changed to a sk_buff **. * NF_ACCEPT can be returned when destination is local. */ int (*packet_xmit)(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph); /* Note: we can group the following members into a structure, * in order to save more space, and the following members are * only used in VS/NAT anyway */ struct ip_vs_app *app; /* bound ip_vs_app object */ void *app_data; /* Application private data */ struct_group(sync_conn_opt, struct ip_vs_seq in_seq; /* incoming seq. struct */ struct ip_vs_seq out_seq; /* outgoing seq. struct */ ); const struct ip_vs_pe *pe; char *pe_data; __u8 pe_data_len; struct rcu_head rcu_head; }; /* Extended internal versions of struct ip_vs_service_user and ip_vs_dest_user * for IPv6 support. * * We need these to conveniently pass around service and destination * options, but unfortunately, we also need to keep the old definitions to * maintain userspace backwards compatibility for the setsockopt interface. */ struct ip_vs_service_user_kern { /* virtual service addresses */ u16 af; u16 protocol; union nf_inet_addr addr; /* virtual ip address */ __be16 port; u32 fwmark; /* firewall mark of service */ /* virtual service options */ char *sched_name; char *pe_name; unsigned int flags; /* virtual service flags */ unsigned int timeout; /* persistent timeout in sec */ __be32 netmask; /* persistent netmask or plen */ }; struct ip_vs_dest_user_kern { /* destination server address */ union nf_inet_addr addr; __be16 port; /* real server options */ unsigned int conn_flags; /* connection flags */ int weight; /* destination weight */ /* thresholds for active connections */ u32 u_threshold; /* upper threshold */ u32 l_threshold; /* lower threshold */ /* Address family of addr */ u16 af; u16 tun_type; /* tunnel type */ __be16 tun_port; /* tunnel port */ u16 tun_flags; /* tunnel flags */ }; /* * The information about the virtual service offered to the net and the * forwarding entries. */ struct ip_vs_service { struct hlist_node s_list; /* for normal service table */ struct hlist_node f_list; /* for fwmark-based service table */ atomic_t refcnt; /* reference counter */ u16 af; /* address family */ __u16 protocol; /* which protocol (TCP/UDP) */ union nf_inet_addr addr; /* IP address for virtual service */ __be16 port; /* port number for the service */ __u32 fwmark; /* firewall mark of the service */ unsigned int flags; /* service status flags */ unsigned int timeout; /* persistent timeout in ticks */ __be32 netmask; /* grouping granularity, mask/plen */ struct netns_ipvs *ipvs; struct list_head destinations; /* real server d-linked list */ __u32 num_dests; /* number of servers */ struct ip_vs_stats stats; /* statistics for the service */ /* for scheduling */ struct ip_vs_scheduler __rcu *scheduler; /* bound scheduler object */ spinlock_t sched_lock; /* lock sched_data */ void *sched_data; /* scheduler application data */ /* alternate persistence engine */ struct ip_vs_pe __rcu *pe; int conntrack_afmask; struct rcu_head rcu_head; }; /* Information for cached dst */ struct ip_vs_dest_dst { struct dst_entry *dst_cache; /* destination cache entry */ u32 dst_cookie; union nf_inet_addr dst_saddr; struct rcu_head rcu_head; }; /* The real server destination forwarding entry with ip address, port number, * and so on. */ struct ip_vs_dest { struct list_head n_list; /* for the dests in the service */ struct hlist_node d_list; /* for table with all the dests */ u16 af; /* address family */ __be16 port; /* port number of the server */ union nf_inet_addr addr; /* IP address of the server */ volatile unsigned int flags; /* dest status flags */ atomic_t conn_flags; /* flags to copy to conn */ atomic_t weight; /* server weight */ atomic_t last_weight; /* server latest weight */ __u16 tun_type; /* tunnel type */ __be16 tun_port; /* tunnel port */ __u16 tun_flags; /* tunnel flags */ refcount_t refcnt; /* reference counter */ struct ip_vs_stats stats; /* statistics */ unsigned long idle_start; /* start time, jiffies */ /* connection counters and thresholds */ atomic_t activeconns; /* active connections */ atomic_t inactconns; /* inactive connections */ atomic_t persistconns; /* persistent connections */ __u32 u_threshold; /* upper threshold */ __u32 l_threshold; /* lower threshold */ /* for destination cache */ spinlock_t dst_lock; /* lock of dst_cache */ struct ip_vs_dest_dst __rcu *dest_dst; /* cached dst info */ /* for virtual service */ struct ip_vs_service __rcu *svc; /* service it belongs to */ __u16 protocol; /* which protocol (TCP/UDP) */ __be16 vport; /* virtual port number */ union nf_inet_addr vaddr; /* virtual IP address */ __u32 vfwmark; /* firewall mark of service */ struct rcu_head rcu_head; struct list_head t_list; /* in dest_trash */ unsigned int in_rs_table:1; /* we are in rs_table */ }; /* The scheduler object */ struct ip_vs_scheduler { struct list_head n_list; /* d-linked list head */ char *name; /* scheduler name */ atomic_t refcnt; /* reference counter */ struct module *module; /* THIS_MODULE/NULL */ /* scheduler initializing service */ int (*init_service)(struct ip_vs_service *svc); /* scheduling service finish */ void (*done_service)(struct ip_vs_service *svc); /* dest is linked */ int (*add_dest)(struct ip_vs_service *svc, struct ip_vs_dest *dest); /* dest is unlinked */ int (*del_dest)(struct ip_vs_service *svc, struct ip_vs_dest *dest); /* dest is updated */ int (*upd_dest)(struct ip_vs_service *svc, struct ip_vs_dest *dest); /* selecting a server from the given service */ struct ip_vs_dest* (*schedule)(struct ip_vs_service *svc, const struct sk_buff *skb, struct ip_vs_iphdr *iph); }; /* The persistence engine object */ struct ip_vs_pe { struct list_head n_list; /* d-linked list head */ char *name; /* scheduler name */ atomic_t refcnt; /* reference counter */ struct module *module; /* THIS_MODULE/NULL */ /* get the connection template, if any */ int (*fill_param)(struct ip_vs_conn_param *p, struct sk_buff *skb); bool (*ct_match)(const struct ip_vs_conn_param *p, struct ip_vs_conn *ct); u32 (*hashkey_raw)(const struct ip_vs_conn_param *p, u32 initval, bool inverse); int (*show_pe_data)(const struct ip_vs_conn *cp, char *buf); /* create connections for real-server outgoing packets */ struct ip_vs_conn* (*conn_out)(struct ip_vs_service *svc, struct ip_vs_dest *dest, struct sk_buff *skb, const struct ip_vs_iphdr *iph, __be16 dport, __be16 cport); }; /* The application module object (a.k.a. app incarnation) */ struct ip_vs_app { struct list_head a_list; /* member in app list */ int type; /* IP_VS_APP_TYPE_xxx */ char *name; /* application module name */ __u16 protocol; struct module *module; /* THIS_MODULE/NULL */ struct list_head incs_list; /* list of incarnations */ /* members for application incarnations */ struct list_head p_list; /* member in proto app list */ struct ip_vs_app *app; /* its real application */ __be16 port; /* port number in net order */ atomic_t usecnt; /* usage counter */ struct rcu_head rcu_head; /* output hook: Process packet in inout direction, diff set for TCP. * Return: 0=Error, 1=Payload Not Mangled/Mangled but checksum is ok, * 2=Mangled but checksum was not updated */ int (*pkt_out)(struct ip_vs_app *, struct ip_vs_conn *, struct sk_buff *, int *diff, struct ip_vs_iphdr *ipvsh); /* input hook: Process packet in outin direction, diff set for TCP. * Return: 0=Error, 1=Payload Not Mangled/Mangled but checksum is ok, * 2=Mangled but checksum was not updated */ int (*pkt_in)(struct ip_vs_app *, struct ip_vs_conn *, struct sk_buff *, int *diff, struct ip_vs_iphdr *ipvsh); /* ip_vs_app initializer */ int (*init_conn)(struct ip_vs_app *, struct ip_vs_conn *); /* ip_vs_app finish */ int (*done_conn)(struct ip_vs_app *, struct ip_vs_conn *); /* not used now */ int (*bind_conn)(struct ip_vs_app *, struct ip_vs_conn *, struct ip_vs_protocol *); void (*unbind_conn)(struct ip_vs_app *, struct ip_vs_conn *); int * timeout_table; int * timeouts; int timeouts_size; int (*conn_schedule)(struct sk_buff *skb, struct ip_vs_app *app, int *verdict, struct ip_vs_conn **cpp); struct ip_vs_conn * (*conn_in_get)(const struct sk_buff *skb, struct ip_vs_app *app, const struct iphdr *iph, int inverse); struct ip_vs_conn * (*conn_out_get)(const struct sk_buff *skb, struct ip_vs_app *app, const struct iphdr *iph, int inverse); int (*state_transition)(struct ip_vs_conn *cp, int direction, const struct sk_buff *skb, struct ip_vs_app *app); void (*timeout_change)(struct ip_vs_app *app, int flags); }; struct ipvs_master_sync_state { struct list_head sync_queue; struct ip_vs_sync_buff *sync_buff; unsigned long sync_queue_len; unsigned int sync_queue_delay; struct delayed_work master_wakeup_work; struct netns_ipvs *ipvs; }; struct ip_vs_sync_thread_data; /* How much time to keep dests in trash */ #define IP_VS_DEST_TRASH_PERIOD (120 * HZ) struct ipvs_sync_daemon_cfg { union nf_inet_addr mcast_group; int syncid; u16 sync_maxlen; u16 mcast_port; u8 mcast_af; u8 mcast_ttl; /* multicast interface name */ char mcast_ifn[IP_VS_IFNAME_MAXLEN]; }; /* IPVS in network namespace */ struct netns_ipvs { int gen; /* Generation */ int enable; /* enable like nf_hooks do */ /* Hash table: for real service lookups */ #define IP_VS_RTAB_BITS 4 #define IP_VS_RTAB_SIZE (1 << IP_VS_RTAB_BITS) #define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1) struct hlist_head rs_table[IP_VS_RTAB_SIZE]; /* ip_vs_app */ struct list_head app_list; /* ip_vs_proto */ #define IP_VS_PROTO_TAB_SIZE 32 /* must be power of 2 */ struct ip_vs_proto_data *proto_data_table[IP_VS_PROTO_TAB_SIZE]; /* ip_vs_proto_tcp */ #ifdef CONFIG_IP_VS_PROTO_TCP #define TCP_APP_TAB_BITS 4 #define TCP_APP_TAB_SIZE (1 << TCP_APP_TAB_BITS) #define TCP_APP_TAB_MASK (TCP_APP_TAB_SIZE - 1) struct list_head tcp_apps[TCP_APP_TAB_SIZE]; #endif /* ip_vs_proto_udp */ #ifdef CONFIG_IP_VS_PROTO_UDP #define UDP_APP_TAB_BITS 4 #define UDP_APP_TAB_SIZE (1 << UDP_APP_TAB_BITS) #define UDP_APP_TAB_MASK (UDP_APP_TAB_SIZE - 1) struct list_head udp_apps[UDP_APP_TAB_SIZE]; #endif /* ip_vs_proto_sctp */ #ifdef CONFIG_IP_VS_PROTO_SCTP #define SCTP_APP_TAB_BITS 4 #define SCTP_APP_TAB_SIZE (1 << SCTP_APP_TAB_BITS) #define SCTP_APP_TAB_MASK (SCTP_APP_TAB_SIZE - 1) /* Hash table for SCTP application incarnations */ struct list_head sctp_apps[SCTP_APP_TAB_SIZE]; #endif /* ip_vs_conn */ atomic_t conn_count; /* connection counter */ /* ip_vs_ctl */ struct ip_vs_stats_rcu *tot_stats; /* Statistics & est. */ int num_services; /* no of virtual services */ int num_services6; /* IPv6 virtual services */ /* Trash for destinations */ struct list_head dest_trash; spinlock_t dest_trash_lock; struct timer_list dest_trash_timer; /* expiration timer */ /* Service counters */ atomic_t ftpsvc_counter; atomic_t nullsvc_counter; atomic_t conn_out_counter; #ifdef CONFIG_SYSCTL /* delayed work for expiring no dest connections */ struct delayed_work expire_nodest_conn_work; /* 1/rate drop and drop-entry variables */ struct delayed_work defense_work; /* Work handler */ int drop_rate; int drop_counter; int old_secure_tcp; atomic_t dropentry; /* locks in ctl.c */ spinlock_t dropentry_lock; /* drop entry handling */ spinlock_t droppacket_lock; /* drop packet handling */ spinlock_t securetcp_lock; /* state and timeout tables */ /* sys-ctl struct */ struct ctl_table_header *sysctl_hdr; struct ctl_table *sysctl_tbl; #endif /* sysctl variables */ int sysctl_amemthresh; int sysctl_am_droprate; int sysctl_drop_entry; int sysctl_drop_packet; int sysctl_secure_tcp; #ifdef CONFIG_IP_VS_NFCT int sysctl_conntrack; #endif int sysctl_snat_reroute; int sysctl_sync_ver; int sysctl_sync_ports; int sysctl_sync_persist_mode; unsigned long sysctl_sync_qlen_max; int sysctl_sync_sock_size; int sysctl_cache_bypass; int sysctl_expire_nodest_conn; int sysctl_sloppy_tcp; int sysctl_sloppy_sctp; int sysctl_expire_quiescent_template; int sysctl_sync_threshold[2]; unsigned int sysctl_sync_refresh_period; int sysctl_sync_retries; int sysctl_nat_icmp_send; int sysctl_pmtu_disc; int sysctl_backup_only; int sysctl_conn_reuse_mode; int sysctl_schedule_icmp; int sysctl_ignore_tunneled; int sysctl_run_estimation; #ifdef CONFIG_SYSCTL cpumask_var_t sysctl_est_cpulist; /* kthread cpumask */ int est_cpulist_valid; /* cpulist set */ int sysctl_est_nice; /* kthread nice */ int est_stopped; /* stop tasks */ #endif /* ip_vs_lblc */ int sysctl_lblc_expiration; struct ctl_table_header *lblc_ctl_header; struct ctl_table *lblc_ctl_table; /* ip_vs_lblcr */ int sysctl_lblcr_expiration; struct ctl_table_header *lblcr_ctl_header; struct ctl_table *lblcr_ctl_table; /* ip_vs_est */ struct delayed_work est_reload_work;/* Reload kthread tasks */ struct mutex est_mutex; /* protect kthread tasks */ struct hlist_head est_temp_list; /* Ests during calc phase */ struct ip_vs_est_kt_data **est_kt_arr; /* Array of kthread data ptrs */ unsigned long est_max_threads;/* Hard limit of kthreads */ int est_calc_phase; /* Calculation phase */ int est_chain_max; /* Calculated chain_max */ int est_kt_count; /* Allocated ptrs */ int est_add_ktid; /* ktid where to add ests */ atomic_t est_genid; /* kthreads reload genid */ atomic_t est_genid_done; /* applied genid */ /* ip_vs_sync */ spinlock_t sync_lock; struct ipvs_master_sync_state *ms; spinlock_t sync_buff_lock; struct ip_vs_sync_thread_data *master_tinfo; struct ip_vs_sync_thread_data *backup_tinfo; int threads_mask; volatile int sync_state; struct mutex sync_mutex; struct ipvs_sync_daemon_cfg mcfg; /* Master Configuration */ struct ipvs_sync_daemon_cfg bcfg; /* Backup Configuration */ /* net name space ptr */ struct net *net; /* Needed by timer routines */ /* Number of heterogeneous destinations, needed because heterogeneous * are not supported when synchronization is enabled. */ unsigned int mixed_address_family_dests; unsigned int hooks_afmask; /* &1=AF_INET, &2=AF_INET6 */ }; #define DEFAULT_SYNC_THRESHOLD 3 #define DEFAULT_SYNC_PERIOD 50 #define DEFAULT_SYNC_VER 1 #define DEFAULT_SLOPPY_TCP 0 #define DEFAULT_SLOPPY_SCTP 0 #define DEFAULT_SYNC_REFRESH_PERIOD (0U * HZ) #define DEFAULT_SYNC_RETRIES 0 #define IPVS_SYNC_WAKEUP_RATE 8 #define IPVS_SYNC_QLEN_MAX (IPVS_SYNC_WAKEUP_RATE * 4) #define IPVS_SYNC_SEND_DELAY (HZ / 50) #define IPVS_SYNC_CHECK_PERIOD HZ #define IPVS_SYNC_FLUSH_TIME (HZ * 2) #define IPVS_SYNC_PORTS_MAX (1 << 6) #ifdef CONFIG_SYSCTL static inline int sysctl_sync_threshold(struct netns_ipvs *ipvs) { return ipvs->sysctl_sync_threshold[0]; } static inline int sysctl_sync_period(struct netns_ipvs *ipvs) { return READ_ONCE(ipvs->sysctl_sync_threshold[1]); } static inline unsigned int sysctl_sync_refresh_period(struct netns_ipvs *ipvs) { return READ_ONCE(ipvs->sysctl_sync_refresh_period); } static inline int sysctl_sync_retries(struct netns_ipvs *ipvs) { return ipvs->sysctl_sync_retries; } static inline int sysctl_sync_ver(struct netns_ipvs *ipvs) { return ipvs->sysctl_sync_ver; } static inline int sysctl_sloppy_tcp(struct netns_ipvs *ipvs) { return ipvs->sysctl_sloppy_tcp; } static inline int sysctl_sloppy_sctp(struct netns_ipvs *ipvs) { return ipvs->sysctl_sloppy_sctp; } static inline int sysctl_sync_ports(struct netns_ipvs *ipvs) { return READ_ONCE(ipvs->sysctl_sync_ports); } static inline int sysctl_sync_persist_mode(struct netns_ipvs *ipvs) { return ipvs->sysctl_sync_persist_mode; } static inline unsigned long sysctl_sync_qlen_max(struct netns_ipvs *ipvs) { return ipvs->sysctl_sync_qlen_max; } static inline int sysctl_sync_sock_size(struct netns_ipvs *ipvs) { return ipvs->sysctl_sync_sock_size; } static inline int sysctl_pmtu_disc(struct netns_ipvs *ipvs) { return ipvs->sysctl_pmtu_disc; } static inline int sysctl_backup_only(struct netns_ipvs *ipvs) { return ipvs->sync_state & IP_VS_STATE_BACKUP && ipvs->sysctl_backup_only; } static inline int sysctl_conn_reuse_mode(struct netns_ipvs *ipvs) { return ipvs->sysctl_conn_reuse_mode; } static inline int sysctl_expire_nodest_conn(struct netns_ipvs *ipvs) { return ipvs->sysctl_expire_nodest_conn; } static inline int sysctl_schedule_icmp(struct netns_ipvs *ipvs) { return ipvs->sysctl_schedule_icmp; } static inline int sysctl_ignore_tunneled(struct netns_ipvs *ipvs) { return ipvs->sysctl_ignore_tunneled; } static inline int sysctl_cache_bypass(struct netns_ipvs *ipvs) { return ipvs->sysctl_cache_bypass; } static inline int sysctl_run_estimation(struct netns_ipvs *ipvs) { return ipvs->sysctl_run_estimation; } static inline const struct cpumask *sysctl_est_cpulist(struct netns_ipvs *ipvs) { if (ipvs->est_cpulist_valid) return ipvs->sysctl_est_cpulist; else return housekeeping_cpumask(HK_TYPE_KTHREAD); } static inline int sysctl_est_nice(struct netns_ipvs *ipvs) { return ipvs->sysctl_est_nice; } #else static inline int sysctl_sync_threshold(struct netns_ipvs *ipvs) { return DEFAULT_SYNC_THRESHOLD; } static inline int sysctl_sync_period(struct netns_ipvs *ipvs) { return DEFAULT_SYNC_PERIOD; } static inline unsigned int sysctl_sync_refresh_period(struct netns_ipvs *ipvs) { return DEFAULT_SYNC_REFRESH_PERIOD; } static inline int sysctl_sync_retries(struct netns_ipvs *ipvs) { return DEFAULT_SYNC_RETRIES & 3; } static inline int sysctl_sync_ver(struct netns_ipvs *ipvs) { return DEFAULT_SYNC_VER; } static inline int sysctl_sloppy_tcp(struct netns_ipvs *ipvs) { return DEFAULT_SLOPPY_TCP; } static inline int sysctl_sloppy_sctp(struct netns_ipvs *ipvs) { return DEFAULT_SLOPPY_SCTP; } static inline int sysctl_sync_ports(struct netns_ipvs *ipvs) { return 1; } static inline int sysctl_sync_persist_mode(struct netns_ipvs *ipvs) { return 0; } static inline unsigned long sysctl_sync_qlen_max(struct netns_ipvs *ipvs) { return IPVS_SYNC_QLEN_MAX; } static inline int sysctl_sync_sock_size(struct netns_ipvs *ipvs) { return 0; } static inline int sysctl_pmtu_disc(struct netns_ipvs *ipvs) { return 1; } static inline int sysctl_backup_only(struct netns_ipvs *ipvs) { return 0; } static inline int sysctl_conn_reuse_mode(struct netns_ipvs *ipvs) { return 1; } static inline int sysctl_expire_nodest_conn(struct netns_ipvs *ipvs) { return 0; } static inline int sysctl_schedule_icmp(struct netns_ipvs *ipvs) { return 0; } static inline int sysctl_ignore_tunneled(struct netns_ipvs *ipvs) { return 0; } static inline int sysctl_cache_bypass(struct netns_ipvs *ipvs) { return 0; } static inline int sysctl_run_estimation(struct netns_ipvs *ipvs) { return 1; } static inline const struct cpumask *sysctl_est_cpulist(struct netns_ipvs *ipvs) { return housekeeping_cpumask(HK_TYPE_KTHREAD); } static inline int sysctl_est_nice(struct netns_ipvs *ipvs) { return IPVS_EST_NICE; } #endif /* IPVS core functions * (from ip_vs_core.c) */ const char *ip_vs_proto_name(unsigned int proto); void ip_vs_init_hash_table(struct list_head *table, int rows); struct ip_vs_conn *ip_vs_new_conn_out(struct ip_vs_service *svc, struct ip_vs_dest *dest, struct sk_buff *skb, const struct ip_vs_iphdr *iph, __be16 dport, __be16 cport); #define IP_VS_INIT_HASH_TABLE(t) ip_vs_init_hash_table((t), ARRAY_SIZE((t))) #define IP_VS_APP_TYPE_FTP 1 /* ip_vs_conn handling functions * (from ip_vs_conn.c) */ enum { IP_VS_DIR_INPUT = 0, IP_VS_DIR_OUTPUT, IP_VS_DIR_INPUT_ONLY, IP_VS_DIR_LAST, }; static inline void ip_vs_conn_fill_param(struct netns_ipvs *ipvs, int af, int protocol, const union nf_inet_addr *caddr, __be16 cport, const union nf_inet_addr *vaddr, __be16 vport, struct ip_vs_conn_param *p) { p->ipvs = ipvs; p->af = af; p->protocol = protocol; p->caddr = caddr; p->cport = cport; p->vaddr = vaddr; p->vport = vport; p->pe = NULL; p->pe_data = NULL; } struct ip_vs_conn *ip_vs_conn_in_get(const struct ip_vs_conn_param *p); struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p); struct ip_vs_conn * ip_vs_conn_in_get_proto(struct netns_ipvs *ipvs, int af, const struct sk_buff *skb, const struct ip_vs_iphdr *iph); struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p); struct ip_vs_conn * ip_vs_conn_out_get_proto(struct netns_ipvs *ipvs, int af, const struct sk_buff *skb, const struct ip_vs_iphdr *iph); /* Get reference to gain full access to conn. * By default, RCU read-side critical sections have access only to * conn fields and its PE data, see ip_vs_conn_rcu_free() for reference. */ static inline bool __ip_vs_conn_get(struct ip_vs_conn *cp) { return refcount_inc_not_zero(&cp->refcnt); } /* put back the conn without restarting its timer */ static inline void __ip_vs_conn_put(struct ip_vs_conn *cp) { smp_mb__before_atomic(); refcount_dec(&cp->refcnt); } void ip_vs_conn_put(struct ip_vs_conn *cp); void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport); struct ip_vs_conn *ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af, const union nf_inet_addr *daddr, __be16 dport, unsigned int flags, struct ip_vs_dest *dest, __u32 fwmark); void ip_vs_conn_expire_now(struct ip_vs_conn *cp); const char *ip_vs_state_name(const struct ip_vs_conn *cp); void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp); int ip_vs_check_template(struct ip_vs_conn *ct, struct ip_vs_dest *cdest); void ip_vs_random_dropentry(struct netns_ipvs *ipvs); int ip_vs_conn_init(void); void ip_vs_conn_cleanup(void); static inline void ip_vs_control_del(struct ip_vs_conn *cp) { struct ip_vs_conn *ctl_cp = cp->control; if (!ctl_cp) { IP_VS_ERR_BUF("request control DEL for uncontrolled: " "%s:%d to %s:%d\n", IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport)); return; } IP_VS_DBG_BUF(7, "DELeting control for: " "cp.dst=%s:%d ctl_cp.dst=%s:%d\n", IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), IP_VS_DBG_ADDR(cp->af, &ctl_cp->caddr), ntohs(ctl_cp->cport)); cp->control = NULL; if (atomic_read(&ctl_cp->n_control) == 0) { IP_VS_ERR_BUF("BUG control DEL with n=0 : " "%s:%d to %s:%d\n", IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport)); return; } atomic_dec(&ctl_cp->n_control); } static inline void ip_vs_control_add(struct ip_vs_conn *cp, struct ip_vs_conn *ctl_cp) { if (cp->control) { IP_VS_ERR_BUF("request control ADD for already controlled: " "%s:%d to %s:%d\n", IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport)); ip_vs_control_del(cp); } IP_VS_DBG_BUF(7, "ADDing control for: " "cp.dst=%s:%d ctl_cp.dst=%s:%d\n", IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), IP_VS_DBG_ADDR(cp->af, &ctl_cp->caddr), ntohs(ctl_cp->cport)); cp->control = ctl_cp; atomic_inc(&ctl_cp->n_control); } /* Mark our template as assured */ static inline void ip_vs_control_assure_ct(struct ip_vs_conn *cp) { struct ip_vs_conn *ct = cp->control; if (ct && !(ct->state & IP_VS_CTPL_S_ASSURED) && (ct->flags & IP_VS_CONN_F_TEMPLATE)) ct->state |= IP_VS_CTPL_S_ASSURED; } /* IPVS netns init & cleanup functions */ int ip_vs_estimator_net_init(struct netns_ipvs *ipvs); int ip_vs_control_net_init(struct netns_ipvs *ipvs); int ip_vs_protocol_net_init(struct netns_ipvs *ipvs); int ip_vs_app_net_init(struct netns_ipvs *ipvs); int ip_vs_conn_net_init(struct netns_ipvs *ipvs); int ip_vs_sync_net_init(struct netns_ipvs *ipvs); void ip_vs_conn_net_cleanup(struct netns_ipvs *ipvs); void ip_vs_app_net_cleanup(struct netns_ipvs *ipvs); void ip_vs_protocol_net_cleanup(struct netns_ipvs *ipvs); void ip_vs_control_net_cleanup(struct netns_ipvs *ipvs); void ip_vs_estimator_net_cleanup(struct netns_ipvs *ipvs); void ip_vs_sync_net_cleanup(struct netns_ipvs *ipvs); void ip_vs_service_nets_cleanup(struct list_head *net_list); /* IPVS application functions * (from ip_vs_app.c) */ #define IP_VS_APP_MAX_PORTS 8 struct ip_vs_app *register_ip_vs_app(struct netns_ipvs *ipvs, struct ip_vs_app *app); void unregister_ip_vs_app(struct netns_ipvs *ipvs, struct ip_vs_app *app); int ip_vs_bind_app(struct ip_vs_conn *cp, struct ip_vs_protocol *pp); void ip_vs_unbind_app(struct ip_vs_conn *cp); int register_ip_vs_app_inc(struct netns_ipvs *ipvs, struct ip_vs_app *app, __u16 proto, __u16 port); int ip_vs_app_inc_get(struct ip_vs_app *inc); void ip_vs_app_inc_put(struct ip_vs_app *inc); int ip_vs_app_pkt_out(struct ip_vs_conn *, struct sk_buff *skb, struct ip_vs_iphdr *ipvsh); int ip_vs_app_pkt_in(struct ip_vs_conn *, struct sk_buff *skb, struct ip_vs_iphdr *ipvsh); int register_ip_vs_pe(struct ip_vs_pe *pe); int unregister_ip_vs_pe(struct ip_vs_pe *pe); struct ip_vs_pe *ip_vs_pe_getbyname(const char *name); struct ip_vs_pe *__ip_vs_pe_getbyname(const char *pe_name); /* Use a #define to avoid all of module.h just for these trivial ops */ #define ip_vs_pe_get(pe) \ if (pe && pe->module) \ __module_get(pe->module); #define ip_vs_pe_put(pe) \ if (pe && pe->module) \ module_put(pe->module); /* IPVS protocol functions (from ip_vs_proto.c) */ int ip_vs_protocol_init(void); void ip_vs_protocol_cleanup(void); void ip_vs_protocol_timeout_change(struct netns_ipvs *ipvs, int flags); int *ip_vs_create_timeout_table(int *table, int size); void ip_vs_tcpudp_debug_packet(int af, struct ip_vs_protocol *pp, const struct sk_buff *skb, int offset, const char *msg); extern struct ip_vs_protocol ip_vs_protocol_tcp; extern struct ip_vs_protocol ip_vs_protocol_udp; extern struct ip_vs_protocol ip_vs_protocol_icmp; extern struct ip_vs_protocol ip_vs_protocol_esp; extern struct ip_vs_protocol ip_vs_protocol_ah; extern struct ip_vs_protocol ip_vs_protocol_sctp; /* Registering/unregistering scheduler functions * (from ip_vs_sched.c) */ int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler); int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler); int ip_vs_bind_scheduler(struct ip_vs_service *svc, struct ip_vs_scheduler *scheduler); void ip_vs_unbind_scheduler(struct ip_vs_service *svc, struct ip_vs_scheduler *sched); struct ip_vs_scheduler *ip_vs_scheduler_get(const char *sched_name); void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler); struct ip_vs_conn * ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, struct ip_vs_proto_data *pd, int *ignored, struct ip_vs_iphdr *iph); int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, struct ip_vs_proto_data *pd, struct ip_vs_iphdr *iph); void ip_vs_scheduler_err(struct ip_vs_service *svc, const char *msg); /* IPVS control data and functions (from ip_vs_ctl.c) */ extern struct ip_vs_stats ip_vs_stats; extern int sysctl_ip_vs_sync_ver; struct ip_vs_service * ip_vs_service_find(struct netns_ipvs *ipvs, int af, __u32 fwmark, __u16 protocol, const union nf_inet_addr *vaddr, __be16 vport); bool ip_vs_has_real_service(struct netns_ipvs *ipvs, int af, __u16 protocol, const union nf_inet_addr *daddr, __be16 dport); struct ip_vs_dest * ip_vs_find_real_service(struct netns_ipvs *ipvs, int af, __u16 protocol, const union nf_inet_addr *daddr, __be16 dport); struct ip_vs_dest *ip_vs_find_tunnel(struct netns_ipvs *ipvs, int af, const union nf_inet_addr *daddr, __be16 tun_port); int ip_vs_use_count_inc(void); void ip_vs_use_count_dec(void); int ip_vs_register_nl_ioctl(void); void ip_vs_unregister_nl_ioctl(void); int ip_vs_control_init(void); void ip_vs_control_cleanup(void); struct ip_vs_dest * ip_vs_find_dest(struct netns_ipvs *ipvs, int svc_af, int dest_af, const union nf_inet_addr *daddr, __be16 dport, const union nf_inet_addr *vaddr, __be16 vport, __u16 protocol, __u32 fwmark, __u32 flags); void ip_vs_try_bind_dest(struct ip_vs_conn *cp); static inline void ip_vs_dest_hold(struct ip_vs_dest *dest) { refcount_inc(&dest->refcnt); } static inline void ip_vs_dest_put(struct ip_vs_dest *dest) { smp_mb__before_atomic(); refcount_dec(&dest->refcnt); } static inline void ip_vs_dest_put_and_free(struct ip_vs_dest *dest) { if (refcount_dec_and_test(&dest->refcnt)) kfree(dest); } /* IPVS sync daemon data and function prototypes * (from ip_vs_sync.c) */ int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *cfg, int state); int stop_sync_thread(struct netns_ipvs *ipvs, int state); void ip_vs_sync_conn(struct netns_ipvs *ipvs, struct ip_vs_conn *cp, int pkts); /* IPVS rate estimator prototypes (from ip_vs_est.c) */ int ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats); void ip_vs_stop_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats); void ip_vs_zero_estimator(struct ip_vs_stats *stats); void ip_vs_read_estimator(struct ip_vs_kstats *dst, struct ip_vs_stats *stats); void ip_vs_est_reload_start(struct netns_ipvs *ipvs); int ip_vs_est_kthread_start(struct netns_ipvs *ipvs, struct ip_vs_est_kt_data *kd); void ip_vs_est_kthread_stop(struct ip_vs_est_kt_data *kd); static inline void ip_vs_est_stopped_recalc(struct netns_ipvs *ipvs) { #ifdef CONFIG_SYSCTL /* Stop tasks while cpulist is empty or if disabled with flag */ ipvs->est_stopped = !sysctl_run_estimation(ipvs) || (ipvs->est_cpulist_valid && cpumask_empty(sysctl_est_cpulist(ipvs))); #endif } static inline bool ip_vs_est_stopped(struct netns_ipvs *ipvs) { #ifdef CONFIG_SYSCTL return ipvs->est_stopped; #else return false; #endif } static inline int ip_vs_est_max_threads(struct netns_ipvs *ipvs) { unsigned int limit = IPVS_EST_CPU_KTHREADS * cpumask_weight(sysctl_est_cpulist(ipvs)); return max(1U, limit); } /* Various IPVS packet transmitters (from ip_vs_xmit.c) */ int ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph); int ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph); int ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph); int ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph); int ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph); int ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp, int offset, unsigned int hooknum, struct ip_vs_iphdr *iph); void ip_vs_dest_dst_rcu_free(struct rcu_head *head); #ifdef CONFIG_IP_VS_IPV6 int ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph); int ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph); int ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph); int ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph); int ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp, int offset, unsigned int hooknum, struct ip_vs_iphdr *iph); #endif #ifdef CONFIG_SYSCTL /* This is a simple mechanism to ignore packets when * we are loaded. Just set ip_vs_drop_rate to 'n' and * we start to drop 1/rate of the packets */ static inline int ip_vs_todrop(struct netns_ipvs *ipvs) { if (!ipvs->drop_rate) return 0; if (--ipvs->drop_counter > 0) return 0; ipvs->drop_counter = ipvs->drop_rate; return 1; } #else static inline int ip_vs_todrop(struct netns_ipvs *ipvs) { return 0; } #endif #ifdef CONFIG_SYSCTL /* Enqueue delayed work for expiring no dest connections * Only run when sysctl_expire_nodest=1 */ static inline void ip_vs_enqueue_expire_nodest_conns(struct netns_ipvs *ipvs) { if (sysctl_expire_nodest_conn(ipvs)) queue_delayed_work(system_long_wq, &ipvs->expire_nodest_conn_work, 1); } void ip_vs_expire_nodest_conn_flush(struct netns_ipvs *ipvs); #else static inline void ip_vs_enqueue_expire_nodest_conns(struct netns_ipvs *ipvs) {} #endif #define IP_VS_DFWD_METHOD(dest) (atomic_read(&(dest)->conn_flags) & \ IP_VS_CONN_F_FWD_MASK) /* ip_vs_fwd_tag returns the forwarding tag of the connection */ #define IP_VS_FWD_METHOD(cp) (cp->flags & IP_VS_CONN_F_FWD_MASK) static inline char ip_vs_fwd_tag(struct ip_vs_conn *cp) { char fwd; switch (IP_VS_FWD_METHOD(cp)) { case IP_VS_CONN_F_MASQ: fwd = 'M'; break; case IP_VS_CONN_F_LOCALNODE: fwd = 'L'; break; case IP_VS_CONN_F_TUNNEL: fwd = 'T'; break; case IP_VS_CONN_F_DROUTE: fwd = 'R'; break; case IP_VS_CONN_F_BYPASS: fwd = 'B'; break; default: fwd = '?'; break; } return fwd; } void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp, int dir); #ifdef CONFIG_IP_VS_IPV6 void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp, int dir); #endif __sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset); static inline __wsum ip_vs_check_diff4(__be32 old, __be32 new, __wsum oldsum) { __be32 diff[2] = { ~old, new }; return csum_partial(diff, sizeof(diff), oldsum); } #ifdef CONFIG_IP_VS_IPV6 static inline __wsum ip_vs_check_diff16(const __be32 *old, const __be32 *new, __wsum oldsum) { __be32 diff[8] = { ~old[3], ~old[2], ~old[1], ~old[0], new[3], new[2], new[1], new[0] }; return csum_partial(diff, sizeof(diff), oldsum); } #endif static inline __wsum ip_vs_check_diff2(__be16 old, __be16 new, __wsum oldsum) { __be16 diff[2] = { ~old, new }; return csum_partial(diff, sizeof(diff), oldsum); } /* Forget current conntrack (unconfirmed) and attach notrack entry */ static inline void ip_vs_notrack(struct sk_buff *skb) { #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) enum ip_conntrack_info ctinfo; struct nf_conn *ct = nf_ct_get(skb, &ctinfo); if (ct) { nf_conntrack_put(&ct->ct_general); nf_ct_set(skb, NULL, IP_CT_UNTRACKED); } #endif } #ifdef CONFIG_IP_VS_NFCT /* Netfilter connection tracking * (from ip_vs_nfct.c) */ static inline int ip_vs_conntrack_enabled(struct netns_ipvs *ipvs) { #ifdef CONFIG_SYSCTL return ipvs->sysctl_conntrack; #else return 0; #endif } void ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, int outin); int ip_vs_confirm_conntrack(struct sk_buff *skb); void ip_vs_nfct_expect_related(struct sk_buff *skb, struct nf_conn *ct, struct ip_vs_conn *cp, u_int8_t proto, const __be16 port, int from_rs); void ip_vs_conn_drop_conntrack(struct ip_vs_conn *cp); #else static inline int ip_vs_conntrack_enabled(struct netns_ipvs *ipvs) { return 0; } static inline void ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, int outin) { } static inline int ip_vs_confirm_conntrack(struct sk_buff *skb) { return NF_ACCEPT; } static inline void ip_vs_conn_drop_conntrack(struct ip_vs_conn *cp) { } #endif /* CONFIG_IP_VS_NFCT */ /* Using old conntrack that can not be redirected to another real server? */ static inline bool ip_vs_conn_uses_old_conntrack(struct ip_vs_conn *cp, struct sk_buff *skb) { #ifdef CONFIG_IP_VS_NFCT enum ip_conntrack_info ctinfo; struct nf_conn *ct; ct = nf_ct_get(skb, &ctinfo); if (ct && nf_ct_is_confirmed(ct)) return true; #endif return false; } static inline int ip_vs_register_conntrack(struct ip_vs_service *svc) { #if IS_ENABLED(CONFIG_NF_CONNTRACK) int afmask = (svc->af == AF_INET6) ? 2 : 1; int ret = 0; if (!(svc->conntrack_afmask & afmask)) { ret = nf_ct_netns_get(svc->ipvs->net, svc->af); if (ret >= 0) svc->conntrack_afmask |= afmask; } return ret; #else return 0; #endif } static inline void ip_vs_unregister_conntrack(struct ip_vs_service *svc) { #if IS_ENABLED(CONFIG_NF_CONNTRACK) int afmask = (svc->af == AF_INET6) ? 2 : 1; if (svc->conntrack_afmask & afmask) { nf_ct_netns_put(svc->ipvs->net, svc->af); svc->conntrack_afmask &= ~afmask; } #endif } int ip_vs_register_hooks(struct netns_ipvs *ipvs, unsigned int af); void ip_vs_unregister_hooks(struct netns_ipvs *ipvs, unsigned int af); static inline int ip_vs_dest_conn_overhead(struct ip_vs_dest *dest) { /* We think the overhead of processing active connections is 256 * times higher than that of inactive connections in average. (This * 256 times might not be accurate, we will change it later) We * use the following formula to estimate the overhead now: * dest->activeconns*256 + dest->inactconns */ return (atomic_read(&dest->activeconns) << 8) + atomic_read(&dest->inactconns); } #ifdef CONFIG_IP_VS_PROTO_TCP INDIRECT_CALLABLE_DECLARE(int tcp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)); #endif #ifdef CONFIG_IP_VS_PROTO_UDP INDIRECT_CALLABLE_DECLARE(int udp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)); #endif #endif /* _NET_IP_VS_H */
8 63 67 67 8 6 3 8 9 67 67 67 30 67 67 63 63 63 63 63 4 4 4 9 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2006 - 2007 Ivo van Doorn * Copyright (C) 2007 Dmitry Torokhov * Copyright 2009 Johannes Berg <johannes@sipsolutions.net> */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/init.h> #include <linux/workqueue.h> #include <linux/capability.h> #include <linux/list.h> #include <linux/mutex.h> #include <linux/rfkill.h> #include <linux/sched.h> #include <linux/spinlock.h> #include <linux/device.h> #include <linux/miscdevice.h> #include <linux/wait.h> #include <linux/poll.h> #include <linux/fs.h> #include <linux/slab.h> #include "rfkill.h" #define POLL_INTERVAL (5 * HZ) #define RFKILL_BLOCK_HW BIT(0) #define RFKILL_BLOCK_SW BIT(1) #define RFKILL_BLOCK_SW_PREV BIT(2) #define RFKILL_BLOCK_ANY (RFKILL_BLOCK_HW |\ RFKILL_BLOCK_SW |\ RFKILL_BLOCK_SW_PREV) #define RFKILL_BLOCK_SW_SETCALL BIT(31) struct rfkill { spinlock_t lock; enum rfkill_type type; unsigned long state; unsigned long hard_block_reasons; u32 idx; bool registered; bool persistent; bool polling_paused; bool suspended; bool need_sync; const struct rfkill_ops *ops; void *data; #ifdef CONFIG_RFKILL_LEDS struct led_trigger led_trigger; const char *ledtrigname; #endif struct device dev; struct list_head node; struct delayed_work poll_work; struct work_struct uevent_work; struct work_struct sync_work; char name[]; }; #define to_rfkill(d) container_of(d, struct rfkill, dev) struct rfkill_int_event { struct list_head list; struct rfkill_event_ext ev; }; struct rfkill_data { struct list_head list; struct list_head events; struct mutex mtx; wait_queue_head_t read_wait; bool input_handler; u8 max_size; }; MODULE_AUTHOR("Ivo van Doorn <IvDoorn@gmail.com>"); MODULE_AUTHOR("Johannes Berg <johannes@sipsolutions.net>"); MODULE_DESCRIPTION("RF switch support"); MODULE_LICENSE("GPL"); /* * The locking here should be made much smarter, we currently have * a bit of a stupid situation because drivers might want to register * the rfkill struct under their own lock, and take this lock during * rfkill method calls -- which will cause an AB-BA deadlock situation. * * To fix that, we need to rework this code here to be mostly lock-free * and only use the mutex for list manipulations, not to protect the * various other global variables. Then we can avoid holding the mutex * around driver operations, and all is happy. */ static LIST_HEAD(rfkill_list); /* list of registered rf switches */ static DEFINE_MUTEX(rfkill_global_mutex); static LIST_HEAD(rfkill_fds); /* list of open fds of /dev/rfkill */ static unsigned int rfkill_default_state = 1; module_param_named(default_state, rfkill_default_state, uint, 0444); MODULE_PARM_DESC(default_state, "Default initial state for all radio types, 0 = radio off"); static struct { bool cur, sav; } rfkill_global_states[NUM_RFKILL_TYPES]; static bool rfkill_epo_lock_active; #ifdef CONFIG_RFKILL_LEDS static void rfkill_led_trigger_event(struct rfkill *rfkill) { struct led_trigger *trigger; if (!rfkill->registered) return; trigger = &rfkill->led_trigger; if (rfkill->state & RFKILL_BLOCK_ANY) led_trigger_event(trigger, LED_OFF); else led_trigger_event(trigger, LED_FULL); } static int rfkill_led_trigger_activate(struct led_classdev *led) { struct rfkill *rfkill; rfkill = container_of(led->trigger, struct rfkill, led_trigger); rfkill_led_trigger_event(rfkill); return 0; } const char *rfkill_get_led_trigger_name(struct rfkill *rfkill) { return rfkill->led_trigger.name; } EXPORT_SYMBOL(rfkill_get_led_trigger_name); void rfkill_set_led_trigger_name(struct rfkill *rfkill, const char *name) { BUG_ON(!rfkill); rfkill->ledtrigname = name; } EXPORT_SYMBOL(rfkill_set_led_trigger_name); static int rfkill_led_trigger_register(struct rfkill *rfkill) { rfkill->led_trigger.name = rfkill->ledtrigname ? : dev_name(&rfkill->dev); rfkill->led_trigger.activate = rfkill_led_trigger_activate; return led_trigger_register(&rfkill->led_trigger); } static void rfkill_led_trigger_unregister(struct rfkill *rfkill) { led_trigger_unregister(&rfkill->led_trigger); } static struct led_trigger rfkill_any_led_trigger; static struct led_trigger rfkill_none_led_trigger; static struct work_struct rfkill_global_led_trigger_work; static void rfkill_global_led_trigger_worker(struct work_struct *work) { enum led_brightness brightness = LED_OFF; struct rfkill *rfkill; mutex_lock(&rfkill_global_mutex); list_for_each_entry(rfkill, &rfkill_list, node) { if (!(rfkill->state & RFKILL_BLOCK_ANY)) { brightness = LED_FULL; break; } } mutex_unlock(&rfkill_global_mutex); led_trigger_event(&rfkill_any_led_trigger, brightness); led_trigger_event(&rfkill_none_led_trigger, brightness == LED_OFF ? LED_FULL : LED_OFF); } static void rfkill_global_led_trigger_event(void) { schedule_work(&rfkill_global_led_trigger_work); } static int rfkill_global_led_trigger_register(void) { int ret; INIT_WORK(&rfkill_global_led_trigger_work, rfkill_global_led_trigger_worker); rfkill_any_led_trigger.name = "rfkill-any"; ret = led_trigger_register(&rfkill_any_led_trigger); if (ret) return ret; rfkill_none_led_trigger.name = "rfkill-none"; ret = led_trigger_register(&rfkill_none_led_trigger); if (ret) led_trigger_unregister(&rfkill_any_led_trigger); else /* Delay activation until all global triggers are registered */ rfkill_global_led_trigger_event(); return ret; } static void rfkill_global_led_trigger_unregister(void) { led_trigger_unregister(&rfkill_none_led_trigger); led_trigger_unregister(&rfkill_any_led_trigger); cancel_work_sync(&rfkill_global_led_trigger_work); } #else static void rfkill_led_trigger_event(struct rfkill *rfkill) { } static inline int rfkill_led_trigger_register(struct rfkill *rfkill) { return 0; } static inline void rfkill_led_trigger_unregister(struct rfkill *rfkill) { } static void rfkill_global_led_trigger_event(void) { } static int rfkill_global_led_trigger_register(void) { return 0; } static void rfkill_global_led_trigger_unregister(void) { } #endif /* CONFIG_RFKILL_LEDS */ static void rfkill_fill_event(struct rfkill_event_ext *ev, struct rfkill *rfkill, enum rfkill_operation op) { unsigned long flags; ev->idx = rfkill->idx; ev->type = rfkill->type; ev->op = op; spin_lock_irqsave(&rfkill->lock, flags); ev->hard = !!(rfkill->state & RFKILL_BLOCK_HW); ev->soft = !!(rfkill->state & (RFKILL_BLOCK_SW | RFKILL_BLOCK_SW_PREV)); ev->hard_block_reasons = rfkill->hard_block_reasons; spin_unlock_irqrestore(&rfkill->lock, flags); } static void rfkill_send_events(struct rfkill *rfkill, enum rfkill_operation op) { struct rfkill_data *data; struct rfkill_int_event *ev; list_for_each_entry(data, &rfkill_fds, list) { ev = kzalloc(sizeof(*ev), GFP_KERNEL); if (!ev) continue; rfkill_fill_event(&ev->ev, rfkill, op); mutex_lock(&data->mtx); list_add_tail(&ev->list, &data->events); mutex_unlock(&data->mtx); wake_up_interruptible(&data->read_wait); } } static void rfkill_event(struct rfkill *rfkill) { if (!rfkill->registered) return; kobject_uevent(&rfkill->dev.kobj, KOBJ_CHANGE); /* also send event to /dev/rfkill */ rfkill_send_events(rfkill, RFKILL_OP_CHANGE); } /** * rfkill_set_block - wrapper for set_block method * * @rfkill: the rfkill struct to use * @blocked: the new software state * * Calls the set_block method (when applicable) and handles notifications * etc. as well. */ static void rfkill_set_block(struct rfkill *rfkill, bool blocked) { unsigned long flags; bool prev, curr; int err; if (unlikely(rfkill->dev.power.power_state.event & PM_EVENT_SLEEP)) return; /* * Some platforms (...!) generate input events which affect the * _hard_ kill state -- whenever something tries to change the * current software state query the hardware state too. */ if (rfkill->ops->query) rfkill->ops->query(rfkill, rfkill->data); spin_lock_irqsave(&rfkill->lock, flags); prev = rfkill->state & RFKILL_BLOCK_SW; if (prev) rfkill->state |= RFKILL_BLOCK_SW_PREV; else rfkill->state &= ~RFKILL_BLOCK_SW_PREV; if (blocked) rfkill->state |= RFKILL_BLOCK_SW; else rfkill->state &= ~RFKILL_BLOCK_SW; rfkill->state |= RFKILL_BLOCK_SW_SETCALL; spin_unlock_irqrestore(&rfkill->lock, flags); err = rfkill->ops->set_block(rfkill->data, blocked); spin_lock_irqsave(&rfkill->lock, flags); if (err) { /* * Failed -- reset status to _PREV, which may be different * from what we have set _PREV to earlier in this function * if rfkill_set_sw_state was invoked. */ if (rfkill->state & RFKILL_BLOCK_SW_PREV) rfkill->state |= RFKILL_BLOCK_SW; else rfkill->state &= ~RFKILL_BLOCK_SW; } rfkill->state &= ~RFKILL_BLOCK_SW_SETCALL; rfkill->state &= ~RFKILL_BLOCK_SW_PREV; curr = rfkill->state & RFKILL_BLOCK_SW; spin_unlock_irqrestore(&rfkill->lock, flags); rfkill_led_trigger_event(rfkill); rfkill_global_led_trigger_event(); if (prev != curr) rfkill_event(rfkill); } static void rfkill_sync(struct rfkill *rfkill) { lockdep_assert_held(&rfkill_global_mutex); if (!rfkill->need_sync) return; rfkill_set_block(rfkill, rfkill_global_states[rfkill->type].cur); rfkill->need_sync = false; } static void rfkill_update_global_state(enum rfkill_type type, bool blocked) { int i; if (type != RFKILL_TYPE_ALL) { rfkill_global_states[type].cur = blocked; return; } for (i = 0; i < NUM_RFKILL_TYPES; i++) rfkill_global_states[i].cur = blocked; } #ifdef CONFIG_RFKILL_INPUT static atomic_t rfkill_input_disabled = ATOMIC_INIT(0); /** * __rfkill_switch_all - Toggle state of all switches of given type * @type: type of interfaces to be affected * @blocked: the new state * * This function sets the state of all switches of given type, * unless a specific switch is suspended. * * Caller must have acquired rfkill_global_mutex. */ static void __rfkill_switch_all(const enum rfkill_type type, bool blocked) { struct rfkill *rfkill; rfkill_update_global_state(type, blocked); list_for_each_entry(rfkill, &rfkill_list, node) { if (rfkill->type != type && type != RFKILL_TYPE_ALL) continue; rfkill_set_block(rfkill, blocked); } } /** * rfkill_switch_all - Toggle state of all switches of given type * @type: type of interfaces to be affected * @blocked: the new state * * Acquires rfkill_global_mutex and calls __rfkill_switch_all(@type, @state). * Please refer to __rfkill_switch_all() for details. * * Does nothing if the EPO lock is active. */ void rfkill_switch_all(enum rfkill_type type, bool blocked) { if (atomic_read(&rfkill_input_disabled)) return; mutex_lock(&rfkill_global_mutex); if (!rfkill_epo_lock_active) __rfkill_switch_all(type, blocked); mutex_unlock(&rfkill_global_mutex); } /** * rfkill_epo - emergency power off all transmitters * * This kicks all non-suspended rfkill devices to RFKILL_STATE_SOFT_BLOCKED, * ignoring everything in its path but rfkill_global_mutex and rfkill->mutex. * * The global state before the EPO is saved and can be restored later * using rfkill_restore_states(). */ void rfkill_epo(void) { struct rfkill *rfkill; int i; if (atomic_read(&rfkill_input_disabled)) return; mutex_lock(&rfkill_global_mutex); rfkill_epo_lock_active = true; list_for_each_entry(rfkill, &rfkill_list, node) rfkill_set_block(rfkill, true); for (i = 0; i < NUM_RFKILL_TYPES; i++) { rfkill_global_states[i].sav = rfkill_global_states[i].cur; rfkill_global_states[i].cur = true; } mutex_unlock(&rfkill_global_mutex); } /** * rfkill_restore_states - restore global states * * Restore (and sync switches to) the global state from the * states in rfkill_default_states. This can undo the effects of * a call to rfkill_epo(). */ void rfkill_restore_states(void) { int i; if (atomic_read(&rfkill_input_disabled)) return; mutex_lock(&rfkill_global_mutex); rfkill_epo_lock_active = false; for (i = 0; i < NUM_RFKILL_TYPES; i++) __rfkill_switch_all(i, rfkill_global_states[i].sav); mutex_unlock(&rfkill_global_mutex); } /** * rfkill_remove_epo_lock - unlock state changes * * Used by rfkill-input manually unlock state changes, when * the EPO switch is deactivated. */ void rfkill_remove_epo_lock(void) { if (atomic_read(&rfkill_input_disabled)) return; mutex_lock(&rfkill_global_mutex); rfkill_epo_lock_active = false; mutex_unlock(&rfkill_global_mutex); } /** * rfkill_is_epo_lock_active - returns true EPO is active * * Returns 0 (false) if there is NOT an active EPO condition, * and 1 (true) if there is an active EPO condition, which * locks all radios in one of the BLOCKED states. * * Can be called in atomic context. */ bool rfkill_is_epo_lock_active(void) { return rfkill_epo_lock_active; } /** * rfkill_get_global_sw_state - returns global state for a type * @type: the type to get the global state of * * Returns the current global state for a given wireless * device type. */ bool rfkill_get_global_sw_state(const enum rfkill_type type) { return rfkill_global_states[type].cur; } #endif bool rfkill_set_hw_state_reason(struct rfkill *rfkill, bool blocked, enum rfkill_hard_block_reasons reason) { unsigned long flags; bool ret, prev; BUG_ON(!rfkill); spin_lock_irqsave(&rfkill->lock, flags); prev = !!(rfkill->hard_block_reasons & reason); if (blocked) { rfkill->state |= RFKILL_BLOCK_HW; rfkill->hard_block_reasons |= reason; } else { rfkill->hard_block_reasons &= ~reason; if (!rfkill->hard_block_reasons) rfkill->state &= ~RFKILL_BLOCK_HW; } ret = !!(rfkill->state & RFKILL_BLOCK_ANY); spin_unlock_irqrestore(&rfkill->lock, flags); rfkill_led_trigger_event(rfkill); rfkill_global_led_trigger_event(); if (rfkill->registered && prev != blocked) schedule_work(&rfkill->uevent_work); return ret; } EXPORT_SYMBOL(rfkill_set_hw_state_reason); static void __rfkill_set_sw_state(struct rfkill *rfkill, bool blocked) { u32 bit = RFKILL_BLOCK_SW; /* if in a ops->set_block right now, use other bit */ if (rfkill->state & RFKILL_BLOCK_SW_SETCALL) bit = RFKILL_BLOCK_SW_PREV; if (blocked) rfkill->state |= bit; else rfkill->state &= ~bit; } bool rfkill_set_sw_state(struct rfkill *rfkill, bool blocked) { unsigned long flags; bool prev, hwblock; BUG_ON(!rfkill); spin_lock_irqsave(&rfkill->lock, flags); prev = !!(rfkill->state & RFKILL_BLOCK_SW); __rfkill_set_sw_state(rfkill, blocked); hwblock = !!(rfkill->state & RFKILL_BLOCK_HW); blocked = blocked || hwblock; spin_unlock_irqrestore(&rfkill->lock, flags); if (!rfkill->registered) return blocked; if (prev != blocked && !hwblock) schedule_work(&rfkill->uevent_work); rfkill_led_trigger_event(rfkill); rfkill_global_led_trigger_event(); return blocked; } EXPORT_SYMBOL(rfkill_set_sw_state); void rfkill_init_sw_state(struct rfkill *rfkill, bool blocked) { unsigned long flags; BUG_ON(!rfkill); BUG_ON(rfkill->registered); spin_lock_irqsave(&rfkill->lock, flags); __rfkill_set_sw_state(rfkill, blocked); rfkill->persistent = true; spin_unlock_irqrestore(&rfkill->lock, flags); } EXPORT_SYMBOL(rfkill_init_sw_state); void rfkill_set_states(struct rfkill *rfkill, bool sw, bool hw) { unsigned long flags; bool swprev, hwprev; BUG_ON(!rfkill); spin_lock_irqsave(&rfkill->lock, flags); /* * No need to care about prev/setblock ... this is for uevent only * and that will get triggered by rfkill_set_block anyway. */ swprev = !!(rfkill->state & RFKILL_BLOCK_SW); hwprev = !!(rfkill->state & RFKILL_BLOCK_HW); __rfkill_set_sw_state(rfkill, sw); if (hw) rfkill->state |= RFKILL_BLOCK_HW; else rfkill->state &= ~RFKILL_BLOCK_HW; spin_unlock_irqrestore(&rfkill->lock, flags); if (!rfkill->registered) { rfkill->persistent = true; } else { if (swprev != sw || hwprev != hw) schedule_work(&rfkill->uevent_work); rfkill_led_trigger_event(rfkill); rfkill_global_led_trigger_event(); } } EXPORT_SYMBOL(rfkill_set_states); static const char * const rfkill_types[] = { NULL, /* RFKILL_TYPE_ALL */ "wlan", "bluetooth", "ultrawideband", "wimax", "wwan", "gps", "fm", "nfc", }; enum rfkill_type rfkill_find_type(const char *name) { int i; BUILD_BUG_ON(ARRAY_SIZE(rfkill_types) != NUM_RFKILL_TYPES); if (!name) return RFKILL_TYPE_ALL; for (i = 1; i < NUM_RFKILL_TYPES; i++) if (!strcmp(name, rfkill_types[i])) return i; return RFKILL_TYPE_ALL; } EXPORT_SYMBOL(rfkill_find_type); static ssize_t name_show(struct device *dev, struct device_attribute *attr, char *buf) { struct rfkill *rfkill = to_rfkill(dev); return sysfs_emit(buf, "%s\n", rfkill->name); } static DEVICE_ATTR_RO(name); static ssize_t type_show(struct device *dev, struct device_attribute *attr, char *buf) { struct rfkill *rfkill = to_rfkill(dev); return sysfs_emit(buf, "%s\n", rfkill_types[rfkill->type]); } static DEVICE_ATTR_RO(type); static ssize_t index_show(struct device *dev, struct device_attribute *attr, char *buf) { struct rfkill *rfkill = to_rfkill(dev); return sysfs_emit(buf, "%d\n", rfkill->idx); } static DEVICE_ATTR_RO(index); static ssize_t persistent_show(struct device *dev, struct device_attribute *attr, char *buf) { struct rfkill *rfkill = to_rfkill(dev); return sysfs_emit(buf, "%d\n", rfkill->persistent); } static DEVICE_ATTR_RO(persistent); static ssize_t hard_show(struct device *dev, struct device_attribute *attr, char *buf) { struct rfkill *rfkill = to_rfkill(dev); return sysfs_emit(buf, "%d\n", (rfkill->state & RFKILL_BLOCK_HW) ? 1 : 0); } static DEVICE_ATTR_RO(hard); static ssize_t soft_show(struct device *dev, struct device_attribute *attr, char *buf) { struct rfkill *rfkill = to_rfkill(dev); mutex_lock(&rfkill_global_mutex); rfkill_sync(rfkill); mutex_unlock(&rfkill_global_mutex); return sysfs_emit(buf, "%d\n", (rfkill->state & RFKILL_BLOCK_SW) ? 1 : 0); } static ssize_t soft_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct rfkill *rfkill = to_rfkill(dev); unsigned long state; int err; if (!capable(CAP_NET_ADMIN)) return -EPERM; err = kstrtoul(buf, 0, &state); if (err) return err; if (state > 1 ) return -EINVAL; mutex_lock(&rfkill_global_mutex); rfkill_sync(rfkill); rfkill_set_block(rfkill, state); mutex_unlock(&rfkill_global_mutex); return count; } static DEVICE_ATTR_RW(soft); static ssize_t hard_block_reasons_show(struct device *dev, struct device_attribute *attr, char *buf) { struct rfkill *rfkill = to_rfkill(dev); return sysfs_emit(buf, "0x%lx\n", rfkill->hard_block_reasons); } static DEVICE_ATTR_RO(hard_block_reasons); static u8 user_state_from_blocked(unsigned long state) { if (state & RFKILL_BLOCK_HW) return RFKILL_USER_STATE_HARD_BLOCKED; if (state & RFKILL_BLOCK_SW) return RFKILL_USER_STATE_SOFT_BLOCKED; return RFKILL_USER_STATE_UNBLOCKED; } static ssize_t state_show(struct device *dev, struct device_attribute *attr, char *buf) { struct rfkill *rfkill = to_rfkill(dev); mutex_lock(&rfkill_global_mutex); rfkill_sync(rfkill); mutex_unlock(&rfkill_global_mutex); return sysfs_emit(buf, "%d\n", user_state_from_blocked(rfkill->state)); } static ssize_t state_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct rfkill *rfkill = to_rfkill(dev); unsigned long state; int err; if (!capable(CAP_NET_ADMIN)) return -EPERM; err = kstrtoul(buf, 0, &state); if (err) return err; if (state != RFKILL_USER_STATE_SOFT_BLOCKED && state != RFKILL_USER_STATE_UNBLOCKED) return -EINVAL; mutex_lock(&rfkill_global_mutex); rfkill_sync(rfkill); rfkill_set_block(rfkill, state == RFKILL_USER_STATE_SOFT_BLOCKED); mutex_unlock(&rfkill_global_mutex); return count; } static DEVICE_ATTR_RW(state); static struct attribute *rfkill_dev_attrs[] = { &dev_attr_name.attr, &dev_attr_type.attr, &dev_attr_index.attr, &dev_attr_persistent.attr, &dev_attr_state.attr, &dev_attr_soft.attr, &dev_attr_hard.attr, &dev_attr_hard_block_reasons.attr, NULL, }; ATTRIBUTE_GROUPS(rfkill_dev); static void rfkill_release(struct device *dev) { struct rfkill *rfkill = to_rfkill(dev); kfree(rfkill); } static int rfkill_dev_uevent(const struct device *dev, struct kobj_uevent_env *env) { struct rfkill *rfkill = to_rfkill(dev); unsigned long flags; unsigned long reasons; u32 state; int error; error = add_uevent_var(env, "RFKILL_NAME=%s", rfkill->name); if (error) return error; error = add_uevent_var(env, "RFKILL_TYPE=%s", rfkill_types[rfkill->type]); if (error) return error; spin_lock_irqsave(&rfkill->lock, flags); state = rfkill->state; reasons = rfkill->hard_block_reasons; spin_unlock_irqrestore(&rfkill->lock, flags); error = add_uevent_var(env, "RFKILL_STATE=%d", user_state_from_blocked(state)); if (error) return error; return add_uevent_var(env, "RFKILL_HW_BLOCK_REASON=0x%lx", reasons); } void rfkill_pause_polling(struct rfkill *rfkill) { BUG_ON(!rfkill); if (!rfkill->ops->poll) return; rfkill->polling_paused = true; cancel_delayed_work_sync(&rfkill->poll_work); } EXPORT_SYMBOL(rfkill_pause_polling); void rfkill_resume_polling(struct rfkill *rfkill) { BUG_ON(!rfkill); if (!rfkill->ops->poll) return; rfkill->polling_paused = false; if (rfkill->suspended) return; queue_delayed_work(system_power_efficient_wq, &rfkill->poll_work, 0); } EXPORT_SYMBOL(rfkill_resume_polling); #ifdef CONFIG_PM_SLEEP static int rfkill_suspend(struct device *dev) { struct rfkill *rfkill = to_rfkill(dev); rfkill->suspended = true; cancel_delayed_work_sync(&rfkill->poll_work); return 0; } static int rfkill_resume(struct device *dev) { struct rfkill *rfkill = to_rfkill(dev); bool cur; rfkill->suspended = false; if (!rfkill->registered) return 0; if (!rfkill->persistent) { cur = !!(rfkill->state & RFKILL_BLOCK_SW); rfkill_set_block(rfkill, cur); } if (rfkill->ops->poll && !rfkill->polling_paused) queue_delayed_work(system_power_efficient_wq, &rfkill->poll_work, 0); return 0; } static SIMPLE_DEV_PM_OPS(rfkill_pm_ops, rfkill_suspend, rfkill_resume); #define RFKILL_PM_OPS (&rfkill_pm_ops) #else #define RFKILL_PM_OPS NULL #endif static struct class rfkill_class = { .name = "rfkill", .dev_release = rfkill_release, .dev_groups = rfkill_dev_groups, .dev_uevent = rfkill_dev_uevent, .pm = RFKILL_PM_OPS, }; bool rfkill_blocked(struct rfkill *rfkill) { unsigned long flags; u32 state; spin_lock_irqsave(&rfkill->lock, flags); state = rfkill->state; spin_unlock_irqrestore(&rfkill->lock, flags); return !!(state & RFKILL_BLOCK_ANY); } EXPORT_SYMBOL(rfkill_blocked); bool rfkill_soft_blocked(struct rfkill *rfkill) { unsigned long flags; u32 state; spin_lock_irqsave(&rfkill->lock, flags); state = rfkill->state; spin_unlock_irqrestore(&rfkill->lock, flags); return !!(state & RFKILL_BLOCK_SW); } EXPORT_SYMBOL(rfkill_soft_blocked); struct rfkill * __must_check rfkill_alloc(const char *name, struct device *parent, const enum rfkill_type type, const struct rfkill_ops *ops, void *ops_data) { struct rfkill *rfkill; struct device *dev; if (WARN_ON(!ops)) return NULL; if (WARN_ON(!ops->set_block)) return NULL; if (WARN_ON(!name)) return NULL; if (WARN_ON(type == RFKILL_TYPE_ALL || type >= NUM_RFKILL_TYPES)) return NULL; rfkill = kzalloc(sizeof(*rfkill) + strlen(name) + 1, GFP_KERNEL); if (!rfkill) return NULL; spin_lock_init(&rfkill->lock); INIT_LIST_HEAD(&rfkill->node); rfkill->type = type; strcpy(rfkill->name, name); rfkill->ops = ops; rfkill->data = ops_data; dev = &rfkill->dev; dev->class = &rfkill_class; dev->parent = parent; device_initialize(dev); return rfkill; } EXPORT_SYMBOL(rfkill_alloc); static void rfkill_poll(struct work_struct *work) { struct rfkill *rfkill; rfkill = container_of(work, struct rfkill, poll_work.work); /* * Poll hardware state -- driver will use one of the * rfkill_set{,_hw,_sw}_state functions and use its * return value to update the current status. */ rfkill->ops->poll(rfkill, rfkill->data); queue_delayed_work(system_power_efficient_wq, &rfkill->poll_work, round_jiffies_relative(POLL_INTERVAL)); } static void rfkill_uevent_work(struct work_struct *work) { struct rfkill *rfkill; rfkill = container_of(work, struct rfkill, uevent_work); mutex_lock(&rfkill_global_mutex); rfkill_event(rfkill); mutex_unlock(&rfkill_global_mutex); } static void rfkill_sync_work(struct work_struct *work) { struct rfkill *rfkill = container_of(work, struct rfkill, sync_work); mutex_lock(&rfkill_global_mutex); rfkill_sync(rfkill); mutex_unlock(&rfkill_global_mutex); } int __must_check rfkill_register(struct rfkill *rfkill) { static unsigned long rfkill_no; struct device *dev; int error; if (!rfkill) return -EINVAL; dev = &rfkill->dev; mutex_lock(&rfkill_global_mutex); if (rfkill->registered) { error = -EALREADY; goto unlock; } rfkill->idx = rfkill_no; dev_set_name(dev, "rfkill%lu", rfkill_no); rfkill_no++; list_add_tail(&rfkill->node, &rfkill_list); error = device_add(dev); if (error) goto remove; error = rfkill_led_trigger_register(rfkill); if (error) goto devdel; rfkill->registered = true; INIT_DELAYED_WORK(&rfkill->poll_work, rfkill_poll); INIT_WORK(&rfkill->uevent_work, rfkill_uevent_work); INIT_WORK(&rfkill->sync_work, rfkill_sync_work); if (rfkill->ops->poll) queue_delayed_work(system_power_efficient_wq, &rfkill->poll_work, round_jiffies_relative(POLL_INTERVAL)); if (!rfkill->persistent || rfkill_epo_lock_active) { rfkill->need_sync = true; schedule_work(&rfkill->sync_work); } else { #ifdef CONFIG_RFKILL_INPUT bool soft_blocked = !!(rfkill->state & RFKILL_BLOCK_SW); if (!atomic_read(&rfkill_input_disabled)) __rfkill_switch_all(rfkill->type, soft_blocked); #endif } rfkill_global_led_trigger_event(); rfkill_send_events(rfkill, RFKILL_OP_ADD); mutex_unlock(&rfkill_global_mutex); return 0; devdel: device_del(&rfkill->dev); remove: list_del_init(&rfkill->node); unlock: mutex_unlock(&rfkill_global_mutex); return error; } EXPORT_SYMBOL(rfkill_register); void rfkill_unregister(struct rfkill *rfkill) { BUG_ON(!rfkill); if (rfkill->ops->poll) cancel_delayed_work_sync(&rfkill->poll_work); cancel_work_sync(&rfkill->uevent_work); cancel_work_sync(&rfkill->sync_work); rfkill->registered = false; device_del(&rfkill->dev); mutex_lock(&rfkill_global_mutex); rfkill_send_events(rfkill, RFKILL_OP_DEL); list_del_init(&rfkill->node); rfkill_global_led_trigger_event(); mutex_unlock(&rfkill_global_mutex); rfkill_led_trigger_unregister(rfkill); } EXPORT_SYMBOL(rfkill_unregister); void rfkill_destroy(struct rfkill *rfkill) { if (rfkill) put_device(&rfkill->dev); } EXPORT_SYMBOL(rfkill_destroy); static int rfkill_fop_open(struct inode *inode, struct file *file) { struct rfkill_data *data; struct rfkill *rfkill; struct rfkill_int_event *ev, *tmp; data = kzalloc(sizeof(*data), GFP_KERNEL); if (!data) return -ENOMEM; data->max_size = RFKILL_EVENT_SIZE_V1; INIT_LIST_HEAD(&data->events); mutex_init(&data->mtx); init_waitqueue_head(&data->read_wait); mutex_lock(&rfkill_global_mutex); /* * start getting events from elsewhere but hold mtx to get * startup events added first */ list_for_each_entry(rfkill, &rfkill_list, node) { ev = kzalloc(sizeof(*ev), GFP_KERNEL); if (!ev) goto free; rfkill_sync(rfkill); rfkill_fill_event(&ev->ev, rfkill, RFKILL_OP_ADD); mutex_lock(&data->mtx); list_add_tail(&ev->list, &data->events); mutex_unlock(&data->mtx); } list_add(&data->list, &rfkill_fds); mutex_unlock(&rfkill_global_mutex); file->private_data = data; return stream_open(inode, file); free: mutex_unlock(&rfkill_global_mutex); mutex_destroy(&data->mtx); list_for_each_entry_safe(ev, tmp, &data->events, list) kfree(ev); kfree(data); return -ENOMEM; } static __poll_t rfkill_fop_poll(struct file *file, poll_table *wait) { struct rfkill_data *data = file->private_data; __poll_t res = EPOLLOUT | EPOLLWRNORM; poll_wait(file, &data->read_wait, wait); mutex_lock(&data->mtx); if (!list_empty(&data->events)) res = EPOLLIN | EPOLLRDNORM; mutex_unlock(&data->mtx); return res; } static ssize_t rfkill_fop_read(struct file *file, char __user *buf, size_t count, loff_t *pos) { struct rfkill_data *data = file->private_data; struct rfkill_int_event *ev; unsigned long sz; int ret; mutex_lock(&data->mtx); while (list_empty(&data->events)) { if (file->f_flags & O_NONBLOCK) { ret = -EAGAIN; goto out; } mutex_unlock(&data->mtx); /* since we re-check and it just compares pointers, * using !list_empty() without locking isn't a problem */ ret = wait_event_interruptible(data->read_wait, !list_empty(&data->events)); mutex_lock(&data->mtx); if (ret) goto out; } ev = list_first_entry(&data->events, struct rfkill_int_event, list); sz = min_t(unsigned long, sizeof(ev->ev), count); sz = min_t(unsigned long, sz, data->max_size); ret = sz; if (copy_to_user(buf, &ev->ev, sz)) ret = -EFAULT; list_del(&ev->list); kfree(ev); out: mutex_unlock(&data->mtx); return ret; } static ssize_t rfkill_fop_write(struct file *file, const char __user *buf, size_t count, loff_t *pos) { struct rfkill_data *data = file->private_data; struct rfkill *rfkill; struct rfkill_event_ext ev; int ret; /* we don't need the 'hard' variable but accept it */ if (count < RFKILL_EVENT_SIZE_V1 - 1) return -EINVAL; /* * Copy as much data as we can accept into our 'ev' buffer, * but tell userspace how much we've copied so it can determine * our API version even in a write() call, if it cares. */ count = min(count, sizeof(ev)); count = min_t(size_t, count, data->max_size); if (copy_from_user(&ev, buf, count)) return -EFAULT; if (ev.type >= NUM_RFKILL_TYPES) return -EINVAL; mutex_lock(&rfkill_global_mutex); switch (ev.op) { case RFKILL_OP_CHANGE_ALL: rfkill_update_global_state(ev.type, ev.soft); list_for_each_entry(rfkill, &rfkill_list, node) if (rfkill->type == ev.type || ev.type == RFKILL_TYPE_ALL) rfkill_set_block(rfkill, ev.soft); ret = 0; break; case RFKILL_OP_CHANGE: list_for_each_entry(rfkill, &rfkill_list, node) if (rfkill->idx == ev.idx && (rfkill->type == ev.type || ev.type == RFKILL_TYPE_ALL)) rfkill_set_block(rfkill, ev.soft); ret = 0; break; default: ret = -EINVAL; break; } mutex_unlock(&rfkill_global_mutex); return ret ?: count; } static int rfkill_fop_release(struct inode *inode, struct file *file) { struct rfkill_data *data = file->private_data; struct rfkill_int_event *ev, *tmp; mutex_lock(&rfkill_global_mutex); list_del(&data->list); mutex_unlock(&rfkill_global_mutex); mutex_destroy(&data->mtx); list_for_each_entry_safe(ev, tmp, &data->events, list) kfree(ev); #ifdef CONFIG_RFKILL_INPUT if (data->input_handler) if (atomic_dec_return(&rfkill_input_disabled) == 0) printk(KERN_DEBUG "rfkill: input handler enabled\n"); #endif kfree(data); return 0; } static long rfkill_fop_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct rfkill_data *data = file->private_data; int ret = -ENOTTY; u32 size; if (_IOC_TYPE(cmd) != RFKILL_IOC_MAGIC) return -ENOTTY; mutex_lock(&data->mtx); switch (_IOC_NR(cmd)) { #ifdef CONFIG_RFKILL_INPUT case RFKILL_IOC_NOINPUT: if (!data->input_handler) { if (atomic_inc_return(&rfkill_input_disabled) == 1) printk(KERN_DEBUG "rfkill: input handler disabled\n"); data->input_handler = true; } ret = 0; break; #endif case RFKILL_IOC_MAX_SIZE: if (get_user(size, (__u32 __user *)arg)) { ret = -EFAULT; break; } if (size < RFKILL_EVENT_SIZE_V1 || size > U8_MAX) { ret = -EINVAL; break; } data->max_size = size; ret = 0; break; default: break; } mutex_unlock(&data->mtx); return ret; } static const struct file_operations rfkill_fops = { .owner = THIS_MODULE, .open = rfkill_fop_open, .read = rfkill_fop_read, .write = rfkill_fop_write, .poll = rfkill_fop_poll, .release = rfkill_fop_release, .unlocked_ioctl = rfkill_fop_ioctl, .compat_ioctl = compat_ptr_ioctl, }; #define RFKILL_NAME "rfkill" static struct miscdevice rfkill_miscdev = { .fops = &rfkill_fops, .name = RFKILL_NAME, .minor = RFKILL_MINOR, }; static int __init rfkill_init(void) { int error; rfkill_update_global_state(RFKILL_TYPE_ALL, !rfkill_default_state); error = class_register(&rfkill_class); if (error) goto error_class; error = misc_register(&rfkill_miscdev); if (error) goto error_misc; error = rfkill_global_led_trigger_register(); if (error) goto error_led_trigger; #ifdef CONFIG_RFKILL_INPUT error = rfkill_handler_init(); if (error) goto error_input; #endif return 0; #ifdef CONFIG_RFKILL_INPUT error_input: rfkill_global_led_trigger_unregister(); #endif error_led_trigger: misc_deregister(&rfkill_miscdev); error_misc: class_unregister(&rfkill_class); error_class: return error; } subsys_initcall(rfkill_init); static void __exit rfkill_exit(void) { #ifdef CONFIG_RFKILL_INPUT rfkill_handler_exit(); #endif rfkill_global_led_trigger_unregister(); misc_deregister(&rfkill_miscdev); class_unregister(&rfkill_class); } module_exit(rfkill_exit); MODULE_ALIAS_MISCDEV(RFKILL_MINOR); MODULE_ALIAS("devname:" RFKILL_NAME);
54 54 11 1 2 33 1 8 5 5 26 3 6 7 39 3 32 3 32 10 2 20 5 9 1 8 1 8 103 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * "Ping" sockets * * Based on ipv4/ping.c code. * * Authors: Lorenzo Colitti (IPv6 support) * Vasiliy Kulikov / Openwall (IPv4 implementation, for Linux 2.6), * Pavel Kankovsky (IPv4 implementation, for Linux 2.4.32) */ #include <net/addrconf.h> #include <net/ipv6.h> #include <net/ip6_route.h> #include <net/protocol.h> #include <net/udp.h> #include <net/transp_v6.h> #include <linux/proc_fs.h> #include <linux/bpf-cgroup.h> #include <net/ping.h> /* Compatibility glue so we can support IPv6 when it's compiled as a module */ static int dummy_ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) { return -EAFNOSUPPORT; } static void dummy_ip6_datagram_recv_ctl(struct sock *sk, struct msghdr *msg, struct sk_buff *skb) { } static int dummy_icmpv6_err_convert(u8 type, u8 code, int *err) { return -EAFNOSUPPORT; } static void dummy_ipv6_icmp_error(struct sock *sk, struct sk_buff *skb, int err, __be16 port, u32 info, u8 *payload) {} static int dummy_ipv6_chk_addr(struct net *net, const struct in6_addr *addr, const struct net_device *dev, int strict) { return 0; } static int ping_v6_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { /* This check is replicated from __ip6_datagram_connect() and * intended to prevent BPF program called below from accessing * bytes that are out of the bound specified by user in addr_len. */ if (addr_len < SIN6_LEN_RFC2133) return -EINVAL; return BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr, &addr_len); } static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); struct icmp6hdr user_icmph; int addr_type; struct in6_addr *daddr; int oif = 0; struct flowi6 fl6; int err; struct dst_entry *dst; struct rt6_info *rt; struct pingfakehdr pfh; struct ipcm6_cookie ipc6; err = ping_common_sendmsg(AF_INET6, msg, len, &user_icmph, sizeof(user_icmph)); if (err) return err; memset(&fl6, 0, sizeof(fl6)); if (msg->msg_name) { DECLARE_SOCKADDR(struct sockaddr_in6 *, u, msg->msg_name); if (msg->msg_namelen < sizeof(*u)) return -EINVAL; if (u->sin6_family != AF_INET6) { return -EAFNOSUPPORT; } daddr = &(u->sin6_addr); if (inet6_test_bit(SNDFLOW, sk)) fl6.flowlabel = u->sin6_flowinfo & IPV6_FLOWINFO_MASK; if (__ipv6_addr_needs_scope_id(ipv6_addr_type(daddr))) oif = u->sin6_scope_id; } else { if (sk->sk_state != TCP_ESTABLISHED) return -EDESTADDRREQ; daddr = &sk->sk_v6_daddr; fl6.flowlabel = np->flow_label; } if (!oif) oif = sk->sk_bound_dev_if; if (!oif) oif = np->sticky_pktinfo.ipi6_ifindex; if (!oif && ipv6_addr_is_multicast(daddr)) oif = READ_ONCE(np->mcast_oif); else if (!oif) oif = READ_ONCE(np->ucast_oif); addr_type = ipv6_addr_type(daddr); if ((__ipv6_addr_needs_scope_id(addr_type) && !oif) || (addr_type & IPV6_ADDR_MAPPED) || (oif && sk->sk_bound_dev_if && oif != sk->sk_bound_dev_if && l3mdev_master_ifindex_by_index(sock_net(sk), oif) != sk->sk_bound_dev_if)) return -EINVAL; ipcm6_init_sk(&ipc6, sk); fl6.flowi6_oif = oif; if (msg->msg_controllen) { struct ipv6_txoptions opt = {}; opt.tot_len = sizeof(opt); ipc6.opt = &opt; err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6, &ipc6); if (err < 0) return err; /* Changes to txoptions and flow info are not implemented, yet. * Drop the options. */ ipc6.opt = NULL; } fl6.flowi6_proto = IPPROTO_ICMPV6; fl6.saddr = np->saddr; fl6.daddr = *daddr; fl6.flowi6_mark = ipc6.sockc.mark; fl6.flowi6_uid = sk->sk_uid; fl6.fl6_icmp_type = user_icmph.icmp6_type; fl6.fl6_icmp_code = user_icmph.icmp6_code; security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6)); fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel); dst = ip6_sk_dst_lookup_flow(sk, &fl6, daddr, false); if (IS_ERR(dst)) return PTR_ERR(dst); rt = dst_rt6_info(dst); if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr)) fl6.flowi6_oif = READ_ONCE(np->mcast_oif); else if (!fl6.flowi6_oif) fl6.flowi6_oif = READ_ONCE(np->ucast_oif); pfh.icmph.type = user_icmph.icmp6_type; pfh.icmph.code = user_icmph.icmp6_code; pfh.icmph.checksum = 0; pfh.icmph.un.echo.id = inet->inet_sport; pfh.icmph.un.echo.sequence = user_icmph.icmp6_sequence; pfh.msg = msg; pfh.wcheck = 0; pfh.family = AF_INET6; if (ipc6.hlimit < 0) ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); lock_sock(sk); err = ip6_append_data(sk, ping_getfrag, &pfh, len, sizeof(struct icmp6hdr), &ipc6, &fl6, rt, MSG_DONTWAIT); if (err) { ICMP6_INC_STATS(sock_net(sk), rt->rt6i_idev, ICMP6_MIB_OUTERRORS); ip6_flush_pending_frames(sk); } else { icmpv6_push_pending_frames(sk, &fl6, (struct icmp6hdr *)&pfh.icmph, len); } release_sock(sk); dst_release(dst); if (err) return err; return len; } struct proto pingv6_prot = { .name = "PINGv6", .owner = THIS_MODULE, .init = ping_init_sock, .close = ping_close, .pre_connect = ping_v6_pre_connect, .connect = ip6_datagram_connect_v6_only, .disconnect = __udp_disconnect, .setsockopt = ipv6_setsockopt, .getsockopt = ipv6_getsockopt, .sendmsg = ping_v6_sendmsg, .recvmsg = ping_recvmsg, .bind = ping_bind, .backlog_rcv = ping_queue_rcv_skb, .hash = ping_hash, .unhash = ping_unhash, .get_port = ping_get_port, .put_port = ping_unhash, .obj_size = sizeof(struct raw6_sock), .ipv6_pinfo_offset = offsetof(struct raw6_sock, inet6), }; EXPORT_SYMBOL_GPL(pingv6_prot); static struct inet_protosw pingv6_protosw = { .type = SOCK_DGRAM, .protocol = IPPROTO_ICMPV6, .prot = &pingv6_prot, .ops = &inet6_sockraw_ops, .flags = INET_PROTOSW_REUSE, }; #ifdef CONFIG_PROC_FS static void *ping_v6_seq_start(struct seq_file *seq, loff_t *pos) { return ping_seq_start(seq, pos, AF_INET6); } static int ping_v6_seq_show(struct seq_file *seq, void *v) { if (v == SEQ_START_TOKEN) { seq_puts(seq, IPV6_SEQ_DGRAM_HEADER); } else { int bucket = ((struct ping_iter_state *) seq->private)->bucket; struct inet_sock *inet = inet_sk((struct sock *)v); __u16 srcp = ntohs(inet->inet_sport); __u16 destp = ntohs(inet->inet_dport); ip6_dgram_sock_seq_show(seq, v, srcp, destp, bucket); } return 0; } static const struct seq_operations ping_v6_seq_ops = { .start = ping_v6_seq_start, .show = ping_v6_seq_show, .next = ping_seq_next, .stop = ping_seq_stop, }; static int __net_init ping_v6_proc_init_net(struct net *net) { if (!proc_create_net("icmp6", 0444, net->proc_net, &ping_v6_seq_ops, sizeof(struct ping_iter_state))) return -ENOMEM; return 0; } static void __net_exit ping_v6_proc_exit_net(struct net *net) { remove_proc_entry("icmp6", net->proc_net); } static struct pernet_operations ping_v6_net_ops = { .init = ping_v6_proc_init_net, .exit = ping_v6_proc_exit_net, }; #endif int __init pingv6_init(void) { #ifdef CONFIG_PROC_FS int ret = register_pernet_subsys(&ping_v6_net_ops); if (ret) return ret; #endif pingv6_ops.ipv6_recv_error = ipv6_recv_error; pingv6_ops.ip6_datagram_recv_common_ctl = ip6_datagram_recv_common_ctl; pingv6_ops.ip6_datagram_recv_specific_ctl = ip6_datagram_recv_specific_ctl; pingv6_ops.icmpv6_err_convert = icmpv6_err_convert; pingv6_ops.ipv6_icmp_error = ipv6_icmp_error; pingv6_ops.ipv6_chk_addr = ipv6_chk_addr; return inet6_register_protosw(&pingv6_protosw); } /* This never gets called because it's not possible to unload the ipv6 module, * but just in case. */ void pingv6_exit(void) { pingv6_ops.ipv6_recv_error = dummy_ipv6_recv_error; pingv6_ops.ip6_datagram_recv_common_ctl = dummy_ip6_datagram_recv_ctl; pingv6_ops.ip6_datagram_recv_specific_ctl = dummy_ip6_datagram_recv_ctl; pingv6_ops.icmpv6_err_convert = dummy_icmpv6_err_convert; pingv6_ops.ipv6_icmp_error = dummy_ipv6_icmp_error; pingv6_ops.ipv6_chk_addr = dummy_ipv6_chk_addr; #ifdef CONFIG_PROC_FS unregister_pernet_subsys(&ping_v6_net_ops); #endif inet6_unregister_protosw(&pingv6_protosw); }
7 5 2 1 2 1 1 7 3 3 1 1 2 1 1 7 4 4 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 // SPDX-License-Identifier: GPL-2.0-or-later /* * The AEGIS-128 Authenticated-Encryption Algorithm * Glue for AES-NI + SSE4.1 implementation * * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com> * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. */ #include <crypto/internal/aead.h> #include <crypto/internal/simd.h> #include <crypto/internal/skcipher.h> #include <crypto/scatterwalk.h> #include <linux/module.h> #include <asm/fpu/api.h> #include <asm/cpu_device_id.h> #define AEGIS128_BLOCK_ALIGN 16 #define AEGIS128_BLOCK_SIZE 16 #define AEGIS128_NONCE_SIZE 16 #define AEGIS128_STATE_BLOCKS 5 #define AEGIS128_KEY_SIZE 16 #define AEGIS128_MIN_AUTH_SIZE 8 #define AEGIS128_MAX_AUTH_SIZE 16 struct aegis_block { u8 bytes[AEGIS128_BLOCK_SIZE] __aligned(AEGIS128_BLOCK_ALIGN); }; struct aegis_state { struct aegis_block blocks[AEGIS128_STATE_BLOCKS]; }; struct aegis_ctx { struct aegis_block key; }; asmlinkage void aegis128_aesni_init(struct aegis_state *state, const struct aegis_block *key, const u8 iv[AEGIS128_NONCE_SIZE]); asmlinkage void aegis128_aesni_ad(struct aegis_state *state, const u8 *data, unsigned int len); asmlinkage void aegis128_aesni_enc(struct aegis_state *state, const u8 *src, u8 *dst, unsigned int len); asmlinkage void aegis128_aesni_dec(struct aegis_state *state, const u8 *src, u8 *dst, unsigned int len); asmlinkage void aegis128_aesni_enc_tail(struct aegis_state *state, const u8 *src, u8 *dst, unsigned int len); asmlinkage void aegis128_aesni_dec_tail(struct aegis_state *state, const u8 *src, u8 *dst, unsigned int len); asmlinkage void aegis128_aesni_final(struct aegis_state *state, struct aegis_block *tag_xor, unsigned int assoclen, unsigned int cryptlen); static void crypto_aegis128_aesni_process_ad( struct aegis_state *state, struct scatterlist *sg_src, unsigned int assoclen) { struct scatter_walk walk; struct aegis_block buf; unsigned int pos = 0; scatterwalk_start(&walk, sg_src); while (assoclen != 0) { unsigned int size = scatterwalk_next(&walk, assoclen); const u8 *src = walk.addr; unsigned int left = size; if (pos + size >= AEGIS128_BLOCK_SIZE) { if (pos > 0) { unsigned int fill = AEGIS128_BLOCK_SIZE - pos; memcpy(buf.bytes + pos, src, fill); aegis128_aesni_ad(state, buf.bytes, AEGIS128_BLOCK_SIZE); pos = 0; left -= fill; src += fill; } aegis128_aesni_ad(state, src, left & ~(AEGIS128_BLOCK_SIZE - 1)); src += left & ~(AEGIS128_BLOCK_SIZE - 1); left &= AEGIS128_BLOCK_SIZE - 1; } memcpy(buf.bytes + pos, src, left); pos += left; assoclen -= size; scatterwalk_done_src(&walk, size); } if (pos > 0) { memset(buf.bytes + pos, 0, AEGIS128_BLOCK_SIZE - pos); aegis128_aesni_ad(state, buf.bytes, AEGIS128_BLOCK_SIZE); } } static __always_inline void crypto_aegis128_aesni_process_crypt(struct aegis_state *state, struct skcipher_walk *walk, bool enc) { while (walk->nbytes >= AEGIS128_BLOCK_SIZE) { if (enc) aegis128_aesni_enc(state, walk->src.virt.addr, walk->dst.virt.addr, round_down(walk->nbytes, AEGIS128_BLOCK_SIZE)); else aegis128_aesni_dec(state, walk->src.virt.addr, walk->dst.virt.addr, round_down(walk->nbytes, AEGIS128_BLOCK_SIZE)); skcipher_walk_done(walk, walk->nbytes % AEGIS128_BLOCK_SIZE); } if (walk->nbytes) { if (enc) aegis128_aesni_enc_tail(state, walk->src.virt.addr, walk->dst.virt.addr, walk->nbytes); else aegis128_aesni_dec_tail(state, walk->src.virt.addr, walk->dst.virt.addr, walk->nbytes); skcipher_walk_done(walk, 0); } } static struct aegis_ctx *crypto_aegis128_aesni_ctx(struct crypto_aead *aead) { u8 *ctx = crypto_aead_ctx(aead); ctx = PTR_ALIGN(ctx, __alignof__(struct aegis_ctx)); return (void *)ctx; } static int crypto_aegis128_aesni_setkey(struct crypto_aead *aead, const u8 *key, unsigned int keylen) { struct aegis_ctx *ctx = crypto_aegis128_aesni_ctx(aead); if (keylen != AEGIS128_KEY_SIZE) return -EINVAL; memcpy(ctx->key.bytes, key, AEGIS128_KEY_SIZE); return 0; } static int crypto_aegis128_aesni_setauthsize(struct crypto_aead *tfm, unsigned int authsize) { if (authsize > AEGIS128_MAX_AUTH_SIZE) return -EINVAL; if (authsize < AEGIS128_MIN_AUTH_SIZE) return -EINVAL; return 0; } static __always_inline void crypto_aegis128_aesni_crypt(struct aead_request *req, struct aegis_block *tag_xor, unsigned int cryptlen, bool enc) { struct crypto_aead *tfm = crypto_aead_reqtfm(req); struct aegis_ctx *ctx = crypto_aegis128_aesni_ctx(tfm); struct skcipher_walk walk; struct aegis_state state; if (enc) skcipher_walk_aead_encrypt(&walk, req, true); else skcipher_walk_aead_decrypt(&walk, req, true); kernel_fpu_begin(); aegis128_aesni_init(&state, &ctx->key, req->iv); crypto_aegis128_aesni_process_ad(&state, req->src, req->assoclen); crypto_aegis128_aesni_process_crypt(&state, &walk, enc); aegis128_aesni_final(&state, tag_xor, req->assoclen, cryptlen); kernel_fpu_end(); } static int crypto_aegis128_aesni_encrypt(struct aead_request *req) { struct crypto_aead *tfm = crypto_aead_reqtfm(req); struct aegis_block tag = {}; unsigned int authsize = crypto_aead_authsize(tfm); unsigned int cryptlen = req->cryptlen; crypto_aegis128_aesni_crypt(req, &tag, cryptlen, true); scatterwalk_map_and_copy(tag.bytes, req->dst, req->assoclen + cryptlen, authsize, 1); return 0; } static int crypto_aegis128_aesni_decrypt(struct aead_request *req) { static const struct aegis_block zeros = {}; struct crypto_aead *tfm = crypto_aead_reqtfm(req); struct aegis_block tag; unsigned int authsize = crypto_aead_authsize(tfm); unsigned int cryptlen = req->cryptlen - authsize; scatterwalk_map_and_copy(tag.bytes, req->src, req->assoclen + cryptlen, authsize, 0); crypto_aegis128_aesni_crypt(req, &tag, cryptlen, false); return crypto_memneq(tag.bytes, zeros.bytes, authsize) ? -EBADMSG : 0; } static struct aead_alg crypto_aegis128_aesni_alg = { .setkey = crypto_aegis128_aesni_setkey, .setauthsize = crypto_aegis128_aesni_setauthsize, .encrypt = crypto_aegis128_aesni_encrypt, .decrypt = crypto_aegis128_aesni_decrypt, .ivsize = AEGIS128_NONCE_SIZE, .maxauthsize = AEGIS128_MAX_AUTH_SIZE, .chunksize = AEGIS128_BLOCK_SIZE, .base = { .cra_flags = CRYPTO_ALG_INTERNAL, .cra_blocksize = 1, .cra_ctxsize = sizeof(struct aegis_ctx) + __alignof__(struct aegis_ctx), .cra_priority = 400, .cra_name = "__aegis128", .cra_driver_name = "__aegis128-aesni", .cra_module = THIS_MODULE, } }; static struct simd_aead_alg *simd_alg; static int __init crypto_aegis128_aesni_module_init(void) { if (!boot_cpu_has(X86_FEATURE_XMM4_1) || !boot_cpu_has(X86_FEATURE_AES) || !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL)) return -ENODEV; return simd_register_aeads_compat(&crypto_aegis128_aesni_alg, 1, &simd_alg); } static void __exit crypto_aegis128_aesni_module_exit(void) { simd_unregister_aeads(&crypto_aegis128_aesni_alg, 1, &simd_alg); } module_init(crypto_aegis128_aesni_module_init); module_exit(crypto_aegis128_aesni_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Ondrej Mosnacek <omosnacek@gmail.com>"); MODULE_DESCRIPTION("AEGIS-128 AEAD algorithm -- AESNI+SSE4.1 implementation"); MODULE_ALIAS_CRYPTO("aegis128"); MODULE_ALIAS_CRYPTO("aegis128-aesni");
11 3 10 8 10 28 28 27 23 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 // SPDX-License-Identifier: GPL-2.0-only #include "netlink.h" #include "common.h" #include "bitset.h" struct debug_req_info { struct ethnl_req_info base; }; struct debug_reply_data { struct ethnl_reply_data base; u32 msg_mask; }; #define DEBUG_REPDATA(__reply_base) \ container_of(__reply_base, struct debug_reply_data, base) const struct nla_policy ethnl_debug_get_policy[] = { [ETHTOOL_A_DEBUG_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), }; static int debug_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, const struct genl_info *info) { struct debug_reply_data *data = DEBUG_REPDATA(reply_base); struct net_device *dev = reply_base->dev; int ret; if (!dev->ethtool_ops->get_msglevel) return -EOPNOTSUPP; ret = ethnl_ops_begin(dev); if (ret < 0) return ret; data->msg_mask = dev->ethtool_ops->get_msglevel(dev); ethnl_ops_complete(dev); return 0; } static int debug_reply_size(const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct debug_reply_data *data = DEBUG_REPDATA(reply_base); bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; return ethnl_bitset32_size(&data->msg_mask, NULL, NETIF_MSG_CLASS_COUNT, netif_msg_class_names, compact); } static int debug_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct debug_reply_data *data = DEBUG_REPDATA(reply_base); bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; return ethnl_put_bitset32(skb, ETHTOOL_A_DEBUG_MSGMASK, &data->msg_mask, NULL, NETIF_MSG_CLASS_COUNT, netif_msg_class_names, compact); } /* DEBUG_SET */ const struct nla_policy ethnl_debug_set_policy[] = { [ETHTOOL_A_DEBUG_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), [ETHTOOL_A_DEBUG_MSGMASK] = { .type = NLA_NESTED }, }; static int ethnl_set_debug_validate(struct ethnl_req_info *req_info, struct genl_info *info) { const struct ethtool_ops *ops = req_info->dev->ethtool_ops; return ops->get_msglevel && ops->set_msglevel ? 1 : -EOPNOTSUPP; } static int ethnl_set_debug(struct ethnl_req_info *req_info, struct genl_info *info) { struct net_device *dev = req_info->dev; struct nlattr **tb = info->attrs; bool mod = false; u32 msg_mask; int ret; msg_mask = dev->ethtool_ops->get_msglevel(dev); ret = ethnl_update_bitset32(&msg_mask, NETIF_MSG_CLASS_COUNT, tb[ETHTOOL_A_DEBUG_MSGMASK], netif_msg_class_names, info->extack, &mod); if (ret < 0 || !mod) return ret; dev->ethtool_ops->set_msglevel(dev, msg_mask); return 1; } const struct ethnl_request_ops ethnl_debug_request_ops = { .request_cmd = ETHTOOL_MSG_DEBUG_GET, .reply_cmd = ETHTOOL_MSG_DEBUG_GET_REPLY, .hdr_attr = ETHTOOL_A_DEBUG_HEADER, .req_info_size = sizeof(struct debug_req_info), .reply_data_size = sizeof(struct debug_reply_data), .prepare_data = debug_prepare_data, .reply_size = debug_reply_size, .fill_reply = debug_fill_reply, .set_validate = ethnl_set_debug_validate, .set = ethnl_set_debug, .set_ntf_cmd = ETHTOOL_MSG_DEBUG_NTF, };
22 22 22 22 22 75 21 77 41 525 21 21 21 21 21 21 21 21 21 21 21 21 564 562 1 1 1 226 274 399 566 324 403 774 402 566 754 883 889 98 777 21 21 21 21 21 21 21 1 26 3 7 31 27 29 6 30 30 2 2 2 30 30 2 25 1 38 31 25 1 22 22 22 22 22 22 22 20 21 21 21 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 /* * Copyright (c) 2018 Cumulus Networks. All rights reserved. * Copyright (c) 2018 David Ahern <dsa@cumulusnetworks.com> * * This software is licensed under the GNU General License Version 2, * June 1991 as shown in the file COPYING in the top-level directory of this * source tree. * * THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" * WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE * OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME * THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. */ #include <linux/bitmap.h> #include <linux/in6.h> #include <linux/kernel.h> #include <linux/list.h> #include <linux/rhashtable.h> #include <linux/spinlock_types.h> #include <linux/types.h> #include <net/fib_notifier.h> #include <net/inet_dscp.h> #include <net/ip_fib.h> #include <net/ip6_fib.h> #include <net/fib_rules.h> #include <net/net_namespace.h> #include <net/nexthop.h> #include <linux/debugfs.h> #include "netdevsim.h" struct nsim_fib_entry { u64 max; atomic64_t num; }; struct nsim_per_fib_data { struct nsim_fib_entry fib; struct nsim_fib_entry rules; }; struct nsim_fib_data { struct notifier_block fib_nb; struct nsim_per_fib_data ipv4; struct nsim_per_fib_data ipv6; struct nsim_fib_entry nexthops; struct rhashtable fib_rt_ht; struct list_head fib_rt_list; struct mutex fib_lock; /* Protects FIB HT and list */ struct notifier_block nexthop_nb; struct rhashtable nexthop_ht; struct devlink *devlink; struct work_struct fib_event_work; struct work_struct fib_flush_work; struct list_head fib_event_queue; spinlock_t fib_event_queue_lock; /* Protects fib event queue list */ struct mutex nh_lock; /* Protects NH HT */ struct dentry *ddir; bool fail_route_offload; bool fail_res_nexthop_group_replace; bool fail_nexthop_bucket_replace; bool fail_route_delete; }; struct nsim_fib_rt_key { unsigned char addr[sizeof(struct in6_addr)]; unsigned char prefix_len; int family; u32 tb_id; }; struct nsim_fib_rt { struct nsim_fib_rt_key key; struct rhash_head ht_node; struct list_head list; /* Member of fib_rt_list */ }; struct nsim_fib4_rt { struct nsim_fib_rt common; struct fib_info *fi; dscp_t dscp; u8 type; }; struct nsim_fib6_rt { struct nsim_fib_rt common; struct list_head nh_list; unsigned int nhs; }; struct nsim_fib6_rt_nh { struct list_head list; /* Member of nh_list */ struct fib6_info *rt; }; struct nsim_fib6_event { struct fib6_info **rt_arr; unsigned int nrt6; }; struct nsim_fib_event { struct list_head list; /* node in fib queue */ union { struct fib_entry_notifier_info fen_info; struct nsim_fib6_event fib6_event; }; struct nsim_fib_data *data; unsigned long event; int family; }; static const struct rhashtable_params nsim_fib_rt_ht_params = { .key_offset = offsetof(struct nsim_fib_rt, key), .head_offset = offsetof(struct nsim_fib_rt, ht_node), .key_len = sizeof(struct nsim_fib_rt_key), .automatic_shrinking = true, }; struct nsim_nexthop { struct rhash_head ht_node; u64 occ; u32 id; bool is_resilient; }; static const struct rhashtable_params nsim_nexthop_ht_params = { .key_offset = offsetof(struct nsim_nexthop, id), .head_offset = offsetof(struct nsim_nexthop, ht_node), .key_len = sizeof(u32), .automatic_shrinking = true, }; u64 nsim_fib_get_val(struct nsim_fib_data *fib_data, enum nsim_resource_id res_id, bool max) { struct nsim_fib_entry *entry; switch (res_id) { case NSIM_RESOURCE_IPV4_FIB: entry = &fib_data->ipv4.fib; break; case NSIM_RESOURCE_IPV4_FIB_RULES: entry = &fib_data->ipv4.rules; break; case NSIM_RESOURCE_IPV6_FIB: entry = &fib_data->ipv6.fib; break; case NSIM_RESOURCE_IPV6_FIB_RULES: entry = &fib_data->ipv6.rules; break; case NSIM_RESOURCE_NEXTHOPS: entry = &fib_data->nexthops; break; default: return 0; } return max ? entry->max : atomic64_read(&entry->num); } static void nsim_fib_set_max(struct nsim_fib_data *fib_data, enum nsim_resource_id res_id, u64 val) { struct nsim_fib_entry *entry; switch (res_id) { case NSIM_RESOURCE_IPV4_FIB: entry = &fib_data->ipv4.fib; break; case NSIM_RESOURCE_IPV4_FIB_RULES: entry = &fib_data->ipv4.rules; break; case NSIM_RESOURCE_IPV6_FIB: entry = &fib_data->ipv6.fib; break; case NSIM_RESOURCE_IPV6_FIB_RULES: entry = &fib_data->ipv6.rules; break; case NSIM_RESOURCE_NEXTHOPS: entry = &fib_data->nexthops; break; default: WARN_ON(1); return; } entry->max = val; } static int nsim_fib_rule_account(struct nsim_fib_entry *entry, bool add, struct netlink_ext_ack *extack) { int err = 0; if (add) { if (!atomic64_add_unless(&entry->num, 1, entry->max)) { err = -ENOSPC; NL_SET_ERR_MSG_MOD(extack, "Exceeded number of supported fib rule entries"); } } else { atomic64_dec_if_positive(&entry->num); } return err; } static int nsim_fib_rule_event(struct nsim_fib_data *data, struct fib_notifier_info *info, bool add) { struct netlink_ext_ack *extack = info->extack; int err = 0; switch (info->family) { case AF_INET: err = nsim_fib_rule_account(&data->ipv4.rules, add, extack); break; case AF_INET6: err = nsim_fib_rule_account(&data->ipv6.rules, add, extack); break; } return err; } static int nsim_fib_account(struct nsim_fib_entry *entry, bool add) { int err = 0; if (add) { if (!atomic64_add_unless(&entry->num, 1, entry->max)) err = -ENOSPC; } else { atomic64_dec_if_positive(&entry->num); } return err; } static void nsim_fib_rt_init(struct nsim_fib_data *data, struct nsim_fib_rt *fib_rt, const void *addr, size_t addr_len, unsigned int prefix_len, int family, u32 tb_id) { memcpy(fib_rt->key.addr, addr, addr_len); fib_rt->key.prefix_len = prefix_len; fib_rt->key.family = family; fib_rt->key.tb_id = tb_id; list_add(&fib_rt->list, &data->fib_rt_list); } static void nsim_fib_rt_fini(struct nsim_fib_rt *fib_rt) { list_del(&fib_rt->list); } static struct nsim_fib_rt *nsim_fib_rt_lookup(struct rhashtable *fib_rt_ht, const void *addr, size_t addr_len, unsigned int prefix_len, int family, u32 tb_id) { struct nsim_fib_rt_key key; memset(&key, 0, sizeof(key)); memcpy(key.addr, addr, addr_len); key.prefix_len = prefix_len; key.family = family; key.tb_id = tb_id; return rhashtable_lookup_fast(fib_rt_ht, &key, nsim_fib_rt_ht_params); } static struct nsim_fib4_rt * nsim_fib4_rt_create(struct nsim_fib_data *data, struct fib_entry_notifier_info *fen_info) { struct nsim_fib4_rt *fib4_rt; fib4_rt = kzalloc(sizeof(*fib4_rt), GFP_KERNEL); if (!fib4_rt) return NULL; nsim_fib_rt_init(data, &fib4_rt->common, &fen_info->dst, sizeof(u32), fen_info->dst_len, AF_INET, fen_info->tb_id); fib4_rt->fi = fen_info->fi; fib_info_hold(fib4_rt->fi); fib4_rt->dscp = fen_info->dscp; fib4_rt->type = fen_info->type; return fib4_rt; } static void nsim_fib4_rt_destroy(struct nsim_fib4_rt *fib4_rt) { fib_info_put(fib4_rt->fi); nsim_fib_rt_fini(&fib4_rt->common); kfree(fib4_rt); } static struct nsim_fib4_rt * nsim_fib4_rt_lookup(struct rhashtable *fib_rt_ht, const struct fib_entry_notifier_info *fen_info) { struct nsim_fib_rt *fib_rt; fib_rt = nsim_fib_rt_lookup(fib_rt_ht, &fen_info->dst, sizeof(u32), fen_info->dst_len, AF_INET, fen_info->tb_id); if (!fib_rt) return NULL; return container_of(fib_rt, struct nsim_fib4_rt, common); } static void nsim_fib4_rt_offload_failed_flag_set(struct net *net, struct fib_entry_notifier_info *fen_info) { u32 *p_dst = (u32 *)&fen_info->dst; struct fib_rt_info fri; fri.fi = fen_info->fi; fri.tb_id = fen_info->tb_id; fri.dst = cpu_to_be32(*p_dst); fri.dst_len = fen_info->dst_len; fri.dscp = fen_info->dscp; fri.type = fen_info->type; fri.offload = false; fri.trap = false; fri.offload_failed = true; fib_alias_hw_flags_set(net, &fri); } static void nsim_fib4_rt_hw_flags_set(struct net *net, const struct nsim_fib4_rt *fib4_rt, bool trap) { u32 *p_dst = (u32 *) fib4_rt->common.key.addr; int dst_len = fib4_rt->common.key.prefix_len; struct fib_rt_info fri; fri.fi = fib4_rt->fi; fri.tb_id = fib4_rt->common.key.tb_id; fri.dst = cpu_to_be32(*p_dst); fri.dst_len = dst_len; fri.dscp = fib4_rt->dscp; fri.type = fib4_rt->type; fri.offload = false; fri.trap = trap; fri.offload_failed = false; fib_alias_hw_flags_set(net, &fri); } static int nsim_fib4_rt_add(struct nsim_fib_data *data, struct nsim_fib4_rt *fib4_rt) { struct net *net = devlink_net(data->devlink); int err; err = rhashtable_insert_fast(&data->fib_rt_ht, &fib4_rt->common.ht_node, nsim_fib_rt_ht_params); if (err) goto err_fib_dismiss; /* Simulate hardware programming latency. */ msleep(1); nsim_fib4_rt_hw_flags_set(net, fib4_rt, true); return 0; err_fib_dismiss: /* Drop the accounting that was increased from the notification * context when FIB_EVENT_ENTRY_REPLACE was triggered. */ nsim_fib_account(&data->ipv4.fib, false); return err; } static int nsim_fib4_rt_replace(struct nsim_fib_data *data, struct nsim_fib4_rt *fib4_rt, struct nsim_fib4_rt *fib4_rt_old) { struct net *net = devlink_net(data->devlink); int err; /* We are replacing a route, so need to remove the accounting which * was increased when FIB_EVENT_ENTRY_REPLACE was triggered. */ err = nsim_fib_account(&data->ipv4.fib, false); if (err) return err; err = rhashtable_replace_fast(&data->fib_rt_ht, &fib4_rt_old->common.ht_node, &fib4_rt->common.ht_node, nsim_fib_rt_ht_params); if (err) return err; msleep(1); nsim_fib4_rt_hw_flags_set(net, fib4_rt, true); nsim_fib4_rt_hw_flags_set(net, fib4_rt_old, false); nsim_fib4_rt_destroy(fib4_rt_old); return 0; } static int nsim_fib4_rt_insert(struct nsim_fib_data *data, struct fib_entry_notifier_info *fen_info) { struct nsim_fib4_rt *fib4_rt, *fib4_rt_old; int err; if (data->fail_route_offload) { /* For testing purposes, user set debugfs fail_route_offload * value to true. Simulate hardware programming latency and then * fail. */ msleep(1); return -EINVAL; } fib4_rt = nsim_fib4_rt_create(data, fen_info); if (!fib4_rt) return -ENOMEM; fib4_rt_old = nsim_fib4_rt_lookup(&data->fib_rt_ht, fen_info); if (!fib4_rt_old) err = nsim_fib4_rt_add(data, fib4_rt); else err = nsim_fib4_rt_replace(data, fib4_rt, fib4_rt_old); if (err) nsim_fib4_rt_destroy(fib4_rt); return err; } static void nsim_fib4_rt_remove(struct nsim_fib_data *data, const struct fib_entry_notifier_info *fen_info) { struct nsim_fib4_rt *fib4_rt; fib4_rt = nsim_fib4_rt_lookup(&data->fib_rt_ht, fen_info); if (!fib4_rt) return; rhashtable_remove_fast(&data->fib_rt_ht, &fib4_rt->common.ht_node, nsim_fib_rt_ht_params); nsim_fib4_rt_destroy(fib4_rt); } static int nsim_fib4_event(struct nsim_fib_data *data, struct fib_entry_notifier_info *fen_info, unsigned long event) { int err = 0; switch (event) { case FIB_EVENT_ENTRY_REPLACE: err = nsim_fib4_rt_insert(data, fen_info); if (err) { struct net *net = devlink_net(data->devlink); nsim_fib4_rt_offload_failed_flag_set(net, fen_info); } break; case FIB_EVENT_ENTRY_DEL: nsim_fib4_rt_remove(data, fen_info); break; default: break; } return err; } static struct nsim_fib6_rt_nh * nsim_fib6_rt_nh_find(const struct nsim_fib6_rt *fib6_rt, const struct fib6_info *rt) { struct nsim_fib6_rt_nh *fib6_rt_nh; list_for_each_entry(fib6_rt_nh, &fib6_rt->nh_list, list) { if (fib6_rt_nh->rt == rt) return fib6_rt_nh; } return NULL; } static int nsim_fib6_rt_nh_add(struct nsim_fib6_rt *fib6_rt, struct fib6_info *rt) { struct nsim_fib6_rt_nh *fib6_rt_nh; fib6_rt_nh = kzalloc(sizeof(*fib6_rt_nh), GFP_KERNEL); if (!fib6_rt_nh) return -ENOMEM; fib6_info_hold(rt); fib6_rt_nh->rt = rt; list_add_tail(&fib6_rt_nh->list, &fib6_rt->nh_list); fib6_rt->nhs++; return 0; } #if IS_ENABLED(CONFIG_IPV6) static void nsim_rt6_release(struct fib6_info *rt) { fib6_info_release(rt); } #else static void nsim_rt6_release(struct fib6_info *rt) { } #endif static void nsim_fib6_rt_nh_del(struct nsim_fib6_rt *fib6_rt, const struct fib6_info *rt) { struct nsim_fib6_rt_nh *fib6_rt_nh; fib6_rt_nh = nsim_fib6_rt_nh_find(fib6_rt, rt); if (!fib6_rt_nh) return; fib6_rt->nhs--; list_del(&fib6_rt_nh->list); nsim_rt6_release(fib6_rt_nh->rt); kfree(fib6_rt_nh); } static struct nsim_fib6_rt * nsim_fib6_rt_create(struct nsim_fib_data *data, struct fib6_info **rt_arr, unsigned int nrt6) { struct fib6_info *rt = rt_arr[0]; struct nsim_fib6_rt *fib6_rt; int i = 0; int err; fib6_rt = kzalloc(sizeof(*fib6_rt), GFP_KERNEL); if (!fib6_rt) return ERR_PTR(-ENOMEM); nsim_fib_rt_init(data, &fib6_rt->common, &rt->fib6_dst.addr, sizeof(rt->fib6_dst.addr), rt->fib6_dst.plen, AF_INET6, rt->fib6_table->tb6_id); /* We consider a multipath IPv6 route as one entry, but it can be made * up from several fib6_info structs (one for each nexthop), so we * add them all to the same list under the entry. */ INIT_LIST_HEAD(&fib6_rt->nh_list); for (i = 0; i < nrt6; i++) { err = nsim_fib6_rt_nh_add(fib6_rt, rt_arr[i]); if (err) goto err_fib6_rt_nh_del; } return fib6_rt; err_fib6_rt_nh_del: for (i--; i >= 0; i--) { nsim_fib6_rt_nh_del(fib6_rt, rt_arr[i]); } nsim_fib_rt_fini(&fib6_rt->common); kfree(fib6_rt); return ERR_PTR(err); } static void nsim_fib6_rt_destroy(struct nsim_fib6_rt *fib6_rt) { struct nsim_fib6_rt_nh *iter, *tmp; list_for_each_entry_safe(iter, tmp, &fib6_rt->nh_list, list) nsim_fib6_rt_nh_del(fib6_rt, iter->rt); WARN_ON_ONCE(!list_empty(&fib6_rt->nh_list)); nsim_fib_rt_fini(&fib6_rt->common); kfree(fib6_rt); } static struct nsim_fib6_rt * nsim_fib6_rt_lookup(struct rhashtable *fib_rt_ht, const struct fib6_info *rt) { struct nsim_fib_rt *fib_rt; fib_rt = nsim_fib_rt_lookup(fib_rt_ht, &rt->fib6_dst.addr, sizeof(rt->fib6_dst.addr), rt->fib6_dst.plen, AF_INET6, rt->fib6_table->tb6_id); if (!fib_rt) return NULL; return container_of(fib_rt, struct nsim_fib6_rt, common); } static int nsim_fib6_rt_append(struct nsim_fib_data *data, struct nsim_fib6_event *fib6_event) { struct fib6_info *rt = fib6_event->rt_arr[0]; struct nsim_fib6_rt *fib6_rt; int i, err; if (data->fail_route_offload) { /* For testing purposes, user set debugfs fail_route_offload * value to true. Simulate hardware programming latency and then * fail. */ msleep(1); return -EINVAL; } fib6_rt = nsim_fib6_rt_lookup(&data->fib_rt_ht, rt); if (!fib6_rt) return -EINVAL; for (i = 0; i < fib6_event->nrt6; i++) { err = nsim_fib6_rt_nh_add(fib6_rt, fib6_event->rt_arr[i]); if (err) goto err_fib6_rt_nh_del; WRITE_ONCE(fib6_event->rt_arr[i]->trap, true); } return 0; err_fib6_rt_nh_del: for (i--; i >= 0; i--) { WRITE_ONCE(fib6_event->rt_arr[i]->trap, false); nsim_fib6_rt_nh_del(fib6_rt, fib6_event->rt_arr[i]); } return err; } #if IS_ENABLED(CONFIG_IPV6) static void nsim_fib6_rt_offload_failed_flag_set(struct nsim_fib_data *data, struct fib6_info **rt_arr, unsigned int nrt6) { struct net *net = devlink_net(data->devlink); int i; for (i = 0; i < nrt6; i++) fib6_info_hw_flags_set(net, rt_arr[i], false, false, true); } #else static void nsim_fib6_rt_offload_failed_flag_set(struct nsim_fib_data *data, struct fib6_info **rt_arr, unsigned int nrt6) { } #endif #if IS_ENABLED(CONFIG_IPV6) static void nsim_fib6_rt_hw_flags_set(struct nsim_fib_data *data, const struct nsim_fib6_rt *fib6_rt, bool trap) { struct net *net = devlink_net(data->devlink); struct nsim_fib6_rt_nh *fib6_rt_nh; list_for_each_entry(fib6_rt_nh, &fib6_rt->nh_list, list) fib6_info_hw_flags_set(net, fib6_rt_nh->rt, false, trap, false); } #else static void nsim_fib6_rt_hw_flags_set(struct nsim_fib_data *data, const struct nsim_fib6_rt *fib6_rt, bool trap) { } #endif static int nsim_fib6_rt_add(struct nsim_fib_data *data, struct nsim_fib6_rt *fib6_rt) { int err; err = rhashtable_insert_fast(&data->fib_rt_ht, &fib6_rt->common.ht_node, nsim_fib_rt_ht_params); if (err) goto err_fib_dismiss; msleep(1); nsim_fib6_rt_hw_flags_set(data, fib6_rt, true); return 0; err_fib_dismiss: /* Drop the accounting that was increased from the notification * context when FIB_EVENT_ENTRY_REPLACE was triggered. */ nsim_fib_account(&data->ipv6.fib, false); return err; } static int nsim_fib6_rt_replace(struct nsim_fib_data *data, struct nsim_fib6_rt *fib6_rt, struct nsim_fib6_rt *fib6_rt_old) { int err; /* We are replacing a route, so need to remove the accounting which * was increased when FIB_EVENT_ENTRY_REPLACE was triggered. */ err = nsim_fib_account(&data->ipv6.fib, false); if (err) return err; err = rhashtable_replace_fast(&data->fib_rt_ht, &fib6_rt_old->common.ht_node, &fib6_rt->common.ht_node, nsim_fib_rt_ht_params); if (err) return err; msleep(1); nsim_fib6_rt_hw_flags_set(data, fib6_rt, true); nsim_fib6_rt_hw_flags_set(data, fib6_rt_old, false); nsim_fib6_rt_destroy(fib6_rt_old); return 0; } static int nsim_fib6_rt_insert(struct nsim_fib_data *data, struct nsim_fib6_event *fib6_event) { struct fib6_info *rt = fib6_event->rt_arr[0]; struct nsim_fib6_rt *fib6_rt, *fib6_rt_old; int err; if (data->fail_route_offload) { /* For testing purposes, user set debugfs fail_route_offload * value to true. Simulate hardware programming latency and then * fail. */ msleep(1); return -EINVAL; } fib6_rt = nsim_fib6_rt_create(data, fib6_event->rt_arr, fib6_event->nrt6); if (IS_ERR(fib6_rt)) return PTR_ERR(fib6_rt); fib6_rt_old = nsim_fib6_rt_lookup(&data->fib_rt_ht, rt); if (!fib6_rt_old) err = nsim_fib6_rt_add(data, fib6_rt); else err = nsim_fib6_rt_replace(data, fib6_rt, fib6_rt_old); if (err) nsim_fib6_rt_destroy(fib6_rt); return err; } static void nsim_fib6_rt_remove(struct nsim_fib_data *data, struct nsim_fib6_event *fib6_event) { struct fib6_info *rt = fib6_event->rt_arr[0]; struct nsim_fib6_rt *fib6_rt; int i; /* Multipath routes are first added to the FIB trie and only then * notified. If we vetoed the addition, we will get a delete * notification for a route we do not have. Therefore, do not warn if * route was not found. */ fib6_rt = nsim_fib6_rt_lookup(&data->fib_rt_ht, rt); if (!fib6_rt) return; /* If not all the nexthops are deleted, then only reduce the nexthop * group. */ if (fib6_event->nrt6 != fib6_rt->nhs) { for (i = 0; i < fib6_event->nrt6; i++) nsim_fib6_rt_nh_del(fib6_rt, fib6_event->rt_arr[i]); return; } rhashtable_remove_fast(&data->fib_rt_ht, &fib6_rt->common.ht_node, nsim_fib_rt_ht_params); nsim_fib6_rt_destroy(fib6_rt); } static int nsim_fib6_event_init(struct nsim_fib6_event *fib6_event, struct fib6_entry_notifier_info *fen6_info) { struct fib6_info *rt = fen6_info->rt; struct fib6_info **rt_arr; struct fib6_info *iter; unsigned int nrt6; int i = 0; nrt6 = fen6_info->nsiblings + 1; rt_arr = kcalloc(nrt6, sizeof(struct fib6_info *), GFP_ATOMIC); if (!rt_arr) return -ENOMEM; fib6_event->rt_arr = rt_arr; fib6_event->nrt6 = nrt6; rt_arr[0] = rt; fib6_info_hold(rt); if (!fen6_info->nsiblings) return 0; list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) { if (i == fen6_info->nsiblings) break; rt_arr[i + 1] = iter; fib6_info_hold(iter); i++; } WARN_ON_ONCE(i != fen6_info->nsiblings); return 0; } static void nsim_fib6_event_fini(struct nsim_fib6_event *fib6_event) { int i; for (i = 0; i < fib6_event->nrt6; i++) nsim_rt6_release(fib6_event->rt_arr[i]); kfree(fib6_event->rt_arr); } static int nsim_fib6_event(struct nsim_fib_data *data, struct nsim_fib6_event *fib6_event, unsigned long event) { int err; if (fib6_event->rt_arr[0]->fib6_src.plen) return 0; switch (event) { case FIB_EVENT_ENTRY_REPLACE: err = nsim_fib6_rt_insert(data, fib6_event); if (err) goto err_rt_offload_failed_flag_set; break; case FIB_EVENT_ENTRY_APPEND: err = nsim_fib6_rt_append(data, fib6_event); if (err) goto err_rt_offload_failed_flag_set; break; case FIB_EVENT_ENTRY_DEL: nsim_fib6_rt_remove(data, fib6_event); break; default: break; } return 0; err_rt_offload_failed_flag_set: nsim_fib6_rt_offload_failed_flag_set(data, fib6_event->rt_arr, fib6_event->nrt6); return err; } static void nsim_fib_event(struct nsim_fib_event *fib_event) { switch (fib_event->family) { case AF_INET: nsim_fib4_event(fib_event->data, &fib_event->fen_info, fib_event->event); fib_info_put(fib_event->fen_info.fi); break; case AF_INET6: nsim_fib6_event(fib_event->data, &fib_event->fib6_event, fib_event->event); nsim_fib6_event_fini(&fib_event->fib6_event); break; } } static int nsim_fib4_prepare_event(struct fib_notifier_info *info, struct nsim_fib_event *fib_event, unsigned long event) { struct nsim_fib_data *data = fib_event->data; struct fib_entry_notifier_info *fen_info; struct netlink_ext_ack *extack; int err = 0; fen_info = container_of(info, struct fib_entry_notifier_info, info); fib_event->fen_info = *fen_info; extack = info->extack; switch (event) { case FIB_EVENT_ENTRY_REPLACE: err = nsim_fib_account(&data->ipv4.fib, true); if (err) { NL_SET_ERR_MSG_MOD(extack, "Exceeded number of supported fib entries"); return err; } break; case FIB_EVENT_ENTRY_DEL: if (data->fail_route_delete) { NL_SET_ERR_MSG_MOD(extack, "Failed to process route deletion"); return -EINVAL; } nsim_fib_account(&data->ipv4.fib, false); break; } /* Take reference on fib_info to prevent it from being * freed while event is queued. Release it afterwards. */ fib_info_hold(fib_event->fen_info.fi); return 0; } static int nsim_fib6_prepare_event(struct fib_notifier_info *info, struct nsim_fib_event *fib_event, unsigned long event) { struct nsim_fib_data *data = fib_event->data; struct fib6_entry_notifier_info *fen6_info; struct netlink_ext_ack *extack; int err = 0; fen6_info = container_of(info, struct fib6_entry_notifier_info, info); err = nsim_fib6_event_init(&fib_event->fib6_event, fen6_info); if (err) return err; extack = info->extack; switch (event) { case FIB_EVENT_ENTRY_REPLACE: err = nsim_fib_account(&data->ipv6.fib, true); if (err) { NL_SET_ERR_MSG_MOD(extack, "Exceeded number of supported fib entries"); goto err_fib6_event_fini; } break; case FIB_EVENT_ENTRY_DEL: if (data->fail_route_delete) { err = -EINVAL; NL_SET_ERR_MSG_MOD(extack, "Failed to process route deletion"); goto err_fib6_event_fini; } nsim_fib_account(&data->ipv6.fib, false); break; } return 0; err_fib6_event_fini: nsim_fib6_event_fini(&fib_event->fib6_event); return err; } static int nsim_fib_event_schedule_work(struct nsim_fib_data *data, struct fib_notifier_info *info, unsigned long event) { struct nsim_fib_event *fib_event; int err; if (info->family != AF_INET && info->family != AF_INET6) /* netdevsim does not support 'RTNL_FAMILY_IP6MR' and * 'RTNL_FAMILY_IPMR' and should ignore them. */ return NOTIFY_DONE; fib_event = kzalloc(sizeof(*fib_event), GFP_ATOMIC); if (!fib_event) goto err_fib_event_alloc; fib_event->data = data; fib_event->event = event; fib_event->family = info->family; switch (info->family) { case AF_INET: err = nsim_fib4_prepare_event(info, fib_event, event); break; case AF_INET6: err = nsim_fib6_prepare_event(info, fib_event, event); break; } if (err) goto err_fib_prepare_event; /* Enqueue the event and trigger the work */ spin_lock_bh(&data->fib_event_queue_lock); list_add_tail(&fib_event->list, &data->fib_event_queue); spin_unlock_bh(&data->fib_event_queue_lock); schedule_work(&data->fib_event_work); return NOTIFY_DONE; err_fib_prepare_event: kfree(fib_event); err_fib_event_alloc: if (event == FIB_EVENT_ENTRY_DEL) schedule_work(&data->fib_flush_work); return NOTIFY_BAD; } static int nsim_fib_event_nb(struct notifier_block *nb, unsigned long event, void *ptr) { struct nsim_fib_data *data = container_of(nb, struct nsim_fib_data, fib_nb); struct fib_notifier_info *info = ptr; int err; switch (event) { case FIB_EVENT_RULE_ADD: case FIB_EVENT_RULE_DEL: err = nsim_fib_rule_event(data, info, event == FIB_EVENT_RULE_ADD); return notifier_from_errno(err); case FIB_EVENT_ENTRY_REPLACE: case FIB_EVENT_ENTRY_APPEND: case FIB_EVENT_ENTRY_DEL: return nsim_fib_event_schedule_work(data, info, event); } return NOTIFY_DONE; } static void nsim_fib4_rt_free(struct nsim_fib_rt *fib_rt, struct nsim_fib_data *data) { struct devlink *devlink = data->devlink; struct nsim_fib4_rt *fib4_rt; fib4_rt = container_of(fib_rt, struct nsim_fib4_rt, common); nsim_fib4_rt_hw_flags_set(devlink_net(devlink), fib4_rt, false); nsim_fib_account(&data->ipv4.fib, false); nsim_fib4_rt_destroy(fib4_rt); } static void nsim_fib6_rt_free(struct nsim_fib_rt *fib_rt, struct nsim_fib_data *data) { struct nsim_fib6_rt *fib6_rt; fib6_rt = container_of(fib_rt, struct nsim_fib6_rt, common); nsim_fib6_rt_hw_flags_set(data, fib6_rt, false); nsim_fib_account(&data->ipv6.fib, false); nsim_fib6_rt_destroy(fib6_rt); } static void nsim_fib_rt_free(void *ptr, void *arg) { struct nsim_fib_rt *fib_rt = ptr; struct nsim_fib_data *data = arg; switch (fib_rt->key.family) { case AF_INET: nsim_fib4_rt_free(fib_rt, data); break; case AF_INET6: nsim_fib6_rt_free(fib_rt, data); break; default: WARN_ON_ONCE(1); } } /* inconsistent dump, trying again */ static void nsim_fib_dump_inconsistent(struct notifier_block *nb) { struct nsim_fib_data *data = container_of(nb, struct nsim_fib_data, fib_nb); struct nsim_fib_rt *fib_rt, *fib_rt_tmp; /* Flush the work to make sure there is no race with notifications. */ flush_work(&data->fib_event_work); /* The notifier block is still not registered, so we do not need to * take any locks here. */ list_for_each_entry_safe(fib_rt, fib_rt_tmp, &data->fib_rt_list, list) { rhashtable_remove_fast(&data->fib_rt_ht, &fib_rt->ht_node, nsim_fib_rt_ht_params); nsim_fib_rt_free(fib_rt, data); } atomic64_set(&data->ipv4.rules.num, 0ULL); atomic64_set(&data->ipv6.rules.num, 0ULL); } static struct nsim_nexthop *nsim_nexthop_create(struct nsim_fib_data *data, struct nh_notifier_info *info) { struct nsim_nexthop *nexthop; u64 occ = 0; int i; nexthop = kzalloc(sizeof(*nexthop), GFP_KERNEL); if (!nexthop) return ERR_PTR(-ENOMEM); nexthop->id = info->id; /* Determine the number of nexthop entries the new nexthop will * occupy. */ switch (info->type) { case NH_NOTIFIER_INFO_TYPE_SINGLE: occ = 1; break; case NH_NOTIFIER_INFO_TYPE_GRP: for (i = 0; i < info->nh_grp->num_nh; i++) occ += info->nh_grp->nh_entries[i].weight; break; case NH_NOTIFIER_INFO_TYPE_RES_TABLE: occ = info->nh_res_table->num_nh_buckets; nexthop->is_resilient = true; break; default: NL_SET_ERR_MSG_MOD(info->extack, "Unsupported nexthop type"); kfree(nexthop); return ERR_PTR(-EOPNOTSUPP); } nexthop->occ = occ; return nexthop; } static void nsim_nexthop_destroy(struct nsim_nexthop *nexthop) { kfree(nexthop); } static int nsim_nexthop_account(struct nsim_fib_data *data, u64 occ, bool add, struct netlink_ext_ack *extack) { int i, err = 0; if (add) { for (i = 0; i < occ; i++) if (!atomic64_add_unless(&data->nexthops.num, 1, data->nexthops.max)) { err = -ENOSPC; NL_SET_ERR_MSG_MOD(extack, "Exceeded number of supported nexthops"); goto err_num_decrease; } } else { if (WARN_ON(occ > atomic64_read(&data->nexthops.num))) return -EINVAL; atomic64_sub(occ, &data->nexthops.num); } return err; err_num_decrease: atomic64_sub(i, &data->nexthops.num); return err; } static void nsim_nexthop_hw_flags_set(struct net *net, const struct nsim_nexthop *nexthop, bool trap) { int i; nexthop_set_hw_flags(net, nexthop->id, false, trap); if (!nexthop->is_resilient) return; for (i = 0; i < nexthop->occ; i++) nexthop_bucket_set_hw_flags(net, nexthop->id, i, false, trap); } static int nsim_nexthop_add(struct nsim_fib_data *data, struct nsim_nexthop *nexthop, struct netlink_ext_ack *extack) { struct net *net = devlink_net(data->devlink); int err; err = nsim_nexthop_account(data, nexthop->occ, true, extack); if (err) return err; err = rhashtable_insert_fast(&data->nexthop_ht, &nexthop->ht_node, nsim_nexthop_ht_params); if (err) { NL_SET_ERR_MSG_MOD(extack, "Failed to insert nexthop"); goto err_nexthop_dismiss; } nsim_nexthop_hw_flags_set(net, nexthop, true); return 0; err_nexthop_dismiss: nsim_nexthop_account(data, nexthop->occ, false, extack); return err; } static int nsim_nexthop_replace(struct nsim_fib_data *data, struct nsim_nexthop *nexthop, struct nsim_nexthop *nexthop_old, struct netlink_ext_ack *extack) { struct net *net = devlink_net(data->devlink); int err; err = nsim_nexthop_account(data, nexthop->occ, true, extack); if (err) return err; err = rhashtable_replace_fast(&data->nexthop_ht, &nexthop_old->ht_node, &nexthop->ht_node, nsim_nexthop_ht_params); if (err) { NL_SET_ERR_MSG_MOD(extack, "Failed to replace nexthop"); goto err_nexthop_dismiss; } nsim_nexthop_hw_flags_set(net, nexthop, true); nsim_nexthop_account(data, nexthop_old->occ, false, extack); nsim_nexthop_destroy(nexthop_old); return 0; err_nexthop_dismiss: nsim_nexthop_account(data, nexthop->occ, false, extack); return err; } static int nsim_nexthop_insert(struct nsim_fib_data *data, struct nh_notifier_info *info) { struct nsim_nexthop *nexthop, *nexthop_old; int err; nexthop = nsim_nexthop_create(data, info); if (IS_ERR(nexthop)) return PTR_ERR(nexthop); nexthop_old = rhashtable_lookup_fast(&data->nexthop_ht, &info->id, nsim_nexthop_ht_params); if (!nexthop_old) err = nsim_nexthop_add(data, nexthop, info->extack); else err = nsim_nexthop_replace(data, nexthop, nexthop_old, info->extack); if (err) nsim_nexthop_destroy(nexthop); return err; } static void nsim_nexthop_remove(struct nsim_fib_data *data, struct nh_notifier_info *info) { struct nsim_nexthop *nexthop; nexthop = rhashtable_lookup_fast(&data->nexthop_ht, &info->id, nsim_nexthop_ht_params); if (!nexthop) return; rhashtable_remove_fast(&data->nexthop_ht, &nexthop->ht_node, nsim_nexthop_ht_params); nsim_nexthop_account(data, nexthop->occ, false, info->extack); nsim_nexthop_destroy(nexthop); } static int nsim_nexthop_res_table_pre_replace(struct nsim_fib_data *data, struct nh_notifier_info *info) { if (data->fail_res_nexthop_group_replace) { NL_SET_ERR_MSG_MOD(info->extack, "Failed to replace a resilient nexthop group"); return -EINVAL; } return 0; } static int nsim_nexthop_bucket_replace(struct nsim_fib_data *data, struct nh_notifier_info *info) { if (data->fail_nexthop_bucket_replace) { NL_SET_ERR_MSG_MOD(info->extack, "Failed to replace nexthop bucket"); return -EINVAL; } nexthop_bucket_set_hw_flags(info->net, info->id, info->nh_res_bucket->bucket_index, false, true); return 0; } static int nsim_nexthop_event_nb(struct notifier_block *nb, unsigned long event, void *ptr) { struct nsim_fib_data *data = container_of(nb, struct nsim_fib_data, nexthop_nb); struct nh_notifier_info *info = ptr; int err = 0; mutex_lock(&data->nh_lock); switch (event) { case NEXTHOP_EVENT_REPLACE: err = nsim_nexthop_insert(data, info); break; case NEXTHOP_EVENT_DEL: nsim_nexthop_remove(data, info); break; case NEXTHOP_EVENT_RES_TABLE_PRE_REPLACE: err = nsim_nexthop_res_table_pre_replace(data, info); break; case NEXTHOP_EVENT_BUCKET_REPLACE: err = nsim_nexthop_bucket_replace(data, info); break; default: break; } mutex_unlock(&data->nh_lock); return notifier_from_errno(err); } static void nsim_nexthop_free(void *ptr, void *arg) { struct nsim_nexthop *nexthop = ptr; struct nsim_fib_data *data = arg; struct net *net; net = devlink_net(data->devlink); nsim_nexthop_hw_flags_set(net, nexthop, false); nsim_nexthop_account(data, nexthop->occ, false, NULL); nsim_nexthop_destroy(nexthop); } static ssize_t nsim_nexthop_bucket_activity_write(struct file *file, const char __user *user_buf, size_t size, loff_t *ppos) { struct nsim_fib_data *data = file->private_data; struct net *net = devlink_net(data->devlink); struct nsim_nexthop *nexthop; unsigned long *activity; loff_t pos = *ppos; u16 bucket_index; char buf[128]; int err = 0; u32 nhid; if (pos != 0) return -EINVAL; if (size > sizeof(buf) - 1) return -EINVAL; if (copy_from_user(buf, user_buf, size)) return -EFAULT; buf[size] = 0; if (sscanf(buf, "%u %hu", &nhid, &bucket_index) != 2) return -EINVAL; rtnl_lock(); nexthop = rhashtable_lookup_fast(&data->nexthop_ht, &nhid, nsim_nexthop_ht_params); if (!nexthop || !nexthop->is_resilient || bucket_index >= nexthop->occ) { err = -EINVAL; goto out; } activity = bitmap_zalloc(nexthop->occ, GFP_KERNEL); if (!activity) { err = -ENOMEM; goto out; } bitmap_set(activity, bucket_index, 1); nexthop_res_grp_activity_update(net, nhid, nexthop->occ, activity); bitmap_free(activity); out: rtnl_unlock(); *ppos = size; return err ?: size; } static const struct file_operations nsim_nexthop_bucket_activity_fops = { .open = simple_open, .write = nsim_nexthop_bucket_activity_write, .owner = THIS_MODULE, }; static u64 nsim_fib_ipv4_resource_occ_get(void *priv) { struct nsim_fib_data *data = priv; return nsim_fib_get_val(data, NSIM_RESOURCE_IPV4_FIB, false); } static u64 nsim_fib_ipv4_rules_res_occ_get(void *priv) { struct nsim_fib_data *data = priv; return nsim_fib_get_val(data, NSIM_RESOURCE_IPV4_FIB_RULES, false); } static u64 nsim_fib_ipv6_resource_occ_get(void *priv) { struct nsim_fib_data *data = priv; return nsim_fib_get_val(data, NSIM_RESOURCE_IPV6_FIB, false); } static u64 nsim_fib_ipv6_rules_res_occ_get(void *priv) { struct nsim_fib_data *data = priv; return nsim_fib_get_val(data, NSIM_RESOURCE_IPV6_FIB_RULES, false); } static u64 nsim_fib_nexthops_res_occ_get(void *priv) { struct nsim_fib_data *data = priv; return nsim_fib_get_val(data, NSIM_RESOURCE_NEXTHOPS, false); } static void nsim_fib_set_max_all(struct nsim_fib_data *data, struct devlink *devlink) { static const enum nsim_resource_id res_ids[] = { NSIM_RESOURCE_IPV4_FIB, NSIM_RESOURCE_IPV4_FIB_RULES, NSIM_RESOURCE_IPV6_FIB, NSIM_RESOURCE_IPV6_FIB_RULES, NSIM_RESOURCE_NEXTHOPS, }; int i; for (i = 0; i < ARRAY_SIZE(res_ids); i++) { int err; u64 val; err = devl_resource_size_get(devlink, res_ids[i], &val); if (err) val = (u64) -1; nsim_fib_set_max(data, res_ids[i], val); } } static void nsim_fib_event_work(struct work_struct *work) { struct nsim_fib_data *data = container_of(work, struct nsim_fib_data, fib_event_work); struct nsim_fib_event *fib_event, *next_fib_event; LIST_HEAD(fib_event_queue); spin_lock_bh(&data->fib_event_queue_lock); list_splice_init(&data->fib_event_queue, &fib_event_queue); spin_unlock_bh(&data->fib_event_queue_lock); mutex_lock(&data->fib_lock); list_for_each_entry_safe(fib_event, next_fib_event, &fib_event_queue, list) { nsim_fib_event(fib_event); list_del(&fib_event->list); kfree(fib_event); cond_resched(); } mutex_unlock(&data->fib_lock); } static void nsim_fib_flush_work(struct work_struct *work) { struct nsim_fib_data *data = container_of(work, struct nsim_fib_data, fib_flush_work); struct nsim_fib_rt *fib_rt, *fib_rt_tmp; /* Process pending work. */ flush_work(&data->fib_event_work); mutex_lock(&data->fib_lock); list_for_each_entry_safe(fib_rt, fib_rt_tmp, &data->fib_rt_list, list) { rhashtable_remove_fast(&data->fib_rt_ht, &fib_rt->ht_node, nsim_fib_rt_ht_params); nsim_fib_rt_free(fib_rt, data); } mutex_unlock(&data->fib_lock); } static int nsim_fib_debugfs_init(struct nsim_fib_data *data, struct nsim_dev *nsim_dev) { data->ddir = debugfs_create_dir("fib", nsim_dev->ddir); if (IS_ERR(data->ddir)) return PTR_ERR(data->ddir); data->fail_route_offload = false; debugfs_create_bool("fail_route_offload", 0600, data->ddir, &data->fail_route_offload); data->fail_res_nexthop_group_replace = false; debugfs_create_bool("fail_res_nexthop_group_replace", 0600, data->ddir, &data->fail_res_nexthop_group_replace); data->fail_nexthop_bucket_replace = false; debugfs_create_bool("fail_nexthop_bucket_replace", 0600, data->ddir, &data->fail_nexthop_bucket_replace); debugfs_create_file("nexthop_bucket_activity", 0200, data->ddir, data, &nsim_nexthop_bucket_activity_fops); data->fail_route_delete = false; debugfs_create_bool("fail_route_delete", 0600, data->ddir, &data->fail_route_delete); return 0; } static void nsim_fib_debugfs_exit(struct nsim_fib_data *data) { debugfs_remove_recursive(data->ddir); } struct nsim_fib_data *nsim_fib_create(struct devlink *devlink, struct netlink_ext_ack *extack) { struct nsim_fib_data *data; struct nsim_dev *nsim_dev; int err; data = kzalloc(sizeof(*data), GFP_KERNEL); if (!data) return ERR_PTR(-ENOMEM); data->devlink = devlink; nsim_dev = devlink_priv(devlink); err = nsim_fib_debugfs_init(data, nsim_dev); if (err) goto err_data_free; mutex_init(&data->nh_lock); err = rhashtable_init(&data->nexthop_ht, &nsim_nexthop_ht_params); if (err) goto err_debugfs_exit; mutex_init(&data->fib_lock); INIT_LIST_HEAD(&data->fib_rt_list); err = rhashtable_init(&data->fib_rt_ht, &nsim_fib_rt_ht_params); if (err) goto err_rhashtable_nexthop_destroy; INIT_WORK(&data->fib_event_work, nsim_fib_event_work); INIT_WORK(&data->fib_flush_work, nsim_fib_flush_work); INIT_LIST_HEAD(&data->fib_event_queue); spin_lock_init(&data->fib_event_queue_lock); nsim_fib_set_max_all(data, devlink); data->nexthop_nb.notifier_call = nsim_nexthop_event_nb; err = register_nexthop_notifier(devlink_net(devlink), &data->nexthop_nb, extack); if (err) { pr_err("Failed to register nexthop notifier\n"); goto err_rhashtable_fib_destroy; } data->fib_nb.notifier_call = nsim_fib_event_nb; err = register_fib_notifier(devlink_net(devlink), &data->fib_nb, nsim_fib_dump_inconsistent, extack); if (err) { pr_err("Failed to register fib notifier\n"); goto err_nexthop_nb_unregister; } devl_resource_occ_get_register(devlink, NSIM_RESOURCE_IPV4_FIB, nsim_fib_ipv4_resource_occ_get, data); devl_resource_occ_get_register(devlink, NSIM_RESOURCE_IPV4_FIB_RULES, nsim_fib_ipv4_rules_res_occ_get, data); devl_resource_occ_get_register(devlink, NSIM_RESOURCE_IPV6_FIB, nsim_fib_ipv6_resource_occ_get, data); devl_resource_occ_get_register(devlink, NSIM_RESOURCE_IPV6_FIB_RULES, nsim_fib_ipv6_rules_res_occ_get, data); devl_resource_occ_get_register(devlink, NSIM_RESOURCE_NEXTHOPS, nsim_fib_nexthops_res_occ_get, data); return data; err_nexthop_nb_unregister: unregister_nexthop_notifier(devlink_net(devlink), &data->nexthop_nb); err_rhashtable_fib_destroy: cancel_work_sync(&data->fib_flush_work); flush_work(&data->fib_event_work); rhashtable_free_and_destroy(&data->fib_rt_ht, nsim_fib_rt_free, data); err_rhashtable_nexthop_destroy: rhashtable_free_and_destroy(&data->nexthop_ht, nsim_nexthop_free, data); mutex_destroy(&data->fib_lock); err_debugfs_exit: mutex_destroy(&data->nh_lock); nsim_fib_debugfs_exit(data); err_data_free: kfree(data); return ERR_PTR(err); } void nsim_fib_destroy(struct devlink *devlink, struct nsim_fib_data *data) { devl_resource_occ_get_unregister(devlink, NSIM_RESOURCE_NEXTHOPS); devl_resource_occ_get_unregister(devlink, NSIM_RESOURCE_IPV6_FIB_RULES); devl_resource_occ_get_unregister(devlink, NSIM_RESOURCE_IPV6_FIB); devl_resource_occ_get_unregister(devlink, NSIM_RESOURCE_IPV4_FIB_RULES); devl_resource_occ_get_unregister(devlink, NSIM_RESOURCE_IPV4_FIB); unregister_fib_notifier(devlink_net(devlink), &data->fib_nb); unregister_nexthop_notifier(devlink_net(devlink), &data->nexthop_nb); cancel_work_sync(&data->fib_flush_work); flush_work(&data->fib_event_work); rhashtable_free_and_destroy(&data->fib_rt_ht, nsim_fib_rt_free, data); rhashtable_free_and_destroy(&data->nexthop_ht, nsim_nexthop_free, data); WARN_ON_ONCE(!list_empty(&data->fib_event_queue)); WARN_ON_ONCE(!list_empty(&data->fib_rt_list)); mutex_destroy(&data->fib_lock); mutex_destroy(&data->nh_lock); nsim_fib_debugfs_exit(data); kfree(data); }
103 103 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 // SPDX-License-Identifier: GPL-2.0-or-later /* * NET4: Sysctl interface to net af_unix subsystem. * * Authors: Mike Shaver. */ #include <linux/slab.h> #include <linux/string.h> #include <linux/sysctl.h> #include <net/af_unix.h> #include <net/net_namespace.h> #include "af_unix.h" static struct ctl_table unix_table[] = { { .procname = "max_dgram_qlen", .data = &init_net.unx.sysctl_max_dgram_qlen, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, }; int __net_init unix_sysctl_register(struct net *net) { struct ctl_table *table; if (net_eq(net, &init_net)) { table = unix_table; } else { table = kmemdup(unix_table, sizeof(unix_table), GFP_KERNEL); if (!table) goto err_alloc; table[0].data = &net->unx.sysctl_max_dgram_qlen; } net->unx.ctl = register_net_sysctl_sz(net, "net/unix", table, ARRAY_SIZE(unix_table)); if (net->unx.ctl == NULL) goto err_reg; return 0; err_reg: if (!net_eq(net, &init_net)) kfree(table); err_alloc: return -ENOMEM; } void unix_sysctl_unregister(struct net *net) { const struct ctl_table *table; table = net->unx.ctl->ctl_table_arg; unregister_net_sysctl_table(net->unx.ctl); if (!net_eq(net, &init_net)) kfree(table); }
43 15 31 4 15 31 2 28 12 24 20 1 11 3 58 15 4 3 86 86 87 53 10 6 7 9 41 41 41 40 1 40 38 38 17 56 5 53 36 5 9 24 35 32 22 36 32 38 38 38 38 38 38 38 38 37 37 24 24 24 24 24 3 2 1 10 3 3 1 6 43 45 45 50 50 7 48 17 40 41 185 154 61 36 2 5 1 2 2 3 5 4 17 15 4 2 10 6 12 17 14 7 17 8 16 6 32 32 30 29 6 36 31 5 32 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 // SPDX-License-Identifier: GPL-2.0-or-later /* * Userspace interface * Linux ethernet bridge * * Authors: * Lennert Buytenhek <buytenh@gnu.org> */ #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/netpoll.h> #include <linux/ethtool.h> #include <linux/if_arp.h> #include <linux/module.h> #include <linux/init.h> #include <linux/rtnetlink.h> #include <linux/if_ether.h> #include <linux/slab.h> #include <net/dsa.h> #include <net/sock.h> #include <linux/if_vlan.h> #include <net/switchdev.h> #include <net/net_namespace.h> #include "br_private.h" /* * Determine initial path cost based on speed. * using recommendations from 802.1d standard * * Since driver might sleep need to not be holding any locks. */ static int port_cost(struct net_device *dev) { struct ethtool_link_ksettings ecmd; if (!__ethtool_get_link_ksettings(dev, &ecmd)) { switch (ecmd.base.speed) { case SPEED_10000: return 2; case SPEED_5000: return 3; case SPEED_2500: return 4; case SPEED_1000: return 5; case SPEED_100: return 19; case SPEED_10: return 100; case SPEED_UNKNOWN: return 100; default: if (ecmd.base.speed > SPEED_10000) return 1; } } /* Old silly heuristics based on name */ if (!strncmp(dev->name, "lec", 3)) return 7; if (!strncmp(dev->name, "plip", 4)) return 2500; return 100; /* assume old 10Mbps */ } /* Check for port carrier transitions. */ void br_port_carrier_check(struct net_bridge_port *p, bool *notified) { struct net_device *dev = p->dev; struct net_bridge *br = p->br; if (!(p->flags & BR_ADMIN_COST) && netif_running(dev) && netif_oper_up(dev)) p->path_cost = port_cost(dev); *notified = false; if (!netif_running(br->dev)) return; spin_lock_bh(&br->lock); if (netif_running(dev) && netif_oper_up(dev)) { if (p->state == BR_STATE_DISABLED) { br_stp_enable_port(p); *notified = true; } } else { if (p->state != BR_STATE_DISABLED) { br_stp_disable_port(p); *notified = true; } } spin_unlock_bh(&br->lock); } static void br_port_set_promisc(struct net_bridge_port *p) { int err = 0; if (br_promisc_port(p)) return; err = dev_set_promiscuity(p->dev, 1); if (err) return; br_fdb_unsync_static(p->br, p); p->flags |= BR_PROMISC; } static void br_port_clear_promisc(struct net_bridge_port *p) { int err; /* Check if the port is already non-promisc or if it doesn't * support UNICAST filtering. Without unicast filtering support * we'll end up re-enabling promisc mode anyway, so just check for * it here. */ if (!br_promisc_port(p) || !(p->dev->priv_flags & IFF_UNICAST_FLT)) return; /* Since we'll be clearing the promisc mode, program the port * first so that we don't have interruption in traffic. */ err = br_fdb_sync_static(p->br, p); if (err) return; dev_set_promiscuity(p->dev, -1); p->flags &= ~BR_PROMISC; } /* When a port is added or removed or when certain port flags * change, this function is called to automatically manage * promiscuity setting of all the bridge ports. We are always called * under RTNL so can skip using rcu primitives. */ void br_manage_promisc(struct net_bridge *br) { struct net_bridge_port *p; bool set_all = false; /* If vlan filtering is disabled or bridge interface is placed * into promiscuous mode, place all ports in promiscuous mode. */ if ((br->dev->flags & IFF_PROMISC) || !br_vlan_enabled(br->dev)) set_all = true; list_for_each_entry(p, &br->port_list, list) { if (set_all) { br_port_set_promisc(p); } else { /* If the number of auto-ports is <= 1, then all other * ports will have their output configuration * statically specified through fdbs. Since ingress * on the auto-port becomes forwarding/egress to other * ports and egress configuration is statically known, * we can say that ingress configuration of the * auto-port is also statically known. * This lets us disable promiscuous mode and write * this config to hw. */ if ((p->dev->priv_flags & IFF_UNICAST_FLT) && (br->auto_cnt == 0 || (br->auto_cnt == 1 && br_auto_port(p)))) br_port_clear_promisc(p); else br_port_set_promisc(p); } } } int nbp_backup_change(struct net_bridge_port *p, struct net_device *backup_dev) { struct net_bridge_port *old_backup = rtnl_dereference(p->backup_port); struct net_bridge_port *backup_p = NULL; ASSERT_RTNL(); if (backup_dev) { if (!netif_is_bridge_port(backup_dev)) return -ENOENT; backup_p = br_port_get_rtnl(backup_dev); if (backup_p->br != p->br) return -EINVAL; } if (p == backup_p) return -EINVAL; if (old_backup == backup_p) return 0; /* if the backup link is already set, clear it */ if (old_backup) old_backup->backup_redirected_cnt--; if (backup_p) backup_p->backup_redirected_cnt++; rcu_assign_pointer(p->backup_port, backup_p); return 0; } static void nbp_backup_clear(struct net_bridge_port *p) { nbp_backup_change(p, NULL); if (p->backup_redirected_cnt) { struct net_bridge_port *cur_p; list_for_each_entry(cur_p, &p->br->port_list, list) { struct net_bridge_port *backup_p; backup_p = rtnl_dereference(cur_p->backup_port); if (backup_p == p) nbp_backup_change(cur_p, NULL); } } WARN_ON(rcu_access_pointer(p->backup_port) || p->backup_redirected_cnt); } static void nbp_update_port_count(struct net_bridge *br) { struct net_bridge_port *p; u32 cnt = 0; list_for_each_entry(p, &br->port_list, list) { if (br_auto_port(p)) cnt++; } if (br->auto_cnt != cnt) { br->auto_cnt = cnt; br_manage_promisc(br); } } static void nbp_delete_promisc(struct net_bridge_port *p) { /* If port is currently promiscuous, unset promiscuity. * Otherwise, it is a static port so remove all addresses * from it. */ dev_set_allmulti(p->dev, -1); if (br_promisc_port(p)) dev_set_promiscuity(p->dev, -1); else br_fdb_unsync_static(p->br, p); } static void release_nbp(struct kobject *kobj) { struct net_bridge_port *p = container_of(kobj, struct net_bridge_port, kobj); kfree(p); } static void brport_get_ownership(const struct kobject *kobj, kuid_t *uid, kgid_t *gid) { struct net_bridge_port *p = kobj_to_brport(kobj); net_ns_get_ownership(dev_net(p->dev), uid, gid); } static const struct kobj_type brport_ktype = { #ifdef CONFIG_SYSFS .sysfs_ops = &brport_sysfs_ops, #endif .release = release_nbp, .get_ownership = brport_get_ownership, }; static void destroy_nbp(struct net_bridge_port *p) { struct net_device *dev = p->dev; p->br = NULL; p->dev = NULL; netdev_put(dev, &p->dev_tracker); kobject_put(&p->kobj); } static void destroy_nbp_rcu(struct rcu_head *head) { struct net_bridge_port *p = container_of(head, struct net_bridge_port, rcu); destroy_nbp(p); } static unsigned get_max_headroom(struct net_bridge *br) { unsigned max_headroom = 0; struct net_bridge_port *p; list_for_each_entry(p, &br->port_list, list) { unsigned dev_headroom = netdev_get_fwd_headroom(p->dev); if (dev_headroom > max_headroom) max_headroom = dev_headroom; } return max_headroom; } static void update_headroom(struct net_bridge *br, int new_hr) { struct net_bridge_port *p; list_for_each_entry(p, &br->port_list, list) netdev_set_rx_headroom(p->dev, new_hr); br->dev->needed_headroom = new_hr; } /* Delete port(interface) from bridge is done in two steps. * via RCU. First step, marks device as down. That deletes * all the timers and stops new packets from flowing through. * * Final cleanup doesn't occur until after all CPU's finished * processing packets. * * Protected from multiple admin operations by RTNL mutex */ static void del_nbp(struct net_bridge_port *p) { struct net_bridge *br = p->br; struct net_device *dev = p->dev; sysfs_remove_link(br->ifobj, p->dev->name); nbp_delete_promisc(p); spin_lock_bh(&br->lock); br_stp_disable_port(p); spin_unlock_bh(&br->lock); br_mrp_port_del(br, p); br_cfm_port_del(br, p); br_ifinfo_notify(RTM_DELLINK, NULL, p); list_del_rcu(&p->list); if (netdev_get_fwd_headroom(dev) == br->dev->needed_headroom) update_headroom(br, get_max_headroom(br)); netdev_reset_rx_headroom(dev); nbp_vlan_flush(p); br_fdb_delete_by_port(br, p, 0, 1); switchdev_deferred_process(); nbp_backup_clear(p); nbp_update_port_count(br); netdev_upper_dev_unlink(dev, br->dev); dev->priv_flags &= ~IFF_BRIDGE_PORT; netdev_rx_handler_unregister(dev); br_multicast_del_port(p); kobject_uevent(&p->kobj, KOBJ_REMOVE); kobject_del(&p->kobj); br_netpoll_disable(p); call_rcu(&p->rcu, destroy_nbp_rcu); } /* Delete bridge device */ void br_dev_delete(struct net_device *dev, struct list_head *head) { struct net_bridge *br = netdev_priv(dev); struct net_bridge_port *p, *n; list_for_each_entry_safe(p, n, &br->port_list, list) { del_nbp(p); } br_recalculate_neigh_suppress_enabled(br); br_fdb_delete_by_port(br, NULL, 0, 1); cancel_delayed_work_sync(&br->gc_work); br_sysfs_delbr(br->dev); unregister_netdevice_queue(br->dev, head); } /* find an available port number */ static int find_portno(struct net_bridge *br) { int index; struct net_bridge_port *p; unsigned long *inuse; inuse = bitmap_zalloc(BR_MAX_PORTS, GFP_KERNEL); if (!inuse) return -ENOMEM; __set_bit(0, inuse); /* zero is reserved */ list_for_each_entry(p, &br->port_list, list) __set_bit(p->port_no, inuse); index = find_first_zero_bit(inuse, BR_MAX_PORTS); bitmap_free(inuse); return (index >= BR_MAX_PORTS) ? -EXFULL : index; } /* called with RTNL but without bridge lock */ static struct net_bridge_port *new_nbp(struct net_bridge *br, struct net_device *dev) { struct net_bridge_port *p; int index, err; index = find_portno(br); if (index < 0) return ERR_PTR(index); p = kzalloc(sizeof(*p), GFP_KERNEL); if (p == NULL) return ERR_PTR(-ENOMEM); p->br = br; netdev_hold(dev, &p->dev_tracker, GFP_KERNEL); p->dev = dev; p->path_cost = port_cost(dev); p->priority = 0x8000 >> BR_PORT_BITS; p->port_no = index; p->flags = BR_LEARNING | BR_FLOOD | BR_MCAST_FLOOD | BR_BCAST_FLOOD; br_init_port(p); br_set_state(p, BR_STATE_DISABLED); br_stp_port_timer_init(p); err = br_multicast_add_port(p); if (err) { netdev_put(dev, &p->dev_tracker); kfree(p); p = ERR_PTR(err); } return p; } int br_add_bridge(struct net *net, const char *name) { struct net_device *dev; int res; dev = alloc_netdev(sizeof(struct net_bridge), name, NET_NAME_UNKNOWN, br_dev_setup); if (!dev) return -ENOMEM; dev_net_set(dev, net); dev->rtnl_link_ops = &br_link_ops; res = register_netdevice(dev); if (res) free_netdev(dev); return res; } int br_del_bridge(struct net *net, const char *name) { struct net_device *dev; int ret = 0; dev = __dev_get_by_name(net, name); if (dev == NULL) ret = -ENXIO; /* Could not find device */ else if (!netif_is_bridge_master(dev)) { /* Attempt to delete non bridge device! */ ret = -EPERM; } else if (dev->flags & IFF_UP) { /* Not shutdown yet. */ ret = -EBUSY; } else br_dev_delete(dev, NULL); return ret; } /* MTU of the bridge pseudo-device: ETH_DATA_LEN or the minimum of the ports */ static int br_mtu_min(const struct net_bridge *br) { const struct net_bridge_port *p; int ret_mtu = 0; list_for_each_entry(p, &br->port_list, list) if (!ret_mtu || ret_mtu > p->dev->mtu) ret_mtu = p->dev->mtu; return ret_mtu ? ret_mtu : ETH_DATA_LEN; } void br_mtu_auto_adjust(struct net_bridge *br) { ASSERT_RTNL(); /* if the bridge MTU was manually configured don't mess with it */ if (br_opt_get(br, BROPT_MTU_SET_BY_USER)) return; /* change to the minimum MTU and clear the flag which was set by * the bridge ndo_change_mtu callback */ dev_set_mtu(br->dev, br_mtu_min(br)); br_opt_toggle(br, BROPT_MTU_SET_BY_USER, false); } static void br_set_gso_limits(struct net_bridge *br) { unsigned int tso_max_size = TSO_MAX_SIZE; const struct net_bridge_port *p; u16 tso_max_segs = TSO_MAX_SEGS; list_for_each_entry(p, &br->port_list, list) { tso_max_size = min(tso_max_size, p->dev->tso_max_size); tso_max_segs = min(tso_max_segs, p->dev->tso_max_segs); } netif_set_tso_max_size(br->dev, tso_max_size); netif_set_tso_max_segs(br->dev, tso_max_segs); } /* * Recomputes features using slave's features */ netdev_features_t br_features_recompute(struct net_bridge *br, netdev_features_t features) { struct net_bridge_port *p; netdev_features_t mask; if (list_empty(&br->port_list)) return features; mask = features; features &= ~NETIF_F_ONE_FOR_ALL; list_for_each_entry(p, &br->port_list, list) { features = netdev_increment_features(features, p->dev->features, mask); } features = netdev_add_tso_features(features, mask); return features; } /* called with RTNL */ int br_add_if(struct net_bridge *br, struct net_device *dev, struct netlink_ext_ack *extack) { struct net_bridge_port *p; int err = 0; unsigned br_hr, dev_hr; bool changed_addr, fdb_synced = false; /* Don't allow bridging non-ethernet like devices. */ if ((dev->flags & IFF_LOOPBACK) || dev->type != ARPHRD_ETHER || dev->addr_len != ETH_ALEN || !is_valid_ether_addr(dev->dev_addr)) return -EINVAL; /* No bridging of bridges */ if (dev->netdev_ops->ndo_start_xmit == br_dev_xmit) { NL_SET_ERR_MSG(extack, "Can not enslave a bridge to a bridge"); return -ELOOP; } /* Device has master upper dev */ if (netdev_master_upper_dev_get(dev)) return -EBUSY; /* No bridging devices that dislike that (e.g. wireless) */ if (dev->priv_flags & IFF_DONT_BRIDGE) { NL_SET_ERR_MSG(extack, "Device does not allow enslaving to a bridge"); return -EOPNOTSUPP; } p = new_nbp(br, dev); if (IS_ERR(p)) return PTR_ERR(p); call_netdevice_notifiers(NETDEV_JOIN, dev); err = dev_set_allmulti(dev, 1); if (err) { br_multicast_del_port(p); netdev_put(dev, &p->dev_tracker); kfree(p); /* kobject not yet init'd, manually free */ goto err1; } err = kobject_init_and_add(&p->kobj, &brport_ktype, &(dev->dev.kobj), SYSFS_BRIDGE_PORT_ATTR); if (err) goto err2; err = br_sysfs_addif(p); if (err) goto err2; err = br_netpoll_enable(p); if (err) goto err3; err = netdev_rx_handler_register(dev, br_get_rx_handler(dev), p); if (err) goto err4; dev->priv_flags |= IFF_BRIDGE_PORT; err = netdev_master_upper_dev_link(dev, br->dev, NULL, NULL, extack); if (err) goto err5; dev_disable_lro(dev); list_add_rcu(&p->list, &br->port_list); nbp_update_port_count(br); if (!br_promisc_port(p) && (p->dev->priv_flags & IFF_UNICAST_FLT)) { /* When updating the port count we also update all ports' * promiscuous mode. * A port leaving promiscuous mode normally gets the bridge's * fdb synced to the unicast filter (if supported), however, * `br_port_clear_promisc` does not distinguish between * non-promiscuous ports and *new* ports, so we need to * sync explicitly here. */ fdb_synced = br_fdb_sync_static(br, p) == 0; if (!fdb_synced) netdev_err(dev, "failed to sync bridge static fdb addresses to this port\n"); } netdev_update_features(br->dev); br_hr = br->dev->needed_headroom; dev_hr = netdev_get_fwd_headroom(dev); if (br_hr < dev_hr) update_headroom(br, dev_hr); else netdev_set_rx_headroom(dev, br_hr); if (br_fdb_add_local(br, p, dev->dev_addr, 0)) netdev_err(dev, "failed insert local address bridge forwarding table\n"); if (br->dev->addr_assign_type != NET_ADDR_SET) { /* Ask for permission to use this MAC address now, even if we * don't end up choosing it below. */ err = dev_pre_changeaddr_notify(br->dev, dev->dev_addr, extack); if (err) goto err6; } err = nbp_vlan_init(p, extack); if (err) { netdev_err(dev, "failed to initialize vlan filtering on this port\n"); goto err6; } spin_lock_bh(&br->lock); changed_addr = br_stp_recalculate_bridge_id(br); if (netif_running(dev) && netif_oper_up(dev) && (br->dev->flags & IFF_UP)) br_stp_enable_port(p); spin_unlock_bh(&br->lock); br_ifinfo_notify(RTM_NEWLINK, NULL, p); if (changed_addr) call_netdevice_notifiers(NETDEV_CHANGEADDR, br->dev); br_mtu_auto_adjust(br); br_set_gso_limits(br); kobject_uevent(&p->kobj, KOBJ_ADD); return 0; err6: if (fdb_synced) br_fdb_unsync_static(br, p); list_del_rcu(&p->list); br_fdb_delete_by_port(br, p, 0, 1); nbp_update_port_count(br); netdev_upper_dev_unlink(dev, br->dev); err5: dev->priv_flags &= ~IFF_BRIDGE_PORT; netdev_rx_handler_unregister(dev); err4: br_netpoll_disable(p); err3: sysfs_remove_link(br->ifobj, p->dev->name); err2: br_multicast_del_port(p); netdev_put(dev, &p->dev_tracker); kobject_put(&p->kobj); dev_set_allmulti(dev, -1); err1: return err; } /* called with RTNL */ int br_del_if(struct net_bridge *br, struct net_device *dev) { struct net_bridge_port *p; bool changed_addr; p = br_port_get_rtnl(dev); if (!p || p->br != br) return -EINVAL; /* Since more than one interface can be attached to a bridge, * there still maybe an alternate path for netconsole to use; * therefore there is no reason for a NETDEV_RELEASE event. */ del_nbp(p); br_mtu_auto_adjust(br); br_set_gso_limits(br); spin_lock_bh(&br->lock); changed_addr = br_stp_recalculate_bridge_id(br); spin_unlock_bh(&br->lock); if (changed_addr) call_netdevice_notifiers(NETDEV_CHANGEADDR, br->dev); netdev_update_features(br->dev); return 0; } void br_port_flags_change(struct net_bridge_port *p, unsigned long mask) { struct net_bridge *br = p->br; if (mask & BR_AUTO_MASK) nbp_update_port_count(br); if (mask & (BR_NEIGH_SUPPRESS | BR_NEIGH_VLAN_SUPPRESS)) br_recalculate_neigh_suppress_enabled(br); } bool br_port_flag_is_set(const struct net_device *dev, unsigned long flag) { struct net_bridge_port *p; p = br_port_get_rtnl_rcu(dev); if (!p) return false; return p->flags & flag; } EXPORT_SYMBOL_GPL(br_port_flag_is_set);
2146 2127 99 1646 2123 27 103 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 // SPDX-License-Identifier: GPL-2.0-or-later /* Copyright Amazon.com Inc. or its affiliates. */ #include <linux/init.h> #include <linux/netdevice.h> #include <linux/notifier.h> #include <linux/rtnetlink.h> #include <net/net_namespace.h> #include <net/netdev_lock.h> #include <net/netns/generic.h> int netdev_debug_event(struct notifier_block *nb, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); enum netdev_cmd cmd = event; /* Keep enum and don't add default to trigger -Werror=switch */ switch (cmd) { case NETDEV_XDP_FEAT_CHANGE: netdev_assert_locked(dev); fallthrough; case NETDEV_CHANGE: case NETDEV_REGISTER: case NETDEV_UP: netdev_ops_assert_locked(dev); fallthrough; case NETDEV_DOWN: case NETDEV_REBOOT: case NETDEV_UNREGISTER: case NETDEV_CHANGEMTU: case NETDEV_CHANGEADDR: case NETDEV_PRE_CHANGEADDR: case NETDEV_GOING_DOWN: case NETDEV_FEAT_CHANGE: case NETDEV_BONDING_FAILOVER: case NETDEV_PRE_UP: case NETDEV_PRE_TYPE_CHANGE: case NETDEV_POST_TYPE_CHANGE: case NETDEV_POST_INIT: case NETDEV_PRE_UNINIT: case NETDEV_RELEASE: case NETDEV_NOTIFY_PEERS: case NETDEV_JOIN: case NETDEV_CHANGEUPPER: case NETDEV_RESEND_IGMP: case NETDEV_PRECHANGEMTU: case NETDEV_CHANGEINFODATA: case NETDEV_BONDING_INFO: case NETDEV_PRECHANGEUPPER: case NETDEV_CHANGELOWERSTATE: case NETDEV_UDP_TUNNEL_PUSH_INFO: case NETDEV_UDP_TUNNEL_DROP_INFO: case NETDEV_CHANGE_TX_QUEUE_LEN: case NETDEV_CVLAN_FILTER_PUSH_INFO: case NETDEV_CVLAN_FILTER_DROP_INFO: case NETDEV_SVLAN_FILTER_PUSH_INFO: case NETDEV_SVLAN_FILTER_DROP_INFO: case NETDEV_OFFLOAD_XSTATS_ENABLE: case NETDEV_OFFLOAD_XSTATS_DISABLE: case NETDEV_OFFLOAD_XSTATS_REPORT_USED: case NETDEV_OFFLOAD_XSTATS_REPORT_DELTA: ASSERT_RTNL(); break; case NETDEV_CHANGENAME: ASSERT_RTNL_NET(net); break; } return NOTIFY_DONE; } EXPORT_SYMBOL_NS_GPL(netdev_debug_event, "NETDEV_INTERNAL"); static int rtnl_net_debug_net_id; static int __net_init rtnl_net_debug_net_init(struct net *net) { struct notifier_block *nb; nb = net_generic(net, rtnl_net_debug_net_id); nb->notifier_call = netdev_debug_event; return register_netdevice_notifier_net(net, nb); } static void __net_exit rtnl_net_debug_net_exit(struct net *net) { struct notifier_block *nb; nb = net_generic(net, rtnl_net_debug_net_id); unregister_netdevice_notifier_net(net, nb); } static struct pernet_operations rtnl_net_debug_net_ops __net_initdata = { .init = rtnl_net_debug_net_init, .exit = rtnl_net_debug_net_exit, .id = &rtnl_net_debug_net_id, .size = sizeof(struct notifier_block), }; static struct notifier_block rtnl_net_debug_block = { .notifier_call = netdev_debug_event, }; static int __init rtnl_net_debug_init(void) { int ret; ret = register_pernet_subsys(&rtnl_net_debug_net_ops); if (ret) return ret; ret = register_netdevice_notifier(&rtnl_net_debug_block); if (ret) unregister_pernet_subsys(&rtnl_net_debug_net_ops); return ret; } subsys_initcall(rtnl_net_debug_init);
3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ /** * file phonet.h * * Phonet sockets kernel interface * * Copyright (C) 2008 Nokia Corporation. All rights reserved. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * version 2 as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA * 02110-1301 USA */ #ifndef _UAPILINUX_PHONET_H #define _UAPILINUX_PHONET_H #include <linux/types.h> #include <linux/socket.h> /* Automatic protocol selection */ #define PN_PROTO_TRANSPORT 0 /* Phonet datagram socket */ #define PN_PROTO_PHONET 1 /* Phonet pipe */ #define PN_PROTO_PIPE 2 #define PHONET_NPROTO 3 /* Socket options for SOL_PNPIPE level */ #define PNPIPE_ENCAP 1 #define PNPIPE_IFINDEX 2 #define PNPIPE_HANDLE 3 #define PNPIPE_INITSTATE 4 #define PNADDR_ANY 0 #define PNADDR_BROADCAST 0xFC #define PNPORT_RESOURCE_ROUTING 0 /* Values for PNPIPE_ENCAP option */ #define PNPIPE_ENCAP_NONE 0 #define PNPIPE_ENCAP_IP 1 /* ioctls */ #define SIOCPNGETOBJECT (SIOCPROTOPRIVATE + 0) #define SIOCPNENABLEPIPE (SIOCPROTOPRIVATE + 13) #define SIOCPNADDRESOURCE (SIOCPROTOPRIVATE + 14) #define SIOCPNDELRESOURCE (SIOCPROTOPRIVATE + 15) /* Phonet protocol header */ struct phonethdr { __u8 pn_rdev; __u8 pn_sdev; __u8 pn_res; __be16 pn_length; __u8 pn_robj; __u8 pn_sobj; } __attribute__((packed)); /* Common Phonet payload header */ struct phonetmsg { __u8 pn_trans_id; /* transaction ID */ __u8 pn_msg_id; /* message type */ union { struct { __u8 pn_submsg_id; /* message subtype */ __u8 pn_data[5]; } base; struct { __u16 pn_e_res_id; /* extended resource ID */ __u8 pn_e_submsg_id; /* message subtype */ __u8 pn_e_data[3]; } ext; } pn_msg_u; }; #define PN_COMMON_MESSAGE 0xF0 #define PN_COMMGR 0x10 #define PN_PREFIX 0xE0 /* resource for extended messages */ #define pn_submsg_id pn_msg_u.base.pn_submsg_id #define pn_e_submsg_id pn_msg_u.ext.pn_e_submsg_id #define pn_e_res_id pn_msg_u.ext.pn_e_res_id #define pn_data pn_msg_u.base.pn_data #define pn_e_data pn_msg_u.ext.pn_e_data /* data for unreachable errors */ #define PN_COMM_SERVICE_NOT_IDENTIFIED_RESP 0x01 #define PN_COMM_ISA_ENTITY_NOT_REACHABLE_RESP 0x14 #define pn_orig_msg_id pn_data[0] #define pn_status pn_data[1] #define pn_e_orig_msg_id pn_e_data[0] #define pn_e_status pn_e_data[1] /* Phonet socket address structure */ struct sockaddr_pn { __kernel_sa_family_t spn_family; __u8 spn_obj; __u8 spn_dev; __u8 spn_resource; __u8 spn_zero[sizeof(struct sockaddr) - sizeof(__kernel_sa_family_t) - 3]; } __attribute__((packed)); /* Well known address */ #define PN_DEV_PC 0x10 static inline __u16 pn_object(__u8 addr, __u16 port) { return (addr << 8) | (port & 0x3ff); } static inline __u8 pn_obj(__u16 handle) { return handle & 0xff; } static inline __u8 pn_dev(__u16 handle) { return handle >> 8; } static inline __u16 pn_port(__u16 handle) { return handle & 0x3ff; } static inline __u8 pn_addr(__u16 handle) { return (handle >> 8) & 0xfc; } static inline void pn_sockaddr_set_addr(struct sockaddr_pn *spn, __u8 addr) { spn->spn_dev &= 0x03; spn->spn_dev |= addr & 0xfc; } static inline void pn_sockaddr_set_port(struct sockaddr_pn *spn, __u16 port) { spn->spn_dev &= 0xfc; spn->spn_dev |= (port >> 8) & 0x03; spn->spn_obj = port & 0xff; } static inline void pn_sockaddr_set_object(struct sockaddr_pn *spn, __u16 handle) { spn->spn_dev = pn_dev(handle); spn->spn_obj = pn_obj(handle); } static inline void pn_sockaddr_set_resource(struct sockaddr_pn *spn, __u8 resource) { spn->spn_resource = resource; } static inline __u8 pn_sockaddr_get_addr(const struct sockaddr_pn *spn) { return spn->spn_dev & 0xfc; } static inline __u16 pn_sockaddr_get_port(const struct sockaddr_pn *spn) { return ((spn->spn_dev & 0x03) << 8) | spn->spn_obj; } static inline __u16 pn_sockaddr_get_object(const struct sockaddr_pn *spn) { return pn_object(spn->spn_dev, spn->spn_obj); } static inline __u8 pn_sockaddr_get_resource(const struct sockaddr_pn *spn) { return spn->spn_resource; } /* Phonet device ioctl requests */ #endif /* _UAPILINUX_PHONET_H */
3 2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2013 Patrick McHardy <kaber@trash.net> */ #include <linux/netfilter_ipv4/ip_tables.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_SYNPROXY.h> #include <net/netfilter/nf_synproxy.h> static unsigned int synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_synproxy_info *info = par->targinfo; struct net *net = xt_net(par); struct synproxy_net *snet = synproxy_pernet(net); struct synproxy_options opts = {}; struct tcphdr *th, _th; if (nf_ip_checksum(skb, xt_hooknum(par), par->thoff, IPPROTO_TCP)) return NF_DROP; th = skb_header_pointer(skb, par->thoff, sizeof(_th), &_th); if (th == NULL) return NF_DROP; if (!synproxy_parse_options(skb, par->thoff, th, &opts)) return NF_DROP; if (th->syn && !(th->ack || th->fin || th->rst)) { /* Initial SYN from client */ this_cpu_inc(snet->stats->syn_received); if (th->ece && th->cwr) opts.options |= XT_SYNPROXY_OPT_ECN; opts.options &= info->options; opts.mss_encode = opts.mss_option; opts.mss_option = info->mss; if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) synproxy_init_timestamp_cookie(info, &opts); else opts.options &= ~(XT_SYNPROXY_OPT_WSCALE | XT_SYNPROXY_OPT_SACK_PERM | XT_SYNPROXY_OPT_ECN); synproxy_send_client_synack(net, skb, th, &opts); consume_skb(skb); return NF_STOLEN; } else if (th->ack && !(th->fin || th->rst || th->syn)) { /* ACK from client */ if (synproxy_recv_client_ack(net, skb, th, &opts, ntohl(th->seq))) { consume_skb(skb); return NF_STOLEN; } else { return NF_DROP; } } return XT_CONTINUE; } static int synproxy_tg4_check(const struct xt_tgchk_param *par) { struct synproxy_net *snet = synproxy_pernet(par->net); const struct ipt_entry *e = par->entryinfo; int err; if (e->ip.proto != IPPROTO_TCP || e->ip.invflags & XT_INV_PROTO) return -EINVAL; err = nf_ct_netns_get(par->net, par->family); if (err) return err; err = nf_synproxy_ipv4_init(snet, par->net); if (err) { nf_ct_netns_put(par->net, par->family); return err; } return err; } static void synproxy_tg4_destroy(const struct xt_tgdtor_param *par) { struct synproxy_net *snet = synproxy_pernet(par->net); nf_synproxy_ipv4_fini(snet, par->net); nf_ct_netns_put(par->net, par->family); } static struct xt_target synproxy_tg4_reg __read_mostly = { .name = "SYNPROXY", .family = NFPROTO_IPV4, .hooks = (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_FORWARD), .target = synproxy_tg4, .targetsize = sizeof(struct xt_synproxy_info), .checkentry = synproxy_tg4_check, .destroy = synproxy_tg4_destroy, .me = THIS_MODULE, }; static int __init synproxy_tg4_init(void) { return xt_register_target(&synproxy_tg4_reg); } static void __exit synproxy_tg4_exit(void) { xt_unregister_target(&synproxy_tg4_reg); } module_init(synproxy_tg4_init); module_exit(synproxy_tg4_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); MODULE_DESCRIPTION("Intercept TCP connections and establish them using syncookies");
390 1048 1763 1762 2752 2753 2748 2750 2748 2749 2749 2751 2098 208 1110 1111 1 2756 2756 2744 2753 209 209 2757 2748 209 2757 2752 2757 2752 1119 2109 2111 2109 1119 1117 1117 1115 1117 264 1115 266 1117 1115 1117 267 1115 1114 1117 1117 1048 1115 1042 599 1046 8 215 266 734 733 732 733 733 734 267 792 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 // SPDX-License-Identifier: GPL-2.0-only #define pr_fmt(fmt) "SMP alternatives: " fmt #include <linux/module.h> #include <linux/sched.h> #include <linux/perf_event.h> #include <linux/mutex.h> #include <linux/list.h> #include <linux/stringify.h> #include <linux/highmem.h> #include <linux/mm.h> #include <linux/vmalloc.h> #include <linux/memory.h> #include <linux/stop_machine.h> #include <linux/slab.h> #include <linux/kdebug.h> #include <linux/kprobes.h> #include <linux/mmu_context.h> #include <linux/bsearch.h> #include <linux/sync_core.h> #include <linux/execmem.h> #include <asm/text-patching.h> #include <asm/alternative.h> #include <asm/sections.h> #include <asm/mce.h> #include <asm/nmi.h> #include <asm/cacheflush.h> #include <asm/tlbflush.h> #include <asm/insn.h> #include <asm/io.h> #include <asm/fixmap.h> #include <asm/paravirt.h> #include <asm/asm-prototypes.h> #include <asm/cfi.h> #include <asm/ibt.h> #include <asm/set_memory.h> int __read_mostly alternatives_patched; EXPORT_SYMBOL_GPL(alternatives_patched); #define MAX_PATCH_LEN (255-1) #define DA_ALL (~0) #define DA_ALT 0x01 #define DA_RET 0x02 #define DA_RETPOLINE 0x04 #define DA_ENDBR 0x08 #define DA_SMP 0x10 static unsigned int debug_alternative; static int __init debug_alt(char *str) { if (str && *str == '=') str++; if (!str || kstrtouint(str, 0, &debug_alternative)) debug_alternative = DA_ALL; return 1; } __setup("debug-alternative", debug_alt); static int noreplace_smp; static int __init setup_noreplace_smp(char *str) { noreplace_smp = 1; return 1; } __setup("noreplace-smp", setup_noreplace_smp); #define DPRINTK(type, fmt, args...) \ do { \ if (debug_alternative & DA_##type) \ printk(KERN_DEBUG pr_fmt(fmt) "\n", ##args); \ } while (0) #define DUMP_BYTES(type, buf, len, fmt, args...) \ do { \ if (unlikely(debug_alternative & DA_##type)) { \ int j; \ \ if (!(len)) \ break; \ \ printk(KERN_DEBUG pr_fmt(fmt), ##args); \ for (j = 0; j < (len) - 1; j++) \ printk(KERN_CONT "%02hhx ", buf[j]); \ printk(KERN_CONT "%02hhx\n", buf[j]); \ } \ } while (0) static const unsigned char x86nops[] = { BYTES_NOP1, BYTES_NOP2, BYTES_NOP3, BYTES_NOP4, BYTES_NOP5, BYTES_NOP6, BYTES_NOP7, BYTES_NOP8, #ifdef CONFIG_64BIT BYTES_NOP9, BYTES_NOP10, BYTES_NOP11, #endif }; const unsigned char * const x86_nops[ASM_NOP_MAX+1] = { NULL, x86nops, x86nops + 1, x86nops + 1 + 2, x86nops + 1 + 2 + 3, x86nops + 1 + 2 + 3 + 4, x86nops + 1 + 2 + 3 + 4 + 5, x86nops + 1 + 2 + 3 + 4 + 5 + 6, x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, #ifdef CONFIG_64BIT x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8, x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9, x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + 10, #endif }; #ifdef CONFIG_FINEIBT static bool cfi_paranoid __ro_after_init; #endif #ifdef CONFIG_MITIGATION_ITS #ifdef CONFIG_MODULES static struct module *its_mod; #endif static void *its_page; static unsigned int its_offset; /* Initialize a thunk with the "jmp *reg; int3" instructions. */ static void *its_init_thunk(void *thunk, int reg) { u8 *bytes = thunk; int offset = 0; int i = 0; #ifdef CONFIG_FINEIBT if (cfi_paranoid) { /* * When ITS uses indirect branch thunk the fineibt_paranoid * caller sequence doesn't fit in the caller site. So put the * remaining part of the sequence (<ea> + JNE) into the ITS * thunk. */ bytes[i++] = 0xea; /* invalid instruction */ bytes[i++] = 0x75; /* JNE */ bytes[i++] = 0xfd; offset = 1; } #endif if (reg >= 8) { bytes[i++] = 0x41; /* REX.B prefix */ reg -= 8; } bytes[i++] = 0xff; bytes[i++] = 0xe0 + reg; /* jmp *reg */ bytes[i++] = 0xcc; return thunk + offset; } #ifdef CONFIG_MODULES void its_init_mod(struct module *mod) { if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS)) return; mutex_lock(&text_mutex); its_mod = mod; its_page = NULL; } void its_fini_mod(struct module *mod) { if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS)) return; WARN_ON_ONCE(its_mod != mod); its_mod = NULL; its_page = NULL; mutex_unlock(&text_mutex); for (int i = 0; i < mod->its_num_pages; i++) { void *page = mod->its_page_array[i]; execmem_restore_rox(page, PAGE_SIZE); } } void its_free_mod(struct module *mod) { if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS)) return; for (int i = 0; i < mod->its_num_pages; i++) { void *page = mod->its_page_array[i]; execmem_free(page); } kfree(mod->its_page_array); } #endif /* CONFIG_MODULES */ static void *its_alloc(void) { void *page __free(execmem) = execmem_alloc(EXECMEM_MODULE_TEXT, PAGE_SIZE); if (!page) return NULL; #ifdef CONFIG_MODULES if (its_mod) { void *tmp = krealloc(its_mod->its_page_array, (its_mod->its_num_pages+1) * sizeof(void *), GFP_KERNEL); if (!tmp) return NULL; its_mod->its_page_array = tmp; its_mod->its_page_array[its_mod->its_num_pages++] = page; execmem_make_temp_rw(page, PAGE_SIZE); } #endif /* CONFIG_MODULES */ return no_free_ptr(page); } static void *its_allocate_thunk(int reg) { int size = 3 + (reg / 8); void *thunk; #ifdef CONFIG_FINEIBT /* * The ITS thunk contains an indirect jump and an int3 instruction so * its size is 3 or 4 bytes depending on the register used. If CFI * paranoid is used then 3 extra bytes are added in the ITS thunk to * complete the fineibt_paranoid caller sequence. */ if (cfi_paranoid) size += 3; #endif if (!its_page || (its_offset + size - 1) >= PAGE_SIZE) { its_page = its_alloc(); if (!its_page) { pr_err("ITS page allocation failed\n"); return NULL; } memset(its_page, INT3_INSN_OPCODE, PAGE_SIZE); its_offset = 32; } /* * If the indirect branch instruction will be in the lower half * of a cacheline, then update the offset to reach the upper half. */ if ((its_offset + size - 1) % 64 < 32) its_offset = ((its_offset - 1) | 0x3F) + 33; thunk = its_page + its_offset; its_offset += size; return its_init_thunk(thunk, reg); } u8 *its_static_thunk(int reg) { u8 *thunk = __x86_indirect_its_thunk_array[reg]; #ifdef CONFIG_FINEIBT /* Paranoid thunk starts 2 bytes before */ if (cfi_paranoid) return thunk - 2; #endif return thunk; } #endif /* * Nomenclature for variable names to simplify and clarify this code and ease * any potential staring at it: * * @instr: source address of the original instructions in the kernel text as * generated by the compiler. * * @buf: temporary buffer on which the patching operates. This buffer is * eventually text-poked into the kernel image. * * @replacement/@repl: pointer to the opcodes which are replacing @instr, located * in the .altinstr_replacement section. */ /* * Fill the buffer with a single effective instruction of size @len. * * In order not to issue an ORC stack depth tracking CFI entry (Call Frame Info) * for every single-byte NOP, try to generate the maximally available NOP of * size <= ASM_NOP_MAX such that only a single CFI entry is generated (vs one for * each single-byte NOPs). If @len to fill out is > ASM_NOP_MAX, pad with INT3 and * *jump* over instead of executing long and daft NOPs. */ static void add_nop(u8 *buf, unsigned int len) { u8 *target = buf + len; if (!len) return; if (len <= ASM_NOP_MAX) { memcpy(buf, x86_nops[len], len); return; } if (len < 128) { __text_gen_insn(buf, JMP8_INSN_OPCODE, buf, target, JMP8_INSN_SIZE); buf += JMP8_INSN_SIZE; } else { __text_gen_insn(buf, JMP32_INSN_OPCODE, buf, target, JMP32_INSN_SIZE); buf += JMP32_INSN_SIZE; } for (;buf < target; buf++) *buf = INT3_INSN_OPCODE; } extern s32 __retpoline_sites[], __retpoline_sites_end[]; extern s32 __return_sites[], __return_sites_end[]; extern s32 __cfi_sites[], __cfi_sites_end[]; extern s32 __ibt_endbr_seal[], __ibt_endbr_seal_end[]; extern s32 __smp_locks[], __smp_locks_end[]; void text_poke_early(void *addr, const void *opcode, size_t len); /* * Matches NOP and NOPL, not any of the other possible NOPs. */ static bool insn_is_nop(struct insn *insn) { /* Anything NOP, but no REP NOP */ if (insn->opcode.bytes[0] == 0x90 && (!insn->prefixes.nbytes || insn->prefixes.bytes[0] != 0xF3)) return true; /* NOPL */ if (insn->opcode.bytes[0] == 0x0F && insn->opcode.bytes[1] == 0x1F) return true; /* TODO: more nops */ return false; } /* * Find the offset of the first non-NOP instruction starting at @offset * but no further than @len. */ static int skip_nops(u8 *buf, int offset, int len) { struct insn insn; for (; offset < len; offset += insn.length) { if (insn_decode_kernel(&insn, &buf[offset])) break; if (!insn_is_nop(&insn)) break; } return offset; } /* * "noinline" to cause control flow change and thus invalidate I$ and * cause refetch after modification. */ static void noinline optimize_nops(const u8 * const instr, u8 *buf, size_t len) { for (int next, i = 0; i < len; i = next) { struct insn insn; if (insn_decode_kernel(&insn, &buf[i])) return; next = i + insn.length; if (insn_is_nop(&insn)) { int nop = i; /* Has the NOP already been optimized? */ if (i + insn.length == len) return; next = skip_nops(buf, next, len); add_nop(buf + nop, next - nop); DUMP_BYTES(ALT, buf, len, "%px: [%d:%d) optimized NOPs: ", instr, nop, next); } } } /* * In this context, "source" is where the instructions are placed in the * section .altinstr_replacement, for example during kernel build by the * toolchain. * "Destination" is where the instructions are being patched in by this * machinery. * * The source offset is: * * src_imm = target - src_next_ip (1) * * and the target offset is: * * dst_imm = target - dst_next_ip (2) * * so rework (1) as an expression for target like: * * target = src_imm + src_next_ip (1a) * * and substitute in (2) to get: * * dst_imm = (src_imm + src_next_ip) - dst_next_ip (3) * * Now, since the instruction stream is 'identical' at src and dst (it * is being copied after all) it can be stated that: * * src_next_ip = src + ip_offset * dst_next_ip = dst + ip_offset (4) * * Substitute (4) in (3) and observe ip_offset being cancelled out to * obtain: * * dst_imm = src_imm + (src + ip_offset) - (dst + ip_offset) * = src_imm + src - dst + ip_offset - ip_offset * = src_imm + src - dst (5) * * IOW, only the relative displacement of the code block matters. */ #define apply_reloc_n(n_, p_, d_) \ do { \ s32 v = *(s##n_ *)(p_); \ v += (d_); \ BUG_ON((v >> 31) != (v >> (n_-1))); \ *(s##n_ *)(p_) = (s##n_)v; \ } while (0) static __always_inline void apply_reloc(int n, void *ptr, uintptr_t diff) { switch (n) { case 1: apply_reloc_n(8, ptr, diff); break; case 2: apply_reloc_n(16, ptr, diff); break; case 4: apply_reloc_n(32, ptr, diff); break; default: BUG(); } } static __always_inline bool need_reloc(unsigned long offset, u8 *src, size_t src_len) { u8 *target = src + offset; /* * If the target is inside the patched block, it's relative to the * block itself and does not need relocation. */ return (target < src || target > src + src_len); } static void __apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u8 *repl, size_t repl_len) { for (int next, i = 0; i < instrlen; i = next) { struct insn insn; if (WARN_ON_ONCE(insn_decode_kernel(&insn, &buf[i]))) return; next = i + insn.length; switch (insn.opcode.bytes[0]) { case 0x0f: if (insn.opcode.bytes[1] < 0x80 || insn.opcode.bytes[1] > 0x8f) break; fallthrough; /* Jcc.d32 */ case 0x70 ... 0x7f: /* Jcc.d8 */ case JMP8_INSN_OPCODE: case JMP32_INSN_OPCODE: case CALL_INSN_OPCODE: if (need_reloc(next + insn.immediate.value, repl, repl_len)) { apply_reloc(insn.immediate.nbytes, buf + i + insn_offset_immediate(&insn), repl - instr); } /* * Where possible, convert JMP.d32 into JMP.d8. */ if (insn.opcode.bytes[0] == JMP32_INSN_OPCODE) { s32 imm = insn.immediate.value; imm += repl - instr; imm += JMP32_INSN_SIZE - JMP8_INSN_SIZE; if ((imm >> 31) == (imm >> 7)) { buf[i+0] = JMP8_INSN_OPCODE; buf[i+1] = (s8)imm; memset(&buf[i+2], INT3_INSN_OPCODE, insn.length - 2); } } break; } if (insn_rip_relative(&insn)) { if (need_reloc(next + insn.displacement.value, repl, repl_len)) { apply_reloc(insn.displacement.nbytes, buf + i + insn_offset_displacement(&insn), repl - instr); } } } } void apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u8 *repl, size_t repl_len) { __apply_relocation(buf, instr, instrlen, repl, repl_len); optimize_nops(instr, buf, instrlen); } /* Low-level backend functions usable from alternative code replacements. */ DEFINE_ASM_FUNC(nop_func, "", .entry.text); EXPORT_SYMBOL_GPL(nop_func); noinstr void BUG_func(void) { BUG(); } EXPORT_SYMBOL(BUG_func); #define CALL_RIP_REL_OPCODE 0xff #define CALL_RIP_REL_MODRM 0x15 /* * Rewrite the "call BUG_func" replacement to point to the target of the * indirect pv_ops call "call *disp(%ip)". */ static int alt_replace_call(u8 *instr, u8 *insn_buff, struct alt_instr *a) { void *target, *bug = &BUG_func; s32 disp; if (a->replacementlen != 5 || insn_buff[0] != CALL_INSN_OPCODE) { pr_err("ALT_FLAG_DIRECT_CALL set for a non-call replacement instruction\n"); BUG(); } if (a->instrlen != 6 || instr[0] != CALL_RIP_REL_OPCODE || instr[1] != CALL_RIP_REL_MODRM) { pr_err("ALT_FLAG_DIRECT_CALL set for unrecognized indirect call\n"); BUG(); } /* Skip CALL_RIP_REL_OPCODE and CALL_RIP_REL_MODRM */ disp = *(s32 *)(instr + 2); #ifdef CONFIG_X86_64 /* ff 15 00 00 00 00 call *0x0(%rip) */ /* target address is stored at "next instruction + disp". */ target = *(void **)(instr + a->instrlen + disp); #else /* ff 15 00 00 00 00 call *0x0 */ /* target address is stored at disp. */ target = *(void **)disp; #endif if (!target) target = bug; /* (BUG_func - .) + (target - BUG_func) := target - . */ *(s32 *)(insn_buff + 1) += target - bug; if (target == &nop_func) return 0; return 5; } static inline u8 * instr_va(struct alt_instr *i) { return (u8 *)&i->instr_offset + i->instr_offset; } /* * Replace instructions with better alternatives for this CPU type. This runs * before SMP is initialized to avoid SMP problems with self modifying code. * This implies that asymmetric systems where APs have less capabilities than * the boot processor are not handled. Tough. Make sure you disable such * features by hand. * * Marked "noinline" to cause control flow change and thus insn cache * to refetch changed I$ lines. */ void __init_or_module noinline apply_alternatives(struct alt_instr *start, struct alt_instr *end) { u8 insn_buff[MAX_PATCH_LEN]; u8 *instr, *replacement; struct alt_instr *a, *b; DPRINTK(ALT, "alt table %px, -> %px", start, end); /* * In the case CONFIG_X86_5LEVEL=y, KASAN_SHADOW_START is defined using * cpu_feature_enabled(X86_FEATURE_LA57) and is therefore patched here. * During the process, KASAN becomes confused seeing partial LA57 * conversion and triggers a false-positive out-of-bound report. * * Disable KASAN until the patching is complete. */ kasan_disable_current(); /* * The scan order should be from start to end. A later scanned * alternative code can overwrite previously scanned alternative code. * Some kernel functions (e.g. memcpy, memset, etc) use this order to * patch code. * * So be careful if you want to change the scan order to any other * order. */ for (a = start; a < end; a++) { int insn_buff_sz = 0; /* * In case of nested ALTERNATIVE()s the outer alternative might * add more padding. To ensure consistent patching find the max * padding for all alt_instr entries for this site (nested * alternatives result in consecutive entries). */ for (b = a+1; b < end && instr_va(b) == instr_va(a); b++) { u8 len = max(a->instrlen, b->instrlen); a->instrlen = b->instrlen = len; } instr = instr_va(a); replacement = (u8 *)&a->repl_offset + a->repl_offset; BUG_ON(a->instrlen > sizeof(insn_buff)); BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32); /* * Patch if either: * - feature is present * - feature not present but ALT_FLAG_NOT is set to mean, * patch if feature is *NOT* present. */ if (!boot_cpu_has(a->cpuid) == !(a->flags & ALT_FLAG_NOT)) { memcpy(insn_buff, instr, a->instrlen); optimize_nops(instr, insn_buff, a->instrlen); text_poke_early(instr, insn_buff, a->instrlen); continue; } DPRINTK(ALT, "feat: %d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d) flags: 0x%x", a->cpuid >> 5, a->cpuid & 0x1f, instr, instr, a->instrlen, replacement, a->replacementlen, a->flags); memcpy(insn_buff, replacement, a->replacementlen); insn_buff_sz = a->replacementlen; if (a->flags & ALT_FLAG_DIRECT_CALL) { insn_buff_sz = alt_replace_call(instr, insn_buff, a); if (insn_buff_sz < 0) continue; } for (; insn_buff_sz < a->instrlen; insn_buff_sz++) insn_buff[insn_buff_sz] = 0x90; apply_relocation(insn_buff, instr, a->instrlen, replacement, a->replacementlen); DUMP_BYTES(ALT, instr, a->instrlen, "%px: old_insn: ", instr); DUMP_BYTES(ALT, replacement, a->replacementlen, "%px: rpl_insn: ", replacement); DUMP_BYTES(ALT, insn_buff, insn_buff_sz, "%px: final_insn: ", instr); text_poke_early(instr, insn_buff, insn_buff_sz); } kasan_enable_current(); } static inline bool is_jcc32(struct insn *insn) { /* Jcc.d32 second opcode byte is in the range: 0x80-0x8f */ return insn->opcode.bytes[0] == 0x0f && (insn->opcode.bytes[1] & 0xf0) == 0x80; } #if defined(CONFIG_MITIGATION_RETPOLINE) && defined(CONFIG_OBJTOOL) /* * CALL/JMP *%\reg */ static int emit_indirect(int op, int reg, u8 *bytes) { int i = 0; u8 modrm; switch (op) { case CALL_INSN_OPCODE: modrm = 0x10; /* Reg = 2; CALL r/m */ break; case JMP32_INSN_OPCODE: modrm = 0x20; /* Reg = 4; JMP r/m */ break; default: WARN_ON_ONCE(1); return -1; } if (reg >= 8) { bytes[i++] = 0x41; /* REX.B prefix */ reg -= 8; } modrm |= 0xc0; /* Mod = 3 */ modrm += reg; bytes[i++] = 0xff; /* opcode */ bytes[i++] = modrm; return i; } static int __emit_trampoline(void *addr, struct insn *insn, u8 *bytes, void *call_dest, void *jmp_dest) { u8 op = insn->opcode.bytes[0]; int i = 0; /* * Clang does 'weird' Jcc __x86_indirect_thunk_r11 conditional * tail-calls. Deal with them. */ if (is_jcc32(insn)) { bytes[i++] = op; op = insn->opcode.bytes[1]; goto clang_jcc; } if (insn->length == 6) bytes[i++] = 0x2e; /* CS-prefix */ switch (op) { case CALL_INSN_OPCODE: __text_gen_insn(bytes+i, op, addr+i, call_dest, CALL_INSN_SIZE); i += CALL_INSN_SIZE; break; case JMP32_INSN_OPCODE: clang_jcc: __text_gen_insn(bytes+i, op, addr+i, jmp_dest, JMP32_INSN_SIZE); i += JMP32_INSN_SIZE; break; default: WARN(1, "%pS %px %*ph\n", addr, addr, 6, addr); return -1; } WARN_ON_ONCE(i != insn->length); return i; } static int emit_call_track_retpoline(void *addr, struct insn *insn, int reg, u8 *bytes) { return __emit_trampoline(addr, insn, bytes, __x86_indirect_call_thunk_array[reg], __x86_indirect_jump_thunk_array[reg]); } #ifdef CONFIG_MITIGATION_ITS static int emit_its_trampoline(void *addr, struct insn *insn, int reg, u8 *bytes) { u8 *thunk = __x86_indirect_its_thunk_array[reg]; u8 *tmp = its_allocate_thunk(reg); if (tmp) thunk = tmp; return __emit_trampoline(addr, insn, bytes, thunk, thunk); } /* Check if an indirect branch is at ITS-unsafe address */ static bool cpu_wants_indirect_its_thunk_at(unsigned long addr, int reg) { if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS)) return false; /* Indirect branch opcode is 2 or 3 bytes depending on reg */ addr += 1 + reg / 8; /* Lower-half of the cacheline? */ return !(addr & 0x20); } #else /* CONFIG_MITIGATION_ITS */ #ifdef CONFIG_FINEIBT static bool cpu_wants_indirect_its_thunk_at(unsigned long addr, int reg) { return false; } #endif #endif /* CONFIG_MITIGATION_ITS */ /* * Rewrite the compiler generated retpoline thunk calls. * * For spectre_v2=off (!X86_FEATURE_RETPOLINE), rewrite them into immediate * indirect instructions, avoiding the extra indirection. * * For example, convert: * * CALL __x86_indirect_thunk_\reg * * into: * * CALL *%\reg * * It also tries to inline spectre_v2=retpoline,lfence when size permits. */ static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes) { retpoline_thunk_t *target; int reg, ret, i = 0; u8 op, cc; target = addr + insn->length + insn->immediate.value; reg = target - __x86_indirect_thunk_array; if (WARN_ON_ONCE(reg & ~0xf)) return -1; /* If anyone ever does: CALL/JMP *%rsp, we're in deep trouble. */ BUG_ON(reg == 4); if (cpu_feature_enabled(X86_FEATURE_RETPOLINE) && !cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) { if (cpu_feature_enabled(X86_FEATURE_CALL_DEPTH)) return emit_call_track_retpoline(addr, insn, reg, bytes); return -1; } op = insn->opcode.bytes[0]; /* * Convert: * * Jcc.d32 __x86_indirect_thunk_\reg * * into: * * Jncc.d8 1f * [ LFENCE ] * JMP *%\reg * [ NOP ] * 1: */ if (is_jcc32(insn)) { cc = insn->opcode.bytes[1] & 0xf; cc ^= 1; /* invert condition */ bytes[i++] = 0x70 + cc; /* Jcc.d8 */ bytes[i++] = insn->length - 2; /* sizeof(Jcc.d8) == 2 */ /* Continue as if: JMP.d32 __x86_indirect_thunk_\reg */ op = JMP32_INSN_OPCODE; } /* * For RETPOLINE_LFENCE: prepend the indirect CALL/JMP with an LFENCE. */ if (cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) { bytes[i++] = 0x0f; bytes[i++] = 0xae; bytes[i++] = 0xe8; /* LFENCE */ } #ifdef CONFIG_MITIGATION_ITS /* * Check if the address of last byte of emitted-indirect is in * lower-half of the cacheline. Such branches need ITS mitigation. */ if (cpu_wants_indirect_its_thunk_at((unsigned long)addr + i, reg)) return emit_its_trampoline(addr, insn, reg, bytes); #endif ret = emit_indirect(op, reg, bytes + i); if (ret < 0) return ret; i += ret; /* * The compiler is supposed to EMIT an INT3 after every unconditional * JMP instruction due to AMD BTC. However, if the compiler is too old * or MITIGATION_SLS isn't enabled, we still need an INT3 after * indirect JMPs even on Intel. */ if (op == JMP32_INSN_OPCODE && i < insn->length) bytes[i++] = INT3_INSN_OPCODE; for (; i < insn->length;) bytes[i++] = BYTES_NOP1; return i; } /* * Generated by 'objtool --retpoline'. */ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) { s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; struct insn insn; int len, ret; u8 bytes[16]; u8 op1, op2; u8 *dest; ret = insn_decode_kernel(&insn, addr); if (WARN_ON_ONCE(ret < 0)) continue; op1 = insn.opcode.bytes[0]; op2 = insn.opcode.bytes[1]; switch (op1) { case 0x70 ... 0x7f: /* Jcc.d8 */ /* See cfi_paranoid. */ WARN_ON_ONCE(cfi_mode != CFI_FINEIBT); continue; case CALL_INSN_OPCODE: case JMP32_INSN_OPCODE: /* Check for cfi_paranoid + ITS */ dest = addr + insn.length + insn.immediate.value; if (dest[-1] == 0xea && (dest[0] & 0xf0) == 0x70) { WARN_ON_ONCE(cfi_mode != CFI_FINEIBT); continue; } break; case 0x0f: /* escape */ if (op2 >= 0x80 && op2 <= 0x8f) break; fallthrough; default: WARN_ON_ONCE(1); continue; } DPRINTK(RETPOLINE, "retpoline at: %pS (%px) len: %d to: %pS", addr, addr, insn.length, addr + insn.length + insn.immediate.value); len = patch_retpoline(addr, &insn, bytes); if (len == insn.length) { optimize_nops(addr, bytes, len); DUMP_BYTES(RETPOLINE, ((u8*)addr), len, "%px: orig: ", addr); DUMP_BYTES(RETPOLINE, ((u8*)bytes), len, "%px: repl: ", addr); text_poke_early(addr, bytes, len); } } } #ifdef CONFIG_MITIGATION_RETHUNK bool cpu_wants_rethunk(void) { return cpu_feature_enabled(X86_FEATURE_RETHUNK); } bool cpu_wants_rethunk_at(void *addr) { if (!cpu_feature_enabled(X86_FEATURE_RETHUNK)) return false; if (x86_return_thunk != its_return_thunk) return true; return !((unsigned long)addr & 0x20); } /* * Rewrite the compiler generated return thunk tail-calls. * * For example, convert: * * JMP __x86_return_thunk * * into: * * RET */ static int patch_return(void *addr, struct insn *insn, u8 *bytes) { int i = 0; /* Patch the custom return thunks... */ if (cpu_wants_rethunk_at(addr)) { i = JMP32_INSN_SIZE; __text_gen_insn(bytes, JMP32_INSN_OPCODE, addr, x86_return_thunk, i); } else { /* ... or patch them out if not needed. */ bytes[i++] = RET_INSN_OPCODE; } for (; i < insn->length;) bytes[i++] = INT3_INSN_OPCODE; return i; } void __init_or_module noinline apply_returns(s32 *start, s32 *end) { s32 *s; if (cpu_wants_rethunk()) static_call_force_reinit(); for (s = start; s < end; s++) { void *dest = NULL, *addr = (void *)s + *s; struct insn insn; int len, ret; u8 bytes[16]; u8 op; ret = insn_decode_kernel(&insn, addr); if (WARN_ON_ONCE(ret < 0)) continue; op = insn.opcode.bytes[0]; if (op == JMP32_INSN_OPCODE) dest = addr + insn.length + insn.immediate.value; if (__static_call_fixup(addr, op, dest) || WARN_ONCE(dest != &__x86_return_thunk, "missing return thunk: %pS-%pS: %*ph", addr, dest, 5, addr)) continue; DPRINTK(RET, "return thunk at: %pS (%px) len: %d to: %pS", addr, addr, insn.length, addr + insn.length + insn.immediate.value); len = patch_return(addr, &insn, bytes); if (len == insn.length) { DUMP_BYTES(RET, ((u8*)addr), len, "%px: orig: ", addr); DUMP_BYTES(RET, ((u8*)bytes), len, "%px: repl: ", addr); text_poke_early(addr, bytes, len); } } } #else /* !CONFIG_MITIGATION_RETHUNK: */ void __init_or_module noinline apply_returns(s32 *start, s32 *end) { } #endif /* !CONFIG_MITIGATION_RETHUNK */ #else /* !CONFIG_MITIGATION_RETPOLINE || !CONFIG_OBJTOOL */ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) { } void __init_or_module noinline apply_returns(s32 *start, s32 *end) { } #endif /* !CONFIG_MITIGATION_RETPOLINE || !CONFIG_OBJTOOL */ #ifdef CONFIG_X86_KERNEL_IBT __noendbr bool is_endbr(u32 *val) { u32 endbr; __get_kernel_nofault(&endbr, val, u32, Efault); return __is_endbr(endbr); Efault: return false; } #ifdef CONFIG_FINEIBT static __noendbr bool exact_endbr(u32 *val) { u32 endbr; __get_kernel_nofault(&endbr, val, u32, Efault); return endbr == gen_endbr(); Efault: return false; } #endif static void poison_cfi(void *addr); static void __init_or_module poison_endbr(void *addr) { u32 poison = gen_endbr_poison(); if (WARN_ON_ONCE(!is_endbr(addr))) return; DPRINTK(ENDBR, "ENDBR at: %pS (%px)", addr, addr); /* * When we have IBT, the lack of ENDBR will trigger #CP */ DUMP_BYTES(ENDBR, ((u8*)addr), 4, "%px: orig: ", addr); DUMP_BYTES(ENDBR, ((u8*)&poison), 4, "%px: repl: ", addr); text_poke_early(addr, &poison, 4); } /* * Generated by: objtool --ibt * * Seal the functions for indirect calls by clobbering the ENDBR instructions * and the kCFI hash value. */ void __init_or_module noinline apply_seal_endbr(s32 *start, s32 *end) { s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; poison_endbr(addr); if (IS_ENABLED(CONFIG_FINEIBT)) poison_cfi(addr - 16); } } #else /* !CONFIG_X86_KERNEL_IBT: */ void __init_or_module apply_seal_endbr(s32 *start, s32 *end) { } #endif /* !CONFIG_X86_KERNEL_IBT */ #ifdef CONFIG_CFI_AUTO_DEFAULT # define __CFI_DEFAULT CFI_AUTO #elif defined(CONFIG_CFI_CLANG) # define __CFI_DEFAULT CFI_KCFI #else # define __CFI_DEFAULT CFI_OFF #endif enum cfi_mode cfi_mode __ro_after_init = __CFI_DEFAULT; #ifdef CONFIG_FINEIBT_BHI bool cfi_bhi __ro_after_init = false; #endif #ifdef CONFIG_CFI_CLANG struct bpf_insn; /* Must match bpf_func_t / DEFINE_BPF_PROG_RUN() */ extern unsigned int __bpf_prog_runX(const void *ctx, const struct bpf_insn *insn); KCFI_REFERENCE(__bpf_prog_runX); /* u32 __ro_after_init cfi_bpf_hash = __kcfi_typeid___bpf_prog_runX; */ asm ( " .pushsection .data..ro_after_init,\"aw\",@progbits \n" " .type cfi_bpf_hash,@object \n" " .globl cfi_bpf_hash \n" " .p2align 2, 0x0 \n" "cfi_bpf_hash: \n" " .long __kcfi_typeid___bpf_prog_runX \n" " .size cfi_bpf_hash, 4 \n" " .popsection \n" ); /* Must match bpf_callback_t */ extern u64 __bpf_callback_fn(u64, u64, u64, u64, u64); KCFI_REFERENCE(__bpf_callback_fn); /* u32 __ro_after_init cfi_bpf_subprog_hash = __kcfi_typeid___bpf_callback_fn; */ asm ( " .pushsection .data..ro_after_init,\"aw\",@progbits \n" " .type cfi_bpf_subprog_hash,@object \n" " .globl cfi_bpf_subprog_hash \n" " .p2align 2, 0x0 \n" "cfi_bpf_subprog_hash: \n" " .long __kcfi_typeid___bpf_callback_fn \n" " .size cfi_bpf_subprog_hash, 4 \n" " .popsection \n" ); u32 cfi_get_func_hash(void *func) { u32 hash; func -= cfi_get_offset(); switch (cfi_mode) { case CFI_FINEIBT: func += 7; break; case CFI_KCFI: func += 1; break; default: return 0; } if (get_kernel_nofault(hash, func)) return 0; return hash; } int cfi_get_func_arity(void *func) { bhi_thunk *target; s32 disp; if (cfi_mode != CFI_FINEIBT && !cfi_bhi) return 0; if (get_kernel_nofault(disp, func - 4)) return 0; target = func + disp; return target - __bhi_args; } #endif #ifdef CONFIG_FINEIBT static bool cfi_rand __ro_after_init = true; static u32 cfi_seed __ro_after_init; /* * Re-hash the CFI hash with a boot-time seed while making sure the result is * not a valid ENDBR instruction. */ static u32 cfi_rehash(u32 hash) { hash ^= cfi_seed; while (unlikely(__is_endbr(hash) || __is_endbr(-hash))) { bool lsb = hash & 1; hash >>= 1; if (lsb) hash ^= 0x80200003; } return hash; } static __init int cfi_parse_cmdline(char *str) { if (!str) return -EINVAL; while (str) { char *next = strchr(str, ','); if (next) { *next = 0; next++; } if (!strcmp(str, "auto")) { cfi_mode = CFI_AUTO; } else if (!strcmp(str, "off")) { cfi_mode = CFI_OFF; cfi_rand = false; } else if (!strcmp(str, "kcfi")) { cfi_mode = CFI_KCFI; } else if (!strcmp(str, "fineibt")) { cfi_mode = CFI_FINEIBT; } else if (!strcmp(str, "norand")) { cfi_rand = false; } else if (!strcmp(str, "warn")) { pr_alert("CFI mismatch non-fatal!\n"); cfi_warn = true; } else if (!strcmp(str, "paranoid")) { if (cfi_mode == CFI_FINEIBT) { cfi_paranoid = true; } else { pr_err("Ignoring paranoid; depends on fineibt.\n"); } } else if (!strcmp(str, "bhi")) { #ifdef CONFIG_FINEIBT_BHI if (cfi_mode == CFI_FINEIBT) { cfi_bhi = true; } else { pr_err("Ignoring bhi; depends on fineibt.\n"); } #else pr_err("Ignoring bhi; depends on FINEIBT_BHI=y.\n"); #endif } else { pr_err("Ignoring unknown cfi option (%s).", str); } str = next; } return 0; } early_param("cfi", cfi_parse_cmdline); /* * kCFI FineIBT * * __cfi_\func: __cfi_\func: * movl $0x12345678,%eax // 5 endbr64 // 4 * nop subl $0x12345678,%r10d // 7 * nop jne __cfi_\func+6 // 2 * nop nop3 // 3 * nop * nop * nop * nop * nop * nop * nop * nop * * * caller: caller: * movl $(-0x12345678),%r10d // 6 movl $0x12345678,%r10d // 6 * addl $-15(%r11),%r10d // 4 lea -0x10(%r11),%r11 // 4 * je 1f // 2 nop4 // 4 * ud2 // 2 * 1: cs call __x86_indirect_thunk_r11 // 6 call *%r11; nop3; // 6 * */ /* * <fineibt_preamble_start>: * 0: f3 0f 1e fa endbr64 * 4: 41 81 <ea> 78 56 34 12 sub $0x12345678, %r10d * b: 75 f9 jne 6 <fineibt_preamble_start+0x6> * d: 0f 1f 00 nopl (%rax) * * Note that the JNE target is the 0xEA byte inside the SUB, this decodes as * (bad) on x86_64 and raises #UD. */ asm( ".pushsection .rodata \n" "fineibt_preamble_start: \n" " endbr64 \n" " subl $0x12345678, %r10d \n" "fineibt_preamble_bhi: \n" " jne fineibt_preamble_start+6 \n" ASM_NOP3 "fineibt_preamble_end: \n" ".popsection\n" ); extern u8 fineibt_preamble_start[]; extern u8 fineibt_preamble_bhi[]; extern u8 fineibt_preamble_end[]; #define fineibt_preamble_size (fineibt_preamble_end - fineibt_preamble_start) #define fineibt_preamble_bhi (fineibt_preamble_bhi - fineibt_preamble_start) #define fineibt_preamble_ud 6 #define fineibt_preamble_hash 7 /* * <fineibt_caller_start>: * 0: 41 ba 78 56 34 12 mov $0x12345678, %r10d * 6: 4d 8d 5b f0 lea -0x10(%r11), %r11 * a: 0f 1f 40 00 nopl 0x0(%rax) */ asm( ".pushsection .rodata \n" "fineibt_caller_start: \n" " movl $0x12345678, %r10d \n" " lea -0x10(%r11), %r11 \n" ASM_NOP4 "fineibt_caller_end: \n" ".popsection \n" ); extern u8 fineibt_caller_start[]; extern u8 fineibt_caller_end[]; #define fineibt_caller_size (fineibt_caller_end - fineibt_caller_start) #define fineibt_caller_hash 2 #define fineibt_caller_jmp (fineibt_caller_size - 2) /* * Since FineIBT does hash validation on the callee side it is prone to * circumvention attacks where a 'naked' ENDBR instruction exists that * is not part of the fineibt_preamble sequence. * * Notably the x86 entry points must be ENDBR and equally cannot be * fineibt_preamble. * * The fineibt_paranoid caller sequence adds additional caller side * hash validation. This stops such circumvention attacks dead, but at the cost * of adding a load. * * <fineibt_paranoid_start>: * 0: 41 ba 78 56 34 12 mov $0x12345678, %r10d * 6: 45 3b 53 f7 cmp -0x9(%r11), %r10d * a: 4d 8d 5b <f0> lea -0x10(%r11), %r11 * e: 75 fd jne d <fineibt_paranoid_start+0xd> * 10: 41 ff d3 call *%r11 * 13: 90 nop * * Notably LEA does not modify flags and can be reordered with the CMP, * avoiding a dependency. Again, using a non-taken (backwards) branch * for the failure case, abusing LEA's immediate 0xf0 as LOCK prefix for the * Jcc.d8, causing #UD. */ asm( ".pushsection .rodata \n" "fineibt_paranoid_start: \n" " movl $0x12345678, %r10d \n" " cmpl -9(%r11), %r10d \n" " lea -0x10(%r11), %r11 \n" " jne fineibt_paranoid_start+0xd \n" "fineibt_paranoid_ind: \n" " call *%r11 \n" " nop \n" "fineibt_paranoid_end: \n" ".popsection \n" ); extern u8 fineibt_paranoid_start[]; extern u8 fineibt_paranoid_ind[]; extern u8 fineibt_paranoid_end[]; #define fineibt_paranoid_size (fineibt_paranoid_end - fineibt_paranoid_start) #define fineibt_paranoid_ind (fineibt_paranoid_ind - fineibt_paranoid_start) #define fineibt_paranoid_ud 0xd static u32 decode_preamble_hash(void *addr, int *reg) { u8 *p = addr; /* b8+reg 78 56 34 12 movl $0x12345678,\reg */ if (p[0] >= 0xb8 && p[0] < 0xc0) { if (reg) *reg = p[0] - 0xb8; return *(u32 *)(addr + 1); } return 0; /* invalid hash value */ } static u32 decode_caller_hash(void *addr) { u8 *p = addr; /* 41 ba 88 a9 cb ed mov $(-0x12345678),%r10d */ if (p[0] == 0x41 && p[1] == 0xba) return -*(u32 *)(addr + 2); /* e8 0c 88 a9 cb ed jmp.d8 +12 */ if (p[0] == JMP8_INSN_OPCODE && p[1] == fineibt_caller_jmp) return -*(u32 *)(addr + 2); return 0; /* invalid hash value */ } /* .retpoline_sites */ static int cfi_disable_callers(s32 *start, s32 *end) { /* * Disable kCFI by patching in a JMP.d8, this leaves the hash immediate * in tact for later usage. Also see decode_caller_hash() and * cfi_rewrite_callers(). */ const u8 jmp[] = { JMP8_INSN_OPCODE, fineibt_caller_jmp }; s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; u32 hash; addr -= fineibt_caller_size; hash = decode_caller_hash(addr); if (!hash) /* nocfi callers */ continue; text_poke_early(addr, jmp, 2); } return 0; } static int cfi_enable_callers(s32 *start, s32 *end) { /* * Re-enable kCFI, undo what cfi_disable_callers() did. */ const u8 mov[] = { 0x41, 0xba }; s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; u32 hash; addr -= fineibt_caller_size; hash = decode_caller_hash(addr); if (!hash) /* nocfi callers */ continue; text_poke_early(addr, mov, 2); } return 0; } /* .cfi_sites */ static int cfi_rand_preamble(s32 *start, s32 *end) { s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; u32 hash; hash = decode_preamble_hash(addr, NULL); if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n", addr, addr, 5, addr)) return -EINVAL; hash = cfi_rehash(hash); text_poke_early(addr + 1, &hash, 4); } return 0; } static void cfi_fineibt_bhi_preamble(void *addr, int arity) { if (!arity) return; if (!cfi_warn && arity == 1) { /* * Crazy scheme to allow arity-1 inline: * * __cfi_foo: * 0: f3 0f 1e fa endbr64 * 4: 41 81 <ea> 78 56 34 12 sub 0x12345678, %r10d * b: 49 0f 45 fa cmovne %r10, %rdi * f: 75 f5 jne __cfi_foo+6 * 11: 0f 1f 00 nopl (%rax) * * Code that direct calls to foo()+0, decodes the tail end as: * * foo: * 0: f5 cmc * 1: 0f 1f 00 nopl (%rax) * * which clobbers CF, but does not affect anything ABI * wise. * * Notably, this scheme is incompatible with permissive CFI * because the CMOVcc is unconditional and RDI will have been * clobbered. */ const u8 magic[9] = { 0x49, 0x0f, 0x45, 0xfa, 0x75, 0xf5, BYTES_NOP3, }; text_poke_early(addr + fineibt_preamble_bhi, magic, 9); return; } text_poke_early(addr + fineibt_preamble_bhi, text_gen_insn(CALL_INSN_OPCODE, addr + fineibt_preamble_bhi, __bhi_args[arity]), CALL_INSN_SIZE); } static int cfi_rewrite_preamble(s32 *start, s32 *end) { s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; int arity; u32 hash; /* * When the function doesn't start with ENDBR the compiler will * have determined there are no indirect calls to it and we * don't need no CFI either. */ if (!is_endbr(addr + 16)) continue; hash = decode_preamble_hash(addr, &arity); if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n", addr, addr, 5, addr)) return -EINVAL; text_poke_early(addr, fineibt_preamble_start, fineibt_preamble_size); WARN_ON(*(u32 *)(addr + fineibt_preamble_hash) != 0x12345678); text_poke_early(addr + fineibt_preamble_hash, &hash, 4); WARN_ONCE(!IS_ENABLED(CONFIG_FINEIBT_BHI) && arity, "kCFI preamble has wrong register at: %pS %*ph\n", addr, 5, addr); if (cfi_bhi) cfi_fineibt_bhi_preamble(addr, arity); } return 0; } static void cfi_rewrite_endbr(s32 *start, s32 *end) { s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; if (!exact_endbr(addr + 16)) continue; poison_endbr(addr + 16); } } /* .retpoline_sites */ static int cfi_rand_callers(s32 *start, s32 *end) { s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; u32 hash; addr -= fineibt_caller_size; hash = decode_caller_hash(addr); if (hash) { hash = -cfi_rehash(hash); text_poke_early(addr + 2, &hash, 4); } } return 0; } static int emit_paranoid_trampoline(void *addr, struct insn *insn, int reg, u8 *bytes) { u8 *thunk = (void *)__x86_indirect_its_thunk_array[reg] - 2; #ifdef CONFIG_MITIGATION_ITS u8 *tmp = its_allocate_thunk(reg); if (tmp) thunk = tmp; #endif return __emit_trampoline(addr, insn, bytes, thunk, thunk); } static int cfi_rewrite_callers(s32 *start, s32 *end) { s32 *s; BUG_ON(fineibt_paranoid_size != 20); for (s = start; s < end; s++) { void *addr = (void *)s + *s; struct insn insn; u8 bytes[20]; u32 hash; int ret; u8 op; addr -= fineibt_caller_size; hash = decode_caller_hash(addr); if (!hash) continue; if (!cfi_paranoid) { text_poke_early(addr, fineibt_caller_start, fineibt_caller_size); WARN_ON(*(u32 *)(addr + fineibt_caller_hash) != 0x12345678); text_poke_early(addr + fineibt_caller_hash, &hash, 4); /* rely on apply_retpolines() */ continue; } /* cfi_paranoid */ ret = insn_decode_kernel(&insn, addr + fineibt_caller_size); if (WARN_ON_ONCE(ret < 0)) continue; op = insn.opcode.bytes[0]; if (op != CALL_INSN_OPCODE && op != JMP32_INSN_OPCODE) { WARN_ON_ONCE(1); continue; } memcpy(bytes, fineibt_paranoid_start, fineibt_paranoid_size); memcpy(bytes + fineibt_caller_hash, &hash, 4); if (cpu_wants_indirect_its_thunk_at((unsigned long)addr + fineibt_paranoid_ind, 11)) { emit_paranoid_trampoline(addr + fineibt_caller_size, &insn, 11, bytes + fineibt_caller_size); } else { ret = emit_indirect(op, 11, bytes + fineibt_paranoid_ind); if (WARN_ON_ONCE(ret != 3)) continue; } text_poke_early(addr, bytes, fineibt_paranoid_size); } return 0; } static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, s32 *start_cfi, s32 *end_cfi, bool builtin) { int ret; if (WARN_ONCE(fineibt_preamble_size != 16, "FineIBT preamble wrong size: %ld", fineibt_preamble_size)) return; if (cfi_mode == CFI_AUTO) { cfi_mode = CFI_KCFI; if (HAS_KERNEL_IBT && cpu_feature_enabled(X86_FEATURE_IBT)) { /* * FRED has much saner context on exception entry and * is less easy to take advantage of. */ if (!cpu_feature_enabled(X86_FEATURE_FRED)) cfi_paranoid = true; cfi_mode = CFI_FINEIBT; } } /* * Rewrite the callers to not use the __cfi_ stubs, such that we might * rewrite them. This disables all CFI. If this succeeds but any of the * later stages fails, we're without CFI. */ ret = cfi_disable_callers(start_retpoline, end_retpoline); if (ret) goto err; if (cfi_rand) { if (builtin) { cfi_seed = get_random_u32(); cfi_bpf_hash = cfi_rehash(cfi_bpf_hash); cfi_bpf_subprog_hash = cfi_rehash(cfi_bpf_subprog_hash); } ret = cfi_rand_preamble(start_cfi, end_cfi); if (ret) goto err; ret = cfi_rand_callers(start_retpoline, end_retpoline); if (ret) goto err; } switch (cfi_mode) { case CFI_OFF: if (builtin) pr_info("Disabling CFI\n"); return; case CFI_KCFI: ret = cfi_enable_callers(start_retpoline, end_retpoline); if (ret) goto err; if (builtin) pr_info("Using kCFI\n"); return; case CFI_FINEIBT: /* place the FineIBT preamble at func()-16 */ ret = cfi_rewrite_preamble(start_cfi, end_cfi); if (ret) goto err; /* rewrite the callers to target func()-16 */ ret = cfi_rewrite_callers(start_retpoline, end_retpoline); if (ret) goto err; /* now that nobody targets func()+0, remove ENDBR there */ cfi_rewrite_endbr(start_cfi, end_cfi); if (builtin) { pr_info("Using %sFineIBT%s CFI\n", cfi_paranoid ? "paranoid " : "", cfi_bhi ? "+BHI" : ""); } return; default: break; } err: pr_err("Something went horribly wrong trying to rewrite the CFI implementation.\n"); } static inline void poison_hash(void *addr) { *(u32 *)addr = 0; } static void poison_cfi(void *addr) { /* * Compilers manage to be inconsistent with ENDBR vs __cfi prefixes, * some (static) functions for which they can determine the address * is never taken do not get a __cfi prefix, but *DO* get an ENDBR. * * As such, these functions will get sealed, but we need to be careful * to not unconditionally scribble the previous function. */ switch (cfi_mode) { case CFI_FINEIBT: /* * FineIBT prefix should start with an ENDBR. */ if (!is_endbr(addr)) break; /* * __cfi_\func: * osp nopl (%rax) * subl $0, %r10d * jz 1f * ud2 * 1: nop */ poison_endbr(addr); poison_hash(addr + fineibt_preamble_hash); break; case CFI_KCFI: /* * kCFI prefix should start with a valid hash. */ if (!decode_preamble_hash(addr, NULL)) break; /* * __cfi_\func: * movl $0, %eax * .skip 11, 0x90 */ poison_hash(addr + 1); break; default: break; } } /* * When regs->ip points to a 0xEA byte in the FineIBT preamble, * return true and fill out target and type. * * We check the preamble by checking for the ENDBR instruction relative to the * 0xEA instruction. */ static bool decode_fineibt_preamble(struct pt_regs *regs, unsigned long *target, u32 *type) { unsigned long addr = regs->ip - fineibt_preamble_ud; u32 hash; if (!exact_endbr((void *)addr)) return false; *target = addr + fineibt_preamble_size; __get_kernel_nofault(&hash, addr + fineibt_preamble_hash, u32, Efault); *type = (u32)regs->r10 + hash; /* * Since regs->ip points to the middle of an instruction; it cannot * continue with the normal fixup. */ regs->ip = *target; return true; Efault: return false; } /* * regs->ip points to one of the UD2 in __bhi_args[]. */ static bool decode_fineibt_bhi(struct pt_regs *regs, unsigned long *target, u32 *type) { unsigned long addr; u32 hash; if (!cfi_bhi) return false; if (regs->ip < (unsigned long)__bhi_args || regs->ip >= (unsigned long)__bhi_args_end) return false; /* * Fetch the return address from the stack, this points to the * FineIBT preamble. Since the CALL instruction is in the 5 last * bytes of the preamble, the return address is in fact the target * address. */ __get_kernel_nofault(&addr, regs->sp, unsigned long, Efault); *target = addr; addr -= fineibt_preamble_size; if (!exact_endbr((void *)addr)) return false; __get_kernel_nofault(&hash, addr + fineibt_preamble_hash, u32, Efault); *type = (u32)regs->r10 + hash; /* * The UD2 sites are constructed with a RET immediately following, * as such the non-fatal case can use the regular fixup. */ return true; Efault: return false; } static bool is_paranoid_thunk(unsigned long addr) { u32 thunk; __get_kernel_nofault(&thunk, (u32 *)addr, u32, Efault); return (thunk & 0x00FFFFFF) == 0xfd75ea; Efault: return false; } /* * regs->ip points to a LOCK Jcc.d8 instruction from the fineibt_paranoid_start[] * sequence, or to an invalid instruction (0xea) + Jcc.d8 for cfi_paranoid + ITS * thunk. */ static bool decode_fineibt_paranoid(struct pt_regs *regs, unsigned long *target, u32 *type) { unsigned long addr = regs->ip - fineibt_paranoid_ud; if (!cfi_paranoid) return false; if (is_cfi_trap(addr + fineibt_caller_size - LEN_UD2)) { *target = regs->r11 + fineibt_preamble_size; *type = regs->r10; /* * Since the trapping instruction is the exact, but LOCK prefixed, * Jcc.d8 that got us here, the normal fixup will work. */ return true; } /* * The cfi_paranoid + ITS thunk combination results in: * * 0: 41 ba 78 56 34 12 mov $0x12345678, %r10d * 6: 45 3b 53 f7 cmp -0x9(%r11), %r10d * a: 4d 8d 5b f0 lea -0x10(%r11), %r11 * e: 2e e8 XX XX XX XX cs call __x86_indirect_paranoid_thunk_r11 * * Where the paranoid_thunk looks like: * * 1d: <ea> (bad) * __x86_indirect_paranoid_thunk_r11: * 1e: 75 fd jne 1d * __x86_indirect_its_thunk_r11: * 20: 41 ff eb jmp *%r11 * 23: cc int3 * */ if (is_paranoid_thunk(regs->ip)) { *target = regs->r11 + fineibt_preamble_size; *type = regs->r10; regs->ip = *target; return true; } return false; } bool decode_fineibt_insn(struct pt_regs *regs, unsigned long *target, u32 *type) { if (decode_fineibt_paranoid(regs, target, type)) return true; if (decode_fineibt_bhi(regs, target, type)) return true; return decode_fineibt_preamble(regs, target, type); } #else /* !CONFIG_FINEIBT: */ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, s32 *start_cfi, s32 *end_cfi, bool builtin) { } #ifdef CONFIG_X86_KERNEL_IBT static void poison_cfi(void *addr) { } #endif #endif /* !CONFIG_FINEIBT */ void apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, s32 *start_cfi, s32 *end_cfi) { return __apply_fineibt(start_retpoline, end_retpoline, start_cfi, end_cfi, /* .builtin = */ false); } #ifdef CONFIG_SMP static void alternatives_smp_lock(const s32 *start, const s32 *end, u8 *text, u8 *text_end) { const s32 *poff; for (poff = start; poff < end; poff++) { u8 *ptr = (u8 *)poff + *poff; if (!*poff || ptr < text || ptr >= text_end) continue; /* turn DS segment override prefix into lock prefix */ if (*ptr == 0x3e) text_poke(ptr, ((unsigned char []){0xf0}), 1); } } static void alternatives_smp_unlock(const s32 *start, const s32 *end, u8 *text, u8 *text_end) { const s32 *poff; for (poff = start; poff < end; poff++) { u8 *ptr = (u8 *)poff + *poff; if (!*poff || ptr < text || ptr >= text_end) continue; /* turn lock prefix into DS segment override prefix */ if (*ptr == 0xf0) text_poke(ptr, ((unsigned char []){0x3E}), 1); } } struct smp_alt_module { /* what is this ??? */ struct module *mod; char *name; /* ptrs to lock prefixes */ const s32 *locks; const s32 *locks_end; /* .text segment, needed to avoid patching init code ;) */ u8 *text; u8 *text_end; struct list_head next; }; static LIST_HEAD(smp_alt_modules); static bool uniproc_patched = false; /* protected by text_mutex */ void __init_or_module alternatives_smp_module_add(struct module *mod, char *name, void *locks, void *locks_end, void *text, void *text_end) { struct smp_alt_module *smp; mutex_lock(&text_mutex); if (!uniproc_patched) goto unlock; if (num_possible_cpus() == 1) /* Don't bother remembering, we'll never have to undo it. */ goto smp_unlock; smp = kzalloc(sizeof(*smp), GFP_KERNEL); if (NULL == smp) /* we'll run the (safe but slow) SMP code then ... */ goto unlock; smp->mod = mod; smp->name = name; smp->locks = locks; smp->locks_end = locks_end; smp->text = text; smp->text_end = text_end; DPRINTK(SMP, "locks %p -> %p, text %p -> %p, name %s\n", smp->locks, smp->locks_end, smp->text, smp->text_end, smp->name); list_add_tail(&smp->next, &smp_alt_modules); smp_unlock: alternatives_smp_unlock(locks, locks_end, text, text_end); unlock: mutex_unlock(&text_mutex); } void __init_or_module alternatives_smp_module_del(struct module *mod) { struct smp_alt_module *item; mutex_lock(&text_mutex); list_for_each_entry(item, &smp_alt_modules, next) { if (mod != item->mod) continue; list_del(&item->next); kfree(item); break; } mutex_unlock(&text_mutex); } void alternatives_enable_smp(void) { struct smp_alt_module *mod; /* Why bother if there are no other CPUs? */ BUG_ON(num_possible_cpus() == 1); mutex_lock(&text_mutex); if (uniproc_patched) { pr_info("switching to SMP code\n"); BUG_ON(num_online_cpus() != 1); clear_cpu_cap(&boot_cpu_data, X86_FEATURE_UP); clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP); list_for_each_entry(mod, &smp_alt_modules, next) alternatives_smp_lock(mod->locks, mod->locks_end, mod->text, mod->text_end); uniproc_patched = false; } mutex_unlock(&text_mutex); } /* * Return 1 if the address range is reserved for SMP-alternatives. * Must hold text_mutex. */ int alternatives_text_reserved(void *start, void *end) { struct smp_alt_module *mod; const s32 *poff; u8 *text_start = start; u8 *text_end = end; lockdep_assert_held(&text_mutex); list_for_each_entry(mod, &smp_alt_modules, next) { if (mod->text > text_end || mod->text_end < text_start) continue; for (poff = mod->locks; poff < mod->locks_end; poff++) { const u8 *ptr = (const u8 *)poff + *poff; if (text_start <= ptr && text_end > ptr) return 1; } } return 0; } #endif /* CONFIG_SMP */ /* * Self-test for the INT3 based CALL emulation code. * * This exercises int3_emulate_call() to make sure INT3 pt_regs are set up * properly and that there is a stack gap between the INT3 frame and the * previous context. Without this gap doing a virtual PUSH on the interrupted * stack would corrupt the INT3 IRET frame. * * See entry_{32,64}.S for more details. */ /* * We define the int3_magic() function in assembly to control the calling * convention such that we can 'call' it from assembly. */ extern void int3_magic(unsigned int *ptr); /* defined in asm */ asm ( " .pushsection .init.text, \"ax\", @progbits\n" " .type int3_magic, @function\n" "int3_magic:\n" ANNOTATE_NOENDBR " movl $1, (%" _ASM_ARG1 ")\n" ASM_RET " .size int3_magic, .-int3_magic\n" " .popsection\n" ); extern void int3_selftest_ip(void); /* defined in asm below */ static int __init int3_exception_notify(struct notifier_block *self, unsigned long val, void *data) { unsigned long selftest = (unsigned long)&int3_selftest_ip; struct die_args *args = data; struct pt_regs *regs = args->regs; OPTIMIZER_HIDE_VAR(selftest); if (!regs || user_mode(regs)) return NOTIFY_DONE; if (val != DIE_INT3) return NOTIFY_DONE; if (regs->ip - INT3_INSN_SIZE != selftest) return NOTIFY_DONE; int3_emulate_call(regs, (unsigned long)&int3_magic); return NOTIFY_STOP; } /* Must be noinline to ensure uniqueness of int3_selftest_ip. */ static noinline void __init int3_selftest(void) { static __initdata struct notifier_block int3_exception_nb = { .notifier_call = int3_exception_notify, .priority = INT_MAX-1, /* last */ }; unsigned int val = 0; BUG_ON(register_die_notifier(&int3_exception_nb)); /* * Basically: int3_magic(&val); but really complicated :-) * * INT3 padded with NOP to CALL_INSN_SIZE. The int3_exception_nb * notifier above will emulate CALL for us. */ asm volatile ("int3_selftest_ip:\n\t" ANNOTATE_NOENDBR " int3; nop; nop; nop; nop\n\t" : ASM_CALL_CONSTRAINT : __ASM_SEL_RAW(a, D) (&val) : "memory"); BUG_ON(val != 1); unregister_die_notifier(&int3_exception_nb); } static __initdata int __alt_reloc_selftest_addr; extern void __init __alt_reloc_selftest(void *arg); __visible noinline void __init __alt_reloc_selftest(void *arg) { WARN_ON(arg != &__alt_reloc_selftest_addr); } static noinline void __init alt_reloc_selftest(void) { /* * Tests apply_relocation(). * * This has a relative immediate (CALL) in a place other than the first * instruction and additionally on x86_64 we get a RIP-relative LEA: * * lea 0x0(%rip),%rdi # 5d0: R_X86_64_PC32 .init.data+0x5566c * call +0 # 5d5: R_X86_64_PLT32 __alt_reloc_selftest-0x4 * * Getting this wrong will either crash and burn or tickle the WARN * above. */ asm_inline volatile ( ALTERNATIVE("", "lea %[mem], %%" _ASM_ARG1 "; call __alt_reloc_selftest;", X86_FEATURE_ALWAYS) : ASM_CALL_CONSTRAINT : [mem] "m" (__alt_reloc_selftest_addr) : _ASM_ARG1 ); } void __init alternative_instructions(void) { u64 ibt; int3_selftest(); /* * The patching is not fully atomic, so try to avoid local * interruptions that might execute the to be patched code. * Other CPUs are not running. */ stop_nmi(); /* * Don't stop machine check exceptions while patching. * MCEs only happen when something got corrupted and in this * case we must do something about the corruption. * Ignoring it is worse than an unlikely patching race. * Also machine checks tend to be broadcast and if one CPU * goes into machine check the others follow quickly, so we don't * expect a machine check to cause undue problems during to code * patching. */ /* * Make sure to set (artificial) features depending on used paravirt * functions which can later influence alternative patching. */ paravirt_set_cap(); /* Keep CET-IBT disabled until caller/callee are patched */ ibt = ibt_save(/*disable*/ true); __apply_fineibt(__retpoline_sites, __retpoline_sites_end, __cfi_sites, __cfi_sites_end, true); /* * Rewrite the retpolines, must be done before alternatives since * those can rewrite the retpoline thunks. */ apply_retpolines(__retpoline_sites, __retpoline_sites_end); apply_returns(__return_sites, __return_sites_end); /* * Adjust all CALL instructions to point to func()-10, including * those in .altinstr_replacement. */ callthunks_patch_builtin_calls(); apply_alternatives(__alt_instructions, __alt_instructions_end); /* * Seal all functions that do not have their address taken. */ apply_seal_endbr(__ibt_endbr_seal, __ibt_endbr_seal_end); ibt_restore(ibt); #ifdef CONFIG_SMP /* Patch to UP if other cpus not imminent. */ if (!noreplace_smp && (num_present_cpus() == 1 || setup_max_cpus <= 1)) { uniproc_patched = true; alternatives_smp_module_add(NULL, "core kernel", __smp_locks, __smp_locks_end, _text, _etext); } if (!uniproc_patched || num_possible_cpus() == 1) { free_init_pages("SMP alternatives", (unsigned long)__smp_locks, (unsigned long)__smp_locks_end); } #endif restart_nmi(); alternatives_patched = 1; alt_reloc_selftest(); } /** * text_poke_early - Update instructions on a live kernel at boot time * @addr: address to modify * @opcode: source of the copy * @len: length to copy * * When you use this code to patch more than one byte of an instruction * you need to make sure that other CPUs cannot execute this code in parallel. * Also no thread must be currently preempted in the middle of these * instructions. And on the local CPU you need to be protected against NMI or * MCE handlers seeing an inconsistent instruction while you patch. */ void __init_or_module text_poke_early(void *addr, const void *opcode, size_t len) { unsigned long flags; if (boot_cpu_has(X86_FEATURE_NX) && is_module_text_address((unsigned long)addr)) { /* * Modules text is marked initially as non-executable, so the * code cannot be running and speculative code-fetches are * prevented. Just change the code. */ memcpy(addr, opcode, len); } else { local_irq_save(flags); memcpy(addr, opcode, len); sync_core(); local_irq_restore(flags); /* * Could also do a CLFLUSH here to speed up CPU recovery; but * that causes hangs on some VIA CPUs. */ } } typedef struct { struct mm_struct *mm; } temp_mm_state_t; /* * Using a temporary mm allows to set temporary mappings that are not accessible * by other CPUs. Such mappings are needed to perform sensitive memory writes * that override the kernel memory protections (e.g., W^X), without exposing the * temporary page-table mappings that are required for these write operations to * other CPUs. Using a temporary mm also allows to avoid TLB shootdowns when the * mapping is torn down. * * Context: The temporary mm needs to be used exclusively by a single core. To * harden security IRQs must be disabled while the temporary mm is * loaded, thereby preventing interrupt handler bugs from overriding * the kernel memory protection. */ static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm) { temp_mm_state_t temp_state; lockdep_assert_irqs_disabled(); /* * Make sure not to be in TLB lazy mode, as otherwise we'll end up * with a stale address space WITHOUT being in lazy mode after * restoring the previous mm. */ if (this_cpu_read(cpu_tlbstate_shared.is_lazy)) leave_mm(); temp_state.mm = this_cpu_read(cpu_tlbstate.loaded_mm); switch_mm_irqs_off(NULL, mm, current); /* * If breakpoints are enabled, disable them while the temporary mm is * used. Userspace might set up watchpoints on addresses that are used * in the temporary mm, which would lead to wrong signals being sent or * crashes. * * Note that breakpoints are not disabled selectively, which also causes * kernel breakpoints (e.g., perf's) to be disabled. This might be * undesirable, but still seems reasonable as the code that runs in the * temporary mm should be short. */ if (hw_breakpoint_active()) hw_breakpoint_disable(); return temp_state; } __ro_after_init struct mm_struct *poking_mm; __ro_after_init unsigned long poking_addr; static inline void unuse_temporary_mm(temp_mm_state_t prev_state) { lockdep_assert_irqs_disabled(); switch_mm_irqs_off(NULL, prev_state.mm, current); /* Clear the cpumask, to indicate no TLB flushing is needed anywhere */ cpumask_clear_cpu(raw_smp_processor_id(), mm_cpumask(poking_mm)); /* * Restore the breakpoints if they were disabled before the temporary mm * was loaded. */ if (hw_breakpoint_active()) hw_breakpoint_restore(); } static void text_poke_memcpy(void *dst, const void *src, size_t len) { memcpy(dst, src, len); } static void text_poke_memset(void *dst, const void *src, size_t len) { int c = *(const int *)src; memset(dst, c, len); } typedef void text_poke_f(void *dst, const void *src, size_t len); static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t len) { bool cross_page_boundary = offset_in_page(addr) + len > PAGE_SIZE; struct page *pages[2] = {NULL}; temp_mm_state_t prev; unsigned long flags; pte_t pte, *ptep; spinlock_t *ptl; pgprot_t pgprot; /* * While boot memory allocator is running we cannot use struct pages as * they are not yet initialized. There is no way to recover. */ BUG_ON(!after_bootmem); if (!core_kernel_text((unsigned long)addr)) { pages[0] = vmalloc_to_page(addr); if (cross_page_boundary) pages[1] = vmalloc_to_page(addr + PAGE_SIZE); } else { pages[0] = virt_to_page(addr); WARN_ON(!PageReserved(pages[0])); if (cross_page_boundary) pages[1] = virt_to_page(addr + PAGE_SIZE); } /* * If something went wrong, crash and burn since recovery paths are not * implemented. */ BUG_ON(!pages[0] || (cross_page_boundary && !pages[1])); /* * Map the page without the global bit, as TLB flushing is done with * flush_tlb_mm_range(), which is intended for non-global PTEs. */ pgprot = __pgprot(pgprot_val(PAGE_KERNEL) & ~_PAGE_GLOBAL); /* * The lock is not really needed, but this allows to avoid open-coding. */ ptep = get_locked_pte(poking_mm, poking_addr, &ptl); /* * This must not fail; preallocated in poking_init(). */ VM_BUG_ON(!ptep); local_irq_save(flags); pte = mk_pte(pages[0], pgprot); set_pte_at(poking_mm, poking_addr, ptep, pte); if (cross_page_boundary) { pte = mk_pte(pages[1], pgprot); set_pte_at(poking_mm, poking_addr + PAGE_SIZE, ptep + 1, pte); } /* * Loading the temporary mm behaves as a compiler barrier, which * guarantees that the PTE will be set at the time memcpy() is done. */ prev = use_temporary_mm(poking_mm); kasan_disable_current(); func((u8 *)poking_addr + offset_in_page(addr), src, len); kasan_enable_current(); /* * Ensure that the PTE is only cleared after the instructions of memcpy * were issued by using a compiler barrier. */ barrier(); pte_clear(poking_mm, poking_addr, ptep); if (cross_page_boundary) pte_clear(poking_mm, poking_addr + PAGE_SIZE, ptep + 1); /* * Loading the previous page-table hierarchy requires a serializing * instruction that already allows the core to see the updated version. * Xen-PV is assumed to serialize execution in a similar manner. */ unuse_temporary_mm(prev); /* * Flushing the TLB might involve IPIs, which would require enabled * IRQs, but not if the mm is not used, as it is in this point. */ flush_tlb_mm_range(poking_mm, poking_addr, poking_addr + (cross_page_boundary ? 2 : 1) * PAGE_SIZE, PAGE_SHIFT, false); if (func == text_poke_memcpy) { /* * If the text does not match what we just wrote then something is * fundamentally screwy; there's nothing we can really do about that. */ BUG_ON(memcmp(addr, src, len)); } local_irq_restore(flags); pte_unmap_unlock(ptep, ptl); return addr; } /** * text_poke - Update instructions on a live kernel * @addr: address to modify * @opcode: source of the copy * @len: length to copy * * Only atomic text poke/set should be allowed when not doing early patching. * It means the size must be writable atomically and the address must be aligned * in a way that permits an atomic write. It also makes sure we fit on a single * page. * * Note that the caller must ensure that if the modified code is part of a * module, the module would not be removed during poking. This can be achieved * by registering a module notifier, and ordering module removal and patching * through a mutex. */ void *text_poke(void *addr, const void *opcode, size_t len) { lockdep_assert_held(&text_mutex); return __text_poke(text_poke_memcpy, addr, opcode, len); } /** * text_poke_kgdb - Update instructions on a live kernel by kgdb * @addr: address to modify * @opcode: source of the copy * @len: length to copy * * Only atomic text poke/set should be allowed when not doing early patching. * It means the size must be writable atomically and the address must be aligned * in a way that permits an atomic write. It also makes sure we fit on a single * page. * * Context: should only be used by kgdb, which ensures no other core is running, * despite the fact it does not hold the text_mutex. */ void *text_poke_kgdb(void *addr, const void *opcode, size_t len) { return __text_poke(text_poke_memcpy, addr, opcode, len); } void *text_poke_copy_locked(void *addr, const void *opcode, size_t len, bool core_ok) { unsigned long start = (unsigned long)addr; size_t patched = 0; if (WARN_ON_ONCE(!core_ok && core_kernel_text(start))) return NULL; while (patched < len) { unsigned long ptr = start + patched; size_t s; s = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(ptr), len - patched); __text_poke(text_poke_memcpy, (void *)ptr, opcode + patched, s); patched += s; } return addr; } /** * text_poke_copy - Copy instructions into (an unused part of) RX memory * @addr: address to modify * @opcode: source of the copy * @len: length to copy, could be more than 2x PAGE_SIZE * * Not safe against concurrent execution; useful for JITs to dump * new code blocks into unused regions of RX memory. Can be used in * conjunction with synchronize_rcu_tasks() to wait for existing * execution to quiesce after having made sure no existing functions * pointers are live. */ void *text_poke_copy(void *addr, const void *opcode, size_t len) { mutex_lock(&text_mutex); addr = text_poke_copy_locked(addr, opcode, len, false); mutex_unlock(&text_mutex); return addr; } /** * text_poke_set - memset into (an unused part of) RX memory * @addr: address to modify * @c: the byte to fill the area with * @len: length to copy, could be more than 2x PAGE_SIZE * * This is useful to overwrite unused regions of RX memory with illegal * instructions. */ void *text_poke_set(void *addr, int c, size_t len) { unsigned long start = (unsigned long)addr; size_t patched = 0; if (WARN_ON_ONCE(core_kernel_text(start))) return NULL; mutex_lock(&text_mutex); while (patched < len) { unsigned long ptr = start + patched; size_t s; s = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(ptr), len - patched); __text_poke(text_poke_memset, (void *)ptr, (void *)&c, s); patched += s; } mutex_unlock(&text_mutex); return addr; } static void do_sync_core(void *info) { sync_core(); } void text_poke_sync(void) { on_each_cpu(do_sync_core, NULL, 1); } /* * NOTE: crazy scheme to allow patching Jcc.d32 but not increase the size of * this thing. When len == 6 everything is prefixed with 0x0f and we map * opcode to Jcc.d8, using len to distinguish. */ struct text_poke_loc { /* addr := _stext + rel_addr */ s32 rel_addr; s32 disp; u8 len; u8 opcode; const u8 text[POKE_MAX_OPCODE_SIZE]; /* see text_poke_bp_batch() */ u8 old; }; struct bp_patching_desc { struct text_poke_loc *vec; int nr_entries; atomic_t refs; }; static struct bp_patching_desc bp_desc; static __always_inline struct bp_patching_desc *try_get_desc(void) { struct bp_patching_desc *desc = &bp_desc; if (!raw_atomic_inc_not_zero(&desc->refs)) return NULL; return desc; } static __always_inline void put_desc(void) { struct bp_patching_desc *desc = &bp_desc; smp_mb__before_atomic(); raw_atomic_dec(&desc->refs); } static __always_inline void *text_poke_addr(struct text_poke_loc *tp) { return _stext + tp->rel_addr; } static __always_inline int patch_cmp(const void *key, const void *elt) { struct text_poke_loc *tp = (struct text_poke_loc *) elt; if (key < text_poke_addr(tp)) return -1; if (key > text_poke_addr(tp)) return 1; return 0; } noinstr int poke_int3_handler(struct pt_regs *regs) { struct bp_patching_desc *desc; struct text_poke_loc *tp; int ret = 0; void *ip; if (user_mode(regs)) return 0; /* * Having observed our INT3 instruction, we now must observe * bp_desc with non-zero refcount: * * bp_desc.refs = 1 INT3 * WMB RMB * write INT3 if (bp_desc.refs != 0) */ smp_rmb(); desc = try_get_desc(); if (!desc) return 0; /* * Discount the INT3. See text_poke_bp_batch(). */ ip = (void *) regs->ip - INT3_INSN_SIZE; /* * Skip the binary search if there is a single member in the vector. */ if (unlikely(desc->nr_entries > 1)) { tp = __inline_bsearch(ip, desc->vec, desc->nr_entries, sizeof(struct text_poke_loc), patch_cmp); if (!tp) goto out_put; } else { tp = desc->vec; if (text_poke_addr(tp) != ip) goto out_put; } ip += tp->len; switch (tp->opcode) { case INT3_INSN_OPCODE: /* * Someone poked an explicit INT3, they'll want to handle it, * do not consume. */ goto out_put; case RET_INSN_OPCODE: int3_emulate_ret(regs); break; case CALL_INSN_OPCODE: int3_emulate_call(regs, (long)ip + tp->disp); break; case JMP32_INSN_OPCODE: case JMP8_INSN_OPCODE: int3_emulate_jmp(regs, (long)ip + tp->disp); break; case 0x70 ... 0x7f: /* Jcc */ int3_emulate_jcc(regs, tp->opcode & 0xf, (long)ip, tp->disp); break; default: BUG(); } ret = 1; out_put: put_desc(); return ret; } #define TP_VEC_MAX (PAGE_SIZE / sizeof(struct text_poke_loc)) static struct text_poke_loc tp_vec[TP_VEC_MAX]; static int tp_vec_nr; /** * text_poke_bp_batch() -- update instructions on live kernel on SMP * @tp: vector of instructions to patch * @nr_entries: number of entries in the vector * * Modify multi-byte instruction by using int3 breakpoint on SMP. * We completely avoid stop_machine() here, and achieve the * synchronization using int3 breakpoint. * * The way it is done: * - For each entry in the vector: * - add a int3 trap to the address that will be patched * - sync cores * - For each entry in the vector: * - update all but the first byte of the patched range * - sync cores * - For each entry in the vector: * - replace the first byte (int3) by the first byte of * replacing opcode * - sync cores */ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries) { unsigned char int3 = INT3_INSN_OPCODE; unsigned int i; int do_sync; lockdep_assert_held(&text_mutex); bp_desc.vec = tp; bp_desc.nr_entries = nr_entries; /* * Corresponds to the implicit memory barrier in try_get_desc() to * ensure reading a non-zero refcount provides up to date bp_desc data. */ atomic_set_release(&bp_desc.refs, 1); /* * Function tracing can enable thousands of places that need to be * updated. This can take quite some time, and with full kernel debugging * enabled, this could cause the softlockup watchdog to trigger. * This function gets called every 256 entries added to be patched. * Call cond_resched() here to make sure that other tasks can get scheduled * while processing all the functions being patched. */ cond_resched(); /* * Corresponding read barrier in int3 notifier for making sure the * nr_entries and handler are correctly ordered wrt. patching. */ smp_wmb(); /* * First step: add a int3 trap to the address that will be patched. */ for (i = 0; i < nr_entries; i++) { tp[i].old = *(u8 *)text_poke_addr(&tp[i]); text_poke(text_poke_addr(&tp[i]), &int3, INT3_INSN_SIZE); } text_poke_sync(); /* * Second step: update all but the first byte of the patched range. */ for (do_sync = 0, i = 0; i < nr_entries; i++) { u8 old[POKE_MAX_OPCODE_SIZE+1] = { tp[i].old, }; u8 _new[POKE_MAX_OPCODE_SIZE+1]; const u8 *new = tp[i].text; int len = tp[i].len; if (len - INT3_INSN_SIZE > 0) { memcpy(old + INT3_INSN_SIZE, text_poke_addr(&tp[i]) + INT3_INSN_SIZE, len - INT3_INSN_SIZE); if (len == 6) { _new[0] = 0x0f; memcpy(_new + 1, new, 5); new = _new; } text_poke(text_poke_addr(&tp[i]) + INT3_INSN_SIZE, new + INT3_INSN_SIZE, len - INT3_INSN_SIZE); do_sync++; } /* * Emit a perf event to record the text poke, primarily to * support Intel PT decoding which must walk the executable code * to reconstruct the trace. The flow up to here is: * - write INT3 byte * - IPI-SYNC * - write instruction tail * At this point the actual control flow will be through the * INT3 and handler and not hit the old or new instruction. * Intel PT outputs FUP/TIP packets for the INT3, so the flow * can still be decoded. Subsequently: * - emit RECORD_TEXT_POKE with the new instruction * - IPI-SYNC * - write first byte * - IPI-SYNC * So before the text poke event timestamp, the decoder will see * either the old instruction flow or FUP/TIP of INT3. After the * text poke event timestamp, the decoder will see either the * new instruction flow or FUP/TIP of INT3. Thus decoders can * use the timestamp as the point at which to modify the * executable code. * The old instruction is recorded so that the event can be * processed forwards or backwards. */ perf_event_text_poke(text_poke_addr(&tp[i]), old, len, new, len); } if (do_sync) { /* * According to Intel, this core syncing is very likely * not necessary and we'd be safe even without it. But * better safe than sorry (plus there's not only Intel). */ text_poke_sync(); } /* * Third step: replace the first byte (int3) by the first byte of * replacing opcode. */ for (do_sync = 0, i = 0; i < nr_entries; i++) { u8 byte = tp[i].text[0]; if (tp[i].len == 6) byte = 0x0f; if (byte == INT3_INSN_OPCODE) continue; text_poke(text_poke_addr(&tp[i]), &byte, INT3_INSN_SIZE); do_sync++; } if (do_sync) text_poke_sync(); /* * Remove and wait for refs to be zero. */ if (!atomic_dec_and_test(&bp_desc.refs)) atomic_cond_read_acquire(&bp_desc.refs, !VAL); } static void text_poke_loc_init(struct text_poke_loc *tp, void *addr, const void *opcode, size_t len, const void *emulate) { struct insn insn; int ret, i = 0; if (len == 6) i = 1; memcpy((void *)tp->text, opcode+i, len-i); if (!emulate) emulate = opcode; ret = insn_decode_kernel(&insn, emulate); BUG_ON(ret < 0); tp->rel_addr = addr - (void *)_stext; tp->len = len; tp->opcode = insn.opcode.bytes[0]; if (is_jcc32(&insn)) { /* * Map Jcc.d32 onto Jcc.d8 and use len to distinguish. */ tp->opcode = insn.opcode.bytes[1] - 0x10; } switch (tp->opcode) { case RET_INSN_OPCODE: case JMP32_INSN_OPCODE: case JMP8_INSN_OPCODE: /* * Control flow instructions without implied execution of the * next instruction can be padded with INT3. */ for (i = insn.length; i < len; i++) BUG_ON(tp->text[i] != INT3_INSN_OPCODE); break; default: BUG_ON(len != insn.length); } switch (tp->opcode) { case INT3_INSN_OPCODE: case RET_INSN_OPCODE: break; case CALL_INSN_OPCODE: case JMP32_INSN_OPCODE: case JMP8_INSN_OPCODE: case 0x70 ... 0x7f: /* Jcc */ tp->disp = insn.immediate.value; break; default: /* assume NOP */ switch (len) { case 2: /* NOP2 -- emulate as JMP8+0 */ BUG_ON(memcmp(emulate, x86_nops[len], len)); tp->opcode = JMP8_INSN_OPCODE; tp->disp = 0; break; case 5: /* NOP5 -- emulate as JMP32+0 */ BUG_ON(memcmp(emulate, x86_nops[len], len)); tp->opcode = JMP32_INSN_OPCODE; tp->disp = 0; break; default: /* unknown instruction */ BUG(); } break; } } /* * We hard rely on the tp_vec being ordered; ensure this is so by flushing * early if needed. */ static bool tp_order_fail(void *addr) { struct text_poke_loc *tp; if (!tp_vec_nr) return false; if (!addr) /* force */ return true; tp = &tp_vec[tp_vec_nr - 1]; if ((unsigned long)text_poke_addr(tp) > (unsigned long)addr) return true; return false; } static void text_poke_flush(void *addr) { if (tp_vec_nr == TP_VEC_MAX || tp_order_fail(addr)) { text_poke_bp_batch(tp_vec, tp_vec_nr); tp_vec_nr = 0; } } void text_poke_finish(void) { text_poke_flush(NULL); } void __ref text_poke_queue(void *addr, const void *opcode, size_t len, const void *emulate) { struct text_poke_loc *tp; text_poke_flush(addr); tp = &tp_vec[tp_vec_nr++]; text_poke_loc_init(tp, addr, opcode, len, emulate); } /** * text_poke_bp() -- update instructions on live kernel on SMP * @addr: address to patch * @opcode: opcode of new instruction * @len: length to copy * @emulate: instruction to be emulated * * Update a single instruction with the vector in the stack, avoiding * dynamically allocated memory. This function should be used when it is * not possible to allocate memory. */ void __ref text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate) { struct text_poke_loc tp; text_poke_loc_init(&tp, addr, opcode, len, emulate); text_poke_bp_batch(&tp, 1); }
42 31 12 12 12 12 12 12 12 12 41 41 3 35 12 32 1 31 1 1 1 1 3 11 12 32 38 3 15 23 23 38 17 9 14 7 11 23 42 4 38 10 38 38 11 38 42 42 20 29 29 6 38 3 40 2 42 35 12 40 42 2 42 41 2 41 42 1 32 11 35 10 1 42 35 11 11 42 42 42 42 42 42 1 17 2 40 39 3 31 12 42 1 1 1 30 12 32 12 34 11 17 1 18 10 18 9 2 16 9 17 17 17 17 17 17 17 17 17 68 65 3 3 61 61 61 61 60 3 2 67 2 66 2 67 66 66 66 66 66 66 66 8 8 42 42 42 17 17 27 26 25 1 1 27 27 27 27 41 41 42 42 41 42 4 42 42 42 42 42 41 42 41 42 4 17 26 42 27 17 23 42 3 42 9 42 38 4 41 1 1 1 1 17 17 14 3 2 2 17 17 17 1 1 1 1 1 1 16 2 17 17 16 1 17 15 2 13 15 3 3 12 14 1 1 14 15 15 15 9 9 9 7 6 6 6 6 1 6 6 6 6 8 2 7 6 6 6 6 54 54 54 4 54 35 36 2 35 1 1 1 1 1 1 1 1 1 1 1 1 17 17 17 1 1 1 1 1 1 1 1 1 1 1 17 17 17 17 17 16 2 17 1 1 17 17 17 17 17 2 16 2 16 2 17 17 17 17 16 2 2 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 1 17 17 16 1 17 17 1 1 15 3 15 2 17 17 15 15 1 17 3 1 15 31 31 31 3 1 2 27 26 25 23 23 22 23 23 23 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright 2002-2005, Instant802 Networks, Inc. * Copyright 2005-2006, Devicescape Software, Inc. * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2007 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (C) 2018-2024 Intel Corporation * * Transmit and frame generation functions. */ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/skbuff.h> #include <linux/if_vlan.h> #include <linux/etherdevice.h> #include <linux/bitmap.h> #include <linux/rcupdate.h> #include <linux/export.h> #include <net/net_namespace.h> #include <net/ieee80211_radiotap.h> #include <net/cfg80211.h> #include <net/mac80211.h> #include <net/codel.h> #include <net/codel_impl.h> #include <linux/unaligned.h> #include <net/fq_impl.h> #include <net/sock.h> #include <net/gso.h> #include "ieee80211_i.h" #include "driver-ops.h" #include "led.h" #include "mesh.h" #include "wep.h" #include "wpa.h" #include "wme.h" #include "rate.h" /* misc utils */ static __le16 ieee80211_duration(struct ieee80211_tx_data *tx, struct sk_buff *skb, int group_addr, int next_frag_len) { int rate, mrate, erp, dur, i; struct ieee80211_rate *txrate; struct ieee80211_local *local = tx->local; struct ieee80211_supported_band *sband; struct ieee80211_hdr *hdr; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); /* assume HW handles this */ if (tx->rate.flags & (IEEE80211_TX_RC_MCS | IEEE80211_TX_RC_VHT_MCS)) return 0; /* uh huh? */ if (WARN_ON_ONCE(tx->rate.idx < 0)) return 0; sband = local->hw.wiphy->bands[info->band]; txrate = &sband->bitrates[tx->rate.idx]; erp = txrate->flags & IEEE80211_RATE_ERP_G; /* device is expected to do this */ if (sband->band == NL80211_BAND_S1GHZ) return 0; /* * data and mgmt (except PS Poll): * - during CFP: 32768 * - during contention period: * if addr1 is group address: 0 * if more fragments = 0 and addr1 is individual address: time to * transmit one ACK plus SIFS * if more fragments = 1 and addr1 is individual address: time to * transmit next fragment plus 2 x ACK plus 3 x SIFS * * IEEE 802.11, 9.6: * - control response frame (CTS or ACK) shall be transmitted using the * same rate as the immediately previous frame in the frame exchange * sequence, if this rate belongs to the PHY mandatory rates, or else * at the highest possible rate belonging to the PHY rates in the * BSSBasicRateSet */ hdr = (struct ieee80211_hdr *)skb->data; if (ieee80211_is_ctl(hdr->frame_control)) { /* TODO: These control frames are not currently sent by * mac80211, but should they be implemented, this function * needs to be updated to support duration field calculation. * * RTS: time needed to transmit pending data/mgmt frame plus * one CTS frame plus one ACK frame plus 3 x SIFS * CTS: duration of immediately previous RTS minus time * required to transmit CTS and its SIFS * ACK: 0 if immediately previous directed data/mgmt had * more=0, with more=1 duration in ACK frame is duration * from previous frame minus time needed to transmit ACK * and its SIFS * PS Poll: BIT(15) | BIT(14) | aid */ return 0; } /* data/mgmt */ if (0 /* FIX: data/mgmt during CFP */) return cpu_to_le16(32768); if (group_addr) /* Group address as the destination - no ACK */ return 0; /* Individual destination address: * IEEE 802.11, Ch. 9.6 (after IEEE 802.11g changes) * CTS and ACK frames shall be transmitted using the highest rate in * basic rate set that is less than or equal to the rate of the * immediately previous frame and that is using the same modulation * (CCK or OFDM). If no basic rate set matches with these requirements, * the highest mandatory rate of the PHY that is less than or equal to * the rate of the previous frame is used. * Mandatory rates for IEEE 802.11g PHY: 1, 2, 5.5, 11, 6, 12, 24 Mbps */ rate = -1; /* use lowest available if everything fails */ mrate = sband->bitrates[0].bitrate; for (i = 0; i < sband->n_bitrates; i++) { struct ieee80211_rate *r = &sband->bitrates[i]; u32 flag; if (r->bitrate > txrate->bitrate) break; if (tx->sdata->vif.bss_conf.basic_rates & BIT(i)) rate = r->bitrate; switch (sband->band) { case NL80211_BAND_2GHZ: case NL80211_BAND_LC: if (tx->sdata->deflink.operating_11g_mode) flag = IEEE80211_RATE_MANDATORY_G; else flag = IEEE80211_RATE_MANDATORY_B; break; case NL80211_BAND_5GHZ: case NL80211_BAND_6GHZ: flag = IEEE80211_RATE_MANDATORY_A; break; default: flag = 0; WARN_ON(1); break; } if (r->flags & flag) mrate = r->bitrate; } if (rate == -1) { /* No matching basic rate found; use highest suitable mandatory * PHY rate */ rate = mrate; } /* Don't calculate ACKs for QoS Frames with NoAck Policy set */ if (ieee80211_is_data_qos(hdr->frame_control) && *(ieee80211_get_qos_ctl(hdr)) & IEEE80211_QOS_CTL_ACK_POLICY_NOACK) dur = 0; else /* Time needed to transmit ACK * (10 bytes + 4-byte FCS = 112 bits) plus SIFS; rounded up * to closest integer */ dur = ieee80211_frame_duration(sband->band, 10, rate, erp, tx->sdata->vif.bss_conf.use_short_preamble); if (next_frag_len) { /* Frame is fragmented: duration increases with time needed to * transmit next fragment plus ACK and 2 x SIFS. */ dur *= 2; /* ACK + SIFS */ /* next fragment */ dur += ieee80211_frame_duration(sband->band, next_frag_len, txrate->bitrate, erp, tx->sdata->vif.bss_conf.use_short_preamble); } return cpu_to_le16(dur); } /* tx handlers */ static ieee80211_tx_result debug_noinline ieee80211_tx_h_dynamic_ps(struct ieee80211_tx_data *tx) { struct ieee80211_local *local = tx->local; struct ieee80211_if_managed *ifmgd; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb); /* driver doesn't support power save */ if (!ieee80211_hw_check(&local->hw, SUPPORTS_PS)) return TX_CONTINUE; /* hardware does dynamic power save */ if (ieee80211_hw_check(&local->hw, SUPPORTS_DYNAMIC_PS)) return TX_CONTINUE; /* dynamic power save disabled */ if (local->hw.conf.dynamic_ps_timeout <= 0) return TX_CONTINUE; /* we are scanning, don't enable power save */ if (local->scanning) return TX_CONTINUE; if (!local->ps_sdata) return TX_CONTINUE; /* No point if we're going to suspend */ if (local->quiescing) return TX_CONTINUE; /* dynamic ps is supported only in managed mode */ if (tx->sdata->vif.type != NL80211_IFTYPE_STATION) return TX_CONTINUE; if (unlikely(info->flags & IEEE80211_TX_INTFL_OFFCHAN_TX_OK)) return TX_CONTINUE; ifmgd = &tx->sdata->u.mgd; /* * Don't wakeup from power save if u-apsd is enabled, voip ac has * u-apsd enabled and the frame is in voip class. This effectively * means that even if all access categories have u-apsd enabled, in * practise u-apsd is only used with the voip ac. This is a * workaround for the case when received voip class packets do not * have correct qos tag for some reason, due the network or the * peer application. * * Note: ifmgd->uapsd_queues access is racy here. If the value is * changed via debugfs, user needs to reassociate manually to have * everything in sync. */ if ((ifmgd->flags & IEEE80211_STA_UAPSD_ENABLED) && (ifmgd->uapsd_queues & IEEE80211_WMM_IE_STA_QOSINFO_AC_VO) && skb_get_queue_mapping(tx->skb) == IEEE80211_AC_VO) return TX_CONTINUE; if (local->hw.conf.flags & IEEE80211_CONF_PS) { ieee80211_stop_queues_by_reason(&local->hw, IEEE80211_MAX_QUEUE_MAP, IEEE80211_QUEUE_STOP_REASON_PS, false); ifmgd->flags &= ~IEEE80211_STA_NULLFUNC_ACKED; wiphy_work_queue(local->hw.wiphy, &local->dynamic_ps_disable_work); } /* Don't restart the timer if we're not disassociated */ if (!ifmgd->associated) return TX_CONTINUE; mod_timer(&local->dynamic_ps_timer, jiffies + msecs_to_jiffies(local->hw.conf.dynamic_ps_timeout)); return TX_CONTINUE; } static ieee80211_tx_result debug_noinline ieee80211_tx_h_check_assoc(struct ieee80211_tx_data *tx) { struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)tx->skb->data; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb); bool assoc = false; if (unlikely(info->flags & IEEE80211_TX_CTL_INJECTED)) return TX_CONTINUE; if (unlikely(test_bit(SCAN_SW_SCANNING, &tx->local->scanning)) && test_bit(SDATA_STATE_OFFCHANNEL, &tx->sdata->state) && !ieee80211_is_probe_req(hdr->frame_control) && !ieee80211_is_any_nullfunc(hdr->frame_control)) /* * When software scanning only nullfunc frames (to notify * the sleep state to the AP) and probe requests (for the * active scan) are allowed, all other frames should not be * sent and we should not get here, but if we do * nonetheless, drop them to avoid sending them * off-channel. See the link below and * ieee80211_start_scan() for more. * * http://article.gmane.org/gmane.linux.kernel.wireless.general/30089 */ return TX_DROP; if (tx->sdata->vif.type == NL80211_IFTYPE_OCB) return TX_CONTINUE; if (tx->flags & IEEE80211_TX_PS_BUFFERED) return TX_CONTINUE; if (tx->sta) assoc = test_sta_flag(tx->sta, WLAN_STA_ASSOC); if (likely(tx->flags & IEEE80211_TX_UNICAST)) { if (unlikely(!assoc && ieee80211_is_data(hdr->frame_control))) { #ifdef CONFIG_MAC80211_VERBOSE_DEBUG sdata_info(tx->sdata, "dropped data frame to not associated station %pM\n", hdr->addr1); #endif I802_DEBUG_INC(tx->local->tx_handlers_drop_not_assoc); return TX_DROP; } } else if (unlikely(ieee80211_is_data(hdr->frame_control) && ieee80211_vif_get_num_mcast_if(tx->sdata) == 0)) { /* * No associated STAs - no need to send multicast * frames. */ return TX_DROP; } return TX_CONTINUE; } /* This function is called whenever the AP is about to exceed the maximum limit * of buffered frames for power saving STAs. This situation should not really * happen often during normal operation, so dropping the oldest buffered packet * from each queue should be OK to make some room for new frames. */ static void purge_old_ps_buffers(struct ieee80211_local *local) { int total = 0, purged = 0; struct sk_buff *skb; struct ieee80211_sub_if_data *sdata; struct sta_info *sta; list_for_each_entry_rcu(sdata, &local->interfaces, list) { struct ps_data *ps; if (sdata->vif.type == NL80211_IFTYPE_AP) ps = &sdata->u.ap.ps; else if (ieee80211_vif_is_mesh(&sdata->vif)) ps = &sdata->u.mesh.ps; else continue; skb = skb_dequeue(&ps->bc_buf); if (skb) { purged++; ieee80211_free_txskb(&local->hw, skb); } total += skb_queue_len(&ps->bc_buf); } /* * Drop one frame from each station from the lowest-priority * AC that has frames at all. */ list_for_each_entry_rcu(sta, &local->sta_list, list) { int ac; for (ac = IEEE80211_AC_BK; ac >= IEEE80211_AC_VO; ac--) { skb = skb_dequeue(&sta->ps_tx_buf[ac]); total += skb_queue_len(&sta->ps_tx_buf[ac]); if (skb) { purged++; ieee80211_free_txskb(&local->hw, skb); break; } } } local->total_ps_buffered = total; ps_dbg_hw(&local->hw, "PS buffers full - purged %d frames\n", purged); } static ieee80211_tx_result ieee80211_tx_h_multicast_ps_buf(struct ieee80211_tx_data *tx) { struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb); struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)tx->skb->data; struct ps_data *ps; /* * broadcast/multicast frame * * If any of the associated/peer stations is in power save mode, * the frame is buffered to be sent after DTIM beacon frame. * This is done either by the hardware or us. */ /* powersaving STAs currently only in AP/VLAN/mesh mode */ if (tx->sdata->vif.type == NL80211_IFTYPE_AP || tx->sdata->vif.type == NL80211_IFTYPE_AP_VLAN) { if (!tx->sdata->bss) return TX_CONTINUE; ps = &tx->sdata->bss->ps; } else if (ieee80211_vif_is_mesh(&tx->sdata->vif)) { ps = &tx->sdata->u.mesh.ps; } else { return TX_CONTINUE; } /* no buffering for ordered frames */ if (ieee80211_has_order(hdr->frame_control)) return TX_CONTINUE; if (ieee80211_is_probe_req(hdr->frame_control)) return TX_CONTINUE; if (ieee80211_hw_check(&tx->local->hw, QUEUE_CONTROL)) info->hw_queue = tx->sdata->vif.cab_queue; /* no stations in PS mode and no buffered packets */ if (!atomic_read(&ps->num_sta_ps) && skb_queue_empty(&ps->bc_buf)) return TX_CONTINUE; info->flags |= IEEE80211_TX_CTL_SEND_AFTER_DTIM; /* device releases frame after DTIM beacon */ if (!ieee80211_hw_check(&tx->local->hw, HOST_BROADCAST_PS_BUFFERING)) return TX_CONTINUE; /* buffered in mac80211 */ if (tx->local->total_ps_buffered >= TOTAL_MAX_TX_BUFFER) purge_old_ps_buffers(tx->local); if (skb_queue_len(&ps->bc_buf) >= AP_MAX_BC_BUFFER) { ps_dbg(tx->sdata, "BC TX buffer full - dropping the oldest frame\n"); ieee80211_free_txskb(&tx->local->hw, skb_dequeue(&ps->bc_buf)); } else tx->local->total_ps_buffered++; skb_queue_tail(&ps->bc_buf, tx->skb); return TX_QUEUED; } static int ieee80211_use_mfp(__le16 fc, struct sta_info *sta, struct sk_buff *skb) { if (!ieee80211_is_mgmt(fc)) return 0; if (sta == NULL || !test_sta_flag(sta, WLAN_STA_MFP)) return 0; if (!ieee80211_is_robust_mgmt_frame(skb)) return 0; return 1; } static ieee80211_tx_result ieee80211_tx_h_unicast_ps_buf(struct ieee80211_tx_data *tx) { struct sta_info *sta = tx->sta; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb); struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)tx->skb->data; struct ieee80211_local *local = tx->local; if (unlikely(!sta)) return TX_CONTINUE; if (unlikely((test_sta_flag(sta, WLAN_STA_PS_STA) || test_sta_flag(sta, WLAN_STA_PS_DRIVER) || test_sta_flag(sta, WLAN_STA_PS_DELIVER)) && !(info->flags & IEEE80211_TX_CTL_NO_PS_BUFFER))) { int ac = skb_get_queue_mapping(tx->skb); if (ieee80211_is_mgmt(hdr->frame_control) && !ieee80211_is_bufferable_mmpdu(tx->skb)) { info->flags |= IEEE80211_TX_CTL_NO_PS_BUFFER; return TX_CONTINUE; } ps_dbg(sta->sdata, "STA %pM aid %d: PS buffer for AC %d\n", sta->sta.addr, sta->sta.aid, ac); if (tx->local->total_ps_buffered >= TOTAL_MAX_TX_BUFFER) purge_old_ps_buffers(tx->local); /* sync with ieee80211_sta_ps_deliver_wakeup */ spin_lock(&sta->ps_lock); /* * STA woke up the meantime and all the frames on ps_tx_buf have * been queued to pending queue. No reordering can happen, go * ahead and Tx the packet. */ if (!test_sta_flag(sta, WLAN_STA_PS_STA) && !test_sta_flag(sta, WLAN_STA_PS_DRIVER) && !test_sta_flag(sta, WLAN_STA_PS_DELIVER)) { spin_unlock(&sta->ps_lock); return TX_CONTINUE; } if (skb_queue_len(&sta->ps_tx_buf[ac]) >= STA_MAX_TX_BUFFER) { struct sk_buff *old = skb_dequeue(&sta->ps_tx_buf[ac]); ps_dbg(tx->sdata, "STA %pM TX buffer for AC %d full - dropping oldest frame\n", sta->sta.addr, ac); ieee80211_free_txskb(&local->hw, old); } else tx->local->total_ps_buffered++; info->control.jiffies = jiffies; info->control.vif = &tx->sdata->vif; info->control.flags |= IEEE80211_TX_INTCFL_NEED_TXPROCESSING; info->flags &= ~IEEE80211_TX_TEMPORARY_FLAGS; skb_queue_tail(&sta->ps_tx_buf[ac], tx->skb); spin_unlock(&sta->ps_lock); if (!timer_pending(&local->sta_cleanup)) mod_timer(&local->sta_cleanup, round_jiffies(jiffies + STA_INFO_CLEANUP_INTERVAL)); /* * We queued up some frames, so the TIM bit might * need to be set, recalculate it. */ sta_info_recalc_tim(sta); return TX_QUEUED; } else if (unlikely(test_sta_flag(sta, WLAN_STA_PS_STA))) { ps_dbg(tx->sdata, "STA %pM in PS mode, but polling/in SP -> send frame\n", sta->sta.addr); } return TX_CONTINUE; } static ieee80211_tx_result debug_noinline ieee80211_tx_h_ps_buf(struct ieee80211_tx_data *tx) { if (unlikely(tx->flags & IEEE80211_TX_PS_BUFFERED)) return TX_CONTINUE; if (tx->flags & IEEE80211_TX_UNICAST) return ieee80211_tx_h_unicast_ps_buf(tx); else return ieee80211_tx_h_multicast_ps_buf(tx); } static ieee80211_tx_result debug_noinline ieee80211_tx_h_check_control_port_protocol(struct ieee80211_tx_data *tx) { struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb); if (unlikely(tx->sdata->control_port_protocol == tx->skb->protocol)) { if (tx->sdata->control_port_no_encrypt) info->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT; info->control.flags |= IEEE80211_TX_CTRL_PORT_CTRL_PROTO; info->flags |= IEEE80211_TX_CTL_USE_MINRATE; } return TX_CONTINUE; } static struct ieee80211_key * ieee80211_select_link_key(struct ieee80211_tx_data *tx) { struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)tx->skb->data; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb); struct ieee80211_link_data *link; unsigned int link_id; link_id = u32_get_bits(info->control.flags, IEEE80211_TX_CTRL_MLO_LINK); if (link_id == IEEE80211_LINK_UNSPECIFIED) { link = &tx->sdata->deflink; } else { link = rcu_dereference(tx->sdata->link[link_id]); if (!link) return NULL; } if (ieee80211_is_group_privacy_action(tx->skb)) return rcu_dereference(link->default_multicast_key); else if (ieee80211_is_mgmt(hdr->frame_control) && is_multicast_ether_addr(hdr->addr1) && ieee80211_is_robust_mgmt_frame(tx->skb)) return rcu_dereference(link->default_mgmt_key); else if (is_multicast_ether_addr(hdr->addr1)) return rcu_dereference(link->default_multicast_key); return NULL; } static ieee80211_tx_result debug_noinline ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx) { struct ieee80211_key *key; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb); struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)tx->skb->data; if (unlikely(info->flags & IEEE80211_TX_INTFL_DONT_ENCRYPT)) { tx->key = NULL; return TX_CONTINUE; } if (tx->sta && (key = rcu_dereference(tx->sta->ptk[tx->sta->ptk_idx]))) tx->key = key; else if ((key = ieee80211_select_link_key(tx))) tx->key = key; else if (!is_multicast_ether_addr(hdr->addr1) && (key = rcu_dereference(tx->sdata->default_unicast_key))) tx->key = key; else tx->key = NULL; if (tx->key) { bool skip_hw = false; /* TODO: add threshold stuff again */ switch (tx->key->conf.cipher) { case WLAN_CIPHER_SUITE_WEP40: case WLAN_CIPHER_SUITE_WEP104: case WLAN_CIPHER_SUITE_TKIP: if (!ieee80211_is_data_present(hdr->frame_control)) tx->key = NULL; break; case WLAN_CIPHER_SUITE_CCMP: case WLAN_CIPHER_SUITE_CCMP_256: case WLAN_CIPHER_SUITE_GCMP: case WLAN_CIPHER_SUITE_GCMP_256: if (!ieee80211_is_data_present(hdr->frame_control) && !ieee80211_use_mfp(hdr->frame_control, tx->sta, tx->skb) && !ieee80211_is_group_privacy_action(tx->skb)) tx->key = NULL; else skip_hw = (tx->key->conf.flags & IEEE80211_KEY_FLAG_SW_MGMT_TX) && ieee80211_is_mgmt(hdr->frame_control); break; case WLAN_CIPHER_SUITE_AES_CMAC: case WLAN_CIPHER_SUITE_BIP_CMAC_256: case WLAN_CIPHER_SUITE_BIP_GMAC_128: case WLAN_CIPHER_SUITE_BIP_GMAC_256: if (!ieee80211_is_mgmt(hdr->frame_control)) tx->key = NULL; break; } if (unlikely(tx->key && tx->key->flags & KEY_FLAG_TAINTED && !ieee80211_is_deauth(hdr->frame_control)) && tx->skb->protocol != tx->sdata->control_port_protocol) return TX_DROP; if (!skip_hw && tx->key && tx->key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE) info->control.hw_key = &tx->key->conf; } else if (ieee80211_is_data_present(hdr->frame_control) && tx->sta && test_sta_flag(tx->sta, WLAN_STA_USES_ENCRYPTION)) { return TX_DROP; } return TX_CONTINUE; } static ieee80211_tx_result debug_noinline ieee80211_tx_h_rate_ctrl(struct ieee80211_tx_data *tx) { struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb); struct ieee80211_hdr *hdr = (void *)tx->skb->data; struct ieee80211_supported_band *sband; u32 len; struct ieee80211_tx_rate_control txrc; struct ieee80211_sta_rates *ratetbl = NULL; bool encap = info->flags & IEEE80211_TX_CTL_HW_80211_ENCAP; bool assoc = false; memset(&txrc, 0, sizeof(txrc)); sband = tx->local->hw.wiphy->bands[info->band]; len = min_t(u32, tx->skb->len + FCS_LEN, tx->local->hw.wiphy->frag_threshold); /* set up the tx rate control struct we give the RC algo */ txrc.hw = &tx->local->hw; txrc.sband = sband; txrc.bss_conf = &tx->sdata->vif.bss_conf; txrc.skb = tx->skb; txrc.reported_rate.idx = -1; if (unlikely(info->control.flags & IEEE80211_TX_CTRL_DONT_USE_RATE_MASK)) { txrc.rate_idx_mask = ~0; } else { txrc.rate_idx_mask = tx->sdata->rc_rateidx_mask[info->band]; if (tx->sdata->rc_has_mcs_mask[info->band]) txrc.rate_idx_mcs_mask = tx->sdata->rc_rateidx_mcs_mask[info->band]; } txrc.bss = (tx->sdata->vif.type == NL80211_IFTYPE_AP || tx->sdata->vif.type == NL80211_IFTYPE_MESH_POINT || tx->sdata->vif.type == NL80211_IFTYPE_ADHOC || tx->sdata->vif.type == NL80211_IFTYPE_OCB); /* set up RTS protection if desired */ if (len > tx->local->hw.wiphy->rts_threshold) { txrc.rts = true; } info->control.use_rts = txrc.rts; info->control.use_cts_prot = tx->sdata->vif.bss_conf.use_cts_prot; /* * Use short preamble if the BSS can handle it, but not for * management frames unless we know the receiver can handle * that -- the management frame might be to a station that * just wants a probe response. */ if (tx->sdata->vif.bss_conf.use_short_preamble && (ieee80211_is_tx_data(tx->skb) || (tx->sta && test_sta_flag(tx->sta, WLAN_STA_SHORT_PREAMBLE)))) txrc.short_preamble = true; info->control.short_preamble = txrc.short_preamble; /* don't ask rate control when rate already injected via radiotap */ if (info->control.flags & IEEE80211_TX_CTRL_RATE_INJECT) return TX_CONTINUE; if (tx->sta) assoc = test_sta_flag(tx->sta, WLAN_STA_ASSOC); /* * Lets not bother rate control if we're associated and cannot * talk to the sta. This should not happen. */ if (WARN(test_bit(SCAN_SW_SCANNING, &tx->local->scanning) && assoc && !rate_usable_index_exists(sband, &tx->sta->sta), "%s: Dropped data frame as no usable bitrate found while " "scanning and associated. Target station: " "%pM on %d GHz band\n", tx->sdata->name, encap ? ((struct ethhdr *)hdr)->h_dest : hdr->addr1, info->band ? 5 : 2)) return TX_DROP; /* * If we're associated with the sta at this point we know we can at * least send the frame at the lowest bit rate. */ rate_control_get_rate(tx->sdata, tx->sta, &txrc); if (tx->sta && !info->control.skip_table) ratetbl = rcu_dereference(tx->sta->sta.rates); if (unlikely(info->control.rates[0].idx < 0)) { if (ratetbl) { struct ieee80211_tx_rate rate = { .idx = ratetbl->rate[0].idx, .flags = ratetbl->rate[0].flags, .count = ratetbl->rate[0].count }; if (ratetbl->rate[0].idx < 0) return TX_DROP; tx->rate = rate; } else { return TX_DROP; } } else { tx->rate = info->control.rates[0]; } if (txrc.reported_rate.idx < 0) { txrc.reported_rate = tx->rate; if (tx->sta && ieee80211_is_tx_data(tx->skb)) tx->sta->deflink.tx_stats.last_rate = txrc.reported_rate; } else if (tx->sta) tx->sta->deflink.tx_stats.last_rate = txrc.reported_rate; if (ratetbl) return TX_CONTINUE; if (unlikely(!info->control.rates[0].count)) info->control.rates[0].count = 1; if (WARN_ON_ONCE((info->control.rates[0].count > 1) && (info->flags & IEEE80211_TX_CTL_NO_ACK))) info->control.rates[0].count = 1; return TX_CONTINUE; } static __le16 ieee80211_tx_next_seq(struct sta_info *sta, int tid) { u16 *seq = &sta->tid_seq[tid]; __le16 ret = cpu_to_le16(*seq); /* Increase the sequence number. */ *seq = (*seq + 0x10) & IEEE80211_SCTL_SEQ; return ret; } static ieee80211_tx_result debug_noinline ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx) { struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb); struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)tx->skb->data; int tid; /* * Packet injection may want to control the sequence * number, if we have no matching interface then we * neither assign one ourselves nor ask the driver to. */ if (unlikely(info->control.vif->type == NL80211_IFTYPE_MONITOR)) return TX_CONTINUE; if (unlikely(ieee80211_is_ctl(hdr->frame_control))) return TX_CONTINUE; if (ieee80211_hdrlen(hdr->frame_control) < 24) return TX_CONTINUE; if (ieee80211_is_qos_nullfunc(hdr->frame_control)) return TX_CONTINUE; if (info->control.flags & IEEE80211_TX_CTRL_NO_SEQNO) return TX_CONTINUE; /* SNS11 from 802.11be 10.3.2.14 */ if (unlikely(is_multicast_ether_addr(hdr->addr1) && ieee80211_vif_is_mld(info->control.vif) && info->control.vif->type == NL80211_IFTYPE_AP)) { if (info->control.flags & IEEE80211_TX_CTRL_MCAST_MLO_FIRST_TX) tx->sdata->mld_mcast_seq += 0x10; hdr->seq_ctrl = cpu_to_le16(tx->sdata->mld_mcast_seq); return TX_CONTINUE; } /* * Anything but QoS data that has a sequence number field * (is long enough) gets a sequence number from the global * counter. QoS data frames with a multicast destination * also use the global counter (802.11-2012 9.3.2.10). */ if (!ieee80211_is_data_qos(hdr->frame_control) || is_multicast_ether_addr(hdr->addr1)) { /* driver should assign sequence number */ info->flags |= IEEE80211_TX_CTL_ASSIGN_SEQ; /* for pure STA mode without beacons, we can do it */ hdr->seq_ctrl = cpu_to_le16(tx->sdata->sequence_number); tx->sdata->sequence_number += 0x10; if (tx->sta) tx->sta->deflink.tx_stats.msdu[IEEE80211_NUM_TIDS]++; return TX_CONTINUE; } /* * This should be true for injected/management frames only, for * management frames we have set the IEEE80211_TX_CTL_ASSIGN_SEQ * above since they are not QoS-data frames. */ if (!tx->sta) return TX_CONTINUE; /* include per-STA, per-TID sequence counter */ tid = ieee80211_get_tid(hdr); tx->sta->deflink.tx_stats.msdu[tid]++; hdr->seq_ctrl = ieee80211_tx_next_seq(tx->sta, tid); return TX_CONTINUE; } static int ieee80211_fragment(struct ieee80211_tx_data *tx, struct sk_buff *skb, int hdrlen, int frag_threshold) { struct ieee80211_local *local = tx->local; struct ieee80211_tx_info *info; struct sk_buff *tmp; int per_fragm = frag_threshold - hdrlen - FCS_LEN; int pos = hdrlen + per_fragm; int rem = skb->len - hdrlen - per_fragm; if (WARN_ON(rem < 0)) return -EINVAL; /* first fragment was already added to queue by caller */ while (rem) { int fraglen = per_fragm; if (fraglen > rem) fraglen = rem; rem -= fraglen; tmp = dev_alloc_skb(local->tx_headroom + frag_threshold + IEEE80211_ENCRYPT_HEADROOM + IEEE80211_ENCRYPT_TAILROOM); if (!tmp) return -ENOMEM; __skb_queue_tail(&tx->skbs, tmp); skb_reserve(tmp, local->tx_headroom + IEEE80211_ENCRYPT_HEADROOM); /* copy control information */ memcpy(tmp->cb, skb->cb, sizeof(tmp->cb)); info = IEEE80211_SKB_CB(tmp); info->flags &= ~(IEEE80211_TX_CTL_CLEAR_PS_FILT | IEEE80211_TX_CTL_FIRST_FRAGMENT); if (rem) info->flags |= IEEE80211_TX_CTL_MORE_FRAMES; skb_copy_queue_mapping(tmp, skb); tmp->priority = skb->priority; tmp->dev = skb->dev; /* copy header and data */ skb_put_data(tmp, skb->data, hdrlen); skb_put_data(tmp, skb->data + pos, fraglen); pos += fraglen; } /* adjust first fragment's length */ skb_trim(skb, hdrlen + per_fragm); return 0; } static ieee80211_tx_result debug_noinline ieee80211_tx_h_fragment(struct ieee80211_tx_data *tx) { struct sk_buff *skb = tx->skb; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct ieee80211_hdr *hdr = (void *)skb->data; int frag_threshold = tx->local->hw.wiphy->frag_threshold; int hdrlen; int fragnum; /* no matter what happens, tx->skb moves to tx->skbs */ __skb_queue_tail(&tx->skbs, skb); tx->skb = NULL; if (info->flags & IEEE80211_TX_CTL_DONTFRAG) return TX_CONTINUE; if (ieee80211_hw_check(&tx->local->hw, SUPPORTS_TX_FRAG)) return TX_CONTINUE; /* * Warn when submitting a fragmented A-MPDU frame and drop it. * This scenario is handled in ieee80211_tx_prepare but extra * caution taken here as fragmented ampdu may cause Tx stop. */ if (WARN_ON(info->flags & IEEE80211_TX_CTL_AMPDU)) return TX_DROP; hdrlen = ieee80211_hdrlen(hdr->frame_control); /* internal error, why isn't DONTFRAG set? */ if (WARN_ON(skb->len + FCS_LEN <= frag_threshold)) return TX_DROP; /* * Now fragment the frame. This will allocate all the fragments and * chain them (using skb as the first fragment) to skb->next. * During transmission, we will remove the successfully transmitted * fragments from this list. When the low-level driver rejects one * of the fragments then we will simply pretend to accept the skb * but store it away as pending. */ if (ieee80211_fragment(tx, skb, hdrlen, frag_threshold)) return TX_DROP; /* update duration/seq/flags of fragments */ fragnum = 0; skb_queue_walk(&tx->skbs, skb) { const __le16 morefrags = cpu_to_le16(IEEE80211_FCTL_MOREFRAGS); hdr = (void *)skb->data; info = IEEE80211_SKB_CB(skb); if (!skb_queue_is_last(&tx->skbs, skb)) { hdr->frame_control |= morefrags; /* * No multi-rate retries for fragmented frames, that * would completely throw off the NAV at other STAs. */ info->control.rates[1].idx = -1; info->control.rates[2].idx = -1; info->control.rates[3].idx = -1; BUILD_BUG_ON(IEEE80211_TX_MAX_RATES != 4); info->flags &= ~IEEE80211_TX_CTL_RATE_CTRL_PROBE; } else { hdr->frame_control &= ~morefrags; } hdr->seq_ctrl |= cpu_to_le16(fragnum & IEEE80211_SCTL_FRAG); fragnum++; } return TX_CONTINUE; } static ieee80211_tx_result debug_noinline ieee80211_tx_h_stats(struct ieee80211_tx_data *tx) { struct sk_buff *skb; int ac = -1; if (!tx->sta) return TX_CONTINUE; skb_queue_walk(&tx->skbs, skb) { ac = skb_get_queue_mapping(skb); tx->sta->deflink.tx_stats.bytes[ac] += skb->len; } if (ac >= 0) tx->sta->deflink.tx_stats.packets[ac]++; return TX_CONTINUE; } static ieee80211_tx_result debug_noinline ieee80211_tx_h_encrypt(struct ieee80211_tx_data *tx) { if (!tx->key) return TX_CONTINUE; switch (tx->key->conf.cipher) { case WLAN_CIPHER_SUITE_WEP40: case WLAN_CIPHER_SUITE_WEP104: return ieee80211_crypto_wep_encrypt(tx); case WLAN_CIPHER_SUITE_TKIP: return ieee80211_crypto_tkip_encrypt(tx); case WLAN_CIPHER_SUITE_CCMP: return ieee80211_crypto_ccmp_encrypt( tx, IEEE80211_CCMP_MIC_LEN); case WLAN_CIPHER_SUITE_CCMP_256: return ieee80211_crypto_ccmp_encrypt( tx, IEEE80211_CCMP_256_MIC_LEN); case WLAN_CIPHER_SUITE_AES_CMAC: return ieee80211_crypto_aes_cmac_encrypt(tx); case WLAN_CIPHER_SUITE_BIP_CMAC_256: return ieee80211_crypto_aes_cmac_256_encrypt(tx); case WLAN_CIPHER_SUITE_BIP_GMAC_128: case WLAN_CIPHER_SUITE_BIP_GMAC_256: return ieee80211_crypto_aes_gmac_encrypt(tx); case WLAN_CIPHER_SUITE_GCMP: case WLAN_CIPHER_SUITE_GCMP_256: return ieee80211_crypto_gcmp_encrypt(tx); } return TX_DROP; } static ieee80211_tx_result debug_noinline ieee80211_tx_h_calculate_duration(struct ieee80211_tx_data *tx) { struct sk_buff *skb; struct ieee80211_hdr *hdr; int next_len; bool group_addr; skb_queue_walk(&tx->skbs, skb) { hdr = (void *) skb->data; if (unlikely(ieee80211_is_pspoll(hdr->frame_control))) break; /* must not overwrite AID */ if (!skb_queue_is_last(&tx->skbs, skb)) { struct sk_buff *next = skb_queue_next(&tx->skbs, skb); next_len = next->len; } else next_len = 0; group_addr = is_multicast_ether_addr(hdr->addr1); hdr->duration_id = ieee80211_duration(tx, skb, group_addr, next_len); } return TX_CONTINUE; } /* actual transmit path */ static bool ieee80211_tx_prep_agg(struct ieee80211_tx_data *tx, struct sk_buff *skb, struct ieee80211_tx_info *info, struct tid_ampdu_tx *tid_tx, int tid) { bool queued = false; bool reset_agg_timer = false; struct sk_buff *purge_skb = NULL; if (test_bit(HT_AGG_STATE_OPERATIONAL, &tid_tx->state)) { reset_agg_timer = true; } else if (test_bit(HT_AGG_STATE_WANT_START, &tid_tx->state)) { /* * nothing -- this aggregation session is being started * but that might still fail with the driver */ } else if (!tx->sta->sta.txq[tid]) { spin_lock(&tx->sta->lock); /* * Need to re-check now, because we may get here * * 1) in the window during which the setup is actually * already done, but not marked yet because not all * packets are spliced over to the driver pending * queue yet -- if this happened we acquire the lock * either before or after the splice happens, but * need to recheck which of these cases happened. * * 2) during session teardown, if the OPERATIONAL bit * was cleared due to the teardown but the pointer * hasn't been assigned NULL yet (or we loaded it * before it was assigned) -- in this case it may * now be NULL which means we should just let the * packet pass through because splicing the frames * back is already done. */ tid_tx = rcu_dereference_protected_tid_tx(tx->sta, tid); if (!tid_tx) { /* do nothing, let packet pass through */ } else if (test_bit(HT_AGG_STATE_OPERATIONAL, &tid_tx->state)) { reset_agg_timer = true; } else { queued = true; if (info->flags & IEEE80211_TX_CTL_NO_PS_BUFFER) { clear_sta_flag(tx->sta, WLAN_STA_SP); ps_dbg(tx->sta->sdata, "STA %pM aid %d: SP frame queued, close the SP w/o telling the peer\n", tx->sta->sta.addr, tx->sta->sta.aid); } info->control.vif = &tx->sdata->vif; info->control.flags |= IEEE80211_TX_INTCFL_NEED_TXPROCESSING; info->flags &= ~IEEE80211_TX_TEMPORARY_FLAGS; __skb_queue_tail(&tid_tx->pending, skb); if (skb_queue_len(&tid_tx->pending) > STA_MAX_TX_BUFFER) purge_skb = __skb_dequeue(&tid_tx->pending); } spin_unlock(&tx->sta->lock); if (purge_skb) ieee80211_free_txskb(&tx->local->hw, purge_skb); } /* reset session timer */ if (reset_agg_timer) tid_tx->last_tx = jiffies; return queued; } void ieee80211_aggr_check(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, struct sk_buff *skb) { struct rate_control_ref *ref = sdata->local->rate_ctrl; u16 tid; if (!ref || !(ref->ops->capa & RATE_CTRL_CAPA_AMPDU_TRIGGER)) return; if (!sta || (!sta->sta.valid_links && !sta->sta.deflink.ht_cap.ht_supported) || !sta->sta.wme || skb_get_queue_mapping(skb) == IEEE80211_AC_VO || skb->protocol == sdata->control_port_protocol) return; tid = skb->priority & IEEE80211_QOS_CTL_TID_MASK; if (likely(sta->ampdu_mlme.tid_tx[tid])) return; ieee80211_start_tx_ba_session(&sta->sta, tid, 0); } /* * initialises @tx * pass %NULL for the station if unknown, a valid pointer if known * or an ERR_PTR() if the station is known not to exist */ static ieee80211_tx_result ieee80211_tx_prepare(struct ieee80211_sub_if_data *sdata, struct ieee80211_tx_data *tx, struct sta_info *sta, struct sk_buff *skb) { struct ieee80211_local *local = sdata->local; struct ieee80211_hdr *hdr; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); bool aggr_check = false; int tid; memset(tx, 0, sizeof(*tx)); tx->skb = skb; tx->local = local; tx->sdata = sdata; __skb_queue_head_init(&tx->skbs); /* * If this flag is set to true anywhere, and we get here, * we are doing the needed processing, so remove the flag * now. */ info->control.flags &= ~IEEE80211_TX_INTCFL_NEED_TXPROCESSING; hdr = (struct ieee80211_hdr *) skb->data; if (likely(sta)) { if (!IS_ERR(sta)) tx->sta = sta; } else { if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) { tx->sta = rcu_dereference(sdata->u.vlan.sta); if (!tx->sta && sdata->wdev.use_4addr) return TX_DROP; } else if (tx->sdata->control_port_protocol == tx->skb->protocol) { tx->sta = sta_info_get_bss(sdata, hdr->addr1); } if (!tx->sta && !is_multicast_ether_addr(hdr->addr1)) { tx->sta = sta_info_get(sdata, hdr->addr1); aggr_check = true; } } if (tx->sta && ieee80211_is_data_qos(hdr->frame_control) && !ieee80211_is_qos_nullfunc(hdr->frame_control) && ieee80211_hw_check(&local->hw, AMPDU_AGGREGATION) && !ieee80211_hw_check(&local->hw, TX_AMPDU_SETUP_IN_HW)) { struct tid_ampdu_tx *tid_tx; tid = ieee80211_get_tid(hdr); tid_tx = rcu_dereference(tx->sta->ampdu_mlme.tid_tx[tid]); if (!tid_tx && aggr_check) { ieee80211_aggr_check(sdata, tx->sta, skb); tid_tx = rcu_dereference(tx->sta->ampdu_mlme.tid_tx[tid]); } if (tid_tx) { bool queued; queued = ieee80211_tx_prep_agg(tx, skb, info, tid_tx, tid); if (unlikely(queued)) return TX_QUEUED; } } if (is_multicast_ether_addr(hdr->addr1)) { tx->flags &= ~IEEE80211_TX_UNICAST; info->flags |= IEEE80211_TX_CTL_NO_ACK; } else tx->flags |= IEEE80211_TX_UNICAST; if (!(info->flags & IEEE80211_TX_CTL_DONTFRAG)) { if (!(tx->flags & IEEE80211_TX_UNICAST) || skb->len + FCS_LEN <= local->hw.wiphy->frag_threshold || info->flags & IEEE80211_TX_CTL_AMPDU) info->flags |= IEEE80211_TX_CTL_DONTFRAG; } if (!tx->sta) info->flags |= IEEE80211_TX_CTL_CLEAR_PS_FILT; else if (test_and_clear_sta_flag(tx->sta, WLAN_STA_CLEAR_PS_FILT)) { info->flags |= IEEE80211_TX_CTL_CLEAR_PS_FILT; ieee80211_check_fast_xmit(tx->sta); } info->flags |= IEEE80211_TX_CTL_FIRST_FRAGMENT; return TX_CONTINUE; } static struct txq_info *ieee80211_get_txq(struct ieee80211_local *local, struct ieee80211_vif *vif, struct sta_info *sta, struct sk_buff *skb) { struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct ieee80211_txq *txq = NULL; if ((info->flags & IEEE80211_TX_CTL_SEND_AFTER_DTIM) || (info->control.flags & IEEE80211_TX_CTRL_PS_RESPONSE)) return NULL; if (!(info->flags & IEEE80211_TX_CTL_HW_80211_ENCAP) && unlikely(!ieee80211_is_data_present(hdr->frame_control))) { if ((!ieee80211_is_mgmt(hdr->frame_control) || ieee80211_is_bufferable_mmpdu(skb) || vif->type == NL80211_IFTYPE_STATION) && sta && sta->uploaded) { /* * This will be NULL if the driver didn't set the * opt-in hardware flag. */ txq = sta->sta.txq[IEEE80211_NUM_TIDS]; } } else if (sta) { u8 tid = skb->priority & IEEE80211_QOS_CTL_TID_MASK; if (!sta->uploaded) return NULL; txq = sta->sta.txq[tid]; } else { txq = vif->txq; } if (!txq) return NULL; return to_txq_info(txq); } static void ieee80211_set_skb_enqueue_time(struct sk_buff *skb) { struct sk_buff *next; codel_time_t now = codel_get_time(); skb_list_walk_safe(skb, skb, next) IEEE80211_SKB_CB(skb)->control.enqueue_time = now; } static u32 codel_skb_len_func(const struct sk_buff *skb) { return skb->len; } static codel_time_t codel_skb_time_func(const struct sk_buff *skb) { const struct ieee80211_tx_info *info; info = (const struct ieee80211_tx_info *)skb->cb; return info->control.enqueue_time; } static struct sk_buff *codel_dequeue_func(struct codel_vars *cvars, void *ctx) { struct ieee80211_local *local; struct txq_info *txqi; struct fq *fq; struct fq_flow *flow; txqi = ctx; local = vif_to_sdata(txqi->txq.vif)->local; fq = &local->fq; if (cvars == &txqi->def_cvars) flow = &txqi->tin.default_flow; else flow = &fq->flows[cvars - local->cvars]; return fq_flow_dequeue(fq, flow); } static void codel_drop_func(struct sk_buff *skb, void *ctx) { struct ieee80211_local *local; struct ieee80211_hw *hw; struct txq_info *txqi; txqi = ctx; local = vif_to_sdata(txqi->txq.vif)->local; hw = &local->hw; ieee80211_free_txskb(hw, skb); } static struct sk_buff *fq_tin_dequeue_func(struct fq *fq, struct fq_tin *tin, struct fq_flow *flow) { struct ieee80211_local *local; struct txq_info *txqi; struct codel_vars *cvars; struct codel_params *cparams; struct codel_stats *cstats; local = container_of(fq, struct ieee80211_local, fq); txqi = container_of(tin, struct txq_info, tin); cparams = &local->cparams; cstats = &txqi->cstats; if (flow == &tin->default_flow) cvars = &txqi->def_cvars; else cvars = &local->cvars[flow - fq->flows]; return codel_dequeue(txqi, &flow->backlog, cparams, cvars, cstats, codel_skb_len_func, codel_skb_time_func, codel_drop_func, codel_dequeue_func); } static void fq_skb_free_func(struct fq *fq, struct fq_tin *tin, struct fq_flow *flow, struct sk_buff *skb) { struct ieee80211_local *local; local = container_of(fq, struct ieee80211_local, fq); ieee80211_free_txskb(&local->hw, skb); } static void ieee80211_txq_enqueue(struct ieee80211_local *local, struct txq_info *txqi, struct sk_buff *skb) { struct fq *fq = &local->fq; struct fq_tin *tin = &txqi->tin; u32 flow_idx = fq_flow_idx(fq, skb); ieee80211_set_skb_enqueue_time(skb); spin_lock_bh(&fq->lock); /* * For management frames, don't really apply codel etc., * we don't want to apply any shaping or anything we just * want to simplify the driver API by having them on the * txqi. */ if (unlikely(txqi->txq.tid == IEEE80211_NUM_TIDS)) { IEEE80211_SKB_CB(skb)->control.flags |= IEEE80211_TX_INTCFL_NEED_TXPROCESSING; __skb_queue_tail(&txqi->frags, skb); } else { fq_tin_enqueue(fq, tin, flow_idx, skb, fq_skb_free_func); } spin_unlock_bh(&fq->lock); } static bool fq_vlan_filter_func(struct fq *fq, struct fq_tin *tin, struct fq_flow *flow, struct sk_buff *skb, void *data) { struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); return info->control.vif == data; } void ieee80211_txq_remove_vlan(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { struct fq *fq = &local->fq; struct txq_info *txqi; struct fq_tin *tin; struct ieee80211_sub_if_data *ap; if (WARN_ON(sdata->vif.type != NL80211_IFTYPE_AP_VLAN)) return; ap = container_of(sdata->bss, struct ieee80211_sub_if_data, u.ap); if (!ap->vif.txq) return; txqi = to_txq_info(ap->vif.txq); tin = &txqi->tin; spin_lock_bh(&fq->lock); fq_tin_filter(fq, tin, fq_vlan_filter_func, &sdata->vif, fq_skb_free_func); spin_unlock_bh(&fq->lock); } void ieee80211_txq_init(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, struct txq_info *txqi, int tid) { fq_tin_init(&txqi->tin); codel_vars_init(&txqi->def_cvars); codel_stats_init(&txqi->cstats); __skb_queue_head_init(&txqi->frags); INIT_LIST_HEAD(&txqi->schedule_order); txqi->txq.vif = &sdata->vif; if (!sta) { sdata->vif.txq = &txqi->txq; txqi->txq.tid = 0; txqi->txq.ac = IEEE80211_AC_BE; return; } if (tid == IEEE80211_NUM_TIDS) { if (sdata->vif.type == NL80211_IFTYPE_STATION) { /* Drivers need to opt in to the management MPDU TXQ */ if (!ieee80211_hw_check(&sdata->local->hw, STA_MMPDU_TXQ)) return; } else if (!ieee80211_hw_check(&sdata->local->hw, BUFF_MMPDU_TXQ)) { /* Drivers need to opt in to the bufferable MMPDU TXQ */ return; } txqi->txq.ac = IEEE80211_AC_VO; } else { txqi->txq.ac = ieee80211_ac_from_tid(tid); } txqi->txq.sta = &sta->sta; txqi->txq.tid = tid; sta->sta.txq[tid] = &txqi->txq; } void ieee80211_txq_purge(struct ieee80211_local *local, struct txq_info *txqi) { struct fq *fq = &local->fq; struct fq_tin *tin = &txqi->tin; spin_lock_bh(&fq->lock); fq_tin_reset(fq, tin, fq_skb_free_func); ieee80211_purge_tx_queue(&local->hw, &txqi->frags); spin_unlock_bh(&fq->lock); spin_lock_bh(&local->active_txq_lock[txqi->txq.ac]); list_del_init(&txqi->schedule_order); spin_unlock_bh(&local->active_txq_lock[txqi->txq.ac]); } void ieee80211_txq_set_params(struct ieee80211_local *local) { if (local->hw.wiphy->txq_limit) local->fq.limit = local->hw.wiphy->txq_limit; else local->hw.wiphy->txq_limit = local->fq.limit; if (local->hw.wiphy->txq_memory_limit) local->fq.memory_limit = local->hw.wiphy->txq_memory_limit; else local->hw.wiphy->txq_memory_limit = local->fq.memory_limit; if (local->hw.wiphy->txq_quantum) local->fq.quantum = local->hw.wiphy->txq_quantum; else local->hw.wiphy->txq_quantum = local->fq.quantum; } int ieee80211_txq_setup_flows(struct ieee80211_local *local) { struct fq *fq = &local->fq; int ret; int i; bool supp_vht = false; enum nl80211_band band; ret = fq_init(fq, 4096); if (ret) return ret; /* * If the hardware doesn't support VHT, it is safe to limit the maximum * queue size. 4 Mbytes is 64 max-size aggregates in 802.11n. */ for (band = 0; band < NUM_NL80211_BANDS; band++) { struct ieee80211_supported_band *sband; sband = local->hw.wiphy->bands[band]; if (!sband) continue; supp_vht = supp_vht || sband->vht_cap.vht_supported; } if (!supp_vht) fq->memory_limit = 4 << 20; /* 4 Mbytes */ codel_params_init(&local->cparams); local->cparams.interval = MS2TIME(100); local->cparams.target = MS2TIME(20); local->cparams.ecn = true; local->cvars = kvcalloc(fq->flows_cnt, sizeof(local->cvars[0]), GFP_KERNEL); if (!local->cvars) { spin_lock_bh(&fq->lock); fq_reset(fq, fq_skb_free_func); spin_unlock_bh(&fq->lock); return -ENOMEM; } for (i = 0; i < fq->flows_cnt; i++) codel_vars_init(&local->cvars[i]); ieee80211_txq_set_params(local); return 0; } void ieee80211_txq_teardown_flows(struct ieee80211_local *local) { struct fq *fq = &local->fq; kvfree(local->cvars); local->cvars = NULL; spin_lock_bh(&fq->lock); fq_reset(fq, fq_skb_free_func); spin_unlock_bh(&fq->lock); } static bool ieee80211_queue_skb(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct sta_info *sta, struct sk_buff *skb) { struct ieee80211_vif *vif; struct txq_info *txqi; if (sdata->vif.type == NL80211_IFTYPE_MONITOR) return false; if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) sdata = container_of(sdata->bss, struct ieee80211_sub_if_data, u.ap); vif = &sdata->vif; txqi = ieee80211_get_txq(local, vif, sta, skb); if (!txqi) return false; ieee80211_txq_enqueue(local, txqi, skb); schedule_and_wake_txq(local, txqi); return true; } static bool ieee80211_tx_frags(struct ieee80211_local *local, struct ieee80211_vif *vif, struct sta_info *sta, struct sk_buff_head *skbs, bool txpending) { struct ieee80211_tx_control control = {}; struct sk_buff *skb, *tmp; unsigned long flags; skb_queue_walk_safe(skbs, skb, tmp) { struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); int q = info->hw_queue; #ifdef CONFIG_MAC80211_VERBOSE_DEBUG if (WARN_ON_ONCE(q >= local->hw.queues)) { __skb_unlink(skb, skbs); ieee80211_free_txskb(&local->hw, skb); continue; } #endif spin_lock_irqsave(&local->queue_stop_reason_lock, flags); if (local->queue_stop_reasons[q] || (!txpending && !skb_queue_empty(&local->pending[q]))) { if (unlikely(info->flags & IEEE80211_TX_INTFL_OFFCHAN_TX_OK)) { if (local->queue_stop_reasons[q] & ~BIT(IEEE80211_QUEUE_STOP_REASON_OFFCHANNEL)) { /* * Drop off-channel frames if queues * are stopped for any reason other * than off-channel operation. Never * queue them. */ spin_unlock_irqrestore( &local->queue_stop_reason_lock, flags); ieee80211_purge_tx_queue(&local->hw, skbs); return true; } } else { /* * Since queue is stopped, queue up frames for * later transmission from the tx-pending * tasklet when the queue is woken again. */ if (txpending) skb_queue_splice_init(skbs, &local->pending[q]); else skb_queue_splice_tail_init(skbs, &local->pending[q]); spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); return false; } } spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); info->control.vif = vif; control.sta = sta ? &sta->sta : NULL; __skb_unlink(skb, skbs); drv_tx(local, &control, skb); } return true; } /* * Returns false if the frame couldn't be transmitted but was queued instead. */ static bool __ieee80211_tx(struct ieee80211_local *local, struct sk_buff_head *skbs, struct sta_info *sta, bool txpending) { struct ieee80211_tx_info *info; struct ieee80211_sub_if_data *sdata; struct ieee80211_vif *vif; struct sk_buff *skb; bool result; if (WARN_ON(skb_queue_empty(skbs))) return true; skb = skb_peek(skbs); info = IEEE80211_SKB_CB(skb); sdata = vif_to_sdata(info->control.vif); if (sta && !sta->uploaded) sta = NULL; switch (sdata->vif.type) { case NL80211_IFTYPE_MONITOR: if ((sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE) || ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR)) { vif = &sdata->vif; break; } sdata = rcu_dereference(local->monitor_sdata); if (sdata && ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF)) { vif = &sdata->vif; info->hw_queue = vif->hw_queue[skb_get_queue_mapping(skb)]; } else if (ieee80211_hw_check(&local->hw, QUEUE_CONTROL)) { ieee80211_purge_tx_queue(&local->hw, skbs); return true; } else vif = NULL; break; case NL80211_IFTYPE_AP_VLAN: sdata = container_of(sdata->bss, struct ieee80211_sub_if_data, u.ap); fallthrough; default: vif = &sdata->vif; break; } result = ieee80211_tx_frags(local, vif, sta, skbs, txpending); WARN_ON_ONCE(!skb_queue_empty(skbs)); return result; } /* * Invoke TX handlers, return 0 on success and non-zero if the * frame was dropped or queued. * * The handlers are split into an early and late part. The latter is everything * that can be sensitive to reordering, and will be deferred to after packets * are dequeued from the intermediate queues (when they are enabled). */ static int invoke_tx_handlers_early(struct ieee80211_tx_data *tx) { ieee80211_tx_result res = TX_DROP; #define CALL_TXH(txh) \ do { \ res = txh(tx); \ if (res != TX_CONTINUE) \ goto txh_done; \ } while (0) CALL_TXH(ieee80211_tx_h_dynamic_ps); CALL_TXH(ieee80211_tx_h_check_assoc); CALL_TXH(ieee80211_tx_h_ps_buf); CALL_TXH(ieee80211_tx_h_check_control_port_protocol); CALL_TXH(ieee80211_tx_h_select_key); txh_done: if (unlikely(res == TX_DROP)) { I802_DEBUG_INC(tx->local->tx_handlers_drop); if (tx->skb) ieee80211_free_txskb(&tx->local->hw, tx->skb); else ieee80211_purge_tx_queue(&tx->local->hw, &tx->skbs); return -1; } else if (unlikely(res == TX_QUEUED)) { I802_DEBUG_INC(tx->local->tx_handlers_queued); return -1; } return 0; } /* * Late handlers can be called while the sta lock is held. Handlers that can * cause packets to be generated will cause deadlock! */ static int invoke_tx_handlers_late(struct ieee80211_tx_data *tx) { struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb); ieee80211_tx_result res = TX_CONTINUE; if (!ieee80211_hw_check(&tx->local->hw, HAS_RATE_CONTROL)) CALL_TXH(ieee80211_tx_h_rate_ctrl); if (unlikely(info->flags & IEEE80211_TX_INTFL_RETRANSMISSION)) { __skb_queue_tail(&tx->skbs, tx->skb); tx->skb = NULL; goto txh_done; } CALL_TXH(ieee80211_tx_h_michael_mic_add); CALL_TXH(ieee80211_tx_h_sequence); CALL_TXH(ieee80211_tx_h_fragment); /* handlers after fragment must be aware of tx info fragmentation! */ CALL_TXH(ieee80211_tx_h_stats); CALL_TXH(ieee80211_tx_h_encrypt); if (!ieee80211_hw_check(&tx->local->hw, HAS_RATE_CONTROL)) CALL_TXH(ieee80211_tx_h_calculate_duration); #undef CALL_TXH txh_done: if (unlikely(res == TX_DROP)) { I802_DEBUG_INC(tx->local->tx_handlers_drop); if (tx->skb) ieee80211_free_txskb(&tx->local->hw, tx->skb); else ieee80211_purge_tx_queue(&tx->local->hw, &tx->skbs); return -1; } else if (unlikely(res == TX_QUEUED)) { I802_DEBUG_INC(tx->local->tx_handlers_queued); return -1; } return 0; } static int invoke_tx_handlers(struct ieee80211_tx_data *tx) { int r = invoke_tx_handlers_early(tx); if (r) return r; return invoke_tx_handlers_late(tx); } bool ieee80211_tx_prepare_skb(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct sk_buff *skb, int band, struct ieee80211_sta **sta) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct ieee80211_tx_data tx; struct sk_buff *skb2; if (ieee80211_tx_prepare(sdata, &tx, NULL, skb) == TX_DROP) return false; info->band = band; info->control.vif = vif; info->hw_queue = vif->hw_queue[skb_get_queue_mapping(skb)]; if (invoke_tx_handlers(&tx)) return false; if (sta) { if (tx.sta) *sta = &tx.sta->sta; else *sta = NULL; } /* this function isn't suitable for fragmented data frames */ skb2 = __skb_dequeue(&tx.skbs); if (WARN_ON(skb2 != skb || !skb_queue_empty(&tx.skbs))) { ieee80211_free_txskb(hw, skb2); ieee80211_purge_tx_queue(hw, &tx.skbs); return false; } return true; } EXPORT_SYMBOL(ieee80211_tx_prepare_skb); /* * Returns false if the frame couldn't be transmitted but was queued instead. */ static bool ieee80211_tx(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, struct sk_buff *skb, bool txpending) { struct ieee80211_local *local = sdata->local; struct ieee80211_tx_data tx; ieee80211_tx_result res_prepare; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); bool result = true; if (unlikely(skb->len < 10)) { dev_kfree_skb(skb); return true; } /* initialises tx */ res_prepare = ieee80211_tx_prepare(sdata, &tx, sta, skb); if (unlikely(res_prepare == TX_DROP)) { ieee80211_free_txskb(&local->hw, skb); return true; } else if (unlikely(res_prepare == TX_QUEUED)) { return true; } /* set up hw_queue value early */ if (!(info->flags & IEEE80211_TX_CTL_TX_OFFCHAN) || !ieee80211_hw_check(&local->hw, QUEUE_CONTROL)) info->hw_queue = sdata->vif.hw_queue[skb_get_queue_mapping(skb)]; if (invoke_tx_handlers_early(&tx)) return true; if (ieee80211_queue_skb(local, sdata, tx.sta, tx.skb)) return true; if (!invoke_tx_handlers_late(&tx)) result = __ieee80211_tx(local, &tx.skbs, tx.sta, txpending); return result; } /* device xmit handlers */ enum ieee80211_encrypt { ENCRYPT_NO, ENCRYPT_MGMT, ENCRYPT_DATA, }; static int ieee80211_skb_resize(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, int head_need, enum ieee80211_encrypt encrypt) { struct ieee80211_local *local = sdata->local; bool enc_tailroom; int tail_need = 0; enc_tailroom = encrypt == ENCRYPT_MGMT || (encrypt == ENCRYPT_DATA && sdata->crypto_tx_tailroom_needed_cnt); if (enc_tailroom) { tail_need = IEEE80211_ENCRYPT_TAILROOM; tail_need -= skb_tailroom(skb); tail_need = max_t(int, tail_need, 0); } if (skb_cloned(skb) && (!ieee80211_hw_check(&local->hw, SUPPORTS_CLONED_SKBS) || !skb_clone_writable(skb, ETH_HLEN) || enc_tailroom)) I802_DEBUG_INC(local->tx_expand_skb_head_cloned); else if (head_need || tail_need) I802_DEBUG_INC(local->tx_expand_skb_head); else return 0; if (pskb_expand_head(skb, head_need, tail_need, GFP_ATOMIC)) { wiphy_debug(local->hw.wiphy, "failed to reallocate TX buffer\n"); return -ENOMEM; } return 0; } void ieee80211_xmit(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, struct sk_buff *skb) { struct ieee80211_local *local = sdata->local; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; int headroom; enum ieee80211_encrypt encrypt; if (info->flags & IEEE80211_TX_INTFL_DONT_ENCRYPT) encrypt = ENCRYPT_NO; else if (ieee80211_is_mgmt(hdr->frame_control)) encrypt = ENCRYPT_MGMT; else encrypt = ENCRYPT_DATA; headroom = local->tx_headroom; if (encrypt != ENCRYPT_NO) headroom += IEEE80211_ENCRYPT_HEADROOM; headroom -= skb_headroom(skb); headroom = max_t(int, 0, headroom); if (ieee80211_skb_resize(sdata, skb, headroom, encrypt)) { ieee80211_free_txskb(&local->hw, skb); return; } /* reload after potential resize */ hdr = (struct ieee80211_hdr *) skb->data; info->control.vif = &sdata->vif; if (ieee80211_vif_is_mesh(&sdata->vif)) { if (ieee80211_is_data(hdr->frame_control) && is_unicast_ether_addr(hdr->addr1)) { if (mesh_nexthop_resolve(sdata, skb)) return; /* skb queued: don't free */ } else { ieee80211_mps_set_frame_flags(sdata, NULL, hdr); } } ieee80211_set_qos_hdr(sdata, skb); ieee80211_tx(sdata, sta, skb, false); } static bool ieee80211_validate_radiotap_len(struct sk_buff *skb) { struct ieee80211_radiotap_header *rthdr = (struct ieee80211_radiotap_header *)skb->data; /* check for not even having the fixed radiotap header part */ if (unlikely(skb->len < sizeof(struct ieee80211_radiotap_header))) return false; /* too short to be possibly valid */ /* is it a header version we can trust to find length from? */ if (unlikely(rthdr->it_version)) return false; /* only version 0 is supported */ /* does the skb contain enough to deliver on the alleged length? */ if (unlikely(skb->len < ieee80211_get_radiotap_len(skb->data))) return false; /* skb too short for claimed rt header extent */ return true; } bool ieee80211_parse_tx_radiotap(struct sk_buff *skb, struct net_device *dev) { struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr); struct ieee80211_radiotap_iterator iterator; struct ieee80211_radiotap_header *rthdr = (struct ieee80211_radiotap_header *) skb->data; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); int ret = ieee80211_radiotap_iterator_init(&iterator, rthdr, skb->len, NULL); u16 txflags; u16 rate = 0; bool rate_found = false; u8 rate_retries = 0; u16 rate_flags = 0; u8 mcs_known, mcs_flags, mcs_bw; u16 vht_known; u8 vht_mcs = 0, vht_nss = 0; int i; if (!ieee80211_validate_radiotap_len(skb)) return false; info->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT | IEEE80211_TX_CTL_DONTFRAG; /* * for every radiotap entry that is present * (ieee80211_radiotap_iterator_next returns -ENOENT when no more * entries present, or -EINVAL on error) */ while (!ret) { ret = ieee80211_radiotap_iterator_next(&iterator); if (ret) continue; /* see if this argument is something we can use */ switch (iterator.this_arg_index) { /* * You must take care when dereferencing iterator.this_arg * for multibyte types... the pointer is not aligned. Use * get_unaligned((type *)iterator.this_arg) to dereference * iterator.this_arg for type "type" safely on all arches. */ case IEEE80211_RADIOTAP_FLAGS: if (*iterator.this_arg & IEEE80211_RADIOTAP_F_FCS) { /* * this indicates that the skb we have been * handed has the 32-bit FCS CRC at the end... * we should react to that by snipping it off * because it will be recomputed and added * on transmission */ if (skb->len < (iterator._max_length + FCS_LEN)) return false; skb_trim(skb, skb->len - FCS_LEN); } if (*iterator.this_arg & IEEE80211_RADIOTAP_F_WEP) info->flags &= ~IEEE80211_TX_INTFL_DONT_ENCRYPT; if (*iterator.this_arg & IEEE80211_RADIOTAP_F_FRAG) info->flags &= ~IEEE80211_TX_CTL_DONTFRAG; break; case IEEE80211_RADIOTAP_TX_FLAGS: txflags = get_unaligned_le16(iterator.this_arg); if (txflags & IEEE80211_RADIOTAP_F_TX_NOACK) info->flags |= IEEE80211_TX_CTL_NO_ACK; if (txflags & IEEE80211_RADIOTAP_F_TX_NOSEQNO) info->control.flags |= IEEE80211_TX_CTRL_NO_SEQNO; if (txflags & IEEE80211_RADIOTAP_F_TX_ORDER) info->control.flags |= IEEE80211_TX_CTRL_DONT_REORDER; break; case IEEE80211_RADIOTAP_RATE: rate = *iterator.this_arg; rate_flags = 0; rate_found = true; break; case IEEE80211_RADIOTAP_ANTENNA: /* this can appear multiple times, keep a bitmap */ info->control.antennas |= BIT(*iterator.this_arg); break; case IEEE80211_RADIOTAP_DATA_RETRIES: rate_retries = *iterator.this_arg; break; case IEEE80211_RADIOTAP_MCS: mcs_known = iterator.this_arg[0]; mcs_flags = iterator.this_arg[1]; if (!(mcs_known & IEEE80211_RADIOTAP_MCS_HAVE_MCS)) break; rate_found = true; rate = iterator.this_arg[2]; rate_flags = IEEE80211_TX_RC_MCS; if (mcs_known & IEEE80211_RADIOTAP_MCS_HAVE_GI && mcs_flags & IEEE80211_RADIOTAP_MCS_SGI) rate_flags |= IEEE80211_TX_RC_SHORT_GI; mcs_bw = mcs_flags & IEEE80211_RADIOTAP_MCS_BW_MASK; if (mcs_known & IEEE80211_RADIOTAP_MCS_HAVE_BW && mcs_bw == IEEE80211_RADIOTAP_MCS_BW_40) rate_flags |= IEEE80211_TX_RC_40_MHZ_WIDTH; if (mcs_known & IEEE80211_RADIOTAP_MCS_HAVE_FEC && mcs_flags & IEEE80211_RADIOTAP_MCS_FEC_LDPC) info->flags |= IEEE80211_TX_CTL_LDPC; if (mcs_known & IEEE80211_RADIOTAP_MCS_HAVE_STBC) { u8 stbc = u8_get_bits(mcs_flags, IEEE80211_RADIOTAP_MCS_STBC_MASK); info->flags |= u32_encode_bits(stbc, IEEE80211_TX_CTL_STBC); } break; case IEEE80211_RADIOTAP_VHT: vht_known = get_unaligned_le16(iterator.this_arg); rate_found = true; rate_flags = IEEE80211_TX_RC_VHT_MCS; if ((vht_known & IEEE80211_RADIOTAP_VHT_KNOWN_GI) && (iterator.this_arg[2] & IEEE80211_RADIOTAP_VHT_FLAG_SGI)) rate_flags |= IEEE80211_TX_RC_SHORT_GI; if (vht_known & IEEE80211_RADIOTAP_VHT_KNOWN_BANDWIDTH) { if (iterator.this_arg[3] == 1) rate_flags |= IEEE80211_TX_RC_40_MHZ_WIDTH; else if (iterator.this_arg[3] == 4) rate_flags |= IEEE80211_TX_RC_80_MHZ_WIDTH; else if (iterator.this_arg[3] == 11) rate_flags |= IEEE80211_TX_RC_160_MHZ_WIDTH; } vht_mcs = iterator.this_arg[4] >> 4; if (vht_mcs > 11) vht_mcs = 0; vht_nss = iterator.this_arg[4] & 0xF; if (!vht_nss || vht_nss > 8) vht_nss = 1; break; /* * Please update the file * Documentation/networking/mac80211-injection.rst * when parsing new fields here. */ default: break; } } if (ret != -ENOENT) /* ie, if we didn't simply run out of fields */ return false; if (rate_found) { struct ieee80211_supported_band *sband = local->hw.wiphy->bands[info->band]; info->control.flags |= IEEE80211_TX_CTRL_RATE_INJECT; for (i = 0; i < IEEE80211_TX_MAX_RATES; i++) { info->control.rates[i].idx = -1; info->control.rates[i].flags = 0; info->control.rates[i].count = 0; } if (rate_flags & IEEE80211_TX_RC_MCS) { /* reset antennas if not enough */ if (IEEE80211_HT_MCS_CHAINS(rate) > hweight8(info->control.antennas)) info->control.antennas = 0; info->control.rates[0].idx = rate; } else if (rate_flags & IEEE80211_TX_RC_VHT_MCS) { /* reset antennas if not enough */ if (vht_nss > hweight8(info->control.antennas)) info->control.antennas = 0; ieee80211_rate_set_vht(info->control.rates, vht_mcs, vht_nss); } else if (sband) { for (i = 0; i < sband->n_bitrates; i++) { if (rate * 5 != sband->bitrates[i].bitrate) continue; info->control.rates[0].idx = i; break; } } if (info->control.rates[0].idx < 0) info->control.flags &= ~IEEE80211_TX_CTRL_RATE_INJECT; info->control.rates[0].flags = rate_flags; info->control.rates[0].count = min_t(u8, rate_retries + 1, local->hw.max_rate_tries); } return true; } netdev_tx_t ieee80211_monitor_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr); struct ieee80211_chanctx_conf *chanctx_conf; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct ieee80211_hdr *hdr; struct ieee80211_sub_if_data *tmp_sdata, *sdata; struct cfg80211_chan_def *chandef; u16 len_rthdr; int hdrlen; sdata = IEEE80211_DEV_TO_SUB_IF(dev); if (unlikely(!ieee80211_sdata_running(sdata))) goto fail; memset(info, 0, sizeof(*info)); info->flags = IEEE80211_TX_CTL_REQ_TX_STATUS | IEEE80211_TX_CTL_INJECTED; /* Sanity-check the length of the radiotap header */ if (!ieee80211_validate_radiotap_len(skb)) goto fail; /* we now know there is a radiotap header with a length we can use */ len_rthdr = ieee80211_get_radiotap_len(skb->data); /* * fix up the pointers accounting for the radiotap * header still being in there. We are being given * a precooked IEEE80211 header so no need for * normal processing */ skb_set_mac_header(skb, len_rthdr); /* * these are just fixed to the end of the rt area since we * don't have any better information and at this point, nobody cares */ skb_set_network_header(skb, len_rthdr); skb_set_transport_header(skb, len_rthdr); if (skb->len < len_rthdr + 2) goto fail; hdr = (struct ieee80211_hdr *)(skb->data + len_rthdr); hdrlen = ieee80211_hdrlen(hdr->frame_control); if (skb->len < len_rthdr + hdrlen) goto fail; /* * Initialize skb->protocol if the injected frame is a data frame * carrying a rfc1042 header */ if (ieee80211_is_data(hdr->frame_control) && skb->len >= len_rthdr + hdrlen + sizeof(rfc1042_header) + 2) { u8 *payload = (u8 *)hdr + hdrlen; if (ether_addr_equal(payload, rfc1042_header)) skb->protocol = cpu_to_be16((payload[6] << 8) | payload[7]); } rcu_read_lock(); /* * We process outgoing injected frames that have a local address * we handle as though they are non-injected frames. * This code here isn't entirely correct, the local MAC address * isn't always enough to find the interface to use; for proper * VLAN support we have an nl80211-based mechanism. * * This is necessary, for example, for old hostapd versions that * don't use nl80211-based management TX/RX. */ list_for_each_entry_rcu(tmp_sdata, &local->interfaces, list) { if (!ieee80211_sdata_running(tmp_sdata)) continue; if (tmp_sdata->vif.type == NL80211_IFTYPE_MONITOR || tmp_sdata->vif.type == NL80211_IFTYPE_AP_VLAN) continue; if (ether_addr_equal(tmp_sdata->vif.addr, hdr->addr2)) { sdata = tmp_sdata; break; } } chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf); if (!chanctx_conf) { tmp_sdata = rcu_dereference(local->monitor_sdata); if (tmp_sdata) chanctx_conf = rcu_dereference(tmp_sdata->vif.bss_conf.chanctx_conf); } if (chanctx_conf) chandef = &chanctx_conf->def; else goto fail_rcu; /* * If driver/HW supports IEEE80211_CHAN_CAN_MONITOR we still * shouldn't transmit on disabled channels. */ if (!cfg80211_chandef_usable(local->hw.wiphy, chandef, IEEE80211_CHAN_DISABLED)) goto fail_rcu; /* * Frame injection is not allowed if beaconing is not allowed * or if we need radar detection. Beaconing is usually not allowed when * the mode or operation (Adhoc, AP, Mesh) does not support DFS. * Passive scan is also used in world regulatory domains where * your country is not known and as such it should be treated as * NO TX unless the channel is explicitly allowed in which case * your current regulatory domain would not have the passive scan * flag. * * Since AP mode uses monitor interfaces to inject/TX management * frames we can make AP mode the exception to this rule once it * supports radar detection as its implementation can deal with * radar detection by itself. We can do that later by adding a * monitor flag interfaces used for AP support. */ if (!cfg80211_reg_can_beacon(local->hw.wiphy, chandef, sdata->vif.type)) goto fail_rcu; info->band = chandef->chan->band; /* Initialize skb->priority according to frame type and TID class, * with respect to the sub interface that the frame will actually * be transmitted on. If the DONT_REORDER flag is set, the original * skb-priority is preserved to assure frames injected with this * flag are not reordered relative to each other. */ ieee80211_select_queue_80211(sdata, skb, hdr); skb_set_queue_mapping(skb, ieee80211_ac_from_tid(skb->priority)); /* * Process the radiotap header. This will now take into account the * selected chandef above to accurately set injection rates and * retransmissions. */ if (!ieee80211_parse_tx_radiotap(skb, dev)) goto fail_rcu; /* remove the injection radiotap header */ skb_pull(skb, len_rthdr); ieee80211_xmit(sdata, NULL, skb); rcu_read_unlock(); return NETDEV_TX_OK; fail_rcu: rcu_read_unlock(); fail: dev_kfree_skb(skb); return NETDEV_TX_OK; /* meaning, we dealt with the skb */ } static inline bool ieee80211_is_tdls_setup(struct sk_buff *skb) { u16 ethertype = (skb->data[12] << 8) | skb->data[13]; return ethertype == ETH_P_TDLS && skb->len > 14 && skb->data[14] == WLAN_TDLS_SNAP_RFTYPE; } int ieee80211_lookup_ra_sta(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, struct sta_info **sta_out) { struct sta_info *sta; switch (sdata->vif.type) { case NL80211_IFTYPE_AP_VLAN: sta = rcu_dereference(sdata->u.vlan.sta); if (sta) { *sta_out = sta; return 0; } else if (sdata->wdev.use_4addr) { return -ENOLINK; } fallthrough; case NL80211_IFTYPE_AP: case NL80211_IFTYPE_OCB: case NL80211_IFTYPE_ADHOC: if (is_multicast_ether_addr(skb->data)) { *sta_out = ERR_PTR(-ENOENT); return 0; } sta = sta_info_get_bss(sdata, skb->data); break; #ifdef CONFIG_MAC80211_MESH case NL80211_IFTYPE_MESH_POINT: /* determined much later */ *sta_out = NULL; return 0; #endif case NL80211_IFTYPE_STATION: if (sdata->wdev.wiphy->flags & WIPHY_FLAG_SUPPORTS_TDLS) { sta = sta_info_get(sdata, skb->data); if (sta && test_sta_flag(sta, WLAN_STA_TDLS_PEER)) { if (test_sta_flag(sta, WLAN_STA_TDLS_PEER_AUTH)) { *sta_out = sta; return 0; } /* * TDLS link during setup - throw out frames to * peer. Allow TDLS-setup frames to unauthorized * peers for the special case of a link teardown * after a TDLS sta is removed due to being * unreachable. */ if (!ieee80211_is_tdls_setup(skb)) return -EINVAL; } } sta = sta_info_get(sdata, sdata->vif.cfg.ap_addr); if (!sta) return -ENOLINK; break; default: return -EINVAL; } *sta_out = sta ?: ERR_PTR(-ENOENT); return 0; } static u16 ieee80211_store_ack_skb(struct ieee80211_local *local, struct sk_buff *skb, u32 *info_flags, u64 *cookie) { struct sk_buff *ack_skb; u16 info_id = 0; if (skb->sk) ack_skb = skb_clone_sk(skb); else ack_skb = skb_clone(skb, GFP_ATOMIC); if (ack_skb) { unsigned long flags; int id; spin_lock_irqsave(&local->ack_status_lock, flags); id = idr_alloc(&local->ack_status_frames, ack_skb, 1, 0x2000, GFP_ATOMIC); spin_unlock_irqrestore(&local->ack_status_lock, flags); if (id >= 0) { info_id = id; *info_flags |= IEEE80211_TX_CTL_REQ_TX_STATUS; if (cookie) { *cookie = ieee80211_mgmt_tx_cookie(local); IEEE80211_SKB_CB(ack_skb)->ack.cookie = *cookie; } } else { kfree_skb(ack_skb); } } return info_id; } /** * ieee80211_build_hdr - build 802.11 header in the given frame * @sdata: virtual interface to build the header for * @skb: the skb to build the header in * @info_flags: skb flags to set * @sta: the station pointer * @ctrl_flags: info control flags to set * @cookie: cookie pointer to fill (if not %NULL) * * This function takes the skb with 802.3 header and reformats the header to * the appropriate IEEE 802.11 header based on which interface the packet is * being transmitted on. * * Note that this function also takes care of the TX status request and * potential unsharing of the SKB - this needs to be interleaved with the * header building. * * The function requires the read-side RCU lock held * * Returns: the (possibly reallocated) skb or an ERR_PTR() code */ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, u32 info_flags, struct sta_info *sta, u32 ctrl_flags, u64 *cookie) { struct ieee80211_local *local = sdata->local; struct ieee80211_tx_info *info; int head_need; u16 ethertype, hdrlen, meshhdrlen = 0; __le16 fc; struct ieee80211_hdr hdr; struct ieee80211s_hdr mesh_hdr __maybe_unused; struct mesh_path __maybe_unused *mppath = NULL, *mpath = NULL; const u8 *encaps_data; int encaps_len, skip_header_bytes; bool wme_sta = false, authorized = false; bool tdls_peer; bool multicast; u16 info_id = 0; struct ieee80211_chanctx_conf *chanctx_conf = NULL; enum nl80211_band band; int ret; u8 link_id = u32_get_bits(ctrl_flags, IEEE80211_TX_CTRL_MLO_LINK); if (IS_ERR(sta)) sta = NULL; #ifdef CONFIG_MAC80211_DEBUGFS if (local->force_tx_status) info_flags |= IEEE80211_TX_CTL_REQ_TX_STATUS; #endif /* convert Ethernet header to proper 802.11 header (based on * operation mode) */ ethertype = (skb->data[12] << 8) | skb->data[13]; fc = cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_DATA); if (!ieee80211_vif_is_mld(&sdata->vif)) chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf); switch (sdata->vif.type) { case NL80211_IFTYPE_AP_VLAN: if (sdata->wdev.use_4addr) { fc |= cpu_to_le16(IEEE80211_FCTL_FROMDS | IEEE80211_FCTL_TODS); /* RA TA DA SA */ memcpy(hdr.addr1, sta->sta.addr, ETH_ALEN); memcpy(hdr.addr2, sdata->vif.addr, ETH_ALEN); memcpy(hdr.addr3, skb->data, ETH_ALEN); memcpy(hdr.addr4, skb->data + ETH_ALEN, ETH_ALEN); hdrlen = 30; authorized = test_sta_flag(sta, WLAN_STA_AUTHORIZED); wme_sta = sta->sta.wme; } if (!ieee80211_vif_is_mld(&sdata->vif)) { struct ieee80211_sub_if_data *ap_sdata; /* override chanctx_conf from AP (we don't have one) */ ap_sdata = container_of(sdata->bss, struct ieee80211_sub_if_data, u.ap); chanctx_conf = rcu_dereference(ap_sdata->vif.bss_conf.chanctx_conf); } if (sdata->wdev.use_4addr) break; fallthrough; case NL80211_IFTYPE_AP: fc |= cpu_to_le16(IEEE80211_FCTL_FROMDS); /* DA BSSID SA */ memcpy(hdr.addr1, skb->data, ETH_ALEN); if (ieee80211_vif_is_mld(&sdata->vif) && sta && !sta->sta.mlo) { struct ieee80211_link_data *link; link_id = sta->deflink.link_id; link = rcu_dereference(sdata->link[link_id]); if (WARN_ON(!link)) { ret = -ENOLINK; goto free; } memcpy(hdr.addr2, link->conf->addr, ETH_ALEN); } else if (link_id == IEEE80211_LINK_UNSPECIFIED || (sta && sta->sta.mlo)) { memcpy(hdr.addr2, sdata->vif.addr, ETH_ALEN); } else { struct ieee80211_bss_conf *conf; conf = rcu_dereference(sdata->vif.link_conf[link_id]); if (unlikely(!conf)) { ret = -ENOLINK; goto free; } memcpy(hdr.addr2, conf->addr, ETH_ALEN); } memcpy(hdr.addr3, skb->data + ETH_ALEN, ETH_ALEN); hdrlen = 24; break; #ifdef CONFIG_MAC80211_MESH case NL80211_IFTYPE_MESH_POINT: if (!is_multicast_ether_addr(skb->data)) { struct sta_info *next_hop; bool mpp_lookup = true; mpath = mesh_path_lookup(sdata, skb->data); if (mpath) { mpp_lookup = false; next_hop = rcu_dereference(mpath->next_hop); if (!next_hop || !(mpath->flags & (MESH_PATH_ACTIVE | MESH_PATH_RESOLVING))) mpp_lookup = true; } if (mpp_lookup) { mppath = mpp_path_lookup(sdata, skb->data); if (mppath) mppath->exp_time = jiffies; } if (mppath && mpath) mesh_path_del(sdata, mpath->dst); } /* * Use address extension if it is a packet from * another interface or if we know the destination * is being proxied by a portal (i.e. portal address * differs from proxied address) */ if (ether_addr_equal(sdata->vif.addr, skb->data + ETH_ALEN) && !(mppath && !ether_addr_equal(mppath->mpp, skb->data))) { hdrlen = ieee80211_fill_mesh_addresses(&hdr, &fc, skb->data, skb->data + ETH_ALEN); meshhdrlen = ieee80211_new_mesh_header(sdata, &mesh_hdr, NULL, NULL); } else { /* DS -> MBSS (802.11-2012 13.11.3.3). * For unicast with unknown forwarding information, * destination might be in the MBSS or if that fails * forwarded to another mesh gate. In either case * resolution will be handled in ieee80211_xmit(), so * leave the original DA. This also works for mcast */ const u8 *mesh_da = skb->data; if (mppath) mesh_da = mppath->mpp; else if (mpath) mesh_da = mpath->dst; hdrlen = ieee80211_fill_mesh_addresses(&hdr, &fc, mesh_da, sdata->vif.addr); if (is_multicast_ether_addr(mesh_da)) /* DA TA mSA AE:SA */ meshhdrlen = ieee80211_new_mesh_header( sdata, &mesh_hdr, skb->data + ETH_ALEN, NULL); else /* RA TA mDA mSA AE:DA SA */ meshhdrlen = ieee80211_new_mesh_header( sdata, &mesh_hdr, skb->data, skb->data + ETH_ALEN); } /* For injected frames, fill RA right away as nexthop lookup * will be skipped. */ if ((ctrl_flags & IEEE80211_TX_CTRL_SKIP_MPATH_LOOKUP) && is_zero_ether_addr(hdr.addr1)) memcpy(hdr.addr1, skb->data, ETH_ALEN); break; #endif case NL80211_IFTYPE_STATION: /* we already did checks when looking up the RA STA */ tdls_peer = test_sta_flag(sta, WLAN_STA_TDLS_PEER); if (tdls_peer) { /* For TDLS only one link can be valid with peer STA */ int tdls_link_id = ieee80211_tdls_sta_link_id(sta); struct ieee80211_link_data *link; /* DA SA BSSID */ memcpy(hdr.addr1, skb->data, ETH_ALEN); memcpy(hdr.addr2, skb->data + ETH_ALEN, ETH_ALEN); link = rcu_dereference(sdata->link[tdls_link_id]); if (WARN_ON_ONCE(!link)) { ret = -EINVAL; goto free; } memcpy(hdr.addr3, link->u.mgd.bssid, ETH_ALEN); hdrlen = 24; } else if (sdata->u.mgd.use_4addr && cpu_to_be16(ethertype) != sdata->control_port_protocol) { fc |= cpu_to_le16(IEEE80211_FCTL_FROMDS | IEEE80211_FCTL_TODS); /* RA TA DA SA */ memcpy(hdr.addr1, sdata->deflink.u.mgd.bssid, ETH_ALEN); memcpy(hdr.addr2, sdata->vif.addr, ETH_ALEN); memcpy(hdr.addr3, skb->data, ETH_ALEN); memcpy(hdr.addr4, skb->data + ETH_ALEN, ETH_ALEN); hdrlen = 30; } else { fc |= cpu_to_le16(IEEE80211_FCTL_TODS); /* BSSID SA DA */ memcpy(hdr.addr1, sdata->vif.cfg.ap_addr, ETH_ALEN); memcpy(hdr.addr2, skb->data + ETH_ALEN, ETH_ALEN); memcpy(hdr.addr3, skb->data, ETH_ALEN); hdrlen = 24; } break; case NL80211_IFTYPE_OCB: /* DA SA BSSID */ memcpy(hdr.addr1, skb->data, ETH_ALEN); memcpy(hdr.addr2, skb->data + ETH_ALEN, ETH_ALEN); eth_broadcast_addr(hdr.addr3); hdrlen = 24; break; case NL80211_IFTYPE_ADHOC: /* DA SA BSSID */ memcpy(hdr.addr1, skb->data, ETH_ALEN); memcpy(hdr.addr2, skb->data + ETH_ALEN, ETH_ALEN); memcpy(hdr.addr3, sdata->u.ibss.bssid, ETH_ALEN); hdrlen = 24; break; default: ret = -EINVAL; goto free; } if (!chanctx_conf) { if (!ieee80211_vif_is_mld(&sdata->vif)) { ret = -ENOTCONN; goto free; } /* MLD transmissions must not rely on the band */ band = 0; } else { band = chanctx_conf->def.chan->band; } multicast = is_multicast_ether_addr(hdr.addr1); /* sta is always NULL for mesh */ if (sta) { authorized = test_sta_flag(sta, WLAN_STA_AUTHORIZED); wme_sta = sta->sta.wme; } else if (ieee80211_vif_is_mesh(&sdata->vif)) { /* For mesh, the use of the QoS header is mandatory */ wme_sta = true; } /* receiver does QoS (which also means we do) use it */ if (wme_sta) { fc |= cpu_to_le16(IEEE80211_STYPE_QOS_DATA); hdrlen += 2; } /* * Drop unicast frames to unauthorised stations unless they are * EAPOL frames from the local station. */ if (unlikely(!ieee80211_vif_is_mesh(&sdata->vif) && (sdata->vif.type != NL80211_IFTYPE_OCB) && !multicast && !authorized && (cpu_to_be16(ethertype) != sdata->control_port_protocol || !ieee80211_is_our_addr(sdata, skb->data + ETH_ALEN, NULL)))) { #ifdef CONFIG_MAC80211_VERBOSE_DEBUG net_info_ratelimited("%s: dropped frame to %pM (unauthorized port)\n", sdata->name, hdr.addr1); #endif I802_DEBUG_INC(local->tx_handlers_drop_unauth_port); ret = -EPERM; goto free; } if (unlikely(!multicast && ((skb->sk && sock_flag(skb->sk, SOCK_WIFI_STATUS)) || ctrl_flags & IEEE80211_TX_CTL_REQ_TX_STATUS))) info_id = ieee80211_store_ack_skb(local, skb, &info_flags, cookie); /* * If the skb is shared we need to obtain our own copy. */ skb = skb_share_check(skb, GFP_ATOMIC); if (unlikely(!skb)) { ret = -ENOMEM; goto free; } hdr.frame_control = fc; hdr.duration_id = 0; hdr.seq_ctrl = 0; skip_header_bytes = ETH_HLEN; if (ethertype == ETH_P_AARP || ethertype == ETH_P_IPX) { encaps_data = bridge_tunnel_header; encaps_len = sizeof(bridge_tunnel_header); skip_header_bytes -= 2; } else if (ethertype >= ETH_P_802_3_MIN) { encaps_data = rfc1042_header; encaps_len = sizeof(rfc1042_header); skip_header_bytes -= 2; } else { encaps_data = NULL; encaps_len = 0; } skb_pull(skb, skip_header_bytes); head_need = hdrlen + encaps_len + meshhdrlen - skb_headroom(skb); /* * So we need to modify the skb header and hence need a copy of * that. The head_need variable above doesn't, so far, include * the needed header space that we don't need right away. If we * can, then we don't reallocate right now but only after the * frame arrives at the master device (if it does...) * * If we cannot, however, then we will reallocate to include all * the ever needed space. Also, if we need to reallocate it anyway, * make it big enough for everything we may ever need. */ if (head_need > 0 || skb_cloned(skb)) { head_need += IEEE80211_ENCRYPT_HEADROOM; head_need += local->tx_headroom; head_need = max_t(int, 0, head_need); if (ieee80211_skb_resize(sdata, skb, head_need, ENCRYPT_DATA)) { ieee80211_free_txskb(&local->hw, skb); skb = NULL; return ERR_PTR(-ENOMEM); } } if (encaps_data) memcpy(skb_push(skb, encaps_len), encaps_data, encaps_len); #ifdef CONFIG_MAC80211_MESH if (meshhdrlen > 0) memcpy(skb_push(skb, meshhdrlen), &mesh_hdr, meshhdrlen); #endif if (ieee80211_is_data_qos(fc)) { __le16 *qos_control; qos_control = skb_push(skb, 2); memcpy(skb_push(skb, hdrlen - 2), &hdr, hdrlen - 2); /* * Maybe we could actually set some fields here, for now just * initialise to zero to indicate no special operation. */ *qos_control = 0; } else memcpy(skb_push(skb, hdrlen), &hdr, hdrlen); skb_reset_mac_header(skb); info = IEEE80211_SKB_CB(skb); memset(info, 0, sizeof(*info)); info->flags = info_flags; if (info_id) { info->status_data = info_id; info->status_data_idr = 1; } info->band = band; if (likely(!cookie)) { ctrl_flags |= u32_encode_bits(link_id, IEEE80211_TX_CTRL_MLO_LINK); } else { unsigned int pre_conf_link_id; /* * ctrl_flags already have been set by * ieee80211_tx_control_port(), here * we just sanity check that */ pre_conf_link_id = u32_get_bits(ctrl_flags, IEEE80211_TX_CTRL_MLO_LINK); if (pre_conf_link_id != link_id && link_id != IEEE80211_LINK_UNSPECIFIED) { #ifdef CONFIG_MAC80211_VERBOSE_DEBUG net_info_ratelimited("%s: dropped frame to %pM with bad link ID request (%d vs. %d)\n", sdata->name, hdr.addr1, pre_conf_link_id, link_id); #endif ret = -EINVAL; goto free; } } info->control.flags = ctrl_flags; return skb; free: kfree_skb(skb); return ERR_PTR(ret); } /* * fast-xmit overview * * The core idea of this fast-xmit is to remove per-packet checks by checking * them out of band. ieee80211_check_fast_xmit() implements the out-of-band * checks that are needed to get the sta->fast_tx pointer assigned, after which * much less work can be done per packet. For example, fragmentation must be * disabled or the fast_tx pointer will not be set. All the conditions are seen * in the code here. * * Once assigned, the fast_tx data structure also caches the per-packet 802.11 * header and other data to aid packet processing in ieee80211_xmit_fast(). * * The most difficult part of this is that when any of these assumptions * change, an external trigger (i.e. a call to ieee80211_clear_fast_xmit(), * ieee80211_check_fast_xmit() or friends) is required to reset the data, * since the per-packet code no longer checks the conditions. This is reflected * by the calls to these functions throughout the rest of the code, and must be * maintained if any of the TX path checks change. */ void ieee80211_check_fast_xmit(struct sta_info *sta) { struct ieee80211_fast_tx build = {}, *fast_tx = NULL, *old; struct ieee80211_local *local = sta->local; struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_hdr *hdr = (void *)build.hdr; struct ieee80211_chanctx_conf *chanctx_conf; __le16 fc; if (!ieee80211_hw_check(&local->hw, SUPPORT_FAST_XMIT)) return; if (ieee80211_vif_is_mesh(&sdata->vif)) mesh_fast_tx_flush_sta(sdata, sta); /* Locking here protects both the pointer itself, and against concurrent * invocations winning data access races to, e.g., the key pointer that * is used. * Without it, the invocation of this function right after the key * pointer changes wouldn't be sufficient, as another CPU could access * the pointer, then stall, and then do the cache update after the CPU * that invalidated the key. * With the locking, such scenarios cannot happen as the check for the * key and the fast-tx assignment are done atomically, so the CPU that * modifies the key will either wait or other one will see the key * cleared/changed already. */ spin_lock_bh(&sta->lock); if (ieee80211_hw_check(&local->hw, SUPPORTS_PS) && !ieee80211_hw_check(&local->hw, SUPPORTS_DYNAMIC_PS) && sdata->vif.type == NL80211_IFTYPE_STATION) goto out; if (!test_sta_flag(sta, WLAN_STA_AUTHORIZED) || !sta->uploaded) goto out; if (test_sta_flag(sta, WLAN_STA_PS_STA) || test_sta_flag(sta, WLAN_STA_PS_DRIVER) || test_sta_flag(sta, WLAN_STA_PS_DELIVER) || test_sta_flag(sta, WLAN_STA_CLEAR_PS_FILT)) goto out; if (sdata->noack_map) goto out; /* fast-xmit doesn't handle fragmentation at all */ if (local->hw.wiphy->frag_threshold != (u32)-1 && !ieee80211_hw_check(&local->hw, SUPPORTS_TX_FRAG)) goto out; if (!ieee80211_vif_is_mld(&sdata->vif)) { rcu_read_lock(); chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf); if (!chanctx_conf) { rcu_read_unlock(); goto out; } build.band = chanctx_conf->def.chan->band; rcu_read_unlock(); } else { /* MLD transmissions must not rely on the band */ build.band = 0; } fc = cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_DATA); switch (sdata->vif.type) { case NL80211_IFTYPE_ADHOC: /* DA SA BSSID */ build.da_offs = offsetof(struct ieee80211_hdr, addr1); build.sa_offs = offsetof(struct ieee80211_hdr, addr2); memcpy(hdr->addr3, sdata->u.ibss.bssid, ETH_ALEN); build.hdr_len = 24; break; case NL80211_IFTYPE_STATION: if (test_sta_flag(sta, WLAN_STA_TDLS_PEER)) { /* For TDLS only one link can be valid with peer STA */ int tdls_link_id = ieee80211_tdls_sta_link_id(sta); struct ieee80211_link_data *link; /* DA SA BSSID */ build.da_offs = offsetof(struct ieee80211_hdr, addr1); build.sa_offs = offsetof(struct ieee80211_hdr, addr2); rcu_read_lock(); link = rcu_dereference(sdata->link[tdls_link_id]); if (!WARN_ON_ONCE(!link)) memcpy(hdr->addr3, link->u.mgd.bssid, ETH_ALEN); rcu_read_unlock(); build.hdr_len = 24; break; } if (sdata->u.mgd.use_4addr) { /* non-regular ethertype cannot use the fastpath */ fc |= cpu_to_le16(IEEE80211_FCTL_FROMDS | IEEE80211_FCTL_TODS); /* RA TA DA SA */ memcpy(hdr->addr1, sdata->deflink.u.mgd.bssid, ETH_ALEN); memcpy(hdr->addr2, sdata->vif.addr, ETH_ALEN); build.da_offs = offsetof(struct ieee80211_hdr, addr3); build.sa_offs = offsetof(struct ieee80211_hdr, addr4); build.hdr_len = 30; break; } fc |= cpu_to_le16(IEEE80211_FCTL_TODS); /* BSSID SA DA */ memcpy(hdr->addr1, sdata->vif.cfg.ap_addr, ETH_ALEN); build.da_offs = offsetof(struct ieee80211_hdr, addr3); build.sa_offs = offsetof(struct ieee80211_hdr, addr2); build.hdr_len = 24; break; case NL80211_IFTYPE_AP_VLAN: if (sdata->wdev.use_4addr) { fc |= cpu_to_le16(IEEE80211_FCTL_FROMDS | IEEE80211_FCTL_TODS); /* RA TA DA SA */ memcpy(hdr->addr1, sta->sta.addr, ETH_ALEN); memcpy(hdr->addr2, sdata->vif.addr, ETH_ALEN); build.da_offs = offsetof(struct ieee80211_hdr, addr3); build.sa_offs = offsetof(struct ieee80211_hdr, addr4); build.hdr_len = 30; break; } fallthrough; case NL80211_IFTYPE_AP: fc |= cpu_to_le16(IEEE80211_FCTL_FROMDS); /* DA BSSID SA */ build.da_offs = offsetof(struct ieee80211_hdr, addr1); if (sta->sta.mlo || !ieee80211_vif_is_mld(&sdata->vif)) { memcpy(hdr->addr2, sdata->vif.addr, ETH_ALEN); } else { unsigned int link_id = sta->deflink.link_id; struct ieee80211_link_data *link; rcu_read_lock(); link = rcu_dereference(sdata->link[link_id]); if (WARN_ON(!link)) { rcu_read_unlock(); goto out; } memcpy(hdr->addr2, link->conf->addr, ETH_ALEN); rcu_read_unlock(); } build.sa_offs = offsetof(struct ieee80211_hdr, addr3); build.hdr_len = 24; break; default: /* not handled on fast-xmit */ goto out; } if (sta->sta.wme) { build.hdr_len += 2; fc |= cpu_to_le16(IEEE80211_STYPE_QOS_DATA); } /* We store the key here so there's no point in using rcu_dereference() * but that's fine because the code that changes the pointers will call * this function after doing so. For a single CPU that would be enough, * for multiple see the comment above. */ build.key = rcu_access_pointer(sta->ptk[sta->ptk_idx]); if (!build.key) build.key = rcu_access_pointer(sdata->default_unicast_key); if (build.key) { bool gen_iv, iv_spc, mmic; gen_iv = build.key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_IV; iv_spc = build.key->conf.flags & IEEE80211_KEY_FLAG_PUT_IV_SPACE; mmic = build.key->conf.flags & (IEEE80211_KEY_FLAG_GENERATE_MMIC | IEEE80211_KEY_FLAG_PUT_MIC_SPACE); /* don't handle software crypto */ if (!(build.key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE)) goto out; /* Key is being removed */ if (build.key->flags & KEY_FLAG_TAINTED) goto out; switch (build.key->conf.cipher) { case WLAN_CIPHER_SUITE_CCMP: case WLAN_CIPHER_SUITE_CCMP_256: if (gen_iv) build.pn_offs = build.hdr_len; if (gen_iv || iv_spc) build.hdr_len += IEEE80211_CCMP_HDR_LEN; break; case WLAN_CIPHER_SUITE_GCMP: case WLAN_CIPHER_SUITE_GCMP_256: if (gen_iv) build.pn_offs = build.hdr_len; if (gen_iv || iv_spc) build.hdr_len += IEEE80211_GCMP_HDR_LEN; break; case WLAN_CIPHER_SUITE_TKIP: /* cannot handle MMIC or IV generation in xmit-fast */ if (mmic || gen_iv) goto out; if (iv_spc) build.hdr_len += IEEE80211_TKIP_IV_LEN; break; case WLAN_CIPHER_SUITE_WEP40: case WLAN_CIPHER_SUITE_WEP104: /* cannot handle IV generation in fast-xmit */ if (gen_iv) goto out; if (iv_spc) build.hdr_len += IEEE80211_WEP_IV_LEN; break; case WLAN_CIPHER_SUITE_AES_CMAC: case WLAN_CIPHER_SUITE_BIP_CMAC_256: case WLAN_CIPHER_SUITE_BIP_GMAC_128: case WLAN_CIPHER_SUITE_BIP_GMAC_256: WARN(1, "management cipher suite 0x%x enabled for data\n", build.key->conf.cipher); goto out; default: /* we don't know how to generate IVs for this at all */ if (WARN_ON(gen_iv)) goto out; } fc |= cpu_to_le16(IEEE80211_FCTL_PROTECTED); } hdr->frame_control = fc; memcpy(build.hdr + build.hdr_len, rfc1042_header, sizeof(rfc1042_header)); build.hdr_len += sizeof(rfc1042_header); fast_tx = kmemdup(&build, sizeof(build), GFP_ATOMIC); /* if the kmemdup fails, continue w/o fast_tx */ out: /* we might have raced against another call to this function */ old = rcu_dereference_protected(sta->fast_tx, lockdep_is_held(&sta->lock)); rcu_assign_pointer(sta->fast_tx, fast_tx); if (old) kfree_rcu(old, rcu_head); spin_unlock_bh(&sta->lock); } void ieee80211_check_fast_xmit_all(struct ieee80211_local *local) { struct sta_info *sta; rcu_read_lock(); list_for_each_entry_rcu(sta, &local->sta_list, list) ieee80211_check_fast_xmit(sta); rcu_read_unlock(); } void ieee80211_check_fast_xmit_iface(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; struct sta_info *sta; rcu_read_lock(); list_for_each_entry_rcu(sta, &local->sta_list, list) { if (sdata != sta->sdata && (!sta->sdata->bss || sta->sdata->bss != sdata->bss)) continue; ieee80211_check_fast_xmit(sta); } rcu_read_unlock(); } void ieee80211_clear_fast_xmit(struct sta_info *sta) { struct ieee80211_fast_tx *fast_tx; spin_lock_bh(&sta->lock); fast_tx = rcu_dereference_protected(sta->fast_tx, lockdep_is_held(&sta->lock)); RCU_INIT_POINTER(sta->fast_tx, NULL); spin_unlock_bh(&sta->lock); if (fast_tx) kfree_rcu(fast_tx, rcu_head); } static bool ieee80211_amsdu_realloc_pad(struct ieee80211_local *local, struct sk_buff *skb, int headroom) { if (skb_headroom(skb) < headroom) { I802_DEBUG_INC(local->tx_expand_skb_head); if (pskb_expand_head(skb, headroom, 0, GFP_ATOMIC)) { wiphy_debug(local->hw.wiphy, "failed to reallocate TX buffer\n"); return false; } } return true; } static bool ieee80211_amsdu_prepare_head(struct ieee80211_sub_if_data *sdata, struct ieee80211_fast_tx *fast_tx, struct sk_buff *skb) { struct ieee80211_local *local = sdata->local; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct ieee80211_hdr *hdr; struct ethhdr *amsdu_hdr; int hdr_len = fast_tx->hdr_len - sizeof(rfc1042_header); int subframe_len = skb->len - hdr_len; void *data; u8 *qc, *h_80211_src, *h_80211_dst; const u8 *bssid; if (info->flags & IEEE80211_TX_CTL_RATE_CTRL_PROBE) return false; if (info->control.flags & IEEE80211_TX_CTRL_AMSDU) return true; if (!ieee80211_amsdu_realloc_pad(local, skb, sizeof(*amsdu_hdr) + local->hw.extra_tx_headroom)) return false; data = skb_push(skb, sizeof(*amsdu_hdr)); memmove(data, data + sizeof(*amsdu_hdr), hdr_len); hdr = data; amsdu_hdr = data + hdr_len; /* h_80211_src/dst is addr* field within hdr */ h_80211_src = data + fast_tx->sa_offs; h_80211_dst = data + fast_tx->da_offs; amsdu_hdr->h_proto = cpu_to_be16(subframe_len); ether_addr_copy(amsdu_hdr->h_source, h_80211_src); ether_addr_copy(amsdu_hdr->h_dest, h_80211_dst); /* according to IEEE 802.11-2012 8.3.2 table 8-19, the outer SA/DA * fields needs to be changed to BSSID for A-MSDU frames depending * on FromDS/ToDS values. */ switch (sdata->vif.type) { case NL80211_IFTYPE_STATION: bssid = sdata->vif.cfg.ap_addr; break; case NL80211_IFTYPE_AP: case NL80211_IFTYPE_AP_VLAN: bssid = sdata->vif.addr; break; default: bssid = NULL; } if (bssid && ieee80211_has_fromds(hdr->frame_control)) ether_addr_copy(h_80211_src, bssid); if (bssid && ieee80211_has_tods(hdr->frame_control)) ether_addr_copy(h_80211_dst, bssid); qc = ieee80211_get_qos_ctl(hdr); *qc |= IEEE80211_QOS_CTL_A_MSDU_PRESENT; info->control.flags |= IEEE80211_TX_CTRL_AMSDU; return true; } static bool ieee80211_amsdu_aggregate(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, struct ieee80211_fast_tx *fast_tx, struct sk_buff *skb, const u8 *da, const u8 *sa) { struct ieee80211_local *local = sdata->local; struct fq *fq = &local->fq; struct fq_tin *tin; struct fq_flow *flow; u8 tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK; struct ieee80211_txq *txq = sta->sta.txq[tid]; struct txq_info *txqi; struct sk_buff **frag_tail, *head; int subframe_len = skb->len - ETH_ALEN; u8 max_subframes = sta->sta.max_amsdu_subframes; int max_frags = local->hw.max_tx_fragments; int max_amsdu_len = sta->sta.cur->max_amsdu_len; int orig_truesize; u32 flow_idx; __be16 len; void *data; bool ret = false; unsigned int orig_len; int n = 2, nfrags, pad = 0; u16 hdrlen; if (!ieee80211_hw_check(&local->hw, TX_AMSDU)) return false; if (sdata->vif.offload_flags & IEEE80211_OFFLOAD_ENCAP_ENABLED) return false; if (ieee80211_vif_is_mesh(&sdata->vif)) return false; if (skb_is_gso(skb)) return false; if (!txq) return false; txqi = to_txq_info(txq); if (test_bit(IEEE80211_TXQ_NO_AMSDU, &txqi->flags)) return false; if (sta->sta.cur->max_rc_amsdu_len) max_amsdu_len = min_t(int, max_amsdu_len, sta->sta.cur->max_rc_amsdu_len); if (sta->sta.cur->max_tid_amsdu_len[tid]) max_amsdu_len = min_t(int, max_amsdu_len, sta->sta.cur->max_tid_amsdu_len[tid]); flow_idx = fq_flow_idx(fq, skb); spin_lock_bh(&fq->lock); /* TODO: Ideally aggregation should be done on dequeue to remain * responsive to environment changes. */ tin = &txqi->tin; flow = fq_flow_classify(fq, tin, flow_idx, skb); head = skb_peek_tail(&flow->queue); if (!head || skb_is_gso(head)) goto out; orig_truesize = head->truesize; orig_len = head->len; if (skb->len + head->len > max_amsdu_len) goto out; nfrags = 1 + skb_shinfo(skb)->nr_frags; nfrags += 1 + skb_shinfo(head)->nr_frags; frag_tail = &skb_shinfo(head)->frag_list; while (*frag_tail) { nfrags += 1 + skb_shinfo(*frag_tail)->nr_frags; frag_tail = &(*frag_tail)->next; n++; } if (max_subframes && n > max_subframes) goto out; if (max_frags && nfrags > max_frags) goto out; if (!drv_can_aggregate_in_amsdu(local, head, skb)) goto out; if (!ieee80211_amsdu_prepare_head(sdata, fast_tx, head)) goto out; /* If n == 2, the "while (*frag_tail)" loop above didn't execute * and frag_tail should be &skb_shinfo(head)->frag_list. * However, ieee80211_amsdu_prepare_head() can reallocate it. * Reload frag_tail to have it pointing to the correct place. */ if (n == 2) frag_tail = &skb_shinfo(head)->frag_list; /* * Pad out the previous subframe to a multiple of 4 by adding the * padding to the next one, that's being added. Note that head->len * is the length of the full A-MSDU, but that works since each time * we add a new subframe we pad out the previous one to a multiple * of 4 and thus it no longer matters in the next round. */ hdrlen = fast_tx->hdr_len - sizeof(rfc1042_header); if ((head->len - hdrlen) & 3) pad = 4 - ((head->len - hdrlen) & 3); if (!ieee80211_amsdu_realloc_pad(local, skb, sizeof(rfc1042_header) + 2 + pad)) goto out_recalc; ret = true; data = skb_push(skb, ETH_ALEN + 2); ether_addr_copy(data, da); ether_addr_copy(data + ETH_ALEN, sa); data += 2 * ETH_ALEN; len = cpu_to_be16(subframe_len); memcpy(data, &len, 2); memcpy(data + 2, rfc1042_header, sizeof(rfc1042_header)); memset(skb_push(skb, pad), 0, pad); head->len += skb->len; head->data_len += skb->len; *frag_tail = skb; out_recalc: fq->memory_usage += head->truesize - orig_truesize; if (head->len != orig_len) { flow->backlog += head->len - orig_len; tin->backlog_bytes += head->len - orig_len; } out: spin_unlock_bh(&fq->lock); return ret; } /* * Can be called while the sta lock is held. Anything that can cause packets to * be generated will cause deadlock! */ static ieee80211_tx_result ieee80211_xmit_fast_finish(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, u8 pn_offs, struct ieee80211_key *key, struct ieee80211_tx_data *tx) { struct sk_buff *skb = tx->skb; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct ieee80211_hdr *hdr = (void *)skb->data; u8 tid = IEEE80211_NUM_TIDS; if (!ieee80211_hw_check(&tx->local->hw, HAS_RATE_CONTROL) && ieee80211_tx_h_rate_ctrl(tx) != TX_CONTINUE) return TX_DROP; if (key) info->control.hw_key = &key->conf; dev_sw_netstats_tx_add(skb->dev, 1, skb->len); if (hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) { tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK; hdr->seq_ctrl = ieee80211_tx_next_seq(sta, tid); } else { info->flags |= IEEE80211_TX_CTL_ASSIGN_SEQ; hdr->seq_ctrl = cpu_to_le16(sdata->sequence_number); sdata->sequence_number += 0x10; } if (skb_shinfo(skb)->gso_size) sta->deflink.tx_stats.msdu[tid] += DIV_ROUND_UP(skb->len, skb_shinfo(skb)->gso_size); else sta->deflink.tx_stats.msdu[tid]++; info->hw_queue = sdata->vif.hw_queue[skb_get_queue_mapping(skb)]; /* statistics normally done by ieee80211_tx_h_stats (but that * has to consider fragmentation, so is more complex) */ sta->deflink.tx_stats.bytes[skb_get_queue_mapping(skb)] += skb->len; sta->deflink.tx_stats.packets[skb_get_queue_mapping(skb)]++; if (pn_offs) { u64 pn; u8 *crypto_hdr = skb->data + pn_offs; switch (key->conf.cipher) { case WLAN_CIPHER_SUITE_CCMP: case WLAN_CIPHER_SUITE_CCMP_256: case WLAN_CIPHER_SUITE_GCMP: case WLAN_CIPHER_SUITE_GCMP_256: pn = atomic64_inc_return(&key->conf.tx_pn); crypto_hdr[0] = pn; crypto_hdr[1] = pn >> 8; crypto_hdr[3] = 0x20 | (key->conf.keyidx << 6); crypto_hdr[4] = pn >> 16; crypto_hdr[5] = pn >> 24; crypto_hdr[6] = pn >> 32; crypto_hdr[7] = pn >> 40; break; } } return TX_CONTINUE; } static netdev_features_t ieee80211_sdata_netdev_features(struct ieee80211_sub_if_data *sdata) { if (sdata->vif.type != NL80211_IFTYPE_AP_VLAN) return sdata->vif.netdev_features; if (!sdata->bss) return 0; sdata = container_of(sdata->bss, struct ieee80211_sub_if_data, u.ap); return sdata->vif.netdev_features; } static struct sk_buff * ieee80211_tx_skb_fixup(struct sk_buff *skb, netdev_features_t features) { if (skb_is_gso(skb)) { struct sk_buff *segs; segs = skb_gso_segment(skb, features); if (!segs) return skb; if (IS_ERR(segs)) goto free; consume_skb(skb); return segs; } if (skb_needs_linearize(skb, features) && __skb_linearize(skb)) goto free; if (skb->ip_summed == CHECKSUM_PARTIAL) { int ofs = skb_checksum_start_offset(skb); if (skb->encapsulation) skb_set_inner_transport_header(skb, ofs); else skb_set_transport_header(skb, ofs); if (skb_csum_hwoffload_help(skb, features)) goto free; } skb_mark_not_on_list(skb); return skb; free: kfree_skb(skb); return NULL; } void __ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, struct ieee80211_fast_tx *fast_tx, struct sk_buff *skb, bool ampdu, const u8 *da, const u8 *sa) { struct ieee80211_local *local = sdata->local; struct ieee80211_hdr *hdr = (void *)fast_tx->hdr; struct ieee80211_tx_info *info; struct ieee80211_tx_data tx; ieee80211_tx_result r; int hw_headroom = sdata->local->hw.extra_tx_headroom; int extra_head = fast_tx->hdr_len - (ETH_HLEN - 2); skb = skb_share_check(skb, GFP_ATOMIC); if (unlikely(!skb)) return; if ((hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) && ieee80211_amsdu_aggregate(sdata, sta, fast_tx, skb, da, sa)) return; /* will not be crypto-handled beyond what we do here, so use false * as the may-encrypt argument for the resize to not account for * more room than we already have in 'extra_head' */ if (unlikely(ieee80211_skb_resize(sdata, skb, max_t(int, extra_head + hw_headroom - skb_headroom(skb), 0), ENCRYPT_NO))) goto free; hdr = skb_push(skb, extra_head); memcpy(skb->data, fast_tx->hdr, fast_tx->hdr_len); memcpy(skb->data + fast_tx->da_offs, da, ETH_ALEN); memcpy(skb->data + fast_tx->sa_offs, sa, ETH_ALEN); info = IEEE80211_SKB_CB(skb); memset(info, 0, sizeof(*info)); info->band = fast_tx->band; info->control.vif = &sdata->vif; info->flags = IEEE80211_TX_CTL_FIRST_FRAGMENT | IEEE80211_TX_CTL_DONTFRAG; info->control.flags = IEEE80211_TX_CTRL_FAST_XMIT | u32_encode_bits(IEEE80211_LINK_UNSPECIFIED, IEEE80211_TX_CTRL_MLO_LINK); #ifdef CONFIG_MAC80211_DEBUGFS if (local->force_tx_status) info->flags |= IEEE80211_TX_CTL_REQ_TX_STATUS; #endif if (hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) { u8 tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK; *ieee80211_get_qos_ctl(hdr) = tid; } __skb_queue_head_init(&tx.skbs); tx.flags = IEEE80211_TX_UNICAST; tx.local = local; tx.sdata = sdata; tx.sta = sta; tx.key = fast_tx->key; if (ieee80211_queue_skb(local, sdata, sta, skb)) return; tx.skb = skb; r = ieee80211_xmit_fast_finish(sdata, sta, fast_tx->pn_offs, fast_tx->key, &tx); tx.skb = NULL; if (r == TX_DROP) goto free; if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) sdata = container_of(sdata->bss, struct ieee80211_sub_if_data, u.ap); __skb_queue_tail(&tx.skbs, skb); ieee80211_tx_frags(local, &sdata->vif, sta, &tx.skbs, false); return; free: kfree_skb(skb); } static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, struct ieee80211_fast_tx *fast_tx, struct sk_buff *skb) { u16 ethertype = (skb->data[12] << 8) | skb->data[13]; struct ieee80211_hdr *hdr = (void *)fast_tx->hdr; struct tid_ampdu_tx *tid_tx = NULL; struct sk_buff *next; struct ethhdr eth; u8 tid = IEEE80211_NUM_TIDS; /* control port protocol needs a lot of special handling */ if (cpu_to_be16(ethertype) == sdata->control_port_protocol) return false; /* only RFC 1042 SNAP */ if (ethertype < ETH_P_802_3_MIN) return false; /* don't handle TX status request here either */ if (skb->sk && sock_flag(skb->sk, SOCK_WIFI_STATUS)) return false; if (hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) { tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK; tid_tx = rcu_dereference(sta->ampdu_mlme.tid_tx[tid]); if (tid_tx) { if (!test_bit(HT_AGG_STATE_OPERATIONAL, &tid_tx->state)) return false; if (tid_tx->timeout) tid_tx->last_tx = jiffies; } } memcpy(&eth, skb->data, ETH_HLEN - 2); /* after this point (skb is modified) we cannot return false */ skb = ieee80211_tx_skb_fixup(skb, ieee80211_sdata_netdev_features(sdata)); if (!skb) return true; skb_list_walk_safe(skb, skb, next) { skb_mark_not_on_list(skb); __ieee80211_xmit_fast(sdata, sta, fast_tx, skb, tid_tx, eth.h_dest, eth.h_source); } return true; } struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw, struct ieee80211_txq *txq) { struct ieee80211_local *local = hw_to_local(hw); struct txq_info *txqi = container_of(txq, struct txq_info, txq); struct ieee80211_hdr *hdr; struct sk_buff *skb = NULL; struct fq *fq = &local->fq; struct fq_tin *tin = &txqi->tin; struct ieee80211_tx_info *info; struct ieee80211_tx_data tx; ieee80211_tx_result r; struct ieee80211_vif *vif = txq->vif; int q = vif->hw_queue[txq->ac]; unsigned long flags; bool q_stopped; WARN_ON_ONCE(softirq_count() == 0); if (!ieee80211_txq_airtime_check(hw, txq)) return NULL; begin: spin_lock_irqsave(&local->queue_stop_reason_lock, flags); q_stopped = local->queue_stop_reasons[q]; spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); if (unlikely(q_stopped)) { /* mark for waking later */ set_bit(IEEE80211_TXQ_DIRTY, &txqi->flags); return NULL; } spin_lock_bh(&fq->lock); /* Make sure fragments stay together. */ skb = __skb_dequeue(&txqi->frags); if (unlikely(skb)) { if (!(IEEE80211_SKB_CB(skb)->control.flags & IEEE80211_TX_INTCFL_NEED_TXPROCESSING)) goto out; IEEE80211_SKB_CB(skb)->control.flags &= ~IEEE80211_TX_INTCFL_NEED_TXPROCESSING; } else { if (unlikely(test_bit(IEEE80211_TXQ_STOP, &txqi->flags))) goto out; skb = fq_tin_dequeue(fq, tin, fq_tin_dequeue_func); } if (!skb) goto out; spin_unlock_bh(&fq->lock); hdr = (struct ieee80211_hdr *)skb->data; info = IEEE80211_SKB_CB(skb); memset(&tx, 0, sizeof(tx)); __skb_queue_head_init(&tx.skbs); tx.local = local; tx.skb = skb; tx.sdata = vif_to_sdata(info->control.vif); if (txq->sta) { tx.sta = container_of(txq->sta, struct sta_info, sta); /* * Drop unicast frames to unauthorised stations unless they are * injected frames or EAPOL frames from the local station. */ if (unlikely(!(info->flags & IEEE80211_TX_CTL_INJECTED) && ieee80211_is_data(hdr->frame_control) && !ieee80211_vif_is_mesh(&tx.sdata->vif) && tx.sdata->vif.type != NL80211_IFTYPE_OCB && !is_multicast_ether_addr(hdr->addr1) && !test_sta_flag(tx.sta, WLAN_STA_AUTHORIZED) && (!(info->control.flags & IEEE80211_TX_CTRL_PORT_CTRL_PROTO) || !ieee80211_is_our_addr(tx.sdata, hdr->addr2, NULL)))) { I802_DEBUG_INC(local->tx_handlers_drop_unauth_port); ieee80211_free_txskb(&local->hw, skb); goto begin; } } /* * The key can be removed while the packet was queued, so need to call * this here to get the current key. */ r = ieee80211_tx_h_select_key(&tx); if (r != TX_CONTINUE) { ieee80211_free_txskb(&local->hw, skb); goto begin; } if (test_bit(IEEE80211_TXQ_AMPDU, &txqi->flags)) info->flags |= (IEEE80211_TX_CTL_AMPDU | IEEE80211_TX_CTL_DONTFRAG); if (info->flags & IEEE80211_TX_CTL_HW_80211_ENCAP) { if (!ieee80211_hw_check(&local->hw, HAS_RATE_CONTROL)) { r = ieee80211_tx_h_rate_ctrl(&tx); if (r != TX_CONTINUE) { ieee80211_free_txskb(&local->hw, skb); goto begin; } } goto encap_out; } if (info->control.flags & IEEE80211_TX_CTRL_FAST_XMIT) { struct sta_info *sta = container_of(txq->sta, struct sta_info, sta); u8 pn_offs = 0; if (tx.key && (tx.key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_IV)) pn_offs = ieee80211_hdrlen(hdr->frame_control); r = ieee80211_xmit_fast_finish(sta->sdata, sta, pn_offs, tx.key, &tx); if (r != TX_CONTINUE) { ieee80211_free_txskb(&local->hw, skb); goto begin; } } else { if (invoke_tx_handlers_late(&tx)) goto begin; skb = __skb_dequeue(&tx.skbs); info = IEEE80211_SKB_CB(skb); if (!skb_queue_empty(&tx.skbs)) { spin_lock_bh(&fq->lock); skb_queue_splice_tail(&tx.skbs, &txqi->frags); spin_unlock_bh(&fq->lock); } } if (skb_has_frag_list(skb) && !ieee80211_hw_check(&local->hw, TX_FRAG_LIST)) { if (skb_linearize(skb)) { ieee80211_free_txskb(&local->hw, skb); goto begin; } } switch (tx.sdata->vif.type) { case NL80211_IFTYPE_MONITOR: if ((tx.sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE) || ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR)) { vif = &tx.sdata->vif; break; } tx.sdata = rcu_dereference(local->monitor_sdata); if (tx.sdata && ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF)) { vif = &tx.sdata->vif; info->hw_queue = vif->hw_queue[skb_get_queue_mapping(skb)]; } else if (ieee80211_hw_check(&local->hw, QUEUE_CONTROL)) { ieee80211_free_txskb(&local->hw, skb); goto begin; } else { info->control.vif = NULL; return skb; } break; case NL80211_IFTYPE_AP_VLAN: tx.sdata = container_of(tx.sdata->bss, struct ieee80211_sub_if_data, u.ap); fallthrough; default: vif = &tx.sdata->vif; break; } encap_out: info->control.vif = vif; if (tx.sta && wiphy_ext_feature_isset(local->hw.wiphy, NL80211_EXT_FEATURE_AQL)) { bool ampdu = txq->ac != IEEE80211_AC_VO; u32 airtime; airtime = ieee80211_calc_expected_tx_airtime(hw, vif, txq->sta, skb->len, ampdu); if (airtime) { airtime = ieee80211_info_set_tx_time_est(info, airtime); ieee80211_sta_update_pending_airtime(local, tx.sta, txq->ac, airtime, false); } } return skb; out: spin_unlock_bh(&fq->lock); return skb; } EXPORT_SYMBOL(ieee80211_tx_dequeue); static inline s32 ieee80211_sta_deficit(struct sta_info *sta, u8 ac) { struct airtime_info *air_info = &sta->airtime[ac]; return air_info->deficit - atomic_read(&air_info->aql_tx_pending); } static void ieee80211_txq_set_active(struct txq_info *txqi) { struct sta_info *sta; if (!txqi->txq.sta) return; sta = container_of(txqi->txq.sta, struct sta_info, sta); sta->airtime[txqi->txq.ac].last_active = jiffies; } static bool ieee80211_txq_keep_active(struct txq_info *txqi) { struct sta_info *sta; if (!txqi->txq.sta) return false; sta = container_of(txqi->txq.sta, struct sta_info, sta); if (ieee80211_sta_deficit(sta, txqi->txq.ac) >= 0) return false; return ieee80211_sta_keep_active(sta, txqi->txq.ac); } struct ieee80211_txq *ieee80211_next_txq(struct ieee80211_hw *hw, u8 ac) { struct ieee80211_local *local = hw_to_local(hw); struct ieee80211_txq *ret = NULL; struct txq_info *txqi = NULL, *head = NULL; bool found_eligible_txq = false; spin_lock_bh(&local->active_txq_lock[ac]); if (!local->schedule_round[ac]) goto out; begin: txqi = list_first_entry_or_null(&local->active_txqs[ac], struct txq_info, schedule_order); if (!txqi) goto out; if (txqi == head) { if (!found_eligible_txq) goto out; else found_eligible_txq = false; } if (!head) head = txqi; if (txqi->txq.sta) { struct sta_info *sta = container_of(txqi->txq.sta, struct sta_info, sta); bool aql_check = ieee80211_txq_airtime_check(hw, &txqi->txq); s32 deficit = ieee80211_sta_deficit(sta, txqi->txq.ac); if (aql_check) found_eligible_txq = true; if (deficit < 0) sta->airtime[txqi->txq.ac].deficit += sta->airtime_weight; if (deficit < 0 || !aql_check) { list_move_tail(&txqi->schedule_order, &local->active_txqs[txqi->txq.ac]); goto begin; } } if (txqi->schedule_round == local->schedule_round[ac]) goto out; list_del_init(&txqi->schedule_order); txqi->schedule_round = local->schedule_round[ac]; ret = &txqi->txq; out: spin_unlock_bh(&local->active_txq_lock[ac]); return ret; } EXPORT_SYMBOL(ieee80211_next_txq); void __ieee80211_schedule_txq(struct ieee80211_hw *hw, struct ieee80211_txq *txq, bool force) { struct ieee80211_local *local = hw_to_local(hw); struct txq_info *txqi = to_txq_info(txq); bool has_queue; spin_lock_bh(&local->active_txq_lock[txq->ac]); has_queue = force || txq_has_queue(txq); if (list_empty(&txqi->schedule_order) && (has_queue || ieee80211_txq_keep_active(txqi))) { /* If airtime accounting is active, always enqueue STAs at the * head of the list to ensure that they only get moved to the * back by the airtime DRR scheduler once they have a negative * deficit. A station that already has a negative deficit will * get immediately moved to the back of the list on the next * call to ieee80211_next_txq(). */ if (txqi->txq.sta && local->airtime_flags && has_queue && wiphy_ext_feature_isset(local->hw.wiphy, NL80211_EXT_FEATURE_AIRTIME_FAIRNESS)) list_add(&txqi->schedule_order, &local->active_txqs[txq->ac]); else list_add_tail(&txqi->schedule_order, &local->active_txqs[txq->ac]); if (has_queue) ieee80211_txq_set_active(txqi); } spin_unlock_bh(&local->active_txq_lock[txq->ac]); } EXPORT_SYMBOL(__ieee80211_schedule_txq); DEFINE_STATIC_KEY_FALSE(aql_disable); bool ieee80211_txq_airtime_check(struct ieee80211_hw *hw, struct ieee80211_txq *txq) { struct sta_info *sta; struct ieee80211_local *local = hw_to_local(hw); if (!wiphy_ext_feature_isset(local->hw.wiphy, NL80211_EXT_FEATURE_AQL)) return true; if (static_branch_unlikely(&aql_disable)) return true; if (!txq->sta) return true; if (unlikely(txq->tid == IEEE80211_NUM_TIDS)) return true; sta = container_of(txq->sta, struct sta_info, sta); if (atomic_read(&sta->airtime[txq->ac].aql_tx_pending) < sta->airtime[txq->ac].aql_limit_low) return true; if (atomic_read(&local->aql_total_pending_airtime) < local->aql_threshold && atomic_read(&sta->airtime[txq->ac].aql_tx_pending) < sta->airtime[txq->ac].aql_limit_high) return true; return false; } EXPORT_SYMBOL(ieee80211_txq_airtime_check); static bool ieee80211_txq_schedule_airtime_check(struct ieee80211_local *local, u8 ac) { unsigned int num_txq = 0; struct txq_info *txq; u32 aql_limit; if (!wiphy_ext_feature_isset(local->hw.wiphy, NL80211_EXT_FEATURE_AQL)) return true; list_for_each_entry(txq, &local->active_txqs[ac], schedule_order) num_txq++; aql_limit = (num_txq - 1) * local->aql_txq_limit_low[ac] / 2 + local->aql_txq_limit_high[ac]; return atomic_read(&local->aql_ac_pending_airtime[ac]) < aql_limit; } bool ieee80211_txq_may_transmit(struct ieee80211_hw *hw, struct ieee80211_txq *txq) { struct ieee80211_local *local = hw_to_local(hw); struct txq_info *iter, *tmp, *txqi = to_txq_info(txq); struct sta_info *sta; u8 ac = txq->ac; spin_lock_bh(&local->active_txq_lock[ac]); if (!txqi->txq.sta) goto out; if (list_empty(&txqi->schedule_order)) goto out; if (!ieee80211_txq_schedule_airtime_check(local, ac)) goto out; list_for_each_entry_safe(iter, tmp, &local->active_txqs[ac], schedule_order) { if (iter == txqi) break; if (!iter->txq.sta) { list_move_tail(&iter->schedule_order, &local->active_txqs[ac]); continue; } sta = container_of(iter->txq.sta, struct sta_info, sta); if (ieee80211_sta_deficit(sta, ac) < 0) sta->airtime[ac].deficit += sta->airtime_weight; list_move_tail(&iter->schedule_order, &local->active_txqs[ac]); } sta = container_of(txqi->txq.sta, struct sta_info, sta); if (sta->airtime[ac].deficit >= 0) goto out; sta->airtime[ac].deficit += sta->airtime_weight; list_move_tail(&txqi->schedule_order, &local->active_txqs[ac]); spin_unlock_bh(&local->active_txq_lock[ac]); return false; out: if (!list_empty(&txqi->schedule_order)) list_del_init(&txqi->schedule_order); spin_unlock_bh(&local->active_txq_lock[ac]); return true; } EXPORT_SYMBOL(ieee80211_txq_may_transmit); void ieee80211_txq_schedule_start(struct ieee80211_hw *hw, u8 ac) { struct ieee80211_local *local = hw_to_local(hw); spin_lock_bh(&local->active_txq_lock[ac]); if (ieee80211_txq_schedule_airtime_check(local, ac)) { local->schedule_round[ac]++; if (!local->schedule_round[ac]) local->schedule_round[ac]++; } else { local->schedule_round[ac] = 0; } spin_unlock_bh(&local->active_txq_lock[ac]); } EXPORT_SYMBOL(ieee80211_txq_schedule_start); void __ieee80211_subif_start_xmit(struct sk_buff *skb, struct net_device *dev, u32 info_flags, u32 ctrl_flags, u64 *cookie) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; struct sta_info *sta; struct sk_buff *next; int len = skb->len; if (unlikely(!ieee80211_sdata_running(sdata) || skb->len < ETH_HLEN)) { kfree_skb(skb); return; } sk_pacing_shift_update(skb->sk, sdata->local->hw.tx_sk_pacing_shift); rcu_read_lock(); if (ieee80211_vif_is_mesh(&sdata->vif) && ieee80211_hw_check(&local->hw, SUPPORT_FAST_XMIT) && ieee80211_mesh_xmit_fast(sdata, skb, ctrl_flags)) goto out; if (ieee80211_lookup_ra_sta(sdata, skb, &sta)) goto out_free; if (IS_ERR(sta)) sta = NULL; skb_set_queue_mapping(skb, ieee80211_select_queue(sdata, sta, skb)); ieee80211_aggr_check(sdata, sta, skb); if (sta) { struct ieee80211_fast_tx *fast_tx; fast_tx = rcu_dereference(sta->fast_tx); if (fast_tx && ieee80211_xmit_fast(sdata, sta, fast_tx, skb)) goto out; } /* the frame could be fragmented, software-encrypted, and other * things so we cannot really handle checksum or GSO offload. * fix it up in software before we handle anything else. */ skb = ieee80211_tx_skb_fixup(skb, 0); if (!skb) { len = 0; goto out; } skb_list_walk_safe(skb, skb, next) { skb_mark_not_on_list(skb); if (skb->protocol == sdata->control_port_protocol) ctrl_flags |= IEEE80211_TX_CTRL_SKIP_MPATH_LOOKUP; skb = ieee80211_build_hdr(sdata, skb, info_flags, sta, ctrl_flags, cookie); if (IS_ERR(skb)) { kfree_skb_list(next); goto out; } dev_sw_netstats_tx_add(dev, 1, skb->len); ieee80211_xmit(sdata, sta, skb); } goto out; out_free: kfree_skb(skb); len = 0; out: if (len) ieee80211_tpt_led_trig_tx(local, len); rcu_read_unlock(); } static int ieee80211_change_da(struct sk_buff *skb, struct sta_info *sta) { struct ethhdr *eth; int err; err = skb_ensure_writable(skb, ETH_HLEN); if (unlikely(err)) return err; eth = (void *)skb->data; ether_addr_copy(eth->h_dest, sta->sta.addr); return 0; } static bool ieee80211_multicast_to_unicast(struct sk_buff *skb, struct net_device *dev) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); const struct ethhdr *eth = (void *)skb->data; const struct vlan_ethhdr *ethvlan = (void *)skb->data; __be16 ethertype; switch (sdata->vif.type) { case NL80211_IFTYPE_AP_VLAN: if (sdata->u.vlan.sta) return false; if (sdata->wdev.use_4addr) return false; fallthrough; case NL80211_IFTYPE_AP: /* check runtime toggle for this bss */ if (!sdata->bss->multicast_to_unicast) return false; break; default: return false; } /* multicast to unicast conversion only for some payload */ ethertype = eth->h_proto; if (ethertype == htons(ETH_P_8021Q) && skb->len >= VLAN_ETH_HLEN) ethertype = ethvlan->h_vlan_encapsulated_proto; switch (ethertype) { case htons(ETH_P_ARP): case htons(ETH_P_IP): case htons(ETH_P_IPV6): break; default: return false; } return true; } static void ieee80211_convert_to_unicast(struct sk_buff *skb, struct net_device *dev, struct sk_buff_head *queue) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; const struct ethhdr *eth = (struct ethhdr *)skb->data; struct sta_info *sta, *first = NULL; struct sk_buff *cloned_skb; rcu_read_lock(); list_for_each_entry_rcu(sta, &local->sta_list, list) { if (sdata != sta->sdata) /* AP-VLAN mismatch */ continue; if (unlikely(ether_addr_equal(eth->h_source, sta->sta.addr))) /* do not send back to source */ continue; if (!first) { first = sta; continue; } cloned_skb = skb_clone(skb, GFP_ATOMIC); if (!cloned_skb) goto multicast; if (unlikely(ieee80211_change_da(cloned_skb, sta))) { dev_kfree_skb(cloned_skb); goto multicast; } __skb_queue_tail(queue, cloned_skb); } if (likely(first)) { if (unlikely(ieee80211_change_da(skb, first))) goto multicast; __skb_queue_tail(queue, skb); } else { /* no STA connected, drop */ kfree_skb(skb); skb = NULL; } goto out; multicast: __skb_queue_purge(queue); __skb_queue_tail(queue, skb); out: rcu_read_unlock(); } static void ieee80211_mlo_multicast_tx_one(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, u32 ctrl_flags, unsigned int link_id) { struct sk_buff *out; out = skb_copy(skb, GFP_ATOMIC); if (!out) return; ctrl_flags |= u32_encode_bits(link_id, IEEE80211_TX_CTRL_MLO_LINK); __ieee80211_subif_start_xmit(out, sdata->dev, 0, ctrl_flags, NULL); } static void ieee80211_mlo_multicast_tx(struct net_device *dev, struct sk_buff *skb) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); unsigned long links = sdata->vif.active_links; unsigned int link; u32 ctrl_flags = IEEE80211_TX_CTRL_MCAST_MLO_FIRST_TX; if (hweight16(links) == 1) { ctrl_flags |= u32_encode_bits(__ffs(links), IEEE80211_TX_CTRL_MLO_LINK); __ieee80211_subif_start_xmit(skb, sdata->dev, 0, ctrl_flags, NULL); return; } for_each_set_bit(link, &links, IEEE80211_MLD_MAX_NUM_LINKS) { ieee80211_mlo_multicast_tx_one(sdata, skb, ctrl_flags, link); ctrl_flags = 0; } kfree_skb(skb); } /** * ieee80211_subif_start_xmit - netif start_xmit function for 802.3 vifs * @skb: packet to be sent * @dev: incoming interface * * On failure skb will be freed. * * Returns: the netdev TX status (but really only %NETDEV_TX_OK) */ netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); const struct ethhdr *eth = (void *)skb->data; if (likely(!is_multicast_ether_addr(eth->h_dest))) goto normal; if (unlikely(!ieee80211_sdata_running(sdata))) { kfree_skb(skb); return NETDEV_TX_OK; } if (unlikely(ieee80211_multicast_to_unicast(skb, dev))) { struct sk_buff_head queue; __skb_queue_head_init(&queue); ieee80211_convert_to_unicast(skb, dev, &queue); while ((skb = __skb_dequeue(&queue))) __ieee80211_subif_start_xmit(skb, dev, 0, IEEE80211_TX_CTRL_MLO_LINK_UNSPEC, NULL); } else if (ieee80211_vif_is_mld(&sdata->vif) && ((sdata->vif.type == NL80211_IFTYPE_AP && !ieee80211_hw_check(&sdata->local->hw, MLO_MCAST_MULTI_LINK_TX)) || (sdata->vif.type == NL80211_IFTYPE_AP_VLAN && !sdata->wdev.use_4addr))) { ieee80211_mlo_multicast_tx(dev, skb); } else { normal: __ieee80211_subif_start_xmit(skb, dev, 0, IEEE80211_TX_CTRL_MLO_LINK_UNSPEC, NULL); } return NETDEV_TX_OK; } static bool __ieee80211_tx_8023(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, struct sta_info *sta, bool txpending) { struct ieee80211_local *local = sdata->local; struct ieee80211_tx_control control = {}; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct ieee80211_sta *pubsta = NULL; unsigned long flags; int q = info->hw_queue; spin_lock_irqsave(&local->queue_stop_reason_lock, flags); if (local->queue_stop_reasons[q] || (!txpending && !skb_queue_empty(&local->pending[q]))) { if (txpending) skb_queue_head(&local->pending[q], skb); else skb_queue_tail(&local->pending[q], skb); spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); return false; } spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); if (sta && sta->uploaded) pubsta = &sta->sta; control.sta = pubsta; drv_tx(local, &control, skb); return true; } static bool ieee80211_tx_8023(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, struct sta_info *sta, bool txpending) { struct ieee80211_local *local = sdata->local; struct sk_buff *next; bool ret = true; if (ieee80211_queue_skb(local, sdata, sta, skb)) return true; skb_list_walk_safe(skb, skb, next) { skb_mark_not_on_list(skb); if (!__ieee80211_tx_8023(sdata, skb, sta, txpending)) ret = false; } return ret; } static void ieee80211_8023_xmit(struct ieee80211_sub_if_data *sdata, struct net_device *dev, struct sta_info *sta, struct ieee80211_key *key, struct sk_buff *skb) { struct ieee80211_tx_info *info; struct ieee80211_local *local = sdata->local; struct tid_ampdu_tx *tid_tx; struct sk_buff *seg, *next; unsigned int skbs = 0, len = 0; u16 queue; u8 tid; queue = ieee80211_select_queue(sdata, sta, skb); skb_set_queue_mapping(skb, queue); if (unlikely(test_bit(SCAN_SW_SCANNING, &local->scanning)) && test_bit(SDATA_STATE_OFFCHANNEL, &sdata->state)) goto out_free; skb = skb_share_check(skb, GFP_ATOMIC); if (unlikely(!skb)) return; ieee80211_aggr_check(sdata, sta, skb); tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK; tid_tx = rcu_dereference(sta->ampdu_mlme.tid_tx[tid]); if (tid_tx) { if (!test_bit(HT_AGG_STATE_OPERATIONAL, &tid_tx->state)) { /* fall back to non-offload slow path */ __ieee80211_subif_start_xmit(skb, dev, 0, IEEE80211_TX_CTRL_MLO_LINK_UNSPEC, NULL); return; } if (tid_tx->timeout) tid_tx->last_tx = jiffies; } skb = ieee80211_tx_skb_fixup(skb, ieee80211_sdata_netdev_features(sdata)); if (!skb) return; info = IEEE80211_SKB_CB(skb); memset(info, 0, sizeof(*info)); info->hw_queue = sdata->vif.hw_queue[queue]; if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) sdata = container_of(sdata->bss, struct ieee80211_sub_if_data, u.ap); info->flags |= IEEE80211_TX_CTL_HW_80211_ENCAP; info->control.vif = &sdata->vif; if (key) info->control.hw_key = &key->conf; skb_list_walk_safe(skb, seg, next) { skbs++; len += seg->len; if (seg != skb) memcpy(IEEE80211_SKB_CB(seg), info, sizeof(*info)); } if (unlikely(skb->sk && sock_flag(skb->sk, SOCK_WIFI_STATUS))) { info->status_data = ieee80211_store_ack_skb(local, skb, &info->flags, NULL); if (info->status_data) info->status_data_idr = 1; } dev_sw_netstats_tx_add(dev, skbs, len); sta->deflink.tx_stats.packets[queue] += skbs; sta->deflink.tx_stats.bytes[queue] += len; ieee80211_tpt_led_trig_tx(local, len); ieee80211_tx_8023(sdata, skb, sta, false); return; out_free: kfree_skb(skb); } netdev_tx_t ieee80211_subif_start_xmit_8023(struct sk_buff *skb, struct net_device *dev) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ethhdr *ehdr = (struct ethhdr *)skb->data; struct ieee80211_key *key; struct sta_info *sta; if (unlikely(!ieee80211_sdata_running(sdata) || skb->len < ETH_HLEN)) { kfree_skb(skb); return NETDEV_TX_OK; } rcu_read_lock(); if (ieee80211_lookup_ra_sta(sdata, skb, &sta)) { kfree_skb(skb); goto out; } if (unlikely(IS_ERR_OR_NULL(sta) || !sta->uploaded || !test_sta_flag(sta, WLAN_STA_AUTHORIZED) || sdata->control_port_protocol == ehdr->h_proto)) goto skip_offload; key = rcu_dereference(sta->ptk[sta->ptk_idx]); if (!key) key = rcu_dereference(sdata->default_unicast_key); if (key && (!(key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE) || key->conf.cipher == WLAN_CIPHER_SUITE_TKIP)) goto skip_offload; sk_pacing_shift_update(skb->sk, sdata->local->hw.tx_sk_pacing_shift); ieee80211_8023_xmit(sdata, dev, sta, key, skb); goto out; skip_offload: ieee80211_subif_start_xmit(skb, dev); out: rcu_read_unlock(); return NETDEV_TX_OK; } struct sk_buff * ieee80211_build_data_template(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, u32 info_flags) { struct ieee80211_hdr *hdr; struct ieee80211_tx_data tx = { .local = sdata->local, .sdata = sdata, }; struct sta_info *sta; rcu_read_lock(); if (ieee80211_lookup_ra_sta(sdata, skb, &sta)) { kfree_skb(skb); skb = ERR_PTR(-EINVAL); goto out; } skb = ieee80211_build_hdr(sdata, skb, info_flags, sta, IEEE80211_TX_CTRL_MLO_LINK_UNSPEC, NULL); if (IS_ERR(skb)) goto out; hdr = (void *)skb->data; tx.sta = sta_info_get(sdata, hdr->addr1); tx.skb = skb; if (ieee80211_tx_h_select_key(&tx) != TX_CONTINUE) { rcu_read_unlock(); kfree_skb(skb); return ERR_PTR(-EINVAL); } out: rcu_read_unlock(); return skb; } /* * ieee80211_clear_tx_pending may not be called in a context where * it is possible that it packets could come in again. */ void ieee80211_clear_tx_pending(struct ieee80211_local *local) { struct sk_buff *skb; int i; for (i = 0; i < local->hw.queues; i++) { while ((skb = skb_dequeue(&local->pending[i])) != NULL) ieee80211_free_txskb(&local->hw, skb); } } /* * Returns false if the frame couldn't be transmitted but was queued instead, * which in this case means re-queued -- take as an indication to stop sending * more pending frames. */ static bool ieee80211_tx_pending_skb(struct ieee80211_local *local, struct sk_buff *skb) { struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct ieee80211_sub_if_data *sdata; struct sta_info *sta; struct ieee80211_hdr *hdr; bool result; struct ieee80211_chanctx_conf *chanctx_conf; sdata = vif_to_sdata(info->control.vif); if (info->control.flags & IEEE80211_TX_INTCFL_NEED_TXPROCESSING) { /* update band only for non-MLD */ if (!ieee80211_vif_is_mld(&sdata->vif)) { chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf); if (unlikely(!chanctx_conf)) { dev_kfree_skb(skb); return true; } info->band = chanctx_conf->def.chan->band; } result = ieee80211_tx(sdata, NULL, skb, true); } else if (info->flags & IEEE80211_TX_CTL_HW_80211_ENCAP) { if (ieee80211_lookup_ra_sta(sdata, skb, &sta)) { dev_kfree_skb(skb); return true; } if (IS_ERR(sta) || (sta && !sta->uploaded)) sta = NULL; result = ieee80211_tx_8023(sdata, skb, sta, true); } else { struct sk_buff_head skbs; __skb_queue_head_init(&skbs); __skb_queue_tail(&skbs, skb); hdr = (struct ieee80211_hdr *)skb->data; sta = sta_info_get(sdata, hdr->addr1); result = __ieee80211_tx(local, &skbs, sta, true); } return result; } /* * Transmit all pending packets. Called from tasklet. */ void ieee80211_tx_pending(struct tasklet_struct *t) { struct ieee80211_local *local = from_tasklet(local, t, tx_pending_tasklet); unsigned long flags; int i; bool txok; rcu_read_lock(); spin_lock_irqsave(&local->queue_stop_reason_lock, flags); for (i = 0; i < local->hw.queues; i++) { /* * If queue is stopped by something other than due to pending * frames, or we have no pending frames, proceed to next queue. */ if (local->queue_stop_reasons[i] || skb_queue_empty(&local->pending[i])) continue; while (!skb_queue_empty(&local->pending[i])) { struct sk_buff *skb = __skb_dequeue(&local->pending[i]); struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); if (WARN_ON(!info->control.vif)) { ieee80211_free_txskb(&local->hw, skb); continue; } spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); txok = ieee80211_tx_pending_skb(local, skb); spin_lock_irqsave(&local->queue_stop_reason_lock, flags); if (!txok) break; } } spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); rcu_read_unlock(); } /* functions for drivers to get certain frames */ static void __ieee80211_beacon_add_tim(struct ieee80211_sub_if_data *sdata, struct ieee80211_link_data *link, struct ps_data *ps, struct sk_buff *skb, bool is_template) { u8 *pos, *tim; int aid0 = 0; int i, have_bits = 0, n1, n2; struct ieee80211_bss_conf *link_conf = link->conf; /* Generate bitmap for TIM only if there are any STAs in power save * mode. */ if (atomic_read(&ps->num_sta_ps) > 0) /* in the hope that this is faster than * checking byte-for-byte */ have_bits = !bitmap_empty((unsigned long *)ps->tim, IEEE80211_MAX_AID+1); if (!is_template) { if (ps->dtim_count == 0) ps->dtim_count = link_conf->dtim_period - 1; else ps->dtim_count--; } tim = pos = skb_put(skb, 5); *pos++ = WLAN_EID_TIM; *pos++ = 3; *pos++ = ps->dtim_count; *pos++ = link_conf->dtim_period; if (ps->dtim_count == 0 && !skb_queue_empty(&ps->bc_buf)) aid0 = 1; ps->dtim_bc_mc = aid0 == 1; if (have_bits) { /* Find largest even number N1 so that bits numbered 1 through * (N1 x 8) - 1 in the bitmap are 0 and number N2 so that bits * (N2 + 1) x 8 through 2007 are 0. */ n1 = 0; for (i = 0; i < IEEE80211_MAX_TIM_LEN; i++) { if (ps->tim[i]) { n1 = i & 0xfe; break; } } n2 = n1; for (i = IEEE80211_MAX_TIM_LEN - 1; i >= n1; i--) { if (ps->tim[i]) { n2 = i; break; } } /* Bitmap control */ *pos++ = n1 | aid0; /* Part Virt Bitmap */ skb_put_data(skb, ps->tim + n1, n2 - n1 + 1); tim[1] = n2 - n1 + 4; } else { *pos++ = aid0; /* Bitmap control */ if (ieee80211_get_link_sband(link)->band != NL80211_BAND_S1GHZ) { tim[1] = 4; /* Part Virt Bitmap */ skb_put_u8(skb, 0); } } } static int ieee80211_beacon_add_tim(struct ieee80211_sub_if_data *sdata, struct ieee80211_link_data *link, struct ps_data *ps, struct sk_buff *skb, bool is_template) { struct ieee80211_local *local = sdata->local; /* * Not very nice, but we want to allow the driver to call * ieee80211_beacon_get() as a response to the set_tim() * callback. That, however, is already invoked under the * sta_lock to guarantee consistent and race-free update * of the tim bitmap in mac80211 and the driver. */ if (local->tim_in_locked_section) { __ieee80211_beacon_add_tim(sdata, link, ps, skb, is_template); } else { spin_lock_bh(&local->tim_lock); __ieee80211_beacon_add_tim(sdata, link, ps, skb, is_template); spin_unlock_bh(&local->tim_lock); } return 0; } static void ieee80211_set_beacon_cntdwn(struct ieee80211_sub_if_data *sdata, struct beacon_data *beacon, struct ieee80211_link_data *link) { u8 *beacon_data, count, max_count = 1; struct probe_resp *resp; size_t beacon_data_len; u16 *bcn_offsets; int i; switch (sdata->vif.type) { case NL80211_IFTYPE_AP: beacon_data = beacon->tail; beacon_data_len = beacon->tail_len; break; case NL80211_IFTYPE_ADHOC: beacon_data = beacon->head; beacon_data_len = beacon->head_len; break; case NL80211_IFTYPE_MESH_POINT: beacon_data = beacon->head; beacon_data_len = beacon->head_len; break; default: return; } resp = rcu_dereference(link->u.ap.probe_resp); bcn_offsets = beacon->cntdwn_counter_offsets; count = beacon->cntdwn_current_counter; if (link->conf->csa_active) max_count = IEEE80211_MAX_CNTDWN_COUNTERS_NUM; for (i = 0; i < max_count; ++i) { if (bcn_offsets[i]) { if (WARN_ON_ONCE(bcn_offsets[i] >= beacon_data_len)) return; beacon_data[bcn_offsets[i]] = count; } if (sdata->vif.type == NL80211_IFTYPE_AP && resp) { u16 *resp_offsets = resp->cntdwn_counter_offsets; resp->data[resp_offsets[i]] = count; } } } static u8 __ieee80211_beacon_update_cntdwn(struct beacon_data *beacon) { beacon->cntdwn_current_counter--; /* the counter should never reach 0 */ WARN_ON_ONCE(!beacon->cntdwn_current_counter); return beacon->cntdwn_current_counter; } u8 ieee80211_beacon_update_cntdwn(struct ieee80211_vif *vif, unsigned int link_id) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); struct ieee80211_link_data *link; struct beacon_data *beacon = NULL; u8 count = 0; if (WARN_ON(link_id >= IEEE80211_MLD_MAX_NUM_LINKS)) return 0; rcu_read_lock(); link = rcu_dereference(sdata->link[link_id]); if (!link) goto unlock; if (sdata->vif.type == NL80211_IFTYPE_AP) beacon = rcu_dereference(link->u.ap.beacon); else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) beacon = rcu_dereference(sdata->u.ibss.presp); else if (ieee80211_vif_is_mesh(&sdata->vif)) beacon = rcu_dereference(sdata->u.mesh.beacon); if (!beacon) goto unlock; count = __ieee80211_beacon_update_cntdwn(beacon); unlock: rcu_read_unlock(); return count; } EXPORT_SYMBOL(ieee80211_beacon_update_cntdwn); void ieee80211_beacon_set_cntdwn(struct ieee80211_vif *vif, u8 counter) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); struct beacon_data *beacon = NULL; rcu_read_lock(); if (sdata->vif.type == NL80211_IFTYPE_AP) beacon = rcu_dereference(sdata->deflink.u.ap.beacon); else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) beacon = rcu_dereference(sdata->u.ibss.presp); else if (ieee80211_vif_is_mesh(&sdata->vif)) beacon = rcu_dereference(sdata->u.mesh.beacon); if (!beacon) goto unlock; if (counter < beacon->cntdwn_current_counter) beacon->cntdwn_current_counter = counter; unlock: rcu_read_unlock(); } EXPORT_SYMBOL(ieee80211_beacon_set_cntdwn); bool ieee80211_beacon_cntdwn_is_complete(struct ieee80211_vif *vif, unsigned int link_id) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); struct ieee80211_link_data *link; struct beacon_data *beacon = NULL; u8 *beacon_data; size_t beacon_data_len; int ret = false; if (!ieee80211_sdata_running(sdata)) return false; if (WARN_ON(link_id >= IEEE80211_MLD_MAX_NUM_LINKS)) return 0; rcu_read_lock(); link = rcu_dereference(sdata->link[link_id]); if (!link) goto out; if (vif->type == NL80211_IFTYPE_AP) { beacon = rcu_dereference(link->u.ap.beacon); if (WARN_ON(!beacon || !beacon->tail)) goto out; beacon_data = beacon->tail; beacon_data_len = beacon->tail_len; } else if (vif->type == NL80211_IFTYPE_ADHOC) { struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; beacon = rcu_dereference(ifibss->presp); if (!beacon) goto out; beacon_data = beacon->head; beacon_data_len = beacon->head_len; } else if (vif->type == NL80211_IFTYPE_MESH_POINT) { struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; beacon = rcu_dereference(ifmsh->beacon); if (!beacon) goto out; beacon_data = beacon->head; beacon_data_len = beacon->head_len; } else { WARN_ON(1); goto out; } if (!beacon->cntdwn_counter_offsets[0]) goto out; if (WARN_ON_ONCE(beacon->cntdwn_counter_offsets[0] > beacon_data_len)) goto out; if (beacon_data[beacon->cntdwn_counter_offsets[0]] == 1) ret = true; out: rcu_read_unlock(); return ret; } EXPORT_SYMBOL(ieee80211_beacon_cntdwn_is_complete); static int ieee80211_beacon_protect(struct sk_buff *skb, struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_link_data *link) { ieee80211_tx_result res; struct ieee80211_tx_data tx; struct sk_buff *check_skb; memset(&tx, 0, sizeof(tx)); tx.key = rcu_dereference(link->default_beacon_key); if (!tx.key) return 0; if (unlikely(tx.key->flags & KEY_FLAG_TAINTED)) { tx.key = NULL; return -EINVAL; } if (!(tx.key->conf.flags & IEEE80211_KEY_FLAG_SW_MGMT_TX) && tx.key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE) IEEE80211_SKB_CB(skb)->control.hw_key = &tx.key->conf; tx.local = local; tx.sdata = sdata; __skb_queue_head_init(&tx.skbs); __skb_queue_tail(&tx.skbs, skb); res = ieee80211_tx_h_encrypt(&tx); check_skb = __skb_dequeue(&tx.skbs); /* we may crash after this, but it'd be a bug in crypto */ WARN_ON(check_skb != skb); if (WARN_ON_ONCE(res != TX_CONTINUE)) return -EINVAL; return 0; } static void ieee80211_beacon_get_finish(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_link_data *link, struct ieee80211_mutable_offsets *offs, struct beacon_data *beacon, struct sk_buff *skb, struct ieee80211_chanctx_conf *chanctx_conf, u16 csa_off_base) { struct ieee80211_local *local = hw_to_local(hw); struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); struct ieee80211_tx_info *info; enum nl80211_band band; struct ieee80211_tx_rate_control txrc; /* CSA offsets */ if (offs && beacon) { u16 i; for (i = 0; i < IEEE80211_MAX_CNTDWN_COUNTERS_NUM; i++) { u16 csa_off = beacon->cntdwn_counter_offsets[i]; if (!csa_off) continue; offs->cntdwn_counter_offs[i] = csa_off_base + csa_off; } } band = chanctx_conf->def.chan->band; info = IEEE80211_SKB_CB(skb); info->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT; info->flags |= IEEE80211_TX_CTL_NO_ACK; info->band = band; memset(&txrc, 0, sizeof(txrc)); txrc.hw = hw; txrc.sband = local->hw.wiphy->bands[band]; txrc.bss_conf = link->conf; txrc.skb = skb; txrc.reported_rate.idx = -1; if (sdata->beacon_rate_set && sdata->beacon_rateidx_mask[band]) txrc.rate_idx_mask = sdata->beacon_rateidx_mask[band]; else txrc.rate_idx_mask = sdata->rc_rateidx_mask[band]; txrc.bss = true; rate_control_get_rate(sdata, NULL, &txrc); info->control.vif = vif; info->control.flags |= u32_encode_bits(link->link_id, IEEE80211_TX_CTRL_MLO_LINK); info->flags |= IEEE80211_TX_CTL_CLEAR_PS_FILT | IEEE80211_TX_CTL_ASSIGN_SEQ | IEEE80211_TX_CTL_FIRST_FRAGMENT; } static void ieee80211_beacon_add_mbssid(struct sk_buff *skb, struct beacon_data *beacon, u8 i) { if (!beacon->mbssid_ies || !beacon->mbssid_ies->cnt || i > beacon->mbssid_ies->cnt) return; if (i < beacon->mbssid_ies->cnt) { skb_put_data(skb, beacon->mbssid_ies->elem[i].data, beacon->mbssid_ies->elem[i].len); if (beacon->rnr_ies && beacon->rnr_ies->cnt) { skb_put_data(skb, beacon->rnr_ies->elem[i].data, beacon->rnr_ies->elem[i].len); for (i = beacon->mbssid_ies->cnt; i < beacon->rnr_ies->cnt; i++) skb_put_data(skb, beacon->rnr_ies->elem[i].data, beacon->rnr_ies->elem[i].len); } return; } /* i == beacon->mbssid_ies->cnt, include all MBSSID elements */ for (i = 0; i < beacon->mbssid_ies->cnt; i++) skb_put_data(skb, beacon->mbssid_ies->elem[i].data, beacon->mbssid_ies->elem[i].len); } static struct sk_buff * ieee80211_beacon_get_ap(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_link_data *link, struct ieee80211_mutable_offsets *offs, bool is_template, struct beacon_data *beacon, struct ieee80211_chanctx_conf *chanctx_conf, u8 ema_index) { struct ieee80211_local *local = hw_to_local(hw); struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); struct ieee80211_if_ap *ap = &sdata->u.ap; struct sk_buff *skb = NULL; u16 csa_off_base = 0; int mbssid_len; if (beacon->cntdwn_counter_offsets[0]) { if (!is_template) ieee80211_beacon_update_cntdwn(vif, link->link_id); ieee80211_set_beacon_cntdwn(sdata, beacon, link); } /* headroom, head length, * tail length, maximum TIM length and multiple BSSID length */ mbssid_len = ieee80211_get_mbssid_beacon_len(beacon->mbssid_ies, beacon->rnr_ies, ema_index); skb = dev_alloc_skb(local->tx_headroom + beacon->head_len + beacon->tail_len + 256 + local->hw.extra_beacon_tailroom + mbssid_len); if (!skb) return NULL; skb_reserve(skb, local->tx_headroom); skb_put_data(skb, beacon->head, beacon->head_len); ieee80211_beacon_add_tim(sdata, link, &ap->ps, skb, is_template); if (offs) { offs->tim_offset = beacon->head_len; offs->tim_length = skb->len - beacon->head_len; offs->cntdwn_counter_offs[0] = beacon->cntdwn_counter_offsets[0]; if (mbssid_len) { ieee80211_beacon_add_mbssid(skb, beacon, ema_index); offs->mbssid_off = skb->len - mbssid_len; } /* for AP the csa offsets are from tail */ csa_off_base = skb->len; } if (beacon->tail) skb_put_data(skb, beacon->tail, beacon->tail_len); if (ieee80211_beacon_protect(skb, local, sdata, link) < 0) { dev_kfree_skb(skb); return NULL; } ieee80211_beacon_get_finish(hw, vif, link, offs, beacon, skb, chanctx_conf, csa_off_base); return skb; } static struct ieee80211_ema_beacons * ieee80211_beacon_get_ap_ema_list(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_link_data *link, struct ieee80211_mutable_offsets *offs, bool is_template, struct beacon_data *beacon, struct ieee80211_chanctx_conf *chanctx_conf) { struct ieee80211_ema_beacons *ema = NULL; if (!beacon->mbssid_ies || !beacon->mbssid_ies->cnt) return NULL; ema = kzalloc(struct_size(ema, bcn, beacon->mbssid_ies->cnt), GFP_ATOMIC); if (!ema) return NULL; for (ema->cnt = 0; ema->cnt < beacon->mbssid_ies->cnt; ema->cnt++) { ema->bcn[ema->cnt].skb = ieee80211_beacon_get_ap(hw, vif, link, &ema->bcn[ema->cnt].offs, is_template, beacon, chanctx_conf, ema->cnt); if (!ema->bcn[ema->cnt].skb) break; } if (ema->cnt == beacon->mbssid_ies->cnt) return ema; ieee80211_beacon_free_ema_list(ema); return NULL; } #define IEEE80211_INCLUDE_ALL_MBSSID_ELEMS -1 static struct sk_buff * __ieee80211_beacon_get(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_mutable_offsets *offs, bool is_template, unsigned int link_id, int ema_index, struct ieee80211_ema_beacons **ema_beacons) { struct ieee80211_local *local = hw_to_local(hw); struct beacon_data *beacon = NULL; struct sk_buff *skb = NULL; struct ieee80211_sub_if_data *sdata = NULL; struct ieee80211_chanctx_conf *chanctx_conf; struct ieee80211_link_data *link; rcu_read_lock(); sdata = vif_to_sdata(vif); link = rcu_dereference(sdata->link[link_id]); if (!link) goto out; chanctx_conf = rcu_dereference(link->conf->chanctx_conf); if (!ieee80211_sdata_running(sdata) || !chanctx_conf) goto out; if (offs) memset(offs, 0, sizeof(*offs)); if (sdata->vif.type == NL80211_IFTYPE_AP) { beacon = rcu_dereference(link->u.ap.beacon); if (!beacon) goto out; if (ema_beacons) { *ema_beacons = ieee80211_beacon_get_ap_ema_list(hw, vif, link, offs, is_template, beacon, chanctx_conf); } else { if (beacon->mbssid_ies && beacon->mbssid_ies->cnt) { if (ema_index >= beacon->mbssid_ies->cnt) goto out; /* End of MBSSID elements */ if (ema_index <= IEEE80211_INCLUDE_ALL_MBSSID_ELEMS) ema_index = beacon->mbssid_ies->cnt; } else { ema_index = 0; } skb = ieee80211_beacon_get_ap(hw, vif, link, offs, is_template, beacon, chanctx_conf, ema_index); } } else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) { struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; struct ieee80211_hdr *hdr; beacon = rcu_dereference(ifibss->presp); if (!beacon) goto out; if (beacon->cntdwn_counter_offsets[0]) { if (!is_template) __ieee80211_beacon_update_cntdwn(beacon); ieee80211_set_beacon_cntdwn(sdata, beacon, link); } skb = dev_alloc_skb(local->tx_headroom + beacon->head_len + local->hw.extra_beacon_tailroom); if (!skb) goto out; skb_reserve(skb, local->tx_headroom); skb_put_data(skb, beacon->head, beacon->head_len); hdr = (struct ieee80211_hdr *) skb->data; hdr->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_BEACON); ieee80211_beacon_get_finish(hw, vif, link, offs, beacon, skb, chanctx_conf, 0); } else if (ieee80211_vif_is_mesh(&sdata->vif)) { struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; beacon = rcu_dereference(ifmsh->beacon); if (!beacon) goto out; if (beacon->cntdwn_counter_offsets[0]) { if (!is_template) /* TODO: For mesh csa_counter is in TU, so * decrementing it by one isn't correct, but * for now we leave it consistent with overall * mac80211's behavior. */ __ieee80211_beacon_update_cntdwn(beacon); ieee80211_set_beacon_cntdwn(sdata, beacon, link); } if (ifmsh->sync_ops) ifmsh->sync_ops->adjust_tsf(sdata, beacon); skb = dev_alloc_skb(local->tx_headroom + beacon->head_len + 256 + /* TIM IE */ beacon->tail_len + local->hw.extra_beacon_tailroom); if (!skb) goto out; skb_reserve(skb, local->tx_headroom); skb_put_data(skb, beacon->head, beacon->head_len); ieee80211_beacon_add_tim(sdata, link, &ifmsh->ps, skb, is_template); if (offs) { offs->tim_offset = beacon->head_len; offs->tim_length = skb->len - beacon->head_len; } skb_put_data(skb, beacon->tail, beacon->tail_len); ieee80211_beacon_get_finish(hw, vif, link, offs, beacon, skb, chanctx_conf, 0); } else { WARN_ON(1); goto out; } out: rcu_read_unlock(); return skb; } struct sk_buff * ieee80211_beacon_get_template(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_mutable_offsets *offs, unsigned int link_id) { return __ieee80211_beacon_get(hw, vif, offs, true, link_id, IEEE80211_INCLUDE_ALL_MBSSID_ELEMS, NULL); } EXPORT_SYMBOL(ieee80211_beacon_get_template); struct sk_buff * ieee80211_beacon_get_template_ema_index(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_mutable_offsets *offs, unsigned int link_id, u8 ema_index) { return __ieee80211_beacon_get(hw, vif, offs, true, link_id, ema_index, NULL); } EXPORT_SYMBOL(ieee80211_beacon_get_template_ema_index); void ieee80211_beacon_free_ema_list(struct ieee80211_ema_beacons *ema_beacons) { u8 i; if (!ema_beacons) return; for (i = 0; i < ema_beacons->cnt; i++) kfree_skb(ema_beacons->bcn[i].skb); kfree(ema_beacons); } EXPORT_SYMBOL(ieee80211_beacon_free_ema_list); struct ieee80211_ema_beacons * ieee80211_beacon_get_template_ema_list(struct ieee80211_hw *hw, struct ieee80211_vif *vif, unsigned int link_id) { struct ieee80211_ema_beacons *ema_beacons = NULL; WARN_ON(__ieee80211_beacon_get(hw, vif, NULL, true, link_id, 0, &ema_beacons)); return ema_beacons; } EXPORT_SYMBOL(ieee80211_beacon_get_template_ema_list); struct sk_buff *ieee80211_beacon_get_tim(struct ieee80211_hw *hw, struct ieee80211_vif *vif, u16 *tim_offset, u16 *tim_length, unsigned int link_id) { struct ieee80211_mutable_offsets offs = {}; struct sk_buff *bcn = __ieee80211_beacon_get(hw, vif, &offs, false, link_id, IEEE80211_INCLUDE_ALL_MBSSID_ELEMS, NULL); struct sk_buff *copy; if (!bcn) return bcn; if (tim_offset) *tim_offset = offs.tim_offset; if (tim_length) *tim_length = offs.tim_length; if (ieee80211_hw_check(hw, BEACON_TX_STATUS) || !hw_to_local(hw)->monitors) return bcn; /* send a copy to monitor interfaces */ copy = skb_copy(bcn, GFP_ATOMIC); if (!copy) return bcn; ieee80211_tx_monitor(hw_to_local(hw), copy, 1, NULL); return bcn; } EXPORT_SYMBOL(ieee80211_beacon_get_tim); struct sk_buff *ieee80211_proberesp_get(struct ieee80211_hw *hw, struct ieee80211_vif *vif) { struct sk_buff *skb = NULL; struct probe_resp *presp = NULL; struct ieee80211_hdr *hdr; struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); if (sdata->vif.type != NL80211_IFTYPE_AP) return NULL; rcu_read_lock(); presp = rcu_dereference(sdata->deflink.u.ap.probe_resp); if (!presp) goto out; skb = dev_alloc_skb(presp->len); if (!skb) goto out; skb_put_data(skb, presp->data, presp->len); hdr = (struct ieee80211_hdr *) skb->data; memset(hdr->addr1, 0, sizeof(hdr->addr1)); out: rcu_read_unlock(); return skb; } EXPORT_SYMBOL(ieee80211_proberesp_get); struct sk_buff *ieee80211_get_fils_discovery_tmpl(struct ieee80211_hw *hw, struct ieee80211_vif *vif) { struct sk_buff *skb = NULL; struct fils_discovery_data *tmpl = NULL; struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); if (sdata->vif.type != NL80211_IFTYPE_AP) return NULL; rcu_read_lock(); tmpl = rcu_dereference(sdata->deflink.u.ap.fils_discovery); if (!tmpl) { rcu_read_unlock(); return NULL; } skb = dev_alloc_skb(sdata->local->hw.extra_tx_headroom + tmpl->len); if (skb) { skb_reserve(skb, sdata->local->hw.extra_tx_headroom); skb_put_data(skb, tmpl->data, tmpl->len); } rcu_read_unlock(); return skb; } EXPORT_SYMBOL(ieee80211_get_fils_discovery_tmpl); struct sk_buff * ieee80211_get_unsol_bcast_probe_resp_tmpl(struct ieee80211_hw *hw, struct ieee80211_vif *vif) { struct sk_buff *skb = NULL; struct unsol_bcast_probe_resp_data *tmpl = NULL; struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); if (sdata->vif.type != NL80211_IFTYPE_AP) return NULL; rcu_read_lock(); tmpl = rcu_dereference(sdata->deflink.u.ap.unsol_bcast_probe_resp); if (!tmpl) { rcu_read_unlock(); return NULL; } skb = dev_alloc_skb(sdata->local->hw.extra_tx_headroom + tmpl->len); if (skb) { skb_reserve(skb, sdata->local->hw.extra_tx_headroom); skb_put_data(skb, tmpl->data, tmpl->len); } rcu_read_unlock(); return skb; } EXPORT_SYMBOL(ieee80211_get_unsol_bcast_probe_resp_tmpl); struct sk_buff *ieee80211_pspoll_get(struct ieee80211_hw *hw, struct ieee80211_vif *vif) { struct ieee80211_sub_if_data *sdata; struct ieee80211_pspoll *pspoll; struct ieee80211_local *local; struct sk_buff *skb; if (WARN_ON(vif->type != NL80211_IFTYPE_STATION)) return NULL; sdata = vif_to_sdata(vif); local = sdata->local; skb = dev_alloc_skb(local->hw.extra_tx_headroom + sizeof(*pspoll)); if (!skb) return NULL; skb_reserve(skb, local->hw.extra_tx_headroom); pspoll = skb_put_zero(skb, sizeof(*pspoll)); pspoll->frame_control = cpu_to_le16(IEEE80211_FTYPE_CTL | IEEE80211_STYPE_PSPOLL); pspoll->aid = cpu_to_le16(sdata->vif.cfg.aid); /* aid in PS-Poll has its two MSBs each set to 1 */ pspoll->aid |= cpu_to_le16(1 << 15 | 1 << 14); memcpy(pspoll->bssid, sdata->deflink.u.mgd.bssid, ETH_ALEN); memcpy(pspoll->ta, vif->addr, ETH_ALEN); return skb; } EXPORT_SYMBOL(ieee80211_pspoll_get); struct sk_buff *ieee80211_nullfunc_get(struct ieee80211_hw *hw, struct ieee80211_vif *vif, int link_id, bool qos_ok) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); struct ieee80211_local *local = sdata->local; struct ieee80211_link_data *link = NULL; struct ieee80211_hdr_3addr *nullfunc; struct sk_buff *skb; bool qos = false; if (WARN_ON(vif->type != NL80211_IFTYPE_STATION)) return NULL; skb = dev_alloc_skb(local->hw.extra_tx_headroom + sizeof(*nullfunc) + 2); if (!skb) return NULL; rcu_read_lock(); if (qos_ok) { struct sta_info *sta; sta = sta_info_get(sdata, vif->cfg.ap_addr); qos = sta && sta->sta.wme; } if (link_id >= 0) { link = rcu_dereference(sdata->link[link_id]); if (WARN_ON_ONCE(!link)) { rcu_read_unlock(); kfree_skb(skb); return NULL; } } skb_reserve(skb, local->hw.extra_tx_headroom); nullfunc = skb_put_zero(skb, sizeof(*nullfunc)); nullfunc->frame_control = cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_NULLFUNC | IEEE80211_FCTL_TODS); if (qos) { __le16 qoshdr = cpu_to_le16(7); BUILD_BUG_ON((IEEE80211_STYPE_QOS_NULLFUNC | IEEE80211_STYPE_NULLFUNC) != IEEE80211_STYPE_QOS_NULLFUNC); nullfunc->frame_control |= cpu_to_le16(IEEE80211_STYPE_QOS_NULLFUNC); skb->priority = 7; skb_set_queue_mapping(skb, IEEE80211_AC_VO); skb_put_data(skb, &qoshdr, sizeof(qoshdr)); } if (link) { memcpy(nullfunc->addr1, link->conf->bssid, ETH_ALEN); memcpy(nullfunc->addr2, link->conf->addr, ETH_ALEN); memcpy(nullfunc->addr3, link->conf->bssid, ETH_ALEN); } else { memcpy(nullfunc->addr1, vif->cfg.ap_addr, ETH_ALEN); memcpy(nullfunc->addr2, vif->addr, ETH_ALEN); memcpy(nullfunc->addr3, vif->cfg.ap_addr, ETH_ALEN); } rcu_read_unlock(); return skb; } EXPORT_SYMBOL(ieee80211_nullfunc_get); struct sk_buff *ieee80211_probereq_get(struct ieee80211_hw *hw, const u8 *src_addr, const u8 *ssid, size_t ssid_len, size_t tailroom) { struct ieee80211_local *local = hw_to_local(hw); struct ieee80211_hdr_3addr *hdr; struct sk_buff *skb; size_t ie_ssid_len; u8 *pos; ie_ssid_len = 2 + ssid_len; skb = dev_alloc_skb(local->hw.extra_tx_headroom + sizeof(*hdr) + ie_ssid_len + tailroom); if (!skb) return NULL; skb_reserve(skb, local->hw.extra_tx_headroom); hdr = skb_put_zero(skb, sizeof(*hdr)); hdr->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_PROBE_REQ); eth_broadcast_addr(hdr->addr1); memcpy(hdr->addr2, src_addr, ETH_ALEN); eth_broadcast_addr(hdr->addr3); pos = skb_put(skb, ie_ssid_len); *pos++ = WLAN_EID_SSID; *pos++ = ssid_len; if (ssid_len) memcpy(pos, ssid, ssid_len); pos += ssid_len; return skb; } EXPORT_SYMBOL(ieee80211_probereq_get); void ieee80211_rts_get(struct ieee80211_hw *hw, struct ieee80211_vif *vif, const void *frame, size_t frame_len, const struct ieee80211_tx_info *frame_txctl, struct ieee80211_rts *rts) { const struct ieee80211_hdr *hdr = frame; rts->frame_control = cpu_to_le16(IEEE80211_FTYPE_CTL | IEEE80211_STYPE_RTS); rts->duration = ieee80211_rts_duration(hw, vif, frame_len, frame_txctl); memcpy(rts->ra, hdr->addr1, sizeof(rts->ra)); memcpy(rts->ta, hdr->addr2, sizeof(rts->ta)); } EXPORT_SYMBOL(ieee80211_rts_get); void ieee80211_ctstoself_get(struct ieee80211_hw *hw, struct ieee80211_vif *vif, const void *frame, size_t frame_len, const struct ieee80211_tx_info *frame_txctl, struct ieee80211_cts *cts) { const struct ieee80211_hdr *hdr = frame; cts->frame_control = cpu_to_le16(IEEE80211_FTYPE_CTL | IEEE80211_STYPE_CTS); cts->duration = ieee80211_ctstoself_duration(hw, vif, frame_len, frame_txctl); memcpy(cts->ra, hdr->addr1, sizeof(cts->ra)); } EXPORT_SYMBOL(ieee80211_ctstoself_get); struct sk_buff * ieee80211_get_buffered_bc(struct ieee80211_hw *hw, struct ieee80211_vif *vif) { struct ieee80211_local *local = hw_to_local(hw); struct sk_buff *skb = NULL; struct ieee80211_tx_data tx; struct ieee80211_sub_if_data *sdata; struct ps_data *ps; struct ieee80211_tx_info *info; struct ieee80211_chanctx_conf *chanctx_conf; sdata = vif_to_sdata(vif); rcu_read_lock(); chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf); if (!chanctx_conf) goto out; if (sdata->vif.type == NL80211_IFTYPE_AP) { struct beacon_data *beacon = rcu_dereference(sdata->deflink.u.ap.beacon); if (!beacon || !beacon->head) goto out; ps = &sdata->u.ap.ps; } else if (ieee80211_vif_is_mesh(&sdata->vif)) { ps = &sdata->u.mesh.ps; } else { goto out; } if (ps->dtim_count != 0 || !ps->dtim_bc_mc) goto out; /* send buffered bc/mc only after DTIM beacon */ while (1) { skb = skb_dequeue(&ps->bc_buf); if (!skb) goto out; local->total_ps_buffered--; if (!skb_queue_empty(&ps->bc_buf) && skb->len >= 2) { struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; /* more buffered multicast/broadcast frames ==> set * MoreData flag in IEEE 802.11 header to inform PS * STAs */ hdr->frame_control |= cpu_to_le16(IEEE80211_FCTL_MOREDATA); } if (sdata->vif.type == NL80211_IFTYPE_AP) sdata = IEEE80211_DEV_TO_SUB_IF(skb->dev); if (!ieee80211_tx_prepare(sdata, &tx, NULL, skb)) break; ieee80211_free_txskb(hw, skb); } info = IEEE80211_SKB_CB(skb); tx.flags |= IEEE80211_TX_PS_BUFFERED; info->band = chanctx_conf->def.chan->band; if (invoke_tx_handlers(&tx)) skb = NULL; out: rcu_read_unlock(); return skb; } EXPORT_SYMBOL(ieee80211_get_buffered_bc); int ieee80211_reserve_tid(struct ieee80211_sta *pubsta, u8 tid) { struct sta_info *sta = container_of(pubsta, struct sta_info, sta); struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_local *local = sdata->local; int ret; u32 queues; lockdep_assert_wiphy(local->hw.wiphy); /* only some cases are supported right now */ switch (sdata->vif.type) { case NL80211_IFTYPE_STATION: case NL80211_IFTYPE_AP: case NL80211_IFTYPE_AP_VLAN: break; default: WARN_ON(1); return -EINVAL; } if (WARN_ON(tid >= IEEE80211_NUM_UPS)) return -EINVAL; if (sta->reserved_tid == tid) { ret = 0; goto out; } if (sta->reserved_tid != IEEE80211_TID_UNRESERVED) { sdata_err(sdata, "TID reservation already active\n"); ret = -EALREADY; goto out; } ieee80211_stop_vif_queues(sdata->local, sdata, IEEE80211_QUEUE_STOP_REASON_RESERVE_TID); synchronize_net(); /* Tear down BA sessions so we stop aggregating on this TID */ if (ieee80211_hw_check(&local->hw, AMPDU_AGGREGATION)) { set_sta_flag(sta, WLAN_STA_BLOCK_BA); __ieee80211_stop_tx_ba_session(sta, tid, AGG_STOP_LOCAL_REQUEST); } queues = BIT(sdata->vif.hw_queue[ieee802_1d_to_ac[tid]]); __ieee80211_flush_queues(local, sdata, queues, false); sta->reserved_tid = tid; ieee80211_wake_vif_queues(local, sdata, IEEE80211_QUEUE_STOP_REASON_RESERVE_TID); if (ieee80211_hw_check(&local->hw, AMPDU_AGGREGATION)) clear_sta_flag(sta, WLAN_STA_BLOCK_BA); ret = 0; out: return ret; } EXPORT_SYMBOL(ieee80211_reserve_tid); void ieee80211_unreserve_tid(struct ieee80211_sta *pubsta, u8 tid) { struct sta_info *sta = container_of(pubsta, struct sta_info, sta); struct ieee80211_sub_if_data *sdata = sta->sdata; lockdep_assert_wiphy(sdata->local->hw.wiphy); /* only some cases are supported right now */ switch (sdata->vif.type) { case NL80211_IFTYPE_STATION: case NL80211_IFTYPE_AP: case NL80211_IFTYPE_AP_VLAN: break; default: WARN_ON(1); return; } if (tid != sta->reserved_tid) { sdata_err(sdata, "TID to unreserve (%d) isn't reserved\n", tid); return; } sta->reserved_tid = IEEE80211_TID_UNRESERVED; } EXPORT_SYMBOL(ieee80211_unreserve_tid); void __ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, int tid, int link_id, enum nl80211_band band) { const struct ieee80211_hdr *hdr = (void *)skb->data; int ac = ieee80211_ac_from_tid(tid); unsigned int link; skb_reset_mac_header(skb); skb_set_queue_mapping(skb, ac); skb->priority = tid; skb->dev = sdata->dev; BUILD_BUG_ON(IEEE80211_LINK_UNSPECIFIED < IEEE80211_MLD_MAX_NUM_LINKS); BUILD_BUG_ON(!FIELD_FIT(IEEE80211_TX_CTRL_MLO_LINK, IEEE80211_LINK_UNSPECIFIED)); if (!ieee80211_vif_is_mld(&sdata->vif)) { link = 0; } else if (link_id >= 0) { link = link_id; } else if (memcmp(sdata->vif.addr, hdr->addr2, ETH_ALEN) == 0) { /* address from the MLD */ link = IEEE80211_LINK_UNSPECIFIED; } else { /* otherwise must be addressed from a link */ rcu_read_lock(); for (link = 0; link < ARRAY_SIZE(sdata->vif.link_conf); link++) { struct ieee80211_bss_conf *link_conf; link_conf = rcu_dereference(sdata->vif.link_conf[link]); if (!link_conf) continue; if (memcmp(link_conf->addr, hdr->addr2, ETH_ALEN) == 0) break; } rcu_read_unlock(); if (WARN_ON_ONCE(link == ARRAY_SIZE(sdata->vif.link_conf))) link = ffs(sdata->vif.active_links) - 1; } IEEE80211_SKB_CB(skb)->control.flags |= u32_encode_bits(link, IEEE80211_TX_CTRL_MLO_LINK); /* * The other path calling ieee80211_xmit is from the tasklet, * and while we can handle concurrent transmissions locking * requirements are that we do not come into tx with bhs on. */ local_bh_disable(); IEEE80211_SKB_CB(skb)->band = band; ieee80211_xmit(sdata, NULL, skb); local_bh_enable(); } void ieee80211_tx_skb_tid(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, int tid, int link_id) { struct ieee80211_chanctx_conf *chanctx_conf; enum nl80211_band band; rcu_read_lock(); if (!ieee80211_vif_is_mld(&sdata->vif)) { WARN_ON(link_id >= 0); chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf); if (WARN_ON(!chanctx_conf)) { rcu_read_unlock(); kfree_skb(skb); return; } band = chanctx_conf->def.chan->band; } else { WARN_ON(link_id >= 0 && !(sdata->vif.active_links & BIT(link_id))); /* MLD transmissions must not rely on the band */ band = 0; } __ieee80211_tx_skb_tid_band(sdata, skb, tid, link_id, band); rcu_read_unlock(); } int ieee80211_tx_control_port(struct wiphy *wiphy, struct net_device *dev, const u8 *buf, size_t len, const u8 *dest, __be16 proto, bool unencrypted, int link_id, u64 *cookie) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; struct sta_info *sta; struct sk_buff *skb; struct ethhdr *ehdr; u32 ctrl_flags = 0; u32 flags = 0; int err; /* mutex lock is only needed for incrementing the cookie counter */ lockdep_assert_wiphy(local->hw.wiphy); /* Only accept CONTROL_PORT_PROTOCOL configured in CONNECT/ASSOCIATE * or Pre-Authentication */ if (proto != sdata->control_port_protocol && proto != cpu_to_be16(ETH_P_PREAUTH)) return -EINVAL; if (proto == sdata->control_port_protocol) ctrl_flags |= IEEE80211_TX_CTRL_PORT_CTRL_PROTO | IEEE80211_TX_CTRL_SKIP_MPATH_LOOKUP; if (unencrypted) flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT; if (cookie) ctrl_flags |= IEEE80211_TX_CTL_REQ_TX_STATUS; flags |= IEEE80211_TX_INTFL_NL80211_FRAME_TX; skb = dev_alloc_skb(local->hw.extra_tx_headroom + sizeof(struct ethhdr) + len); if (!skb) return -ENOMEM; skb_reserve(skb, local->hw.extra_tx_headroom + sizeof(struct ethhdr)); skb_put_data(skb, buf, len); ehdr = skb_push(skb, sizeof(struct ethhdr)); memcpy(ehdr->h_dest, dest, ETH_ALEN); /* we may override the SA for MLO STA later */ if (link_id < 0) { ctrl_flags |= u32_encode_bits(IEEE80211_LINK_UNSPECIFIED, IEEE80211_TX_CTRL_MLO_LINK); memcpy(ehdr->h_source, sdata->vif.addr, ETH_ALEN); } else { struct ieee80211_bss_conf *link_conf; ctrl_flags |= u32_encode_bits(link_id, IEEE80211_TX_CTRL_MLO_LINK); rcu_read_lock(); link_conf = rcu_dereference(sdata->vif.link_conf[link_id]); if (!link_conf) { dev_kfree_skb(skb); rcu_read_unlock(); return -ENOLINK; } memcpy(ehdr->h_source, link_conf->addr, ETH_ALEN); rcu_read_unlock(); } ehdr->h_proto = proto; skb->dev = dev; skb->protocol = proto; skb_reset_network_header(skb); skb_reset_mac_header(skb); if (local->hw.queues < IEEE80211_NUM_ACS) goto start_xmit; /* update QoS header to prioritize control port frames if possible, * prioritization also happens for control port frames send over * AF_PACKET */ rcu_read_lock(); err = ieee80211_lookup_ra_sta(sdata, skb, &sta); if (err) { dev_kfree_skb(skb); rcu_read_unlock(); return err; } if (!IS_ERR(sta)) { u16 queue = ieee80211_select_queue(sdata, sta, skb); skb_set_queue_mapping(skb, queue); /* * for MLO STA, the SA should be the AP MLD address, but * the link ID has been selected already */ if (sta && sta->sta.mlo) memcpy(ehdr->h_source, sdata->vif.addr, ETH_ALEN); } rcu_read_unlock(); start_xmit: local_bh_disable(); __ieee80211_subif_start_xmit(skb, skb->dev, flags, ctrl_flags, cookie); local_bh_enable(); return 0; } int ieee80211_probe_mesh_link(struct wiphy *wiphy, struct net_device *dev, const u8 *buf, size_t len) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; struct sk_buff *skb; skb = dev_alloc_skb(local->hw.extra_tx_headroom + len + 30 + /* header size */ 18); /* 11s header size */ if (!skb) return -ENOMEM; skb_reserve(skb, local->hw.extra_tx_headroom); skb_put_data(skb, buf, len); skb->dev = dev; skb->protocol = htons(ETH_P_802_3); skb_reset_network_header(skb); skb_reset_mac_header(skb); local_bh_disable(); __ieee80211_subif_start_xmit(skb, skb->dev, 0, IEEE80211_TX_CTRL_SKIP_MPATH_LOOKUP, NULL); local_bh_enable(); return 0; }
8 8 8 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 // SPDX-License-Identifier: GPL-2.0 /* * Disk events - monitor disk events like media change and eject request. */ #include <linux/export.h> #include <linux/moduleparam.h> #include <linux/blkdev.h> #include "blk.h" struct disk_events { struct list_head node; /* all disk_event's */ struct gendisk *disk; /* the associated disk */ spinlock_t lock; struct mutex block_mutex; /* protects blocking */ int block; /* event blocking depth */ unsigned int pending; /* events already sent out */ unsigned int clearing; /* events being cleared */ long poll_msecs; /* interval, -1 for default */ struct delayed_work dwork; }; static const char *disk_events_strs[] = { [ilog2(DISK_EVENT_MEDIA_CHANGE)] = "media_change", [ilog2(DISK_EVENT_EJECT_REQUEST)] = "eject_request", }; static char *disk_uevents[] = { [ilog2(DISK_EVENT_MEDIA_CHANGE)] = "DISK_MEDIA_CHANGE=1", [ilog2(DISK_EVENT_EJECT_REQUEST)] = "DISK_EJECT_REQUEST=1", }; /* list of all disk_events */ static DEFINE_MUTEX(disk_events_mutex); static LIST_HEAD(disk_events); /* disable in-kernel polling by default */ static unsigned long disk_events_dfl_poll_msecs; static unsigned long disk_events_poll_jiffies(struct gendisk *disk) { struct disk_events *ev = disk->ev; long intv_msecs = 0; /* * If device-specific poll interval is set, always use it. If * the default is being used, poll if the POLL flag is set. */ if (ev->poll_msecs >= 0) intv_msecs = ev->poll_msecs; else if (disk->event_flags & DISK_EVENT_FLAG_POLL) intv_msecs = disk_events_dfl_poll_msecs; return msecs_to_jiffies(intv_msecs); } /** * disk_block_events - block and flush disk event checking * @disk: disk to block events for * * On return from this function, it is guaranteed that event checking * isn't in progress and won't happen until unblocked by * disk_unblock_events(). Events blocking is counted and the actual * unblocking happens after the matching number of unblocks are done. * * Note that this intentionally does not block event checking from * disk_clear_events(). * * CONTEXT: * Might sleep. */ void disk_block_events(struct gendisk *disk) { struct disk_events *ev = disk->ev; unsigned long flags; bool cancel; if (!ev) return; /* * Outer mutex ensures that the first blocker completes canceling * the event work before further blockers are allowed to finish. */ mutex_lock(&ev->block_mutex); spin_lock_irqsave(&ev->lock, flags); cancel = !ev->block++; spin_unlock_irqrestore(&ev->lock, flags); if (cancel) cancel_delayed_work_sync(&disk->ev->dwork); mutex_unlock(&ev->block_mutex); } static void __disk_unblock_events(struct gendisk *disk, bool check_now) { struct disk_events *ev = disk->ev; unsigned long intv; unsigned long flags; spin_lock_irqsave(&ev->lock, flags); if (WARN_ON_ONCE(ev->block <= 0)) goto out_unlock; if (--ev->block) goto out_unlock; intv = disk_events_poll_jiffies(disk); if (check_now) queue_delayed_work(system_freezable_power_efficient_wq, &ev->dwork, 0); else if (intv) queue_delayed_work(system_freezable_power_efficient_wq, &ev->dwork, intv); out_unlock: spin_unlock_irqrestore(&ev->lock, flags); } /** * disk_unblock_events - unblock disk event checking * @disk: disk to unblock events for * * Undo disk_block_events(). When the block count reaches zero, it * starts events polling if configured. * * CONTEXT: * Don't care. Safe to call from irq context. */ void disk_unblock_events(struct gendisk *disk) { if (disk->ev) __disk_unblock_events(disk, false); } /** * disk_flush_events - schedule immediate event checking and flushing * @disk: disk to check and flush events for * @mask: events to flush * * Schedule immediate event checking on @disk if not blocked. Events in * @mask are scheduled to be cleared from the driver. Note that this * doesn't clear the events from @disk->ev. * * CONTEXT: * If @mask is non-zero must be called with disk->open_mutex held. */ void disk_flush_events(struct gendisk *disk, unsigned int mask) { struct disk_events *ev = disk->ev; if (!ev) return; spin_lock_irq(&ev->lock); ev->clearing |= mask; if (!ev->block) mod_delayed_work(system_freezable_power_efficient_wq, &ev->dwork, 0); spin_unlock_irq(&ev->lock); } /* * Tell userland about new events. Only the events listed in @disk->events are * reported, and only if DISK_EVENT_FLAG_UEVENT is set. Otherwise, events are * processed internally but never get reported to userland. */ static void disk_event_uevent(struct gendisk *disk, unsigned int events) { char *envp[ARRAY_SIZE(disk_uevents) + 1] = { }; int nr_events = 0, i; for (i = 0; i < ARRAY_SIZE(disk_uevents); i++) if (events & disk->events & (1 << i)) envp[nr_events++] = disk_uevents[i]; if (nr_events) kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp); } static void disk_check_events(struct disk_events *ev, unsigned int *clearing_ptr) { struct gendisk *disk = ev->disk; unsigned int clearing = *clearing_ptr; unsigned int events; unsigned long intv; /* check events */ events = disk->fops->check_events(disk, clearing); /* accumulate pending events and schedule next poll if necessary */ spin_lock_irq(&ev->lock); events &= ~ev->pending; ev->pending |= events; *clearing_ptr &= ~clearing; intv = disk_events_poll_jiffies(disk); if (!ev->block && intv) queue_delayed_work(system_freezable_power_efficient_wq, &ev->dwork, intv); spin_unlock_irq(&ev->lock); if (events & DISK_EVENT_MEDIA_CHANGE) inc_diskseq(disk); if (disk->event_flags & DISK_EVENT_FLAG_UEVENT) disk_event_uevent(disk, events); } /** * disk_clear_events - synchronously check, clear and return pending events * @disk: disk to fetch and clear events from * @mask: mask of events to be fetched and cleared * * Disk events are synchronously checked and pending events in @mask * are cleared and returned. This ignores the block count. * * CONTEXT: * Might sleep. */ static unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask) { struct disk_events *ev = disk->ev; unsigned int pending; unsigned int clearing = mask; if (!ev) return 0; disk_block_events(disk); /* * store the union of mask and ev->clearing on the stack so that the * race with disk_flush_events does not cause ambiguity (ev->clearing * can still be modified even if events are blocked). */ spin_lock_irq(&ev->lock); clearing |= ev->clearing; ev->clearing = 0; spin_unlock_irq(&ev->lock); disk_check_events(ev, &clearing); /* * if ev->clearing is not 0, the disk_flush_events got called in the * middle of this function, so we want to run the workfn without delay. */ __disk_unblock_events(disk, ev->clearing ? true : false); /* then, fetch and clear pending events */ spin_lock_irq(&ev->lock); pending = ev->pending & mask; ev->pending &= ~mask; spin_unlock_irq(&ev->lock); WARN_ON_ONCE(clearing & mask); return pending; } /** * disk_check_media_change - check if a removable media has been changed * @disk: gendisk to check * * Returns %true and marks the disk for a partition rescan whether a removable * media has been changed, and %false if the media did not change. */ bool disk_check_media_change(struct gendisk *disk) { unsigned int events; events = disk_clear_events(disk, DISK_EVENT_MEDIA_CHANGE | DISK_EVENT_EJECT_REQUEST); if (events & DISK_EVENT_MEDIA_CHANGE) { set_bit(GD_NEED_PART_SCAN, &disk->state); return true; } return false; } EXPORT_SYMBOL(disk_check_media_change); /** * disk_force_media_change - force a media change event * @disk: the disk which will raise the event * * Should be called when the media changes for @disk. Generates a uevent * and attempts to free all dentries and inodes and invalidates all block * device page cache entries in that case. */ void disk_force_media_change(struct gendisk *disk) { disk_event_uevent(disk, DISK_EVENT_MEDIA_CHANGE); inc_diskseq(disk); bdev_mark_dead(disk->part0, true); set_bit(GD_NEED_PART_SCAN, &disk->state); } EXPORT_SYMBOL_GPL(disk_force_media_change); /* * Separate this part out so that a different pointer for clearing_ptr can be * passed in for disk_clear_events. */ static void disk_events_workfn(struct work_struct *work) { struct delayed_work *dwork = to_delayed_work(work); struct disk_events *ev = container_of(dwork, struct disk_events, dwork); disk_check_events(ev, &ev->clearing); } /* * A disk events enabled device has the following sysfs nodes under * its /sys/block/X/ directory. * * events : list of all supported events * events_async : list of events which can be detected w/o polling * (always empty, only for backwards compatibility) * events_poll_msecs : polling interval, 0: disable, -1: system default */ static ssize_t __disk_events_show(unsigned int events, char *buf) { const char *delim = ""; ssize_t pos = 0; int i; for (i = 0; i < ARRAY_SIZE(disk_events_strs); i++) if (events & (1 << i)) { pos += sprintf(buf + pos, "%s%s", delim, disk_events_strs[i]); delim = " "; } if (pos) pos += sprintf(buf + pos, "\n"); return pos; } static ssize_t disk_events_show(struct device *dev, struct device_attribute *attr, char *buf) { struct gendisk *disk = dev_to_disk(dev); if (!(disk->event_flags & DISK_EVENT_FLAG_UEVENT)) return 0; return __disk_events_show(disk->events, buf); } static ssize_t disk_events_async_show(struct device *dev, struct device_attribute *attr, char *buf) { return 0; } static ssize_t disk_events_poll_msecs_show(struct device *dev, struct device_attribute *attr, char *buf) { struct gendisk *disk = dev_to_disk(dev); if (!disk->ev) return sprintf(buf, "-1\n"); return sprintf(buf, "%ld\n", disk->ev->poll_msecs); } static ssize_t disk_events_poll_msecs_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct gendisk *disk = dev_to_disk(dev); long intv; if (!count || !sscanf(buf, "%ld", &intv)) return -EINVAL; if (intv < 0 && intv != -1) return -EINVAL; if (!disk->ev) return -ENODEV; disk_block_events(disk); disk->ev->poll_msecs = intv; __disk_unblock_events(disk, true); return count; } DEVICE_ATTR(events, 0444, disk_events_show, NULL); DEVICE_ATTR(events_async, 0444, disk_events_async_show, NULL); DEVICE_ATTR(events_poll_msecs, 0644, disk_events_poll_msecs_show, disk_events_poll_msecs_store); /* * The default polling interval can be specified by the kernel * parameter block.events_dfl_poll_msecs which defaults to 0 * (disable). This can also be modified runtime by writing to * /sys/module/block/parameters/events_dfl_poll_msecs. */ static int disk_events_set_dfl_poll_msecs(const char *val, const struct kernel_param *kp) { struct disk_events *ev; int ret; ret = param_set_ulong(val, kp); if (ret < 0) return ret; mutex_lock(&disk_events_mutex); list_for_each_entry(ev, &disk_events, node) disk_flush_events(ev->disk, 0); mutex_unlock(&disk_events_mutex); return 0; } static const struct kernel_param_ops disk_events_dfl_poll_msecs_param_ops = { .set = disk_events_set_dfl_poll_msecs, .get = param_get_ulong, }; #undef MODULE_PARAM_PREFIX #define MODULE_PARAM_PREFIX "block." module_param_cb(events_dfl_poll_msecs, &disk_events_dfl_poll_msecs_param_ops, &disk_events_dfl_poll_msecs, 0644); /* * disk_{alloc|add|del|release}_events - initialize and destroy disk_events. */ int disk_alloc_events(struct gendisk *disk) { struct disk_events *ev; if (!disk->fops->check_events || !disk->events) return 0; ev = kzalloc(sizeof(*ev), GFP_KERNEL); if (!ev) { pr_warn("%s: failed to initialize events\n", disk->disk_name); return -ENOMEM; } INIT_LIST_HEAD(&ev->node); ev->disk = disk; spin_lock_init(&ev->lock); mutex_init(&ev->block_mutex); ev->block = 1; ev->poll_msecs = -1; INIT_DELAYED_WORK(&ev->dwork, disk_events_workfn); disk->ev = ev; return 0; } void disk_add_events(struct gendisk *disk) { if (!disk->ev) return; mutex_lock(&disk_events_mutex); list_add_tail(&disk->ev->node, &disk_events); mutex_unlock(&disk_events_mutex); /* * Block count is initialized to 1 and the following initial * unblock kicks it into action. */ __disk_unblock_events(disk, true); } void disk_del_events(struct gendisk *disk) { if (disk->ev) { disk_block_events(disk); mutex_lock(&disk_events_mutex); list_del_init(&disk->ev->node); mutex_unlock(&disk_events_mutex); } } void disk_release_events(struct gendisk *disk) { /* the block count should be 1 from disk_del_events() */ WARN_ON_ONCE(disk->ev && disk->ev->block != 1); kfree(disk->ev); }
135 137 137 134 43 136 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 // SPDX-License-Identifier: GPL-2.0 #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/once.h> #include <linux/random.h> #include <linux/module.h> struct once_work { struct work_struct work; struct static_key_true *key; struct module *module; }; static void once_deferred(struct work_struct *w) { struct once_work *work; work = container_of(w, struct once_work, work); BUG_ON(!static_key_enabled(work->key)); static_branch_disable(work->key); module_put(work->module); kfree(work); } static void once_disable_jump(struct static_key_true *key, struct module *mod) { struct once_work *w; w = kmalloc(sizeof(*w), GFP_ATOMIC); if (!w) return; INIT_WORK(&w->work, once_deferred); w->key = key; w->module = mod; __module_get(mod); schedule_work(&w->work); } static DEFINE_SPINLOCK(once_lock); bool __do_once_start(bool *done, unsigned long *flags) __acquires(once_lock) { spin_lock_irqsave(&once_lock, *flags); if (*done) { spin_unlock_irqrestore(&once_lock, *flags); /* Keep sparse happy by restoring an even lock count on * this lock. In case we return here, we don't call into * __do_once_done but return early in the DO_ONCE() macro. */ __acquire(once_lock); return false; } return true; } EXPORT_SYMBOL(__do_once_start); void __do_once_done(bool *done, struct static_key_true *once_key, unsigned long *flags, struct module *mod) __releases(once_lock) { *done = true; spin_unlock_irqrestore(&once_lock, *flags); once_disable_jump(once_key, mod); } EXPORT_SYMBOL(__do_once_done); static DEFINE_MUTEX(once_mutex); bool __do_once_sleepable_start(bool *done) __acquires(once_mutex) { mutex_lock(&once_mutex); if (*done) { mutex_unlock(&once_mutex); /* Keep sparse happy by restoring an even lock count on * this mutex. In case we return here, we don't call into * __do_once_done but return early in the DO_ONCE_SLEEPABLE() macro. */ __acquire(once_mutex); return false; } return true; } EXPORT_SYMBOL(__do_once_sleepable_start); void __do_once_sleepable_done(bool *done, struct static_key_true *once_key, struct module *mod) __releases(once_mutex) { *done = true; mutex_unlock(&once_mutex); once_disable_jump(once_key, mod); } EXPORT_SYMBOL(__do_once_sleepable_done);
3659 3661 3660 3655 3659 3663 3653 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 // SPDX-License-Identifier: GPL-2.0 /* * SHA1 routine optimized to do word accesses rather than byte accesses, * and to avoid unnecessary copies into the context array. * * This was based on the git SHA1 implementation. */ #include <linux/kernel.h> #include <linux/export.h> #include <linux/module.h> #include <linux/bitops.h> #include <linux/string.h> #include <crypto/sha1.h> #include <linux/unaligned.h> /* * If you have 32 registers or more, the compiler can (and should) * try to change the array[] accesses into registers. However, on * machines with less than ~25 registers, that won't really work, * and at least gcc will make an unholy mess of it. * * So to avoid that mess which just slows things down, we force * the stores to memory to actually happen (we might be better off * with a 'W(t)=(val);asm("":"+m" (W(t))' there instead, as * suggested by Artur Skawina - that will also make gcc unable to * try to do the silly "optimize away loads" part because it won't * see what the value will be). * * Ben Herrenschmidt reports that on PPC, the C version comes close * to the optimized asm with this (ie on PPC you don't want that * 'volatile', since there are lots of registers). * * On ARM we get the best code generation by forcing a full memory barrier * between each SHA_ROUND, otherwise gcc happily get wild with spilling and * the stack frame size simply explode and performance goes down the drain. */ #ifdef CONFIG_X86 #define setW(x, val) (*(volatile __u32 *)&W(x) = (val)) #elif defined(CONFIG_ARM) #define setW(x, val) do { W(x) = (val); __asm__("":::"memory"); } while (0) #else #define setW(x, val) (W(x) = (val)) #endif /* This "rolls" over the 512-bit array */ #define W(x) (array[(x)&15]) /* * Where do we get the source from? The first 16 iterations get it from * the input data, the next mix it from the 512-bit array. */ #define SHA_SRC(t) get_unaligned_be32((__u32 *)data + t) #define SHA_MIX(t) rol32(W(t+13) ^ W(t+8) ^ W(t+2) ^ W(t), 1) #define SHA_ROUND(t, input, fn, constant, A, B, C, D, E) do { \ __u32 TEMP = input(t); setW(t, TEMP); \ E += TEMP + rol32(A,5) + (fn) + (constant); \ B = ror32(B, 2); \ TEMP = E; E = D; D = C; C = B; B = A; A = TEMP; } while (0) #define T_0_15(t, A, B, C, D, E) SHA_ROUND(t, SHA_SRC, (((C^D)&B)^D) , 0x5a827999, A, B, C, D, E ) #define T_16_19(t, A, B, C, D, E) SHA_ROUND(t, SHA_MIX, (((C^D)&B)^D) , 0x5a827999, A, B, C, D, E ) #define T_20_39(t, A, B, C, D, E) SHA_ROUND(t, SHA_MIX, (B^C^D) , 0x6ed9eba1, A, B, C, D, E ) #define T_40_59(t, A, B, C, D, E) SHA_ROUND(t, SHA_MIX, ((B&C)+(D&(B^C))) , 0x8f1bbcdc, A, B, C, D, E ) #define T_60_79(t, A, B, C, D, E) SHA_ROUND(t, SHA_MIX, (B^C^D) , 0xca62c1d6, A, B, C, D, E ) /** * sha1_transform - single block SHA1 transform (deprecated) * * @digest: 160 bit digest to update * @data: 512 bits of data to hash * @array: 16 words of workspace (see note) * * This function executes SHA-1's internal compression function. It updates the * 160-bit internal state (@digest) with a single 512-bit data block (@data). * * Don't use this function. SHA-1 is no longer considered secure. And even if * you do have to use SHA-1, this isn't the correct way to hash something with * SHA-1 as this doesn't handle padding and finalization. * * Note: If the hash is security sensitive, the caller should be sure * to clear the workspace. This is left to the caller to avoid * unnecessary clears between chained hashing operations. */ void sha1_transform(__u32 *digest, const char *data, __u32 *array) { __u32 A, B, C, D, E; unsigned int i = 0; A = digest[0]; B = digest[1]; C = digest[2]; D = digest[3]; E = digest[4]; /* Round 1 - iterations 0-16 take their input from 'data' */ for (; i < 16; ++i) T_0_15(i, A, B, C, D, E); /* Round 1 - tail. Input from 512-bit mixing array */ for (; i < 20; ++i) T_16_19(i, A, B, C, D, E); /* Round 2 */ for (; i < 40; ++i) T_20_39(i, A, B, C, D, E); /* Round 3 */ for (; i < 60; ++i) T_40_59(i, A, B, C, D, E); /* Round 4 */ for (; i < 80; ++i) T_60_79(i, A, B, C, D, E); digest[0] += A; digest[1] += B; digest[2] += C; digest[3] += D; digest[4] += E; } EXPORT_SYMBOL(sha1_transform); /** * sha1_init - initialize the vectors for a SHA1 digest * @buf: vector to initialize */ void sha1_init(__u32 *buf) { buf[0] = 0x67452301; buf[1] = 0xefcdab89; buf[2] = 0x98badcfe; buf[3] = 0x10325476; buf[4] = 0xc3d2e1f0; } EXPORT_SYMBOL(sha1_init); MODULE_DESCRIPTION("SHA-1 Algorithm"); MODULE_LICENSE("GPL");
7 1 7 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2011 Patrick McHardy <kaber@trash.net> */ #include <linux/module.h> #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/netfilter/xt_devgroup.h> #include <linux/netfilter/x_tables.h> MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Xtables: Device group match"); MODULE_ALIAS("ipt_devgroup"); MODULE_ALIAS("ip6t_devgroup"); static bool devgroup_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_devgroup_info *info = par->matchinfo; if (info->flags & XT_DEVGROUP_MATCH_SRC && (((info->src_group ^ xt_in(par)->group) & info->src_mask ? 1 : 0) ^ ((info->flags & XT_DEVGROUP_INVERT_SRC) ? 1 : 0))) return false; if (info->flags & XT_DEVGROUP_MATCH_DST && (((info->dst_group ^ xt_out(par)->group) & info->dst_mask ? 1 : 0) ^ ((info->flags & XT_DEVGROUP_INVERT_DST) ? 1 : 0))) return false; return true; } static int devgroup_mt_checkentry(const struct xt_mtchk_param *par) { const struct xt_devgroup_info *info = par->matchinfo; if (info->flags & ~(XT_DEVGROUP_MATCH_SRC | XT_DEVGROUP_INVERT_SRC | XT_DEVGROUP_MATCH_DST | XT_DEVGROUP_INVERT_DST)) return -EINVAL; if (info->flags & XT_DEVGROUP_MATCH_SRC && par->hook_mask & ~((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_FORWARD))) return -EINVAL; if (info->flags & XT_DEVGROUP_MATCH_DST && par->hook_mask & ~((1 << NF_INET_FORWARD) | (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_POST_ROUTING))) return -EINVAL; return 0; } static struct xt_match devgroup_mt_reg __read_mostly = { .name = "devgroup", .match = devgroup_mt, .checkentry = devgroup_mt_checkentry, .matchsize = sizeof(struct xt_devgroup_info), .family = NFPROTO_UNSPEC, .me = THIS_MODULE }; static int __init devgroup_mt_init(void) { return xt_register_match(&devgroup_mt_reg); } static void __exit devgroup_mt_exit(void) { xt_unregister_match(&devgroup_mt_reg); } module_init(devgroup_mt_init); module_exit(devgroup_mt_exit);
8 8 10 8 8 8 8 8 8 8 5 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 // SPDX-License-Identifier: GPL-2.0-only /* * LED support for the input layer * * Copyright 2010-2015 Samuel Thibault <samuel.thibault@ens-lyon.org> */ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/init.h> #include <linux/leds.h> #include <linux/input.h> #if IS_ENABLED(CONFIG_VT) #define VT_TRIGGER(_name) .trigger = _name #else #define VT_TRIGGER(_name) .trigger = NULL #endif #if IS_ENABLED(CONFIG_SND_CTL_LED) #define AUDIO_TRIGGER(_name) .trigger = _name #else #define AUDIO_TRIGGER(_name) .trigger = NULL #endif static const struct { const char *name; const char *trigger; } input_led_info[LED_CNT] = { [LED_NUML] = { "numlock", VT_TRIGGER("kbd-numlock") }, [LED_CAPSL] = { "capslock", VT_TRIGGER("kbd-capslock") }, [LED_SCROLLL] = { "scrolllock", VT_TRIGGER("kbd-scrolllock") }, [LED_COMPOSE] = { "compose" }, [LED_KANA] = { "kana", VT_TRIGGER("kbd-kanalock") }, [LED_SLEEP] = { "sleep" } , [LED_SUSPEND] = { "suspend" }, [LED_MUTE] = { "mute", AUDIO_TRIGGER("audio-mute") }, [LED_MISC] = { "misc" }, [LED_MAIL] = { "mail" }, [LED_CHARGING] = { "charging" }, }; struct input_led { struct led_classdev cdev; struct input_handle *handle; unsigned int code; /* One of LED_* constants */ }; struct input_leds { struct input_handle handle; unsigned int num_leds; struct input_led leds[] __counted_by(num_leds); }; static enum led_brightness input_leds_brightness_get(struct led_classdev *cdev) { struct input_led *led = container_of(cdev, struct input_led, cdev); struct input_dev *input = led->handle->dev; return test_bit(led->code, input->led) ? cdev->max_brightness : 0; } static void input_leds_brightness_set(struct led_classdev *cdev, enum led_brightness brightness) { struct input_led *led = container_of(cdev, struct input_led, cdev); input_inject_event(led->handle, EV_LED, led->code, !!brightness); } static void input_leds_event(struct input_handle *handle, unsigned int type, unsigned int code, int value) { } static int input_leds_get_count(struct input_dev *dev) { unsigned int led_code; int count = 0; for_each_set_bit(led_code, dev->ledbit, LED_CNT) if (input_led_info[led_code].name) count++; return count; } static int input_leds_connect(struct input_handler *handler, struct input_dev *dev, const struct input_device_id *id) { struct input_leds *leds; struct input_led *led; unsigned int num_leds; unsigned int led_code; int led_no; int error; num_leds = input_leds_get_count(dev); if (!num_leds) return -ENXIO; leds = kzalloc(struct_size(leds, leds, num_leds), GFP_KERNEL); if (!leds) return -ENOMEM; leds->num_leds = num_leds; leds->handle.dev = dev; leds->handle.handler = handler; leds->handle.name = "leds"; leds->handle.private = leds; error = input_register_handle(&leds->handle); if (error) goto err_free_mem; error = input_open_device(&leds->handle); if (error) goto err_unregister_handle; led_no = 0; for_each_set_bit(led_code, dev->ledbit, LED_CNT) { if (!input_led_info[led_code].name) continue; led = &leds->leds[led_no]; led->handle = &leds->handle; led->code = led_code; led->cdev.name = kasprintf(GFP_KERNEL, "%s::%s", dev_name(&dev->dev), input_led_info[led_code].name); if (!led->cdev.name) { error = -ENOMEM; goto err_unregister_leds; } led->cdev.max_brightness = 1; led->cdev.brightness_get = input_leds_brightness_get; led->cdev.brightness_set = input_leds_brightness_set; led->cdev.default_trigger = input_led_info[led_code].trigger; error = led_classdev_register(&dev->dev, &led->cdev); if (error) { dev_err(&dev->dev, "failed to register LED %s: %d\n", led->cdev.name, error); kfree(led->cdev.name); goto err_unregister_leds; } led_no++; } return 0; err_unregister_leds: while (--led_no >= 0) { struct input_led *led = &leds->leds[led_no]; led_classdev_unregister(&led->cdev); kfree(led->cdev.name); } input_close_device(&leds->handle); err_unregister_handle: input_unregister_handle(&leds->handle); err_free_mem: kfree(leds); return error; } static void input_leds_disconnect(struct input_handle *handle) { struct input_leds *leds = handle->private; int i; for (i = 0; i < leds->num_leds; i++) { struct input_led *led = &leds->leds[i]; led_classdev_unregister(&led->cdev); kfree(led->cdev.name); } input_close_device(handle); input_unregister_handle(handle); kfree(leds); } static const struct input_device_id input_leds_ids[] = { { .flags = INPUT_DEVICE_ID_MATCH_EVBIT, .evbit = { BIT_MASK(EV_LED) }, }, { }, }; MODULE_DEVICE_TABLE(input, input_leds_ids); static struct input_handler input_leds_handler = { .event = input_leds_event, .connect = input_leds_connect, .disconnect = input_leds_disconnect, .name = "leds", .id_table = input_leds_ids, }; static int __init input_leds_init(void) { return input_register_handler(&input_leds_handler); } module_init(input_leds_init); static void __exit input_leds_exit(void) { input_unregister_handler(&input_leds_handler); } module_exit(input_leds_exit); MODULE_AUTHOR("Samuel Thibault <samuel.thibault@ens-lyon.org>"); MODULE_AUTHOR("Dmitry Torokhov <dmitry.torokhov@gmail.com>"); MODULE_DESCRIPTION("Input -> LEDs Bridge"); MODULE_LICENSE("GPL v2");
275 317 11 3 6 70 42 271 124 69 41 2 2909 1280 273 885 567 2919 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. NET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Definitions for the Ethernet handlers. * * Version: @(#)eth.h 1.0.4 05/13/93 * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * * Relocated to include/linux where it belongs by Alan Cox * <gw4pts@gw4pts.ampr.org> */ #ifndef _LINUX_ETHERDEVICE_H #define _LINUX_ETHERDEVICE_H #include <linux/if_ether.h> #include <linux/netdevice.h> #include <linux/random.h> #include <linux/crc32.h> #include <linux/unaligned.h> #include <asm/bitsperlong.h> #ifdef __KERNEL__ struct device; struct fwnode_handle; int eth_platform_get_mac_address(struct device *dev, u8 *mac_addr); int platform_get_ethdev_address(struct device *dev, struct net_device *netdev); unsigned char *arch_get_platform_mac_address(void); int nvmem_get_mac_address(struct device *dev, void *addrbuf); int device_get_mac_address(struct device *dev, char *addr); int device_get_ethdev_address(struct device *dev, struct net_device *netdev); int fwnode_get_mac_address(struct fwnode_handle *fwnode, char *addr); u32 eth_get_headlen(const struct net_device *dev, const void *data, u32 len); __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev); extern const struct header_ops eth_header_ops; int eth_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, const void *daddr, const void *saddr, unsigned len); int eth_header_parse(const struct sk_buff *skb, unsigned char *haddr); int eth_header_cache(const struct neighbour *neigh, struct hh_cache *hh, __be16 type); void eth_header_cache_update(struct hh_cache *hh, const struct net_device *dev, const unsigned char *haddr); __be16 eth_header_parse_protocol(const struct sk_buff *skb); int eth_prepare_mac_addr_change(struct net_device *dev, void *p); void eth_commit_mac_addr_change(struct net_device *dev, void *p); int eth_mac_addr(struct net_device *dev, void *p); int eth_validate_addr(struct net_device *dev); struct net_device *alloc_etherdev_mqs(int sizeof_priv, unsigned int txqs, unsigned int rxqs); #define alloc_etherdev(sizeof_priv) alloc_etherdev_mq(sizeof_priv, 1) #define alloc_etherdev_mq(sizeof_priv, count) alloc_etherdev_mqs(sizeof_priv, count, count) struct net_device *devm_alloc_etherdev_mqs(struct device *dev, int sizeof_priv, unsigned int txqs, unsigned int rxqs); #define devm_alloc_etherdev(dev, sizeof_priv) devm_alloc_etherdev_mqs(dev, sizeof_priv, 1, 1) struct sk_buff *eth_gro_receive(struct list_head *head, struct sk_buff *skb); int eth_gro_complete(struct sk_buff *skb, int nhoff); /* Reserved Ethernet Addresses per IEEE 802.1Q */ static const u8 eth_reserved_addr_base[ETH_ALEN] __aligned(2) = { 0x01, 0x80, 0xc2, 0x00, 0x00, 0x00 }; #define eth_stp_addr eth_reserved_addr_base static const u8 eth_ipv4_mcast_addr_base[ETH_ALEN] __aligned(2) = { 0x01, 0x00, 0x5e, 0x00, 0x00, 0x00 }; static const u8 eth_ipv6_mcast_addr_base[ETH_ALEN] __aligned(2) = { 0x33, 0x33, 0x00, 0x00, 0x00, 0x00 }; /** * is_link_local_ether_addr - Determine if given Ethernet address is link-local * @addr: Pointer to a six-byte array containing the Ethernet address * * Return: true if address is link local reserved addr (01:80:c2:00:00:0X) per * IEEE 802.1Q 8.6.3 Frame filtering. * * Please note: addr must be aligned to u16. */ static inline bool is_link_local_ether_addr(const u8 *addr) { __be16 *a = (__be16 *)addr; static const __be16 *b = (const __be16 *)eth_reserved_addr_base; static const __be16 m = cpu_to_be16(0xfff0); #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) return (((*(const u32 *)addr) ^ (*(const u32 *)b)) | (__force int)((a[2] ^ b[2]) & m)) == 0; #else return ((a[0] ^ b[0]) | (a[1] ^ b[1]) | ((a[2] ^ b[2]) & m)) == 0; #endif } /** * is_zero_ether_addr - Determine if give Ethernet address is all zeros. * @addr: Pointer to a six-byte array containing the Ethernet address * * Return: true if the address is all zeroes. * * Please note: addr must be aligned to u16. */ static inline bool is_zero_ether_addr(const u8 *addr) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) return ((*(const u32 *)addr) | (*(const u16 *)(addr + 4))) == 0; #else return (*(const u16 *)(addr + 0) | *(const u16 *)(addr + 2) | *(const u16 *)(addr + 4)) == 0; #endif } /** * is_multicast_ether_addr - Determine if the Ethernet address is a multicast. * @addr: Pointer to a six-byte array containing the Ethernet address * * Return: true if the address is a multicast address. * By definition the broadcast address is also a multicast address. */ static inline bool is_multicast_ether_addr(const u8 *addr) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) u32 a = *(const u32 *)addr; #else u16 a = *(const u16 *)addr; #endif #ifdef __BIG_ENDIAN return 0x01 & (a >> ((sizeof(a) * 8) - 8)); #else return 0x01 & a; #endif } static inline bool is_multicast_ether_addr_64bits(const u8 *addr) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 #ifdef __BIG_ENDIAN return 0x01 & ((*(const u64 *)addr) >> 56); #else return 0x01 & (*(const u64 *)addr); #endif #else return is_multicast_ether_addr(addr); #endif } /** * is_local_ether_addr - Determine if the Ethernet address is locally-assigned one (IEEE 802). * @addr: Pointer to a six-byte array containing the Ethernet address * * Return: true if the address is a local address. */ static inline bool is_local_ether_addr(const u8 *addr) { return 0x02 & addr[0]; } /** * is_broadcast_ether_addr - Determine if the Ethernet address is broadcast * @addr: Pointer to a six-byte array containing the Ethernet address * * Return: true if the address is the broadcast address. * * Please note: addr must be aligned to u16. */ static inline bool is_broadcast_ether_addr(const u8 *addr) { return (*(const u16 *)(addr + 0) & *(const u16 *)(addr + 2) & *(const u16 *)(addr + 4)) == 0xffff; } /** * is_unicast_ether_addr - Determine if the Ethernet address is unicast * @addr: Pointer to a six-byte array containing the Ethernet address * * Return: true if the address is a unicast address. */ static inline bool is_unicast_ether_addr(const u8 *addr) { return !is_multicast_ether_addr(addr); } /** * is_valid_ether_addr - Determine if the given Ethernet address is valid * @addr: Pointer to a six-byte array containing the Ethernet address * * Check that the Ethernet address (MAC) is not 00:00:00:00:00:00, is not * a multicast address, and is not FF:FF:FF:FF:FF:FF. * * Return: true if the address is valid. * * Please note: addr must be aligned to u16. */ static inline bool is_valid_ether_addr(const u8 *addr) { /* FF:FF:FF:FF:FF:FF is a multicast address so we don't need to * explicitly check for it here. */ return !is_multicast_ether_addr(addr) && !is_zero_ether_addr(addr); } /** * eth_proto_is_802_3 - Determine if a given Ethertype/length is a protocol * @proto: Ethertype/length value to be tested * * Check that the value from the Ethertype/length field is a valid Ethertype. * * Return: true if the valid is an 802.3 supported Ethertype. */ static inline bool eth_proto_is_802_3(__be16 proto) { #ifndef __BIG_ENDIAN /* if CPU is little endian mask off bits representing LSB */ proto &= htons(0xFF00); #endif /* cast both to u16 and compare since LSB can be ignored */ return (__force u16)proto >= (__force u16)htons(ETH_P_802_3_MIN); } /** * eth_random_addr - Generate software assigned random Ethernet address * @addr: Pointer to a six-byte array containing the Ethernet address * * Generate a random Ethernet address (MAC) that is not multicast * and has the local assigned bit set. */ static inline void eth_random_addr(u8 *addr) { get_random_bytes(addr, ETH_ALEN); addr[0] &= 0xfe; /* clear multicast bit */ addr[0] |= 0x02; /* set local assignment bit (IEEE802) */ } /** * eth_broadcast_addr - Assign broadcast address * @addr: Pointer to a six-byte array containing the Ethernet address * * Assign the broadcast address to the given address array. */ static inline void eth_broadcast_addr(u8 *addr) { memset(addr, 0xff, ETH_ALEN); } /** * eth_zero_addr - Assign zero address * @addr: Pointer to a six-byte array containing the Ethernet address * * Assign the zero address to the given address array. */ static inline void eth_zero_addr(u8 *addr) { memset(addr, 0x00, ETH_ALEN); } /** * eth_hw_addr_random - Generate software assigned random Ethernet and * set device flag * @dev: pointer to net_device structure * * Generate a random Ethernet address (MAC) to be used by a net device * and set addr_assign_type so the state can be read by sysfs and be * used by userspace. */ static inline void eth_hw_addr_random(struct net_device *dev) { u8 addr[ETH_ALEN]; eth_random_addr(addr); __dev_addr_set(dev, addr, ETH_ALEN); dev->addr_assign_type = NET_ADDR_RANDOM; } /** * eth_hw_addr_crc - Calculate CRC from netdev_hw_addr * @ha: pointer to hardware address * * Calculate CRC from a hardware address as basis for filter hashes. */ static inline u32 eth_hw_addr_crc(struct netdev_hw_addr *ha) { return ether_crc(ETH_ALEN, ha->addr); } /** * ether_addr_copy - Copy an Ethernet address * @dst: Pointer to a six-byte array Ethernet address destination * @src: Pointer to a six-byte array Ethernet address source * * Please note: dst & src must both be aligned to u16. */ static inline void ether_addr_copy(u8 *dst, const u8 *src) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) *(u32 *)dst = *(const u32 *)src; *(u16 *)(dst + 4) = *(const u16 *)(src + 4); #else u16 *a = (u16 *)dst; const u16 *b = (const u16 *)src; a[0] = b[0]; a[1] = b[1]; a[2] = b[2]; #endif } /** * eth_hw_addr_set - Assign Ethernet address to a net_device * @dev: pointer to net_device structure * @addr: address to assign * * Assign given address to the net_device, addr_assign_type is not changed. */ static inline void eth_hw_addr_set(struct net_device *dev, const u8 *addr) { __dev_addr_set(dev, addr, ETH_ALEN); } /** * eth_hw_addr_inherit - Copy dev_addr from another net_device * @dst: pointer to net_device to copy dev_addr to * @src: pointer to net_device to copy dev_addr from * * Copy the Ethernet address from one net_device to another along with * the address attributes (addr_assign_type). */ static inline void eth_hw_addr_inherit(struct net_device *dst, struct net_device *src) { dst->addr_assign_type = src->addr_assign_type; eth_hw_addr_set(dst, src->dev_addr); } /** * ether_addr_equal - Compare two Ethernet addresses * @addr1: Pointer to a six-byte array containing the Ethernet address * @addr2: Pointer other six-byte array containing the Ethernet address * * Compare two Ethernet addresses, returns true if equal * * Please note: addr1 & addr2 must both be aligned to u16. */ static inline bool ether_addr_equal(const u8 *addr1, const u8 *addr2) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) u32 fold = ((*(const u32 *)addr1) ^ (*(const u32 *)addr2)) | ((*(const u16 *)(addr1 + 4)) ^ (*(const u16 *)(addr2 + 4))); return fold == 0; #else const u16 *a = (const u16 *)addr1; const u16 *b = (const u16 *)addr2; return ((a[0] ^ b[0]) | (a[1] ^ b[1]) | (a[2] ^ b[2])) == 0; #endif } /** * ether_addr_equal_64bits - Compare two Ethernet addresses * @addr1: Pointer to an array of 8 bytes * @addr2: Pointer to an other array of 8 bytes * * Compare two Ethernet addresses, returns true if equal, false otherwise. * * The function doesn't need any conditional branches and possibly uses * word memory accesses on CPU allowing cheap unaligned memory reads. * arrays = { byte1, byte2, byte3, byte4, byte5, byte6, pad1, pad2 } * * Please note that alignment of addr1 & addr2 are only guaranteed to be 16 bits. */ static inline bool ether_addr_equal_64bits(const u8 *addr1, const u8 *addr2) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 u64 fold = (*(const u64 *)addr1) ^ (*(const u64 *)addr2); #ifdef __BIG_ENDIAN return (fold >> 16) == 0; #else return (fold << 16) == 0; #endif #else return ether_addr_equal(addr1, addr2); #endif } /** * ether_addr_equal_unaligned - Compare two not u16 aligned Ethernet addresses * @addr1: Pointer to a six-byte array containing the Ethernet address * @addr2: Pointer other six-byte array containing the Ethernet address * * Compare two Ethernet addresses, returns true if equal * * Please note: Use only when any Ethernet address may not be u16 aligned. */ static inline bool ether_addr_equal_unaligned(const u8 *addr1, const u8 *addr2) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) return ether_addr_equal(addr1, addr2); #else return memcmp(addr1, addr2, ETH_ALEN) == 0; #endif } /** * ether_addr_equal_masked - Compare two Ethernet addresses with a mask * @addr1: Pointer to a six-byte array containing the 1st Ethernet address * @addr2: Pointer to a six-byte array containing the 2nd Ethernet address * @mask: Pointer to a six-byte array containing the Ethernet address bitmask * * Compare two Ethernet addresses with a mask, returns true if for every bit * set in the bitmask the equivalent bits in the ethernet addresses are equal. * Using a mask with all bits set is a slower ether_addr_equal. */ static inline bool ether_addr_equal_masked(const u8 *addr1, const u8 *addr2, const u8 *mask) { int i; for (i = 0; i < ETH_ALEN; i++) { if ((addr1[i] ^ addr2[i]) & mask[i]) return false; } return true; } static inline bool ether_addr_is_ipv4_mcast(const u8 *addr) { u8 mask[ETH_ALEN] = { 0xff, 0xff, 0xff, 0x80, 0x00, 0x00 }; return ether_addr_equal_masked(addr, eth_ipv4_mcast_addr_base, mask); } static inline bool ether_addr_is_ipv6_mcast(const u8 *addr) { u8 mask[ETH_ALEN] = { 0xff, 0xff, 0x00, 0x00, 0x00, 0x00 }; return ether_addr_equal_masked(addr, eth_ipv6_mcast_addr_base, mask); } static inline bool ether_addr_is_ip_mcast(const u8 *addr) { return ether_addr_is_ipv4_mcast(addr) || ether_addr_is_ipv6_mcast(addr); } /** * ether_addr_to_u64 - Convert an Ethernet address into a u64 value. * @addr: Pointer to a six-byte array containing the Ethernet address * * Return: a u64 value of the address */ static inline u64 ether_addr_to_u64(const u8 *addr) { u64 u = 0; int i; for (i = 0; i < ETH_ALEN; i++) u = u << 8 | addr[i]; return u; } /** * u64_to_ether_addr - Convert a u64 to an Ethernet address. * @u: u64 to convert to an Ethernet MAC address * @addr: Pointer to a six-byte array to contain the Ethernet address */ static inline void u64_to_ether_addr(u64 u, u8 *addr) { int i; for (i = ETH_ALEN - 1; i >= 0; i--) { addr[i] = u & 0xff; u = u >> 8; } } /** * eth_addr_dec - Decrement the given MAC address * * @addr: Pointer to a six-byte array containing Ethernet address to decrement */ static inline void eth_addr_dec(u8 *addr) { u64 u = ether_addr_to_u64(addr); u--; u64_to_ether_addr(u, addr); } /** * eth_addr_inc() - Increment the given MAC address. * @addr: Pointer to a six-byte array containing Ethernet address to increment. */ static inline void eth_addr_inc(u8 *addr) { u64 u = ether_addr_to_u64(addr); u++; u64_to_ether_addr(u, addr); } /** * eth_addr_add() - Add (or subtract) an offset to/from the given MAC address. * * @offset: Offset to add. * @addr: Pointer to a six-byte array containing Ethernet address to increment. */ static inline void eth_addr_add(u8 *addr, long offset) { u64 u = ether_addr_to_u64(addr); u += offset; u64_to_ether_addr(u, addr); } /** * is_etherdev_addr - Tell if given Ethernet address belongs to the device. * @dev: Pointer to a device structure * @addr: Pointer to a six-byte array containing the Ethernet address * * Compare passed address with all addresses of the device. Return true if the * address if one of the device addresses. * * Note that this function calls ether_addr_equal_64bits() so take care of * the right padding. */ static inline bool is_etherdev_addr(const struct net_device *dev, const u8 addr[6 + 2]) { struct netdev_hw_addr *ha; bool res = false; rcu_read_lock(); for_each_dev_addr(dev, ha) { res = ether_addr_equal_64bits(addr, ha->addr); if (res) break; } rcu_read_unlock(); return res; } #endif /* __KERNEL__ */ /** * compare_ether_header - Compare two Ethernet headers * @a: Pointer to Ethernet header * @b: Pointer to Ethernet header * * Compare two Ethernet headers, returns 0 if equal. * This assumes that the network header (i.e., IP header) is 4-byte * aligned OR the platform can handle unaligned access. This is the * case for all packets coming into netif_receive_skb or similar * entry points. */ static inline unsigned long compare_ether_header(const void *a, const void *b) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 unsigned long fold; /* * We want to compare 14 bytes: * [a0 ... a13] ^ [b0 ... b13] * Use two long XOR, ORed together, with an overlap of two bytes. * [a0 a1 a2 a3 a4 a5 a6 a7 ] ^ [b0 b1 b2 b3 b4 b5 b6 b7 ] | * [a6 a7 a8 a9 a10 a11 a12 a13] ^ [b6 b7 b8 b9 b10 b11 b12 b13] * This means the [a6 a7] ^ [b6 b7] part is done two times. */ fold = *(unsigned long *)a ^ *(unsigned long *)b; fold |= *(unsigned long *)(a + 6) ^ *(unsigned long *)(b + 6); return fold; #else u32 *a32 = (u32 *)((u8 *)a + 2); u32 *b32 = (u32 *)((u8 *)b + 2); return (*(u16 *)a ^ *(u16 *)b) | (a32[0] ^ b32[0]) | (a32[1] ^ b32[1]) | (a32[2] ^ b32[2]); #endif } /** * eth_hw_addr_gen - Generate and assign Ethernet address to a port * @dev: pointer to port's net_device structure * @base_addr: base Ethernet address * @id: offset to add to the base address * * Generate a MAC address using a base address and an offset and assign it * to a net_device. Commonly used by switch drivers which need to compute * addresses for all their ports. addr_assign_type is not changed. */ static inline void eth_hw_addr_gen(struct net_device *dev, const u8 *base_addr, unsigned int id) { u64 u = ether_addr_to_u64(base_addr); u8 addr[ETH_ALEN]; u += id; u64_to_ether_addr(u, addr); eth_hw_addr_set(dev, addr); } /** * eth_skb_pkt_type - Assign packet type if destination address does not match * @skb: Assigned a packet type if address does not match @dev address * @dev: Network device used to compare packet address against * * If the destination MAC address of the packet does not match the network * device address, assign an appropriate packet type. */ static inline void eth_skb_pkt_type(struct sk_buff *skb, const struct net_device *dev) { const struct ethhdr *eth = eth_hdr(skb); if (unlikely(!ether_addr_equal_64bits(eth->h_dest, dev->dev_addr))) { if (unlikely(is_multicast_ether_addr_64bits(eth->h_dest))) { if (ether_addr_equal_64bits(eth->h_dest, dev->broadcast)) skb->pkt_type = PACKET_BROADCAST; else skb->pkt_type = PACKET_MULTICAST; } else { skb->pkt_type = PACKET_OTHERHOST; } } } static inline struct ethhdr *eth_skb_pull_mac(struct sk_buff *skb) { struct ethhdr *eth = (struct ethhdr *)skb->data; skb_pull_inline(skb, ETH_HLEN); return eth; } /** * eth_skb_pad - Pad buffer to minimum number of octets for Ethernet frame * @skb: Buffer to pad * * An Ethernet frame should have a minimum size of 60 bytes. This function * takes short frames and pads them with zeros up to the 60 byte limit. */ static inline int eth_skb_pad(struct sk_buff *skb) { return skb_put_padto(skb, ETH_ZLEN); } #endif /* _LINUX_ETHERDEVICE_H */
10 16 299 187 199 473 190 190 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _NF_CONNTRACK_ZONES_H #define _NF_CONNTRACK_ZONES_H #include <linux/netfilter/nf_conntrack_zones_common.h> #include <net/netfilter/nf_conntrack.h> static inline const struct nf_conntrack_zone * nf_ct_zone(const struct nf_conn *ct) { #ifdef CONFIG_NF_CONNTRACK_ZONES return &ct->zone; #else return &nf_ct_zone_dflt; #endif } static inline const struct nf_conntrack_zone * nf_ct_zone_init(struct nf_conntrack_zone *zone, u16 id, u8 dir, u8 flags) { zone->id = id; zone->flags = flags; zone->dir = dir; return zone; } static inline const struct nf_conntrack_zone * nf_ct_zone_tmpl(const struct nf_conn *tmpl, const struct sk_buff *skb, struct nf_conntrack_zone *tmp) { #ifdef CONFIG_NF_CONNTRACK_ZONES if (!tmpl) return &nf_ct_zone_dflt; if (tmpl->zone.flags & NF_CT_FLAG_MARK) return nf_ct_zone_init(tmp, skb->mark, tmpl->zone.dir, 0); #endif return nf_ct_zone(tmpl); } static inline void nf_ct_zone_add(struct nf_conn *ct, const struct nf_conntrack_zone *zone) { #ifdef CONFIG_NF_CONNTRACK_ZONES ct->zone = *zone; #endif } static inline bool nf_ct_zone_matches_dir(const struct nf_conntrack_zone *zone, enum ip_conntrack_dir dir) { return zone->dir & (1 << dir); } static inline u16 nf_ct_zone_id(const struct nf_conntrack_zone *zone, enum ip_conntrack_dir dir) { #ifdef CONFIG_NF_CONNTRACK_ZONES return nf_ct_zone_matches_dir(zone, dir) ? zone->id : NF_CT_DEFAULT_ZONE_ID; #else return NF_CT_DEFAULT_ZONE_ID; #endif } static inline bool nf_ct_zone_equal(const struct nf_conn *a, const struct nf_conntrack_zone *b, enum ip_conntrack_dir dir) { #ifdef CONFIG_NF_CONNTRACK_ZONES return nf_ct_zone_id(nf_ct_zone(a), dir) == nf_ct_zone_id(b, dir); #else return true; #endif } static inline bool nf_ct_zone_equal_any(const struct nf_conn *a, const struct nf_conntrack_zone *b) { #ifdef CONFIG_NF_CONNTRACK_ZONES return nf_ct_zone(a)->id == b->id; #else return true; #endif } #endif /* _NF_CONNTRACK_ZONES_H */
622 497 2979 1329 22 1334 9 1 10 8449 3211 7733 2580 417 419 897 1897 2979 1850 8 907 1093 369 974 971 972 10 10 112 1097 1101 851 9 853 853 2117 214 2116 10 10 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 /* SPDX-License-Identifier: GPL-2.0+ */ #ifndef _LINUX_XARRAY_H #define _LINUX_XARRAY_H /* * eXtensible Arrays * Copyright (c) 2017 Microsoft Corporation * Author: Matthew Wilcox <willy@infradead.org> * * See Documentation/core-api/xarray.rst for how to use the XArray. */ #include <linux/bitmap.h> #include <linux/bug.h> #include <linux/compiler.h> #include <linux/err.h> #include <linux/gfp.h> #include <linux/kconfig.h> #include <linux/limits.h> #include <linux/lockdep.h> #include <linux/rcupdate.h> #include <linux/sched/mm.h> #include <linux/spinlock.h> #include <linux/types.h> struct list_lru; /* * The bottom two bits of the entry determine how the XArray interprets * the contents: * * 00: Pointer entry * 10: Internal entry * x1: Value entry or tagged pointer * * Attempting to store internal entries in the XArray is a bug. * * Most internal entries are pointers to the next node in the tree. * The following internal entries have a special meaning: * * 0-62: Sibling entries * 256: Retry entry * 257: Zero entry * * Errors are also represented as internal entries, but use the negative * space (-4094 to -2). They're never stored in the slots array; only * returned by the normal API. */ #define BITS_PER_XA_VALUE (BITS_PER_LONG - 1) /** * xa_mk_value() - Create an XArray entry from an integer. * @v: Value to store in XArray. * * Context: Any context. * Return: An entry suitable for storing in the XArray. */ static inline void *xa_mk_value(unsigned long v) { WARN_ON((long)v < 0); return (void *)((v << 1) | 1); } /** * xa_to_value() - Get value stored in an XArray entry. * @entry: XArray entry. * * Context: Any context. * Return: The value stored in the XArray entry. */ static inline unsigned long xa_to_value(const void *entry) { return (unsigned long)entry >> 1; } /** * xa_is_value() - Determine if an entry is a value. * @entry: XArray entry. * * Context: Any context. * Return: True if the entry is a value, false if it is a pointer. */ static inline bool xa_is_value(const void *entry) { return (unsigned long)entry & 1; } /** * xa_tag_pointer() - Create an XArray entry for a tagged pointer. * @p: Plain pointer. * @tag: Tag value (0, 1 or 3). * * If the user of the XArray prefers, they can tag their pointers instead * of storing value entries. Three tags are available (0, 1 and 3). * These are distinct from the xa_mark_t as they are not replicated up * through the array and cannot be searched for. * * Context: Any context. * Return: An XArray entry. */ static inline void *xa_tag_pointer(void *p, unsigned long tag) { return (void *)((unsigned long)p | tag); } /** * xa_untag_pointer() - Turn an XArray entry into a plain pointer. * @entry: XArray entry. * * If you have stored a tagged pointer in the XArray, call this function * to get the untagged version of the pointer. * * Context: Any context. * Return: A pointer. */ static inline void *xa_untag_pointer(void *entry) { return (void *)((unsigned long)entry & ~3UL); } /** * xa_pointer_tag() - Get the tag stored in an XArray entry. * @entry: XArray entry. * * If you have stored a tagged pointer in the XArray, call this function * to get the tag of that pointer. * * Context: Any context. * Return: A tag. */ static inline unsigned int xa_pointer_tag(void *entry) { return (unsigned long)entry & 3UL; } /* * xa_mk_internal() - Create an internal entry. * @v: Value to turn into an internal entry. * * Internal entries are used for a number of purposes. Entries 0-255 are * used for sibling entries (only 0-62 are used by the current code). 256 * is used for the retry entry. 257 is used for the reserved / zero entry. * Negative internal entries are used to represent errnos. Node pointers * are also tagged as internal entries in some situations. * * Context: Any context. * Return: An XArray internal entry corresponding to this value. */ static inline void *xa_mk_internal(unsigned long v) { return (void *)((v << 2) | 2); } /* * xa_to_internal() - Extract the value from an internal entry. * @entry: XArray entry. * * Context: Any context. * Return: The value which was stored in the internal entry. */ static inline unsigned long xa_to_internal(const void *entry) { return (unsigned long)entry >> 2; } /* * xa_is_internal() - Is the entry an internal entry? * @entry: XArray entry. * * Context: Any context. * Return: %true if the entry is an internal entry. */ static inline bool xa_is_internal(const void *entry) { return ((unsigned long)entry & 3) == 2; } #define XA_ZERO_ENTRY xa_mk_internal(257) /** * xa_is_zero() - Is the entry a zero entry? * @entry: Entry retrieved from the XArray * * The normal API will return NULL as the contents of a slot containing * a zero entry. You can only see zero entries by using the advanced API. * * Return: %true if the entry is a zero entry. */ static inline bool xa_is_zero(const void *entry) { return unlikely(entry == XA_ZERO_ENTRY); } /** * xa_is_err() - Report whether an XArray operation returned an error * @entry: Result from calling an XArray function * * If an XArray operation cannot complete an operation, it will return * a special value indicating an error. This function tells you * whether an error occurred; xa_err() tells you which error occurred. * * Context: Any context. * Return: %true if the entry indicates an error. */ static inline bool xa_is_err(const void *entry) { return unlikely(xa_is_internal(entry) && entry >= xa_mk_internal(-MAX_ERRNO)); } /** * xa_err() - Turn an XArray result into an errno. * @entry: Result from calling an XArray function. * * If an XArray operation cannot complete an operation, it will return * a special pointer value which encodes an errno. This function extracts * the errno from the pointer value, or returns 0 if the pointer does not * represent an errno. * * Context: Any context. * Return: A negative errno or 0. */ static inline int xa_err(void *entry) { /* xa_to_internal() would not do sign extension. */ if (xa_is_err(entry)) return (long)entry >> 2; return 0; } /** * struct xa_limit - Represents a range of IDs. * @min: The lowest ID to allocate (inclusive). * @max: The maximum ID to allocate (inclusive). * * This structure is used either directly or via the XA_LIMIT() macro * to communicate the range of IDs that are valid for allocation. * Three common ranges are predefined for you: * * xa_limit_32b - [0 - UINT_MAX] * * xa_limit_31b - [0 - INT_MAX] * * xa_limit_16b - [0 - USHRT_MAX] */ struct xa_limit { u32 max; u32 min; }; #define XA_LIMIT(_min, _max) (struct xa_limit) { .min = _min, .max = _max } #define xa_limit_32b XA_LIMIT(0, UINT_MAX) #define xa_limit_31b XA_LIMIT(0, INT_MAX) #define xa_limit_16b XA_LIMIT(0, USHRT_MAX) typedef unsigned __bitwise xa_mark_t; #define XA_MARK_0 ((__force xa_mark_t)0U) #define XA_MARK_1 ((__force xa_mark_t)1U) #define XA_MARK_2 ((__force xa_mark_t)2U) #define XA_PRESENT ((__force xa_mark_t)8U) #define XA_MARK_MAX XA_MARK_2 #define XA_FREE_MARK XA_MARK_0 enum xa_lock_type { XA_LOCK_IRQ = 1, XA_LOCK_BH = 2, }; /* * Values for xa_flags. The radix tree stores its GFP flags in the xa_flags, * and we remain compatible with that. */ #define XA_FLAGS_LOCK_IRQ ((__force gfp_t)XA_LOCK_IRQ) #define XA_FLAGS_LOCK_BH ((__force gfp_t)XA_LOCK_BH) #define XA_FLAGS_TRACK_FREE ((__force gfp_t)4U) #define XA_FLAGS_ZERO_BUSY ((__force gfp_t)8U) #define XA_FLAGS_ALLOC_WRAPPED ((__force gfp_t)16U) #define XA_FLAGS_ACCOUNT ((__force gfp_t)32U) #define XA_FLAGS_MARK(mark) ((__force gfp_t)((1U << __GFP_BITS_SHIFT) << \ (__force unsigned)(mark))) /* ALLOC is for a normal 0-based alloc. ALLOC1 is for an 1-based alloc */ #define XA_FLAGS_ALLOC (XA_FLAGS_TRACK_FREE | XA_FLAGS_MARK(XA_FREE_MARK)) #define XA_FLAGS_ALLOC1 (XA_FLAGS_TRACK_FREE | XA_FLAGS_ZERO_BUSY) /** * struct xarray - The anchor of the XArray. * @xa_lock: Lock that protects the contents of the XArray. * * To use the xarray, define it statically or embed it in your data structure. * It is a very small data structure, so it does not usually make sense to * allocate it separately and keep a pointer to it in your data structure. * * You may use the xa_lock to protect your own data structures as well. */ /* * If all of the entries in the array are NULL, @xa_head is a NULL pointer. * If the only non-NULL entry in the array is at index 0, @xa_head is that * entry. If any other entry in the array is non-NULL, @xa_head points * to an @xa_node. */ struct xarray { spinlock_t xa_lock; /* private: The rest of the data structure is not to be used directly. */ gfp_t xa_flags; void __rcu * xa_head; }; #define XARRAY_INIT(name, flags) { \ .xa_lock = __SPIN_LOCK_UNLOCKED(name.xa_lock), \ .xa_flags = flags, \ .xa_head = NULL, \ } /** * DEFINE_XARRAY_FLAGS() - Define an XArray with custom flags. * @name: A string that names your XArray. * @flags: XA_FLAG values. * * This is intended for file scope definitions of XArrays. It declares * and initialises an empty XArray with the chosen name and flags. It is * equivalent to calling xa_init_flags() on the array, but it does the * initialisation at compiletime instead of runtime. */ #define DEFINE_XARRAY_FLAGS(name, flags) \ struct xarray name = XARRAY_INIT(name, flags) /** * DEFINE_XARRAY() - Define an XArray. * @name: A string that names your XArray. * * This is intended for file scope definitions of XArrays. It declares * and initialises an empty XArray with the chosen name. It is equivalent * to calling xa_init() on the array, but it does the initialisation at * compiletime instead of runtime. */ #define DEFINE_XARRAY(name) DEFINE_XARRAY_FLAGS(name, 0) /** * DEFINE_XARRAY_ALLOC() - Define an XArray which allocates IDs starting at 0. * @name: A string that names your XArray. * * This is intended for file scope definitions of allocating XArrays. * See also DEFINE_XARRAY(). */ #define DEFINE_XARRAY_ALLOC(name) DEFINE_XARRAY_FLAGS(name, XA_FLAGS_ALLOC) /** * DEFINE_XARRAY_ALLOC1() - Define an XArray which allocates IDs starting at 1. * @name: A string that names your XArray. * * This is intended for file scope definitions of allocating XArrays. * See also DEFINE_XARRAY(). */ #define DEFINE_XARRAY_ALLOC1(name) DEFINE_XARRAY_FLAGS(name, XA_FLAGS_ALLOC1) void *xa_load(struct xarray *, unsigned long index); void *xa_store(struct xarray *, unsigned long index, void *entry, gfp_t); void *xa_erase(struct xarray *, unsigned long index); void *xa_store_range(struct xarray *, unsigned long first, unsigned long last, void *entry, gfp_t); bool xa_get_mark(struct xarray *, unsigned long index, xa_mark_t); void xa_set_mark(struct xarray *, unsigned long index, xa_mark_t); void xa_clear_mark(struct xarray *, unsigned long index, xa_mark_t); void *xa_find(struct xarray *xa, unsigned long *index, unsigned long max, xa_mark_t) __attribute__((nonnull(2))); void *xa_find_after(struct xarray *xa, unsigned long *index, unsigned long max, xa_mark_t) __attribute__((nonnull(2))); unsigned int xa_extract(struct xarray *, void **dst, unsigned long start, unsigned long max, unsigned int n, xa_mark_t); void xa_destroy(struct xarray *); /** * xa_init_flags() - Initialise an empty XArray with flags. * @xa: XArray. * @flags: XA_FLAG values. * * If you need to initialise an XArray with special flags (eg you need * to take the lock from interrupt context), use this function instead * of xa_init(). * * Context: Any context. */ static inline void xa_init_flags(struct xarray *xa, gfp_t flags) { spin_lock_init(&xa->xa_lock); xa->xa_flags = flags; xa->xa_head = NULL; } /** * xa_init() - Initialise an empty XArray. * @xa: XArray. * * An empty XArray is full of NULL entries. * * Context: Any context. */ static inline void xa_init(struct xarray *xa) { xa_init_flags(xa, 0); } /** * xa_empty() - Determine if an array has any present entries. * @xa: XArray. * * Context: Any context. * Return: %true if the array contains only NULL pointers. */ static inline bool xa_empty(const struct xarray *xa) { return xa->xa_head == NULL; } /** * xa_marked() - Inquire whether any entry in this array has a mark set * @xa: Array * @mark: Mark value * * Context: Any context. * Return: %true if any entry has this mark set. */ static inline bool xa_marked(const struct xarray *xa, xa_mark_t mark) { return xa->xa_flags & XA_FLAGS_MARK(mark); } /** * xa_for_each_range() - Iterate over a portion of an XArray. * @xa: XArray. * @index: Index of @entry. * @entry: Entry retrieved from array. * @start: First index to retrieve from array. * @last: Last index to retrieve from array. * * During the iteration, @entry will have the value of the entry stored * in @xa at @index. You may modify @index during the iteration if you * want to skip or reprocess indices. It is safe to modify the array * during the iteration. At the end of the iteration, @entry will be set * to NULL and @index will have a value less than or equal to max. * * xa_for_each_range() is O(n.log(n)) while xas_for_each() is O(n). You have * to handle your own locking with xas_for_each(), and if you have to unlock * after each iteration, it will also end up being O(n.log(n)). * xa_for_each_range() will spin if it hits a retry entry; if you intend to * see retry entries, you should use the xas_for_each() iterator instead. * The xas_for_each() iterator will expand into more inline code than * xa_for_each_range(). * * Context: Any context. Takes and releases the RCU lock. */ #define xa_for_each_range(xa, index, entry, start, last) \ for (index = start, \ entry = xa_find(xa, &index, last, XA_PRESENT); \ entry; \ entry = xa_find_after(xa, &index, last, XA_PRESENT)) /** * xa_for_each_start() - Iterate over a portion of an XArray. * @xa: XArray. * @index: Index of @entry. * @entry: Entry retrieved from array. * @start: First index to retrieve from array. * * During the iteration, @entry will have the value of the entry stored * in @xa at @index. You may modify @index during the iteration if you * want to skip or reprocess indices. It is safe to modify the array * during the iteration. At the end of the iteration, @entry will be set * to NULL and @index will have a value less than or equal to max. * * xa_for_each_start() is O(n.log(n)) while xas_for_each() is O(n). You have * to handle your own locking with xas_for_each(), and if you have to unlock * after each iteration, it will also end up being O(n.log(n)). * xa_for_each_start() will spin if it hits a retry entry; if you intend to * see retry entries, you should use the xas_for_each() iterator instead. * The xas_for_each() iterator will expand into more inline code than * xa_for_each_start(). * * Context: Any context. Takes and releases the RCU lock. */ #define xa_for_each_start(xa, index, entry, start) \ xa_for_each_range(xa, index, entry, start, ULONG_MAX) /** * xa_for_each() - Iterate over present entries in an XArray. * @xa: XArray. * @index: Index of @entry. * @entry: Entry retrieved from array. * * During the iteration, @entry will have the value of the entry stored * in @xa at @index. You may modify @index during the iteration if you want * to skip or reprocess indices. It is safe to modify the array during the * iteration. At the end of the iteration, @entry will be set to NULL and * @index will have a value less than or equal to max. * * xa_for_each() is O(n.log(n)) while xas_for_each() is O(n). You have * to handle your own locking with xas_for_each(), and if you have to unlock * after each iteration, it will also end up being O(n.log(n)). xa_for_each() * will spin if it hits a retry entry; if you intend to see retry entries, * you should use the xas_for_each() iterator instead. The xas_for_each() * iterator will expand into more inline code than xa_for_each(). * * Context: Any context. Takes and releases the RCU lock. */ #define xa_for_each(xa, index, entry) \ xa_for_each_start(xa, index, entry, 0) /** * xa_for_each_marked() - Iterate over marked entries in an XArray. * @xa: XArray. * @index: Index of @entry. * @entry: Entry retrieved from array. * @filter: Selection criterion. * * During the iteration, @entry will have the value of the entry stored * in @xa at @index. The iteration will skip all entries in the array * which do not match @filter. You may modify @index during the iteration * if you want to skip or reprocess indices. It is safe to modify the array * during the iteration. At the end of the iteration, @entry will be set to * NULL and @index will have a value less than or equal to max. * * xa_for_each_marked() is O(n.log(n)) while xas_for_each_marked() is O(n). * You have to handle your own locking with xas_for_each(), and if you have * to unlock after each iteration, it will also end up being O(n.log(n)). * xa_for_each_marked() will spin if it hits a retry entry; if you intend to * see retry entries, you should use the xas_for_each_marked() iterator * instead. The xas_for_each_marked() iterator will expand into more inline * code than xa_for_each_marked(). * * Context: Any context. Takes and releases the RCU lock. */ #define xa_for_each_marked(xa, index, entry, filter) \ for (index = 0, entry = xa_find(xa, &index, ULONG_MAX, filter); \ entry; entry = xa_find_after(xa, &index, ULONG_MAX, filter)) #define xa_trylock(xa) spin_trylock(&(xa)->xa_lock) #define xa_lock(xa) spin_lock(&(xa)->xa_lock) #define xa_unlock(xa) spin_unlock(&(xa)->xa_lock) #define xa_lock_bh(xa) spin_lock_bh(&(xa)->xa_lock) #define xa_unlock_bh(xa) spin_unlock_bh(&(xa)->xa_lock) #define xa_lock_irq(xa) spin_lock_irq(&(xa)->xa_lock) #define xa_unlock_irq(xa) spin_unlock_irq(&(xa)->xa_lock) #define xa_lock_irqsave(xa, flags) \ spin_lock_irqsave(&(xa)->xa_lock, flags) #define xa_unlock_irqrestore(xa, flags) \ spin_unlock_irqrestore(&(xa)->xa_lock, flags) #define xa_lock_nested(xa, subclass) \ spin_lock_nested(&(xa)->xa_lock, subclass) #define xa_lock_bh_nested(xa, subclass) \ spin_lock_bh_nested(&(xa)->xa_lock, subclass) #define xa_lock_irq_nested(xa, subclass) \ spin_lock_irq_nested(&(xa)->xa_lock, subclass) #define xa_lock_irqsave_nested(xa, flags, subclass) \ spin_lock_irqsave_nested(&(xa)->xa_lock, flags, subclass) /* * Versions of the normal API which require the caller to hold the * xa_lock. If the GFP flags allow it, they will drop the lock to * allocate memory, then reacquire it afterwards. These functions * may also re-enable interrupts if the XArray flags indicate the * locking should be interrupt safe. */ void *__xa_erase(struct xarray *, unsigned long index); void *__xa_store(struct xarray *, unsigned long index, void *entry, gfp_t); void *__xa_cmpxchg(struct xarray *, unsigned long index, void *old, void *entry, gfp_t); int __must_check __xa_insert(struct xarray *, unsigned long index, void *entry, gfp_t); int __must_check __xa_alloc(struct xarray *, u32 *id, void *entry, struct xa_limit, gfp_t); int __must_check __xa_alloc_cyclic(struct xarray *, u32 *id, void *entry, struct xa_limit, u32 *next, gfp_t); void __xa_set_mark(struct xarray *, unsigned long index, xa_mark_t); void __xa_clear_mark(struct xarray *, unsigned long index, xa_mark_t); /** * xa_store_bh() - Store this entry in the XArray. * @xa: XArray. * @index: Index into array. * @entry: New entry. * @gfp: Memory allocation flags. * * This function is like calling xa_store() except it disables softirqs * while holding the array lock. * * Context: Any context. Takes and releases the xa_lock while * disabling softirqs. * Return: The old entry at this index or xa_err() if an error happened. */ static inline void *xa_store_bh(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) { void *curr; might_alloc(gfp); xa_lock_bh(xa); curr = __xa_store(xa, index, entry, gfp); xa_unlock_bh(xa); return curr; } /** * xa_store_irq() - Store this entry in the XArray. * @xa: XArray. * @index: Index into array. * @entry: New entry. * @gfp: Memory allocation flags. * * This function is like calling xa_store() except it disables interrupts * while holding the array lock. * * Context: Process context. Takes and releases the xa_lock while * disabling interrupts. * Return: The old entry at this index or xa_err() if an error happened. */ static inline void *xa_store_irq(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) { void *curr; might_alloc(gfp); xa_lock_irq(xa); curr = __xa_store(xa, index, entry, gfp); xa_unlock_irq(xa); return curr; } /** * xa_erase_bh() - Erase this entry from the XArray. * @xa: XArray. * @index: Index of entry. * * After this function returns, loading from @index will return %NULL. * If the index is part of a multi-index entry, all indices will be erased * and none of the entries will be part of a multi-index entry. * * Context: Any context. Takes and releases the xa_lock while * disabling softirqs. * Return: The entry which used to be at this index. */ static inline void *xa_erase_bh(struct xarray *xa, unsigned long index) { void *entry; xa_lock_bh(xa); entry = __xa_erase(xa, index); xa_unlock_bh(xa); return entry; } /** * xa_erase_irq() - Erase this entry from the XArray. * @xa: XArray. * @index: Index of entry. * * After this function returns, loading from @index will return %NULL. * If the index is part of a multi-index entry, all indices will be erased * and none of the entries will be part of a multi-index entry. * * Context: Process context. Takes and releases the xa_lock while * disabling interrupts. * Return: The entry which used to be at this index. */ static inline void *xa_erase_irq(struct xarray *xa, unsigned long index) { void *entry; xa_lock_irq(xa); entry = __xa_erase(xa, index); xa_unlock_irq(xa); return entry; } /** * xa_cmpxchg() - Conditionally replace an entry in the XArray. * @xa: XArray. * @index: Index into array. * @old: Old value to test against. * @entry: New value to place in array. * @gfp: Memory allocation flags. * * If the entry at @index is the same as @old, replace it with @entry. * If the return value is equal to @old, then the exchange was successful. * * Context: Any context. Takes and releases the xa_lock. May sleep * if the @gfp flags permit. * Return: The old value at this index or xa_err() if an error happened. */ static inline void *xa_cmpxchg(struct xarray *xa, unsigned long index, void *old, void *entry, gfp_t gfp) { void *curr; might_alloc(gfp); xa_lock(xa); curr = __xa_cmpxchg(xa, index, old, entry, gfp); xa_unlock(xa); return curr; } /** * xa_cmpxchg_bh() - Conditionally replace an entry in the XArray. * @xa: XArray. * @index: Index into array. * @old: Old value to test against. * @entry: New value to place in array. * @gfp: Memory allocation flags. * * This function is like calling xa_cmpxchg() except it disables softirqs * while holding the array lock. * * Context: Any context. Takes and releases the xa_lock while * disabling softirqs. May sleep if the @gfp flags permit. * Return: The old value at this index or xa_err() if an error happened. */ static inline void *xa_cmpxchg_bh(struct xarray *xa, unsigned long index, void *old, void *entry, gfp_t gfp) { void *curr; might_alloc(gfp); xa_lock_bh(xa); curr = __xa_cmpxchg(xa, index, old, entry, gfp); xa_unlock_bh(xa); return curr; } /** * xa_cmpxchg_irq() - Conditionally replace an entry in the XArray. * @xa: XArray. * @index: Index into array. * @old: Old value to test against. * @entry: New value to place in array. * @gfp: Memory allocation flags. * * This function is like calling xa_cmpxchg() except it disables interrupts * while holding the array lock. * * Context: Process context. Takes and releases the xa_lock while * disabling interrupts. May sleep if the @gfp flags permit. * Return: The old value at this index or xa_err() if an error happened. */ static inline void *xa_cmpxchg_irq(struct xarray *xa, unsigned long index, void *old, void *entry, gfp_t gfp) { void *curr; might_alloc(gfp); xa_lock_irq(xa); curr = __xa_cmpxchg(xa, index, old, entry, gfp); xa_unlock_irq(xa); return curr; } /** * xa_insert() - Store this entry in the XArray unless another entry is * already present. * @xa: XArray. * @index: Index into array. * @entry: New entry. * @gfp: Memory allocation flags. * * Inserting a NULL entry will store a reserved entry (like xa_reserve()) * if no entry is present. Inserting will fail if a reserved entry is * present, even though loading from this index will return NULL. * * Context: Any context. Takes and releases the xa_lock. May sleep if * the @gfp flags permit. * Return: 0 if the store succeeded. -EBUSY if another entry was present. * -ENOMEM if memory could not be allocated. */ static inline int __must_check xa_insert(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) { int err; might_alloc(gfp); xa_lock(xa); err = __xa_insert(xa, index, entry, gfp); xa_unlock(xa); return err; } /** * xa_insert_bh() - Store this entry in the XArray unless another entry is * already present. * @xa: XArray. * @index: Index into array. * @entry: New entry. * @gfp: Memory allocation flags. * * Inserting a NULL entry will store a reserved entry (like xa_reserve()) * if no entry is present. Inserting will fail if a reserved entry is * present, even though loading from this index will return NULL. * * Context: Any context. Takes and releases the xa_lock while * disabling softirqs. May sleep if the @gfp flags permit. * Return: 0 if the store succeeded. -EBUSY if another entry was present. * -ENOMEM if memory could not be allocated. */ static inline int __must_check xa_insert_bh(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) { int err; might_alloc(gfp); xa_lock_bh(xa); err = __xa_insert(xa, index, entry, gfp); xa_unlock_bh(xa); return err; } /** * xa_insert_irq() - Store this entry in the XArray unless another entry is * already present. * @xa: XArray. * @index: Index into array. * @entry: New entry. * @gfp: Memory allocation flags. * * Inserting a NULL entry will store a reserved entry (like xa_reserve()) * if no entry is present. Inserting will fail if a reserved entry is * present, even though loading from this index will return NULL. * * Context: Process context. Takes and releases the xa_lock while * disabling interrupts. May sleep if the @gfp flags permit. * Return: 0 if the store succeeded. -EBUSY if another entry was present. * -ENOMEM if memory could not be allocated. */ static inline int __must_check xa_insert_irq(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) { int err; might_alloc(gfp); xa_lock_irq(xa); err = __xa_insert(xa, index, entry, gfp); xa_unlock_irq(xa); return err; } /** * xa_alloc() - Find somewhere to store this entry in the XArray. * @xa: XArray. * @id: Pointer to ID. * @entry: New entry. * @limit: Range of ID to allocate. * @gfp: Memory allocation flags. * * Finds an empty entry in @xa between @limit.min and @limit.max, * stores the index into the @id pointer, then stores the entry at * that index. A concurrent lookup will not see an uninitialised @id. * * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set * in xa_init_flags(). * * Context: Any context. Takes and releases the xa_lock. May sleep if * the @gfp flags permit. * Return: 0 on success, -ENOMEM if memory could not be allocated or * -EBUSY if there are no free entries in @limit. */ static inline __must_check int xa_alloc(struct xarray *xa, u32 *id, void *entry, struct xa_limit limit, gfp_t gfp) { int err; might_alloc(gfp); xa_lock(xa); err = __xa_alloc(xa, id, entry, limit, gfp); xa_unlock(xa); return err; } /** * xa_alloc_bh() - Find somewhere to store this entry in the XArray. * @xa: XArray. * @id: Pointer to ID. * @entry: New entry. * @limit: Range of ID to allocate. * @gfp: Memory allocation flags. * * Finds an empty entry in @xa between @limit.min and @limit.max, * stores the index into the @id pointer, then stores the entry at * that index. A concurrent lookup will not see an uninitialised @id. * * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set * in xa_init_flags(). * * Context: Any context. Takes and releases the xa_lock while * disabling softirqs. May sleep if the @gfp flags permit. * Return: 0 on success, -ENOMEM if memory could not be allocated or * -EBUSY if there are no free entries in @limit. */ static inline int __must_check xa_alloc_bh(struct xarray *xa, u32 *id, void *entry, struct xa_limit limit, gfp_t gfp) { int err; might_alloc(gfp); xa_lock_bh(xa); err = __xa_alloc(xa, id, entry, limit, gfp); xa_unlock_bh(xa); return err; } /** * xa_alloc_irq() - Find somewhere to store this entry in the XArray. * @xa: XArray. * @id: Pointer to ID. * @entry: New entry. * @limit: Range of ID to allocate. * @gfp: Memory allocation flags. * * Finds an empty entry in @xa between @limit.min and @limit.max, * stores the index into the @id pointer, then stores the entry at * that index. A concurrent lookup will not see an uninitialised @id. * * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set * in xa_init_flags(). * * Context: Process context. Takes and releases the xa_lock while * disabling interrupts. May sleep if the @gfp flags permit. * Return: 0 on success, -ENOMEM if memory could not be allocated or * -EBUSY if there are no free entries in @limit. */ static inline int __must_check xa_alloc_irq(struct xarray *xa, u32 *id, void *entry, struct xa_limit limit, gfp_t gfp) { int err; might_alloc(gfp); xa_lock_irq(xa); err = __xa_alloc(xa, id, entry, limit, gfp); xa_unlock_irq(xa); return err; } /** * xa_alloc_cyclic() - Find somewhere to store this entry in the XArray. * @xa: XArray. * @id: Pointer to ID. * @entry: New entry. * @limit: Range of allocated ID. * @next: Pointer to next ID to allocate. * @gfp: Memory allocation flags. * * Finds an empty entry in @xa between @limit.min and @limit.max, * stores the index into the @id pointer, then stores the entry at * that index. A concurrent lookup will not see an uninitialised @id. * The search for an empty entry will start at @next and will wrap * around if necessary. * * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set * in xa_init_flags(). * * Context: Any context. Takes and releases the xa_lock. May sleep if * the @gfp flags permit. * Return: 0 if the allocation succeeded without wrapping. 1 if the * allocation succeeded after wrapping, -ENOMEM if memory could not be * allocated or -EBUSY if there are no free entries in @limit. */ static inline int xa_alloc_cyclic(struct xarray *xa, u32 *id, void *entry, struct xa_limit limit, u32 *next, gfp_t gfp) { int err; might_alloc(gfp); xa_lock(xa); err = __xa_alloc_cyclic(xa, id, entry, limit, next, gfp); xa_unlock(xa); return err; } /** * xa_alloc_cyclic_bh() - Find somewhere to store this entry in the XArray. * @xa: XArray. * @id: Pointer to ID. * @entry: New entry. * @limit: Range of allocated ID. * @next: Pointer to next ID to allocate. * @gfp: Memory allocation flags. * * Finds an empty entry in @xa between @limit.min and @limit.max, * stores the index into the @id pointer, then stores the entry at * that index. A concurrent lookup will not see an uninitialised @id. * The search for an empty entry will start at @next and will wrap * around if necessary. * * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set * in xa_init_flags(). * * Context: Any context. Takes and releases the xa_lock while * disabling softirqs. May sleep if the @gfp flags permit. * Return: 0 if the allocation succeeded without wrapping. 1 if the * allocation succeeded after wrapping, -ENOMEM if memory could not be * allocated or -EBUSY if there are no free entries in @limit. */ static inline int xa_alloc_cyclic_bh(struct xarray *xa, u32 *id, void *entry, struct xa_limit limit, u32 *next, gfp_t gfp) { int err; might_alloc(gfp); xa_lock_bh(xa); err = __xa_alloc_cyclic(xa, id, entry, limit, next, gfp); xa_unlock_bh(xa); return err; } /** * xa_alloc_cyclic_irq() - Find somewhere to store this entry in the XArray. * @xa: XArray. * @id: Pointer to ID. * @entry: New entry. * @limit: Range of allocated ID. * @next: Pointer to next ID to allocate. * @gfp: Memory allocation flags. * * Finds an empty entry in @xa between @limit.min and @limit.max, * stores the index into the @id pointer, then stores the entry at * that index. A concurrent lookup will not see an uninitialised @id. * The search for an empty entry will start at @next and will wrap * around if necessary. * * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set * in xa_init_flags(). * * Context: Process context. Takes and releases the xa_lock while * disabling interrupts. May sleep if the @gfp flags permit. * Return: 0 if the allocation succeeded without wrapping. 1 if the * allocation succeeded after wrapping, -ENOMEM if memory could not be * allocated or -EBUSY if there are no free entries in @limit. */ static inline int xa_alloc_cyclic_irq(struct xarray *xa, u32 *id, void *entry, struct xa_limit limit, u32 *next, gfp_t gfp) { int err; might_alloc(gfp); xa_lock_irq(xa); err = __xa_alloc_cyclic(xa, id, entry, limit, next, gfp); xa_unlock_irq(xa); return err; } /** * xa_reserve() - Reserve this index in the XArray. * @xa: XArray. * @index: Index into array. * @gfp: Memory allocation flags. * * Ensures there is somewhere to store an entry at @index in the array. * If there is already something stored at @index, this function does * nothing. If there was nothing there, the entry is marked as reserved. * Loading from a reserved entry returns a %NULL pointer. * * If you do not use the entry that you have reserved, call xa_release() * or xa_erase() to free any unnecessary memory. * * Context: Any context. Takes and releases the xa_lock. * May sleep if the @gfp flags permit. * Return: 0 if the reservation succeeded or -ENOMEM if it failed. */ static inline __must_check int xa_reserve(struct xarray *xa, unsigned long index, gfp_t gfp) { return xa_err(xa_cmpxchg(xa, index, NULL, XA_ZERO_ENTRY, gfp)); } /** * xa_reserve_bh() - Reserve this index in the XArray. * @xa: XArray. * @index: Index into array. * @gfp: Memory allocation flags. * * A softirq-disabling version of xa_reserve(). * * Context: Any context. Takes and releases the xa_lock while * disabling softirqs. * Return: 0 if the reservation succeeded or -ENOMEM if it failed. */ static inline __must_check int xa_reserve_bh(struct xarray *xa, unsigned long index, gfp_t gfp) { return xa_err(xa_cmpxchg_bh(xa, index, NULL, XA_ZERO_ENTRY, gfp)); } /** * xa_reserve_irq() - Reserve this index in the XArray. * @xa: XArray. * @index: Index into array. * @gfp: Memory allocation flags. * * An interrupt-disabling version of xa_reserve(). * * Context: Process context. Takes and releases the xa_lock while * disabling interrupts. * Return: 0 if the reservation succeeded or -ENOMEM if it failed. */ static inline __must_check int xa_reserve_irq(struct xarray *xa, unsigned long index, gfp_t gfp) { return xa_err(xa_cmpxchg_irq(xa, index, NULL, XA_ZERO_ENTRY, gfp)); } /** * xa_release() - Release a reserved entry. * @xa: XArray. * @index: Index of entry. * * After calling xa_reserve(), you can call this function to release the * reservation. If the entry at @index has been stored to, this function * will do nothing. */ static inline void xa_release(struct xarray *xa, unsigned long index) { xa_cmpxchg(xa, index, XA_ZERO_ENTRY, NULL, 0); } /* Everything below here is the Advanced API. Proceed with caution. */ /* * The xarray is constructed out of a set of 'chunks' of pointers. Choosing * the best chunk size requires some tradeoffs. A power of two recommends * itself so that we can walk the tree based purely on shifts and masks. * Generally, the larger the better; as the number of slots per level of the * tree increases, the less tall the tree needs to be. But that needs to be * balanced against the memory consumption of each node. On a 64-bit system, * xa_node is currently 576 bytes, and we get 7 of them per 4kB page. If we * doubled the number of slots per node, we'd get only 3 nodes per 4kB page. */ #ifndef XA_CHUNK_SHIFT #define XA_CHUNK_SHIFT (IS_ENABLED(CONFIG_BASE_SMALL) ? 4 : 6) #endif #define XA_CHUNK_SIZE (1UL << XA_CHUNK_SHIFT) #define XA_CHUNK_MASK (XA_CHUNK_SIZE - 1) #define XA_MAX_MARKS 3 #define XA_MARK_LONGS BITS_TO_LONGS(XA_CHUNK_SIZE) /* * @count is the count of every non-NULL element in the ->slots array * whether that is a value entry, a retry entry, a user pointer, * a sibling entry or a pointer to the next level of the tree. * @nr_values is the count of every element in ->slots which is * either a value entry or a sibling of a value entry. */ struct xa_node { unsigned char shift; /* Bits remaining in each slot */ unsigned char offset; /* Slot offset in parent */ unsigned char count; /* Total entry count */ unsigned char nr_values; /* Value entry count */ struct xa_node __rcu *parent; /* NULL at top of tree */ struct xarray *array; /* The array we belong to */ union { struct list_head private_list; /* For tree user */ struct rcu_head rcu_head; /* Used when freeing node */ }; void __rcu *slots[XA_CHUNK_SIZE]; union { unsigned long tags[XA_MAX_MARKS][XA_MARK_LONGS]; unsigned long marks[XA_MAX_MARKS][XA_MARK_LONGS]; }; }; void xa_dump(const struct xarray *); void xa_dump_node(const struct xa_node *); #ifdef XA_DEBUG #define XA_BUG_ON(xa, x) do { \ if (x) { \ xa_dump(xa); \ BUG(); \ } \ } while (0) #define XA_NODE_BUG_ON(node, x) do { \ if (x) { \ if (node) xa_dump_node(node); \ BUG(); \ } \ } while (0) #else #define XA_BUG_ON(xa, x) do { } while (0) #define XA_NODE_BUG_ON(node, x) do { } while (0) #endif /* Private */ static inline void *xa_head(const struct xarray *xa) { return rcu_dereference_check(xa->xa_head, lockdep_is_held(&xa->xa_lock)); } /* Private */ static inline void *xa_head_locked(const struct xarray *xa) { return rcu_dereference_protected(xa->xa_head, lockdep_is_held(&xa->xa_lock)); } /* Private */ static inline void *xa_entry(const struct xarray *xa, const struct xa_node *node, unsigned int offset) { XA_NODE_BUG_ON(node, offset >= XA_CHUNK_SIZE); return rcu_dereference_check(node->slots[offset], lockdep_is_held(&xa->xa_lock)); } /* Private */ static inline void *xa_entry_locked(const struct xarray *xa, const struct xa_node *node, unsigned int offset) { XA_NODE_BUG_ON(node, offset >= XA_CHUNK_SIZE); return rcu_dereference_protected(node->slots[offset], lockdep_is_held(&xa->xa_lock)); } /* Private */ static inline struct xa_node *xa_parent(const struct xarray *xa, const struct xa_node *node) { return rcu_dereference_check(node->parent, lockdep_is_held(&xa->xa_lock)); } /* Private */ static inline struct xa_node *xa_parent_locked(const struct xarray *xa, const struct xa_node *node) { return rcu_dereference_protected(node->parent, lockdep_is_held(&xa->xa_lock)); } /* Private */ static inline void *xa_mk_node(const struct xa_node *node) { return (void *)((unsigned long)node | 2); } /* Private */ static inline struct xa_node *xa_to_node(const void *entry) { return (struct xa_node *)((unsigned long)entry - 2); } /* Private */ static inline bool xa_is_node(const void *entry) { return xa_is_internal(entry) && (unsigned long)entry > 4096; } /* Private */ static inline void *xa_mk_sibling(unsigned int offset) { return xa_mk_internal(offset); } /* Private */ static inline unsigned long xa_to_sibling(const void *entry) { return xa_to_internal(entry); } /** * xa_is_sibling() - Is the entry a sibling entry? * @entry: Entry retrieved from the XArray * * Return: %true if the entry is a sibling entry. */ static inline bool xa_is_sibling(const void *entry) { return IS_ENABLED(CONFIG_XARRAY_MULTI) && xa_is_internal(entry) && (entry < xa_mk_sibling(XA_CHUNK_SIZE - 1)); } #define XA_RETRY_ENTRY xa_mk_internal(256) /** * xa_is_retry() - Is the entry a retry entry? * @entry: Entry retrieved from the XArray * * Return: %true if the entry is a retry entry. */ static inline bool xa_is_retry(const void *entry) { return unlikely(entry == XA_RETRY_ENTRY); } /** * xa_is_advanced() - Is the entry only permitted for the advanced API? * @entry: Entry to be stored in the XArray. * * Return: %true if the entry cannot be stored by the normal API. */ static inline bool xa_is_advanced(const void *entry) { return xa_is_internal(entry) && (entry <= XA_RETRY_ENTRY); } /** * typedef xa_update_node_t - A callback function from the XArray. * @node: The node which is being processed * * This function is called every time the XArray updates the count of * present and value entries in a node. It allows advanced users to * maintain the private_list in the node. * * Context: The xa_lock is held and interrupts may be disabled. * Implementations should not drop the xa_lock, nor re-enable * interrupts. */ typedef void (*xa_update_node_t)(struct xa_node *node); void xa_delete_node(struct xa_node *, xa_update_node_t); /* * The xa_state is opaque to its users. It contains various different pieces * of state involved in the current operation on the XArray. It should be * declared on the stack and passed between the various internal routines. * The various elements in it should not be accessed directly, but only * through the provided accessor functions. The below documentation is for * the benefit of those working on the code, not for users of the XArray. * * @xa_node usually points to the xa_node containing the slot we're operating * on (and @xa_offset is the offset in the slots array). If there is a * single entry in the array at index 0, there are no allocated xa_nodes to * point to, and so we store %NULL in @xa_node. @xa_node is set to * the value %XAS_RESTART if the xa_state is not walked to the correct * position in the tree of nodes for this operation. If an error occurs * during an operation, it is set to an %XAS_ERROR value. If we run off the * end of the allocated nodes, it is set to %XAS_BOUNDS. */ struct xa_state { struct xarray *xa; unsigned long xa_index; unsigned char xa_shift; unsigned char xa_sibs; unsigned char xa_offset; unsigned char xa_pad; /* Helps gcc generate better code */ struct xa_node *xa_node; struct xa_node *xa_alloc; xa_update_node_t xa_update; struct list_lru *xa_lru; }; /* * We encode errnos in the xas->xa_node. If an error has happened, we need to * drop the lock to fix it, and once we've done so the xa_state is invalid. */ #define XA_ERROR(errno) ((struct xa_node *)(((unsigned long)errno << 2) | 2UL)) #define XAS_BOUNDS ((struct xa_node *)1UL) #define XAS_RESTART ((struct xa_node *)3UL) #define __XA_STATE(array, index, shift, sibs) { \ .xa = array, \ .xa_index = index, \ .xa_shift = shift, \ .xa_sibs = sibs, \ .xa_offset = 0, \ .xa_pad = 0, \ .xa_node = XAS_RESTART, \ .xa_alloc = NULL, \ .xa_update = NULL, \ .xa_lru = NULL, \ } /** * XA_STATE() - Declare an XArray operation state. * @name: Name of this operation state (usually xas). * @array: Array to operate on. * @index: Initial index of interest. * * Declare and initialise an xa_state on the stack. */ #define XA_STATE(name, array, index) \ struct xa_state name = __XA_STATE(array, index, 0, 0) /** * XA_STATE_ORDER() - Declare an XArray operation state. * @name: Name of this operation state (usually xas). * @array: Array to operate on. * @index: Initial index of interest. * @order: Order of entry. * * Declare and initialise an xa_state on the stack. This variant of * XA_STATE() allows you to specify the 'order' of the element you * want to operate on.` */ #define XA_STATE_ORDER(name, array, index, order) \ struct xa_state name = __XA_STATE(array, \ (index >> order) << order, \ order - (order % XA_CHUNK_SHIFT), \ (1U << (order % XA_CHUNK_SHIFT)) - 1) #define xas_marked(xas, mark) xa_marked((xas)->xa, (mark)) #define xas_trylock(xas) xa_trylock((xas)->xa) #define xas_lock(xas) xa_lock((xas)->xa) #define xas_unlock(xas) xa_unlock((xas)->xa) #define xas_lock_bh(xas) xa_lock_bh((xas)->xa) #define xas_unlock_bh(xas) xa_unlock_bh((xas)->xa) #define xas_lock_irq(xas) xa_lock_irq((xas)->xa) #define xas_unlock_irq(xas) xa_unlock_irq((xas)->xa) #define xas_lock_irqsave(xas, flags) \ xa_lock_irqsave((xas)->xa, flags) #define xas_unlock_irqrestore(xas, flags) \ xa_unlock_irqrestore((xas)->xa, flags) /** * xas_error() - Return an errno stored in the xa_state. * @xas: XArray operation state. * * Return: 0 if no error has been noted. A negative errno if one has. */ static inline int xas_error(const struct xa_state *xas) { return xa_err(xas->xa_node); } /** * xas_set_err() - Note an error in the xa_state. * @xas: XArray operation state. * @err: Negative error number. * * Only call this function with a negative @err; zero or positive errors * will probably not behave the way you think they should. If you want * to clear the error from an xa_state, use xas_reset(). */ static inline void xas_set_err(struct xa_state *xas, long err) { xas->xa_node = XA_ERROR(err); } /** * xas_invalid() - Is the xas in a retry or error state? * @xas: XArray operation state. * * Return: %true if the xas cannot be used for operations. */ static inline bool xas_invalid(const struct xa_state *xas) { return (unsigned long)xas->xa_node & 3; } /** * xas_valid() - Is the xas a valid cursor into the array? * @xas: XArray operation state. * * Return: %true if the xas can be used for operations. */ static inline bool xas_valid(const struct xa_state *xas) { return !xas_invalid(xas); } /** * xas_is_node() - Does the xas point to a node? * @xas: XArray operation state. * * Return: %true if the xas currently references a node. */ static inline bool xas_is_node(const struct xa_state *xas) { return xas_valid(xas) && xas->xa_node; } /* True if the pointer is something other than a node */ static inline bool xas_not_node(struct xa_node *node) { return ((unsigned long)node & 3) || !node; } /* True if the node represents RESTART or an error */ static inline bool xas_frozen(struct xa_node *node) { return (unsigned long)node & 2; } /* True if the node represents head-of-tree, RESTART or BOUNDS */ static inline bool xas_top(struct xa_node *node) { return node <= XAS_RESTART; } /** * xas_reset() - Reset an XArray operation state. * @xas: XArray operation state. * * Resets the error or walk state of the @xas so future walks of the * array will start from the root. Use this if you have dropped the * xarray lock and want to reuse the xa_state. * * Context: Any context. */ static inline void xas_reset(struct xa_state *xas) { xas->xa_node = XAS_RESTART; } /** * xas_retry() - Retry the operation if appropriate. * @xas: XArray operation state. * @entry: Entry from xarray. * * The advanced functions may sometimes return an internal entry, such as * a retry entry or a zero entry. This function sets up the @xas to restart * the walk from the head of the array if needed. * * Context: Any context. * Return: true if the operation needs to be retried. */ static inline bool xas_retry(struct xa_state *xas, const void *entry) { if (xa_is_zero(entry)) return true; if (!xa_is_retry(entry)) return false; xas_reset(xas); return true; } void *xas_load(struct xa_state *); void *xas_store(struct xa_state *, void *entry); void *xas_find(struct xa_state *, unsigned long max); void *xas_find_conflict(struct xa_state *); bool xas_get_mark(const struct xa_state *, xa_mark_t); void xas_set_mark(const struct xa_state *, xa_mark_t); void xas_clear_mark(const struct xa_state *, xa_mark_t); void *xas_find_marked(struct xa_state *, unsigned long max, xa_mark_t); void xas_init_marks(const struct xa_state *); bool xas_nomem(struct xa_state *, gfp_t); void xas_destroy(struct xa_state *); void xas_pause(struct xa_state *); void xas_create_range(struct xa_state *); #ifdef CONFIG_XARRAY_MULTI int xa_get_order(struct xarray *, unsigned long index); int xas_get_order(struct xa_state *xas); void xas_split(struct xa_state *, void *entry, unsigned int order); void xas_split_alloc(struct xa_state *, void *entry, unsigned int order, gfp_t); void xas_try_split(struct xa_state *xas, void *entry, unsigned int order); unsigned int xas_try_split_min_order(unsigned int order); #else static inline int xa_get_order(struct xarray *xa, unsigned long index) { return 0; } static inline int xas_get_order(struct xa_state *xas) { return 0; } static inline void xas_split(struct xa_state *xas, void *entry, unsigned int order) { xas_store(xas, entry); } static inline void xas_split_alloc(struct xa_state *xas, void *entry, unsigned int order, gfp_t gfp) { } static inline void xas_try_split(struct xa_state *xas, void *entry, unsigned int order) { } static inline unsigned int xas_try_split_min_order(unsigned int order) { return 0; } #endif /** * xas_reload() - Refetch an entry from the xarray. * @xas: XArray operation state. * * Use this function to check that a previously loaded entry still has * the same value. This is useful for the lockless pagecache lookup where * we walk the array with only the RCU lock to protect us, lock the page, * then check that the page hasn't moved since we looked it up. * * The caller guarantees that @xas is still valid. If it may be in an * error or restart state, call xas_load() instead. * * Return: The entry at this location in the xarray. */ static inline void *xas_reload(struct xa_state *xas) { struct xa_node *node = xas->xa_node; void *entry; char offset; if (!node) return xa_head(xas->xa); if (IS_ENABLED(CONFIG_XARRAY_MULTI)) { offset = (xas->xa_index >> node->shift) & XA_CHUNK_MASK; entry = xa_entry(xas->xa, node, offset); if (!xa_is_sibling(entry)) return entry; offset = xa_to_sibling(entry); } else { offset = xas->xa_offset; } return xa_entry(xas->xa, node, offset); } /** * xas_set() - Set up XArray operation state for a different index. * @xas: XArray operation state. * @index: New index into the XArray. * * Move the operation state to refer to a different index. This will * have the effect of starting a walk from the top; see xas_next() * to move to an adjacent index. */ static inline void xas_set(struct xa_state *xas, unsigned long index) { xas->xa_index = index; xas->xa_node = XAS_RESTART; } /** * xas_advance() - Skip over sibling entries. * @xas: XArray operation state. * @index: Index of last sibling entry. * * Move the operation state to refer to the last sibling entry. * This is useful for loops that normally want to see sibling * entries but sometimes want to skip them. Use xas_set() if you * want to move to an index which is not part of this entry. */ static inline void xas_advance(struct xa_state *xas, unsigned long index) { unsigned char shift = xas_is_node(xas) ? xas->xa_node->shift : 0; xas->xa_index = index; xas->xa_offset = (index >> shift) & XA_CHUNK_MASK; } /** * xas_set_order() - Set up XArray operation state for a multislot entry. * @xas: XArray operation state. * @index: Target of the operation. * @order: Entry occupies 2^@order indices. */ static inline void xas_set_order(struct xa_state *xas, unsigned long index, unsigned int order) { #ifdef CONFIG_XARRAY_MULTI xas->xa_index = order < BITS_PER_LONG ? (index >> order) << order : 0; xas->xa_shift = order - (order % XA_CHUNK_SHIFT); xas->xa_sibs = (1 << (order % XA_CHUNK_SHIFT)) - 1; xas->xa_node = XAS_RESTART; #else BUG_ON(order > 0); xas_set(xas, index); #endif } /** * xas_set_update() - Set up XArray operation state for a callback. * @xas: XArray operation state. * @update: Function to call when updating a node. * * The XArray can notify a caller after it has updated an xa_node. * This is advanced functionality and is only needed by the page * cache and swap cache. */ static inline void xas_set_update(struct xa_state *xas, xa_update_node_t update) { xas->xa_update = update; } static inline void xas_set_lru(struct xa_state *xas, struct list_lru *lru) { xas->xa_lru = lru; } /** * xas_next_entry() - Advance iterator to next present entry. * @xas: XArray operation state. * @max: Highest index to return. * * xas_next_entry() is an inline function to optimise xarray traversal for * speed. It is equivalent to calling xas_find(), and will call xas_find() * for all the hard cases. * * Return: The next present entry after the one currently referred to by @xas. */ static inline void *xas_next_entry(struct xa_state *xas, unsigned long max) { struct xa_node *node = xas->xa_node; void *entry; if (unlikely(xas_not_node(node) || node->shift || xas->xa_offset != (xas->xa_index & XA_CHUNK_MASK))) return xas_find(xas, max); do { if (unlikely(xas->xa_index >= max)) return xas_find(xas, max); if (unlikely(xas->xa_offset == XA_CHUNK_MASK)) return xas_find(xas, max); entry = xa_entry(xas->xa, node, xas->xa_offset + 1); if (unlikely(xa_is_internal(entry))) return xas_find(xas, max); xas->xa_offset++; xas->xa_index++; } while (!entry); return entry; } /* Private */ static inline unsigned int xas_find_chunk(struct xa_state *xas, bool advance, xa_mark_t mark) { unsigned long *addr = xas->xa_node->marks[(__force unsigned)mark]; unsigned int offset = xas->xa_offset; if (advance) offset++; if (XA_CHUNK_SIZE == BITS_PER_LONG) { if (offset < XA_CHUNK_SIZE) { unsigned long data = *addr & (~0UL << offset); if (data) return __ffs(data); } return XA_CHUNK_SIZE; } return find_next_bit(addr, XA_CHUNK_SIZE, offset); } /** * xas_next_marked() - Advance iterator to next marked entry. * @xas: XArray operation state. * @max: Highest index to return. * @mark: Mark to search for. * * xas_next_marked() is an inline function to optimise xarray traversal for * speed. It is equivalent to calling xas_find_marked(), and will call * xas_find_marked() for all the hard cases. * * Return: The next marked entry after the one currently referred to by @xas. */ static inline void *xas_next_marked(struct xa_state *xas, unsigned long max, xa_mark_t mark) { struct xa_node *node = xas->xa_node; void *entry; unsigned int offset; if (unlikely(xas_not_node(node) || node->shift)) return xas_find_marked(xas, max, mark); offset = xas_find_chunk(xas, true, mark); xas->xa_offset = offset; xas->xa_index = (xas->xa_index & ~XA_CHUNK_MASK) + offset; if (xas->xa_index > max) return NULL; if (offset == XA_CHUNK_SIZE) return xas_find_marked(xas, max, mark); entry = xa_entry(xas->xa, node, offset); if (!entry) return xas_find_marked(xas, max, mark); return entry; } /* * If iterating while holding a lock, drop the lock and reschedule * every %XA_CHECK_SCHED loops. */ enum { XA_CHECK_SCHED = 4096, }; /** * xas_for_each() - Iterate over a range of an XArray. * @xas: XArray operation state. * @entry: Entry retrieved from the array. * @max: Maximum index to retrieve from array. * * The loop body will be executed for each entry present in the xarray * between the current xas position and @max. @entry will be set to * the entry retrieved from the xarray. It is safe to delete entries * from the array in the loop body. You should hold either the RCU lock * or the xa_lock while iterating. If you need to drop the lock, call * xas_pause() first. */ #define xas_for_each(xas, entry, max) \ for (entry = xas_find(xas, max); entry; \ entry = xas_next_entry(xas, max)) /** * xas_for_each_marked() - Iterate over a range of an XArray. * @xas: XArray operation state. * @entry: Entry retrieved from the array. * @max: Maximum index to retrieve from array. * @mark: Mark to search for. * * The loop body will be executed for each marked entry in the xarray * between the current xas position and @max. @entry will be set to * the entry retrieved from the xarray. It is safe to delete entries * from the array in the loop body. You should hold either the RCU lock * or the xa_lock while iterating. If you need to drop the lock, call * xas_pause() first. */ #define xas_for_each_marked(xas, entry, max, mark) \ for (entry = xas_find_marked(xas, max, mark); entry; \ entry = xas_next_marked(xas, max, mark)) /** * xas_for_each_conflict() - Iterate over a range of an XArray. * @xas: XArray operation state. * @entry: Entry retrieved from the array. * * The loop body will be executed for each entry in the XArray that * lies within the range specified by @xas. If the loop terminates * normally, @entry will be %NULL. The user may break out of the loop, * which will leave @entry set to the conflicting entry. The caller * may also call xa_set_err() to exit the loop while setting an error * to record the reason. */ #define xas_for_each_conflict(xas, entry) \ while ((entry = xas_find_conflict(xas))) void *__xas_next(struct xa_state *); void *__xas_prev(struct xa_state *); /** * xas_prev() - Move iterator to previous index. * @xas: XArray operation state. * * If the @xas was in an error state, it will remain in an error state * and this function will return %NULL. If the @xas has never been walked, * it will have the effect of calling xas_load(). Otherwise one will be * subtracted from the index and the state will be walked to the correct * location in the array for the next operation. * * If the iterator was referencing index 0, this function wraps * around to %ULONG_MAX. * * Return: The entry at the new index. This may be %NULL or an internal * entry. */ static inline void *xas_prev(struct xa_state *xas) { struct xa_node *node = xas->xa_node; if (unlikely(xas_not_node(node) || node->shift || xas->xa_offset == 0)) return __xas_prev(xas); xas->xa_index--; xas->xa_offset--; return xa_entry(xas->xa, node, xas->xa_offset); } /** * xas_next() - Move state to next index. * @xas: XArray operation state. * * If the @xas was in an error state, it will remain in an error state * and this function will return %NULL. If the @xas has never been walked, * it will have the effect of calling xas_load(). Otherwise one will be * added to the index and the state will be walked to the correct * location in the array for the next operation. * * If the iterator was referencing index %ULONG_MAX, this function wraps * around to 0. * * Return: The entry at the new index. This may be %NULL or an internal * entry. */ static inline void *xas_next(struct xa_state *xas) { struct xa_node *node = xas->xa_node; if (unlikely(xas_not_node(node) || node->shift || xas->xa_offset == XA_CHUNK_MASK)) return __xas_next(xas); xas->xa_index++; xas->xa_offset++; return xa_entry(xas->xa, node, xas->xa_offset); } #endif /* _LINUX_XARRAY_H */
4 6 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 // SPDX-License-Identifier: GPL-2.0-only /* * ip_vs_proto_ah_esp.c: AH/ESP IPSec load balancing support for IPVS * * Authors: Julian Anastasov <ja@ssi.bg>, February 2002 * Wensong Zhang <wensong@linuxvirtualserver.org> */ #define KMSG_COMPONENT "IPVS" #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt #include <linux/in.h> #include <linux/ip.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv4.h> #include <net/ip_vs.h> /* TODO: struct isakmp_hdr { __u8 icookie[8]; __u8 rcookie[8]; __u8 np; __u8 version; __u8 xchgtype; __u8 flags; __u32 msgid; __u32 length; }; */ #define PORT_ISAKMP 500 static void ah_esp_conn_fill_param_proto(struct netns_ipvs *ipvs, int af, const struct ip_vs_iphdr *iph, struct ip_vs_conn_param *p) { if (likely(!ip_vs_iph_inverse(iph))) ip_vs_conn_fill_param(ipvs, af, IPPROTO_UDP, &iph->saddr, htons(PORT_ISAKMP), &iph->daddr, htons(PORT_ISAKMP), p); else ip_vs_conn_fill_param(ipvs, af, IPPROTO_UDP, &iph->daddr, htons(PORT_ISAKMP), &iph->saddr, htons(PORT_ISAKMP), p); } static struct ip_vs_conn * ah_esp_conn_in_get(struct netns_ipvs *ipvs, int af, const struct sk_buff *skb, const struct ip_vs_iphdr *iph) { struct ip_vs_conn *cp; struct ip_vs_conn_param p; ah_esp_conn_fill_param_proto(ipvs, af, iph, &p); cp = ip_vs_conn_in_get(&p); if (!cp) { /* * We are not sure if the packet is from our * service, so our conn_schedule hook should return NF_ACCEPT */ IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for outin packet " "%s%s %s->%s\n", ip_vs_iph_icmp(iph) ? "ICMP+" : "", ip_vs_proto_get(iph->protocol)->name, IP_VS_DBG_ADDR(af, &iph->saddr), IP_VS_DBG_ADDR(af, &iph->daddr)); } return cp; } static struct ip_vs_conn * ah_esp_conn_out_get(struct netns_ipvs *ipvs, int af, const struct sk_buff *skb, const struct ip_vs_iphdr *iph) { struct ip_vs_conn *cp; struct ip_vs_conn_param p; ah_esp_conn_fill_param_proto(ipvs, af, iph, &p); cp = ip_vs_conn_out_get(&p); if (!cp) { IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for inout packet " "%s%s %s->%s\n", ip_vs_iph_icmp(iph) ? "ICMP+" : "", ip_vs_proto_get(iph->protocol)->name, IP_VS_DBG_ADDR(af, &iph->saddr), IP_VS_DBG_ADDR(af, &iph->daddr)); } return cp; } static int ah_esp_conn_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, int *verdict, struct ip_vs_conn **cpp, struct ip_vs_iphdr *iph) { /* * AH/ESP is only related traffic. Pass the packet to IP stack. */ *verdict = NF_ACCEPT; return 0; } #ifdef CONFIG_IP_VS_PROTO_AH struct ip_vs_protocol ip_vs_protocol_ah = { .name = "AH", .protocol = IPPROTO_AH, .num_states = 1, .dont_defrag = 1, .init = NULL, .exit = NULL, .conn_schedule = ah_esp_conn_schedule, .conn_in_get = ah_esp_conn_in_get, .conn_out_get = ah_esp_conn_out_get, .snat_handler = NULL, .dnat_handler = NULL, .state_transition = NULL, .register_app = NULL, .unregister_app = NULL, .app_conn_bind = NULL, .debug_packet = ip_vs_tcpudp_debug_packet, .timeout_change = NULL, /* ISAKMP */ }; #endif #ifdef CONFIG_IP_VS_PROTO_ESP struct ip_vs_protocol ip_vs_protocol_esp = { .name = "ESP", .protocol = IPPROTO_ESP, .num_states = 1, .dont_defrag = 1, .init = NULL, .exit = NULL, .conn_schedule = ah_esp_conn_schedule, .conn_in_get = ah_esp_conn_in_get, .conn_out_get = ah_esp_conn_out_get, .snat_handler = NULL, .dnat_handler = NULL, .state_transition = NULL, .register_app = NULL, .unregister_app = NULL, .app_conn_bind = NULL, .debug_packet = ip_vs_tcpudp_debug_packet, .timeout_change = NULL, /* ISAKMP */ }; #endif
97 172 109 148 42 179 204 203 298 297 297 257 23 106 132 201 32 80 90 23 97 95 6 27 5 87 28 471 199 274 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _XFRM_HASH_H #define _XFRM_HASH_H #include <linux/xfrm.h> #include <linux/socket.h> #include <linux/jhash.h> static inline unsigned int __xfrm4_addr_hash(const xfrm_address_t *addr) { return ntohl(addr->a4); } static inline unsigned int __xfrm6_addr_hash(const xfrm_address_t *addr) { return jhash2((__force u32 *)addr->a6, 4, 0); } static inline unsigned int __xfrm4_daddr_saddr_hash(const xfrm_address_t *daddr, const xfrm_address_t *saddr) { u32 sum = (__force u32)daddr->a4 + (__force u32)saddr->a4; return ntohl((__force __be32)sum); } static inline unsigned int __xfrm6_daddr_saddr_hash(const xfrm_address_t *daddr, const xfrm_address_t *saddr) { return __xfrm6_addr_hash(daddr) ^ __xfrm6_addr_hash(saddr); } static inline u32 __bits2mask32(__u8 bits) { u32 mask32 = 0xffffffff; if (bits == 0) mask32 = 0; else if (bits < 32) mask32 <<= (32 - bits); return mask32; } static inline unsigned int __xfrm4_dpref_spref_hash(const xfrm_address_t *daddr, const xfrm_address_t *saddr, __u8 dbits, __u8 sbits) { return jhash_2words(ntohl(daddr->a4) & __bits2mask32(dbits), ntohl(saddr->a4) & __bits2mask32(sbits), 0); } static inline unsigned int __xfrm6_pref_hash(const xfrm_address_t *addr, __u8 prefixlen) { unsigned int pdw; unsigned int pbi; u32 initval = 0; pdw = prefixlen >> 5; /* num of whole u32 in prefix */ pbi = prefixlen & 0x1f; /* num of bits in incomplete u32 in prefix */ if (pbi) { __be32 mask; mask = htonl((0xffffffff) << (32 - pbi)); initval = (__force u32)(addr->a6[pdw] & mask); } return jhash2((__force u32 *)addr->a6, pdw, initval); } static inline unsigned int __xfrm6_dpref_spref_hash(const xfrm_address_t *daddr, const xfrm_address_t *saddr, __u8 dbits, __u8 sbits) { return __xfrm6_pref_hash(daddr, dbits) ^ __xfrm6_pref_hash(saddr, sbits); } static inline unsigned int __xfrm_dst_hash(const xfrm_address_t *daddr, const xfrm_address_t *saddr, u32 reqid, unsigned short family, unsigned int hmask) { unsigned int h = family ^ reqid; switch (family) { case AF_INET: h ^= __xfrm4_daddr_saddr_hash(daddr, saddr); break; case AF_INET6: h ^= __xfrm6_daddr_saddr_hash(daddr, saddr); break; } return (h ^ (h >> 16)) & hmask; } static inline unsigned int __xfrm_src_hash(const xfrm_address_t *daddr, const xfrm_address_t *saddr, unsigned short family, unsigned int hmask) { unsigned int h = family; switch (family) { case AF_INET: h ^= __xfrm4_daddr_saddr_hash(daddr, saddr); break; case AF_INET6: h ^= __xfrm6_daddr_saddr_hash(daddr, saddr); break; } return (h ^ (h >> 16)) & hmask; } static inline unsigned int __xfrm_spi_hash(const xfrm_address_t *daddr, __be32 spi, u8 proto, unsigned short family, unsigned int hmask) { unsigned int h = (__force u32)spi ^ proto; switch (family) { case AF_INET: h ^= __xfrm4_addr_hash(daddr); break; case AF_INET6: h ^= __xfrm6_addr_hash(daddr); break; } return (h ^ (h >> 10) ^ (h >> 20)) & hmask; } static inline unsigned int __xfrm_seq_hash(u32 seq, unsigned int hmask) { unsigned int h = seq; return (h ^ (h >> 10) ^ (h >> 20)) & hmask; } static inline unsigned int __idx_hash(u32 index, unsigned int hmask) { return (index ^ (index >> 8)) & hmask; } static inline unsigned int __sel_hash(const struct xfrm_selector *sel, unsigned short family, unsigned int hmask, u8 dbits, u8 sbits) { const xfrm_address_t *daddr = &sel->daddr; const xfrm_address_t *saddr = &sel->saddr; unsigned int h = 0; switch (family) { case AF_INET: if (sel->prefixlen_d < dbits || sel->prefixlen_s < sbits) return hmask + 1; h = __xfrm4_dpref_spref_hash(daddr, saddr, dbits, sbits); break; case AF_INET6: if (sel->prefixlen_d < dbits || sel->prefixlen_s < sbits) return hmask + 1; h = __xfrm6_dpref_spref_hash(daddr, saddr, dbits, sbits); break; } h ^= (h >> 16); return h & hmask; } static inline unsigned int __addr_hash(const xfrm_address_t *daddr, const xfrm_address_t *saddr, unsigned short family, unsigned int hmask, u8 dbits, u8 sbits) { unsigned int h = 0; switch (family) { case AF_INET: h = __xfrm4_dpref_spref_hash(daddr, saddr, dbits, sbits); break; case AF_INET6: h = __xfrm6_dpref_spref_hash(daddr, saddr, dbits, sbits); break; } h ^= (h >> 16); return h & hmask; } struct hlist_head *xfrm_hash_alloc(unsigned int sz); void xfrm_hash_free(struct hlist_head *n, unsigned int sz); #endif /* _XFRM_HASH_H */
113 100 73 8 27 54 55 19 55 55 54 3 106 106 106 98 98 69 55 55 1 55 15 49 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 /* * INETPEER - A storage for permanent information about peers * * This source is covered by the GNU GPL, the same as all kernel sources. * * Authors: Andrey V. Savochkin <saw@msu.ru> */ #include <linux/cache.h> #include <linux/module.h> #include <linux/types.h> #include <linux/slab.h> #include <linux/interrupt.h> #include <linux/spinlock.h> #include <linux/random.h> #include <linux/timer.h> #include <linux/time.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/net.h> #include <linux/workqueue.h> #include <net/ip.h> #include <net/inetpeer.h> #include <net/secure_seq.h> /* * Theory of operations. * We keep one entry for each peer IP address. The nodes contains long-living * information about the peer which doesn't depend on routes. * * Nodes are removed only when reference counter goes to 0. * When it's happened the node may be removed when a sufficient amount of * time has been passed since its last use. The less-recently-used entry can * also be removed if the pool is overloaded i.e. if the total amount of * entries is greater-or-equal than the threshold. * * Node pool is organised as an RB tree. * Such an implementation has been chosen not just for fun. It's a way to * prevent easy and efficient DoS attacks by creating hash collisions. A huge * amount of long living nodes in a single hash slot would significantly delay * lookups performed with disabled BHs. * * Serialisation issues. * 1. Nodes may appear in the tree only with the pool lock held. * 2. Nodes may disappear from the tree only with the pool lock held * AND reference count being 0. * 3. Global variable peer_total is modified under the pool lock. * 4. struct inet_peer fields modification: * rb_node: pool lock * refcnt: atomically against modifications on other CPU; * usually under some other lock to prevent node disappearing * daddr: unchangeable */ static struct kmem_cache *peer_cachep __ro_after_init; void inet_peer_base_init(struct inet_peer_base *bp) { bp->rb_root = RB_ROOT; seqlock_init(&bp->lock); bp->total = 0; } EXPORT_IPV6_MOD_GPL(inet_peer_base_init); #define PEER_MAX_GC 32 /* Exported for sysctl_net_ipv4. */ int inet_peer_threshold __read_mostly; /* start to throw entries more * aggressively at this stage */ int inet_peer_minttl __read_mostly = 120 * HZ; /* TTL under high load: 120 sec */ int inet_peer_maxttl __read_mostly = 10 * 60 * HZ; /* usual time to live: 10 min */ /* Called from ip_output.c:ip_init */ void __init inet_initpeers(void) { u64 nr_entries; /* 1% of physical memory */ nr_entries = div64_ul((u64)totalram_pages() << PAGE_SHIFT, 100 * L1_CACHE_ALIGN(sizeof(struct inet_peer))); inet_peer_threshold = clamp_val(nr_entries, 4096, 65536 + 128); peer_cachep = KMEM_CACHE(inet_peer, SLAB_HWCACHE_ALIGN | SLAB_PANIC); } /* Called with rcu_read_lock() or base->lock held */ static struct inet_peer *lookup(const struct inetpeer_addr *daddr, struct inet_peer_base *base, unsigned int seq, struct inet_peer *gc_stack[], unsigned int *gc_cnt, struct rb_node **parent_p, struct rb_node ***pp_p) { struct rb_node **pp, *parent, *next; struct inet_peer *p; u32 now; pp = &base->rb_root.rb_node; parent = NULL; while (1) { int cmp; next = rcu_dereference_raw(*pp); if (!next) break; parent = next; p = rb_entry(parent, struct inet_peer, rb_node); cmp = inetpeer_addr_cmp(daddr, &p->daddr); if (cmp == 0) { now = jiffies; if (READ_ONCE(p->dtime) != now) WRITE_ONCE(p->dtime, now); return p; } if (gc_stack) { if (*gc_cnt < PEER_MAX_GC) gc_stack[(*gc_cnt)++] = p; } else if (unlikely(read_seqretry(&base->lock, seq))) { break; } if (cmp == -1) pp = &next->rb_left; else pp = &next->rb_right; } *parent_p = parent; *pp_p = pp; return NULL; } /* perform garbage collect on all items stacked during a lookup */ static void inet_peer_gc(struct inet_peer_base *base, struct inet_peer *gc_stack[], unsigned int gc_cnt) { int peer_threshold, peer_maxttl, peer_minttl; struct inet_peer *p; __u32 delta, ttl; int i; peer_threshold = READ_ONCE(inet_peer_threshold); peer_maxttl = READ_ONCE(inet_peer_maxttl); peer_minttl = READ_ONCE(inet_peer_minttl); if (base->total >= peer_threshold) ttl = 0; /* be aggressive */ else ttl = peer_maxttl - (peer_maxttl - peer_minttl) / HZ * base->total / peer_threshold * HZ; for (i = 0; i < gc_cnt; i++) { p = gc_stack[i]; delta = (__u32)jiffies - READ_ONCE(p->dtime); if (delta < ttl || !refcount_dec_if_one(&p->refcnt)) gc_stack[i] = NULL; } for (i = 0; i < gc_cnt; i++) { p = gc_stack[i]; if (p) { rb_erase(&p->rb_node, &base->rb_root); base->total--; kfree_rcu(p, rcu); } } } /* Must be called under RCU : No refcount change is done here. */ struct inet_peer *inet_getpeer(struct inet_peer_base *base, const struct inetpeer_addr *daddr) { struct inet_peer *p, *gc_stack[PEER_MAX_GC]; struct rb_node **pp, *parent; unsigned int gc_cnt, seq; /* Attempt a lockless lookup first. * Because of a concurrent writer, we might not find an existing entry. */ seq = read_seqbegin(&base->lock); p = lookup(daddr, base, seq, NULL, &gc_cnt, &parent, &pp); if (p) return p; /* retry an exact lookup, taking the lock before. * At least, nodes should be hot in our cache. */ parent = NULL; write_seqlock_bh(&base->lock); gc_cnt = 0; p = lookup(daddr, base, seq, gc_stack, &gc_cnt, &parent, &pp); if (!p) { p = kmem_cache_alloc(peer_cachep, GFP_ATOMIC); if (p) { p->daddr = *daddr; p->dtime = (__u32)jiffies; refcount_set(&p->refcnt, 1); atomic_set(&p->rid, 0); p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW; p->rate_tokens = 0; p->n_redirects = 0; /* 60*HZ is arbitrary, but chosen enough high so that the first * calculation of tokens is at its maximum. */ p->rate_last = jiffies - 60*HZ; rb_link_node(&p->rb_node, parent, pp); rb_insert_color(&p->rb_node, &base->rb_root); base->total++; } } if (gc_cnt) inet_peer_gc(base, gc_stack, gc_cnt); write_sequnlock_bh(&base->lock); return p; } EXPORT_IPV6_MOD_GPL(inet_getpeer); void inet_putpeer(struct inet_peer *p) { if (refcount_dec_and_test(&p->refcnt)) kfree_rcu(p, rcu); } /* * Check transmit rate limitation for given message. * The rate information is held in the inet_peer entries now. * This function is generic and could be used for other purposes * too. It uses a Token bucket filter as suggested by Alexey Kuznetsov. * * Note that the same inet_peer fields are modified by functions in * route.c too, but these work for packet destinations while xrlim_allow * works for icmp destinations. This means the rate limiting information * for one "ip object" is shared - and these ICMPs are twice limited: * by source and by destination. * * RFC 1812: 4.3.2.8 SHOULD be able to limit error message rate * SHOULD allow setting of rate limits * * Shared between ICMPv4 and ICMPv6. */ #define XRLIM_BURST_FACTOR 6 bool inet_peer_xrlim_allow(struct inet_peer *peer, int timeout) { unsigned long now, token, otoken, delta; bool rc = false; if (!peer) return true; token = otoken = READ_ONCE(peer->rate_tokens); now = jiffies; delta = now - READ_ONCE(peer->rate_last); if (delta) { WRITE_ONCE(peer->rate_last, now); token += delta; if (token > XRLIM_BURST_FACTOR * timeout) token = XRLIM_BURST_FACTOR * timeout; } if (token >= timeout) { token -= timeout; rc = true; } if (token != otoken) WRITE_ONCE(peer->rate_tokens, token); return rc; } EXPORT_IPV6_MOD(inet_peer_xrlim_allow); void inetpeer_invalidate_tree(struct inet_peer_base *base) { struct rb_node *p = rb_first(&base->rb_root); while (p) { struct inet_peer *peer = rb_entry(p, struct inet_peer, rb_node); p = rb_next(p); rb_erase(&peer->rb_node, &base->rb_root); inet_putpeer(peer); cond_resched(); } base->total = 0; } EXPORT_IPV6_MOD(inetpeer_invalidate_tree);
12 1 10 1 36 1 1 6 2 26 8 1 1 5 4 2 5 5 1 6 1 3 3 2 1 1 1 10 10 10 10 10 10 10 10 18 16 2 3 3 3 3 3 3 3 10 3 1 2 4 3 1 2 103 103 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 // SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) /* Copyright (C) 2019 Netronome Systems, Inc. */ #include <linux/if_arp.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/mpls.h> #include <linux/rtnetlink.h> #include <linux/skbuff.h> #include <linux/tc_act/tc_mpls.h> #include <net/mpls.h> #include <net/netlink.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> #include <net/tc_act/tc_mpls.h> #include <net/tc_wrapper.h> static struct tc_action_ops act_mpls_ops; #define ACT_MPLS_TTL_DEFAULT 255 static __be32 tcf_mpls_get_lse(struct mpls_shim_hdr *lse, struct tcf_mpls_params *p, bool set_bos) { u32 new_lse = 0; if (lse) new_lse = be32_to_cpu(lse->label_stack_entry); if (p->tcfm_label != ACT_MPLS_LABEL_NOT_SET) { new_lse &= ~MPLS_LS_LABEL_MASK; new_lse |= p->tcfm_label << MPLS_LS_LABEL_SHIFT; } if (p->tcfm_ttl) { new_lse &= ~MPLS_LS_TTL_MASK; new_lse |= p->tcfm_ttl << MPLS_LS_TTL_SHIFT; } if (p->tcfm_tc != ACT_MPLS_TC_NOT_SET) { new_lse &= ~MPLS_LS_TC_MASK; new_lse |= p->tcfm_tc << MPLS_LS_TC_SHIFT; } if (p->tcfm_bos != ACT_MPLS_BOS_NOT_SET) { new_lse &= ~MPLS_LS_S_MASK; new_lse |= p->tcfm_bos << MPLS_LS_S_SHIFT; } else if (set_bos) { new_lse |= 1 << MPLS_LS_S_SHIFT; } return cpu_to_be32(new_lse); } TC_INDIRECT_SCOPE int tcf_mpls_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct tcf_mpls *m = to_mpls(a); struct tcf_mpls_params *p; __be32 new_lse; int ret, mac_len; tcf_lastuse_update(&m->tcf_tm); bstats_update(this_cpu_ptr(m->common.cpu_bstats), skb); /* Ensure 'data' points at mac_header prior calling mpls manipulating * functions. */ if (skb_at_tc_ingress(skb)) { skb_push_rcsum(skb, skb->mac_len); mac_len = skb->mac_len; } else { mac_len = skb_network_offset(skb); } ret = READ_ONCE(m->tcf_action); p = rcu_dereference_bh(m->mpls_p); switch (p->tcfm_action) { case TCA_MPLS_ACT_POP: if (skb_mpls_pop(skb, p->tcfm_proto, mac_len, skb->dev && skb->dev->type == ARPHRD_ETHER)) goto drop; break; case TCA_MPLS_ACT_PUSH: new_lse = tcf_mpls_get_lse(NULL, p, !eth_p_mpls(skb_protocol(skb, true))); if (skb_mpls_push(skb, new_lse, p->tcfm_proto, mac_len, skb->dev && skb->dev->type == ARPHRD_ETHER)) goto drop; break; case TCA_MPLS_ACT_MAC_PUSH: if (skb_vlan_tag_present(skb)) { if (__vlan_insert_inner_tag(skb, skb->vlan_proto, skb_vlan_tag_get(skb), ETH_HLEN) < 0) goto drop; skb->protocol = skb->vlan_proto; __vlan_hwaccel_clear_tag(skb); } new_lse = tcf_mpls_get_lse(NULL, p, mac_len || !eth_p_mpls(skb->protocol)); if (skb_mpls_push(skb, new_lse, p->tcfm_proto, 0, false)) goto drop; break; case TCA_MPLS_ACT_MODIFY: if (!pskb_may_pull(skb, skb_network_offset(skb) + MPLS_HLEN)) goto drop; new_lse = tcf_mpls_get_lse(mpls_hdr(skb), p, false); if (skb_mpls_update_lse(skb, new_lse)) goto drop; break; case TCA_MPLS_ACT_DEC_TTL: if (skb_mpls_dec_ttl(skb)) goto drop; break; } if (skb_at_tc_ingress(skb)) skb_pull_rcsum(skb, skb->mac_len); return ret; drop: qstats_drop_inc(this_cpu_ptr(m->common.cpu_qstats)); return TC_ACT_SHOT; } static int valid_label(const struct nlattr *attr, struct netlink_ext_ack *extack) { const u32 *label = nla_data(attr); if (nla_len(attr) != sizeof(*label)) { NL_SET_ERR_MSG_MOD(extack, "Invalid MPLS label length"); return -EINVAL; } if (*label & ~MPLS_LABEL_MASK || *label == MPLS_LABEL_IMPLNULL) { NL_SET_ERR_MSG_MOD(extack, "MPLS label out of range"); return -EINVAL; } return 0; } static const struct nla_policy mpls_policy[TCA_MPLS_MAX + 1] = { [TCA_MPLS_PARMS] = NLA_POLICY_EXACT_LEN(sizeof(struct tc_mpls)), [TCA_MPLS_PROTO] = { .type = NLA_U16 }, [TCA_MPLS_LABEL] = NLA_POLICY_VALIDATE_FN(NLA_BINARY, valid_label), [TCA_MPLS_TC] = NLA_POLICY_RANGE(NLA_U8, 0, 7), [TCA_MPLS_TTL] = NLA_POLICY_MIN(NLA_U8, 1), [TCA_MPLS_BOS] = NLA_POLICY_RANGE(NLA_U8, 0, 1), }; static int tcf_mpls_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, act_mpls_ops.net_id); bool bind = flags & TCA_ACT_FLAGS_BIND; struct nlattr *tb[TCA_MPLS_MAX + 1]; struct tcf_chain *goto_ch = NULL; struct tcf_mpls_params *p; struct tc_mpls *parm; bool exists = false; struct tcf_mpls *m; int ret = 0, err; u8 mpls_ttl = 0; u32 index; if (!nla) { NL_SET_ERR_MSG_MOD(extack, "Missing netlink attributes"); return -EINVAL; } err = nla_parse_nested(tb, TCA_MPLS_MAX, nla, mpls_policy, extack); if (err < 0) return err; if (!tb[TCA_MPLS_PARMS]) { NL_SET_ERR_MSG_MOD(extack, "No MPLS params"); return -EINVAL; } parm = nla_data(tb[TCA_MPLS_PARMS]); index = parm->index; err = tcf_idr_check_alloc(tn, &index, a, bind); if (err < 0) return err; exists = err; if (exists && bind) return ACT_P_BOUND; if (!exists) { ret = tcf_idr_create(tn, index, est, a, &act_mpls_ops, bind, true, flags); if (ret) { tcf_idr_cleanup(tn, index); return ret; } ret = ACT_P_CREATED; } else if (!(flags & TCA_ACT_FLAGS_REPLACE)) { tcf_idr_release(*a, bind); return -EEXIST; } /* Verify parameters against action type. */ switch (parm->m_action) { case TCA_MPLS_ACT_POP: if (!tb[TCA_MPLS_PROTO]) { NL_SET_ERR_MSG_MOD(extack, "Protocol must be set for MPLS pop"); err = -EINVAL; goto release_idr; } if (!eth_proto_is_802_3(nla_get_be16(tb[TCA_MPLS_PROTO]))) { NL_SET_ERR_MSG_MOD(extack, "Invalid protocol type for MPLS pop"); err = -EINVAL; goto release_idr; } if (tb[TCA_MPLS_LABEL] || tb[TCA_MPLS_TTL] || tb[TCA_MPLS_TC] || tb[TCA_MPLS_BOS]) { NL_SET_ERR_MSG_MOD(extack, "Label, TTL, TC or BOS cannot be used with MPLS pop"); err = -EINVAL; goto release_idr; } break; case TCA_MPLS_ACT_DEC_TTL: if (tb[TCA_MPLS_PROTO] || tb[TCA_MPLS_LABEL] || tb[TCA_MPLS_TTL] || tb[TCA_MPLS_TC] || tb[TCA_MPLS_BOS]) { NL_SET_ERR_MSG_MOD(extack, "Label, TTL, TC, BOS or protocol cannot be used with MPLS dec_ttl"); err = -EINVAL; goto release_idr; } break; case TCA_MPLS_ACT_PUSH: case TCA_MPLS_ACT_MAC_PUSH: if (!tb[TCA_MPLS_LABEL]) { NL_SET_ERR_MSG_MOD(extack, "Label is required for MPLS push"); err = -EINVAL; goto release_idr; } if (tb[TCA_MPLS_PROTO] && !eth_p_mpls(nla_get_be16(tb[TCA_MPLS_PROTO]))) { NL_SET_ERR_MSG_MOD(extack, "Protocol must be an MPLS type for MPLS push"); err = -EPROTONOSUPPORT; goto release_idr; } /* Push needs a TTL - if not specified, set a default value. */ if (!tb[TCA_MPLS_TTL]) { #if IS_ENABLED(CONFIG_MPLS) mpls_ttl = net->mpls.default_ttl ? net->mpls.default_ttl : ACT_MPLS_TTL_DEFAULT; #else mpls_ttl = ACT_MPLS_TTL_DEFAULT; #endif } break; case TCA_MPLS_ACT_MODIFY: if (tb[TCA_MPLS_PROTO]) { NL_SET_ERR_MSG_MOD(extack, "Protocol cannot be used with MPLS modify"); err = -EINVAL; goto release_idr; } break; default: NL_SET_ERR_MSG_MOD(extack, "Unknown MPLS action"); err = -EINVAL; goto release_idr; } err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); if (err < 0) goto release_idr; m = to_mpls(*a); p = kzalloc(sizeof(*p), GFP_KERNEL); if (!p) { err = -ENOMEM; goto put_chain; } p->tcfm_action = parm->m_action; p->tcfm_label = nla_get_u32_default(tb[TCA_MPLS_LABEL], ACT_MPLS_LABEL_NOT_SET); p->tcfm_tc = nla_get_u8_default(tb[TCA_MPLS_TC], ACT_MPLS_TC_NOT_SET); p->tcfm_ttl = nla_get_u8_default(tb[TCA_MPLS_TTL], mpls_ttl); p->tcfm_bos = nla_get_u8_default(tb[TCA_MPLS_BOS], ACT_MPLS_BOS_NOT_SET); p->tcfm_proto = nla_get_be16_default(tb[TCA_MPLS_PROTO], htons(ETH_P_MPLS_UC)); spin_lock_bh(&m->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); p = rcu_replace_pointer(m->mpls_p, p, lockdep_is_held(&m->tcf_lock)); spin_unlock_bh(&m->tcf_lock); if (goto_ch) tcf_chain_put_by_act(goto_ch); if (p) kfree_rcu(p, rcu); return ret; put_chain: if (goto_ch) tcf_chain_put_by_act(goto_ch); release_idr: tcf_idr_release(*a, bind); return err; } static void tcf_mpls_cleanup(struct tc_action *a) { struct tcf_mpls *m = to_mpls(a); struct tcf_mpls_params *p; p = rcu_dereference_protected(m->mpls_p, 1); if (p) kfree_rcu(p, rcu); } static int tcf_mpls_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); struct tcf_mpls *m = to_mpls(a); struct tcf_mpls_params *p; struct tc_mpls opt = { .index = m->tcf_index, .refcnt = refcount_read(&m->tcf_refcnt) - ref, .bindcnt = atomic_read(&m->tcf_bindcnt) - bind, }; struct tcf_t t; spin_lock_bh(&m->tcf_lock); opt.action = m->tcf_action; p = rcu_dereference_protected(m->mpls_p, lockdep_is_held(&m->tcf_lock)); opt.m_action = p->tcfm_action; if (nla_put(skb, TCA_MPLS_PARMS, sizeof(opt), &opt)) goto nla_put_failure; if (p->tcfm_label != ACT_MPLS_LABEL_NOT_SET && nla_put_u32(skb, TCA_MPLS_LABEL, p->tcfm_label)) goto nla_put_failure; if (p->tcfm_tc != ACT_MPLS_TC_NOT_SET && nla_put_u8(skb, TCA_MPLS_TC, p->tcfm_tc)) goto nla_put_failure; if (p->tcfm_ttl && nla_put_u8(skb, TCA_MPLS_TTL, p->tcfm_ttl)) goto nla_put_failure; if (p->tcfm_bos != ACT_MPLS_BOS_NOT_SET && nla_put_u8(skb, TCA_MPLS_BOS, p->tcfm_bos)) goto nla_put_failure; if (nla_put_be16(skb, TCA_MPLS_PROTO, p->tcfm_proto)) goto nla_put_failure; tcf_tm_dump(&t, &m->tcf_tm); if (nla_put_64bit(skb, TCA_MPLS_TM, sizeof(t), &t, TCA_MPLS_PAD)) goto nla_put_failure; spin_unlock_bh(&m->tcf_lock); return skb->len; nla_put_failure: spin_unlock_bh(&m->tcf_lock); nlmsg_trim(skb, b); return -EMSGSIZE; } static int tcf_mpls_offload_act_setup(struct tc_action *act, void *entry_data, u32 *index_inc, bool bind, struct netlink_ext_ack *extack) { if (bind) { struct flow_action_entry *entry = entry_data; switch (tcf_mpls_action(act)) { case TCA_MPLS_ACT_PUSH: entry->id = FLOW_ACTION_MPLS_PUSH; entry->mpls_push.proto = tcf_mpls_proto(act); entry->mpls_push.label = tcf_mpls_label(act); entry->mpls_push.tc = tcf_mpls_tc(act); entry->mpls_push.bos = tcf_mpls_bos(act); entry->mpls_push.ttl = tcf_mpls_ttl(act); break; case TCA_MPLS_ACT_POP: entry->id = FLOW_ACTION_MPLS_POP; entry->mpls_pop.proto = tcf_mpls_proto(act); break; case TCA_MPLS_ACT_MODIFY: entry->id = FLOW_ACTION_MPLS_MANGLE; entry->mpls_mangle.label = tcf_mpls_label(act); entry->mpls_mangle.tc = tcf_mpls_tc(act); entry->mpls_mangle.bos = tcf_mpls_bos(act); entry->mpls_mangle.ttl = tcf_mpls_ttl(act); break; case TCA_MPLS_ACT_DEC_TTL: NL_SET_ERR_MSG_MOD(extack, "Offload not supported when \"dec_ttl\" option is used"); return -EOPNOTSUPP; case TCA_MPLS_ACT_MAC_PUSH: NL_SET_ERR_MSG_MOD(extack, "Offload not supported when \"mac_push\" option is used"); return -EOPNOTSUPP; default: NL_SET_ERR_MSG_MOD(extack, "Unsupported MPLS mode offload"); return -EOPNOTSUPP; } *index_inc = 1; } else { struct flow_offload_action *fl_action = entry_data; switch (tcf_mpls_action(act)) { case TCA_MPLS_ACT_PUSH: fl_action->id = FLOW_ACTION_MPLS_PUSH; break; case TCA_MPLS_ACT_POP: fl_action->id = FLOW_ACTION_MPLS_POP; break; case TCA_MPLS_ACT_MODIFY: fl_action->id = FLOW_ACTION_MPLS_MANGLE; break; default: return -EOPNOTSUPP; } } return 0; } static struct tc_action_ops act_mpls_ops = { .kind = "mpls", .id = TCA_ID_MPLS, .owner = THIS_MODULE, .act = tcf_mpls_act, .dump = tcf_mpls_dump, .init = tcf_mpls_init, .cleanup = tcf_mpls_cleanup, .offload_act_setup = tcf_mpls_offload_act_setup, .size = sizeof(struct tcf_mpls), }; MODULE_ALIAS_NET_ACT("mpls"); static __net_init int mpls_init_net(struct net *net) { struct tc_action_net *tn = net_generic(net, act_mpls_ops.net_id); return tc_action_net_init(net, tn, &act_mpls_ops); } static void __net_exit mpls_exit_net(struct list_head *net_list) { tc_action_net_exit(net_list, act_mpls_ops.net_id); } static struct pernet_operations mpls_net_ops = { .init = mpls_init_net, .exit_batch = mpls_exit_net, .id = &act_mpls_ops.net_id, .size = sizeof(struct tc_action_net), }; static int __init mpls_init_module(void) { return tcf_register_action(&act_mpls_ops, &mpls_net_ops); } static void __exit mpls_cleanup_module(void) { tcf_unregister_action(&act_mpls_ops, &mpls_net_ops); } module_init(mpls_init_module); module_exit(mpls_cleanup_module); MODULE_SOFTDEP("post: mpls_gso"); MODULE_AUTHOR("Netronome Systems <oss-drivers@netronome.com>"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("MPLS manipulation actions");
8 224 224 2 136 55 53 153 138 22 138 19 9 138 18 6 19 138 19 143 19 151 85 46 19 2 19 86 86 48 58 138 19 6 151 151 138 19 19 19 19 19 2 136 2 152 152 152 152 138 19 1 86 56 46 86 46 46 56 85 56 46 46 68 68 41 38 8 38 38 34 33 49 50 34 33 3 3 3 3 6 137 137 138 138 6 138 2 136 1 138 58 58 57 9 56 37 37 37 37 115 116 2 7 5 12 1 8 1 7 7 7 7 303 5 2 3 3 3 3 3 6 1 5 7 3 2 5 1 8 11 11 11 6 6 17 11 6 6 6 5 5 6 2 2 2 119 119 17 17 2 17 22 22 22 19 3 19 11 11 11 1 10 38 38 38 38 16 16 26 5 6 6 8 8 3 2 2 1 1 8 3 8 31 3 3 61 259 260 119 37 34 88 259 134 18 18 1 4 2 116 116 1 207 89 147 94 143 203 11 79 159 204 201 65 155 81 155 203 1 5 3 5 2 2 2 5 5 1 1 2 3 12 1 3 3 5 8 2 2 4 1 3 4 4 14 2 2 1 6 1 1 1 7 3 4 2 5 6 3 31 1 2 3 1 7 16 10 25 25 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 // SPDX-License-Identifier: GPL-2.0-only #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/rtnetlink.h> #include <linux/slab.h> #include <net/switchdev.h> #include "br_private.h" #include "br_private_tunnel.h" static void nbp_vlan_set_vlan_dev_state(struct net_bridge_port *p, u16 vid); static inline int br_vlan_cmp(struct rhashtable_compare_arg *arg, const void *ptr) { const struct net_bridge_vlan *vle = ptr; u16 vid = *(u16 *)arg->key; return vle->vid != vid; } static const struct rhashtable_params br_vlan_rht_params = { .head_offset = offsetof(struct net_bridge_vlan, vnode), .key_offset = offsetof(struct net_bridge_vlan, vid), .key_len = sizeof(u16), .nelem_hint = 3, .max_size = VLAN_N_VID, .obj_cmpfn = br_vlan_cmp, .automatic_shrinking = true, }; static struct net_bridge_vlan *br_vlan_lookup(struct rhashtable *tbl, u16 vid) { return rhashtable_lookup_fast(tbl, &vid, br_vlan_rht_params); } static void __vlan_add_pvid(struct net_bridge_vlan_group *vg, const struct net_bridge_vlan *v) { if (vg->pvid == v->vid) return; smp_wmb(); br_vlan_set_pvid_state(vg, v->state); vg->pvid = v->vid; } static void __vlan_delete_pvid(struct net_bridge_vlan_group *vg, u16 vid) { if (vg->pvid != vid) return; smp_wmb(); vg->pvid = 0; } /* Update the BRIDGE_VLAN_INFO_PVID and BRIDGE_VLAN_INFO_UNTAGGED flags of @v. * If @commit is false, return just whether the BRIDGE_VLAN_INFO_PVID and * BRIDGE_VLAN_INFO_UNTAGGED bits of @flags would produce any change onto @v. */ static bool __vlan_flags_update(struct net_bridge_vlan *v, u16 flags, bool commit) { struct net_bridge_vlan_group *vg; bool change; if (br_vlan_is_master(v)) vg = br_vlan_group(v->br); else vg = nbp_vlan_group(v->port); /* check if anything would be changed on commit */ change = !!(flags & BRIDGE_VLAN_INFO_PVID) == !!(vg->pvid != v->vid) || ((flags ^ v->flags) & BRIDGE_VLAN_INFO_UNTAGGED); if (!commit) goto out; if (flags & BRIDGE_VLAN_INFO_PVID) __vlan_add_pvid(vg, v); else __vlan_delete_pvid(vg, v->vid); if (flags & BRIDGE_VLAN_INFO_UNTAGGED) v->flags |= BRIDGE_VLAN_INFO_UNTAGGED; else v->flags &= ~BRIDGE_VLAN_INFO_UNTAGGED; out: return change; } static bool __vlan_flags_would_change(struct net_bridge_vlan *v, u16 flags) { return __vlan_flags_update(v, flags, false); } static void __vlan_flags_commit(struct net_bridge_vlan *v, u16 flags) { __vlan_flags_update(v, flags, true); } static int __vlan_vid_add(struct net_device *dev, struct net_bridge *br, struct net_bridge_vlan *v, u16 flags, struct netlink_ext_ack *extack) { int err; /* Try switchdev op first. In case it is not supported, fallback to * 8021q add. */ err = br_switchdev_port_vlan_add(dev, v->vid, flags, false, extack); if (err == -EOPNOTSUPP) return vlan_vid_add(dev, br->vlan_proto, v->vid); v->priv_flags |= BR_VLFLAG_ADDED_BY_SWITCHDEV; return err; } static void __vlan_add_list(struct net_bridge_vlan *v) { struct net_bridge_vlan_group *vg; struct list_head *headp, *hpos; struct net_bridge_vlan *vent; if (br_vlan_is_master(v)) vg = br_vlan_group(v->br); else vg = nbp_vlan_group(v->port); headp = &vg->vlan_list; list_for_each_prev(hpos, headp) { vent = list_entry(hpos, struct net_bridge_vlan, vlist); if (v->vid >= vent->vid) break; } list_add_rcu(&v->vlist, hpos); } static void __vlan_del_list(struct net_bridge_vlan *v) { list_del_rcu(&v->vlist); } static int __vlan_vid_del(struct net_device *dev, struct net_bridge *br, const struct net_bridge_vlan *v) { int err; /* Try switchdev op first. In case it is not supported, fallback to * 8021q del. */ err = br_switchdev_port_vlan_del(dev, v->vid); if (!(v->priv_flags & BR_VLFLAG_ADDED_BY_SWITCHDEV)) vlan_vid_del(dev, br->vlan_proto, v->vid); return err == -EOPNOTSUPP ? 0 : err; } /* Returns a master vlan, if it didn't exist it gets created. In all cases * a reference is taken to the master vlan before returning. */ static struct net_bridge_vlan * br_vlan_get_master(struct net_bridge *br, u16 vid, struct netlink_ext_ack *extack) { struct net_bridge_vlan_group *vg; struct net_bridge_vlan *masterv; vg = br_vlan_group(br); masterv = br_vlan_find(vg, vid); if (!masterv) { bool changed; /* missing global ctx, create it now */ if (br_vlan_add(br, vid, 0, &changed, extack)) return NULL; masterv = br_vlan_find(vg, vid); if (WARN_ON(!masterv)) return NULL; refcount_set(&masterv->refcnt, 1); return masterv; } refcount_inc(&masterv->refcnt); return masterv; } static void br_master_vlan_rcu_free(struct rcu_head *rcu) { struct net_bridge_vlan *v; v = container_of(rcu, struct net_bridge_vlan, rcu); WARN_ON(!br_vlan_is_master(v)); free_percpu(v->stats); v->stats = NULL; kfree(v); } static void br_vlan_put_master(struct net_bridge_vlan *masterv) { struct net_bridge_vlan_group *vg; if (!br_vlan_is_master(masterv)) return; vg = br_vlan_group(masterv->br); if (refcount_dec_and_test(&masterv->refcnt)) { rhashtable_remove_fast(&vg->vlan_hash, &masterv->vnode, br_vlan_rht_params); __vlan_del_list(masterv); br_multicast_toggle_one_vlan(masterv, false); br_multicast_ctx_deinit(&masterv->br_mcast_ctx); call_rcu(&masterv->rcu, br_master_vlan_rcu_free); } } static void nbp_vlan_rcu_free(struct rcu_head *rcu) { struct net_bridge_vlan *v; v = container_of(rcu, struct net_bridge_vlan, rcu); WARN_ON(br_vlan_is_master(v)); /* if we had per-port stats configured then free them here */ if (v->priv_flags & BR_VLFLAG_PER_PORT_STATS) free_percpu(v->stats); v->stats = NULL; kfree(v); } static void br_vlan_init_state(struct net_bridge_vlan *v) { struct net_bridge *br; if (br_vlan_is_master(v)) br = v->br; else br = v->port->br; if (br_opt_get(br, BROPT_MST_ENABLED)) { br_mst_vlan_init_state(v); return; } v->state = BR_STATE_FORWARDING; v->msti = 0; } /* This is the shared VLAN add function which works for both ports and bridge * devices. There are four possible calls to this function in terms of the * vlan entry type: * 1. vlan is being added on a port (no master flags, global entry exists) * 2. vlan is being added on a bridge (both master and brentry flags) * 3. vlan is being added on a port, but a global entry didn't exist which * is being created right now (master flag set, brentry flag unset), the * global entry is used for global per-vlan features, but not for filtering * 4. same as 3 but with both master and brentry flags set so the entry * will be used for filtering in both the port and the bridge */ static int __vlan_add(struct net_bridge_vlan *v, u16 flags, struct netlink_ext_ack *extack) { struct net_bridge_vlan *masterv = NULL; struct net_bridge_port *p = NULL; struct net_bridge_vlan_group *vg; struct net_device *dev; struct net_bridge *br; int err; if (br_vlan_is_master(v)) { br = v->br; dev = br->dev; vg = br_vlan_group(br); } else { p = v->port; br = p->br; dev = p->dev; vg = nbp_vlan_group(p); } if (p) { /* Add VLAN to the device filter if it is supported. * This ensures tagged traffic enters the bridge when * promiscuous mode is disabled by br_manage_promisc(). */ err = __vlan_vid_add(dev, br, v, flags, extack); if (err) goto out; /* need to work on the master vlan too */ if (flags & BRIDGE_VLAN_INFO_MASTER) { bool changed; err = br_vlan_add(br, v->vid, flags | BRIDGE_VLAN_INFO_BRENTRY, &changed, extack); if (err) goto out_filt; if (changed) br_vlan_notify(br, NULL, v->vid, 0, RTM_NEWVLAN); } masterv = br_vlan_get_master(br, v->vid, extack); if (!masterv) { err = -ENOMEM; goto out_filt; } v->brvlan = masterv; if (br_opt_get(br, BROPT_VLAN_STATS_PER_PORT)) { v->stats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); if (!v->stats) { err = -ENOMEM; goto out_filt; } v->priv_flags |= BR_VLFLAG_PER_PORT_STATS; } else { v->stats = masterv->stats; } br_multicast_port_ctx_init(p, v, &v->port_mcast_ctx); } else { if (br_vlan_should_use(v)) { err = br_switchdev_port_vlan_add(dev, v->vid, flags, false, extack); if (err && err != -EOPNOTSUPP) goto out; } br_multicast_ctx_init(br, v, &v->br_mcast_ctx); v->priv_flags |= BR_VLFLAG_GLOBAL_MCAST_ENABLED; } /* Add the dev mac and count the vlan only if it's usable */ if (br_vlan_should_use(v)) { err = br_fdb_add_local(br, p, dev->dev_addr, v->vid); if (err) { br_err(br, "failed insert local address into bridge forwarding table\n"); goto out_filt; } vg->num_vlans++; } /* set the state before publishing */ br_vlan_init_state(v); err = rhashtable_lookup_insert_fast(&vg->vlan_hash, &v->vnode, br_vlan_rht_params); if (err) goto out_fdb_insert; __vlan_add_list(v); __vlan_flags_commit(v, flags); br_multicast_toggle_one_vlan(v, true); if (p) nbp_vlan_set_vlan_dev_state(p, v->vid); out: return err; out_fdb_insert: if (br_vlan_should_use(v)) { br_fdb_find_delete_local(br, p, dev->dev_addr, v->vid); vg->num_vlans--; } out_filt: if (p) { __vlan_vid_del(dev, br, v); if (masterv) { if (v->stats && masterv->stats != v->stats) free_percpu(v->stats); v->stats = NULL; br_vlan_put_master(masterv); v->brvlan = NULL; } } else { br_switchdev_port_vlan_del(dev, v->vid); } goto out; } static int __vlan_del(struct net_bridge_vlan *v) { struct net_bridge_vlan *masterv = v; struct net_bridge_vlan_group *vg; struct net_bridge_port *p = NULL; int err = 0; if (br_vlan_is_master(v)) { vg = br_vlan_group(v->br); } else { p = v->port; vg = nbp_vlan_group(v->port); masterv = v->brvlan; } __vlan_delete_pvid(vg, v->vid); if (p) { err = __vlan_vid_del(p->dev, p->br, v); if (err) goto out; } else { err = br_switchdev_port_vlan_del(v->br->dev, v->vid); if (err && err != -EOPNOTSUPP) goto out; err = 0; } if (br_vlan_should_use(v)) { v->flags &= ~BRIDGE_VLAN_INFO_BRENTRY; vg->num_vlans--; } if (masterv != v) { vlan_tunnel_info_del(vg, v); rhashtable_remove_fast(&vg->vlan_hash, &v->vnode, br_vlan_rht_params); __vlan_del_list(v); nbp_vlan_set_vlan_dev_state(p, v->vid); br_multicast_toggle_one_vlan(v, false); br_multicast_port_ctx_deinit(&v->port_mcast_ctx); call_rcu(&v->rcu, nbp_vlan_rcu_free); } br_vlan_put_master(masterv); out: return err; } static void __vlan_group_free(struct net_bridge_vlan_group *vg) { WARN_ON(!list_empty(&vg->vlan_list)); rhashtable_destroy(&vg->vlan_hash); vlan_tunnel_deinit(vg); kfree(vg); } static void __vlan_flush(const struct net_bridge *br, const struct net_bridge_port *p, struct net_bridge_vlan_group *vg) { struct net_bridge_vlan *vlan, *tmp; u16 v_start = 0, v_end = 0; int err; __vlan_delete_pvid(vg, vg->pvid); list_for_each_entry_safe(vlan, tmp, &vg->vlan_list, vlist) { /* take care of disjoint ranges */ if (!v_start) { v_start = vlan->vid; } else if (vlan->vid - v_end != 1) { /* found range end, notify and start next one */ br_vlan_notify(br, p, v_start, v_end, RTM_DELVLAN); v_start = vlan->vid; } v_end = vlan->vid; err = __vlan_del(vlan); if (err) { br_err(br, "port %u(%s) failed to delete vlan %d: %pe\n", (unsigned int) p->port_no, p->dev->name, vlan->vid, ERR_PTR(err)); } } /* notify about the last/whole vlan range */ if (v_start) br_vlan_notify(br, p, v_start, v_end, RTM_DELVLAN); } struct sk_buff *br_handle_vlan(struct net_bridge *br, const struct net_bridge_port *p, struct net_bridge_vlan_group *vg, struct sk_buff *skb) { struct pcpu_sw_netstats *stats; struct net_bridge_vlan *v; u16 vid; /* If this packet was not filtered at input, let it pass */ if (!BR_INPUT_SKB_CB(skb)->vlan_filtered) goto out; /* At this point, we know that the frame was filtered and contains * a valid vlan id. If the vlan id has untagged flag set, * send untagged; otherwise, send tagged. */ br_vlan_get_tag(skb, &vid); v = br_vlan_find(vg, vid); /* Vlan entry must be configured at this point. The * only exception is the bridge is set in promisc mode and the * packet is destined for the bridge device. In this case * pass the packet as is. */ if (!v || !br_vlan_should_use(v)) { if ((br->dev->flags & IFF_PROMISC) && skb->dev == br->dev) { goto out; } else { kfree_skb(skb); return NULL; } } if (br_opt_get(br, BROPT_VLAN_STATS_ENABLED)) { stats = this_cpu_ptr(v->stats); u64_stats_update_begin(&stats->syncp); u64_stats_add(&stats->tx_bytes, skb->len); u64_stats_inc(&stats->tx_packets); u64_stats_update_end(&stats->syncp); } /* If the skb will be sent using forwarding offload, the assumption is * that the switchdev will inject the packet into hardware together * with the bridge VLAN, so that it can be forwarded according to that * VLAN. The switchdev should deal with popping the VLAN header in * hardware on each egress port as appropriate. So only strip the VLAN * header if forwarding offload is not being used. */ if (v->flags & BRIDGE_VLAN_INFO_UNTAGGED && !br_switchdev_frame_uses_tx_fwd_offload(skb)) __vlan_hwaccel_clear_tag(skb); if (p && (p->flags & BR_VLAN_TUNNEL) && br_handle_egress_vlan_tunnel(skb, v)) { kfree_skb(skb); return NULL; } out: return skb; } /* Called under RCU */ static bool __allowed_ingress(const struct net_bridge *br, struct net_bridge_vlan_group *vg, struct sk_buff *skb, u16 *vid, u8 *state, struct net_bridge_vlan **vlan) { struct pcpu_sw_netstats *stats; struct net_bridge_vlan *v; bool tagged; BR_INPUT_SKB_CB(skb)->vlan_filtered = true; /* If vlan tx offload is disabled on bridge device and frame was * sent from vlan device on the bridge device, it does not have * HW accelerated vlan tag. */ if (unlikely(!skb_vlan_tag_present(skb) && skb->protocol == br->vlan_proto)) { skb = skb_vlan_untag(skb); if (unlikely(!skb)) return false; } if (!br_vlan_get_tag(skb, vid)) { /* Tagged frame */ if (skb->vlan_proto != br->vlan_proto) { /* Protocol-mismatch, empty out vlan_tci for new tag */ skb_push(skb, ETH_HLEN); skb = vlan_insert_tag_set_proto(skb, skb->vlan_proto, skb_vlan_tag_get(skb)); if (unlikely(!skb)) return false; skb_pull(skb, ETH_HLEN); skb_reset_mac_len(skb); *vid = 0; tagged = false; } else { tagged = true; } } else { /* Untagged frame */ tagged = false; } if (!*vid) { u16 pvid = br_get_pvid(vg); /* Frame had a tag with VID 0 or did not have a tag. * See if pvid is set on this port. That tells us which * vlan untagged or priority-tagged traffic belongs to. */ if (!pvid) goto drop; /* PVID is set on this port. Any untagged or priority-tagged * ingress frame is considered to belong to this vlan. */ *vid = pvid; if (likely(!tagged)) /* Untagged Frame. */ __vlan_hwaccel_put_tag(skb, br->vlan_proto, pvid); else /* Priority-tagged Frame. * At this point, we know that skb->vlan_tci VID * field was 0. * We update only VID field and preserve PCP field. */ skb->vlan_tci |= pvid; /* if snooping and stats are disabled we can avoid the lookup */ if (!br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED) && !br_opt_get(br, BROPT_VLAN_STATS_ENABLED)) { if (*state == BR_STATE_FORWARDING) { *state = br_vlan_get_pvid_state(vg); if (!br_vlan_state_allowed(*state, true)) goto drop; } return true; } } v = br_vlan_find(vg, *vid); if (!v || !br_vlan_should_use(v)) goto drop; if (*state == BR_STATE_FORWARDING) { *state = br_vlan_get_state(v); if (!br_vlan_state_allowed(*state, true)) goto drop; } if (br_opt_get(br, BROPT_VLAN_STATS_ENABLED)) { stats = this_cpu_ptr(v->stats); u64_stats_update_begin(&stats->syncp); u64_stats_add(&stats->rx_bytes, skb->len); u64_stats_inc(&stats->rx_packets); u64_stats_update_end(&stats->syncp); } *vlan = v; return true; drop: kfree_skb(skb); return false; } bool br_allowed_ingress(const struct net_bridge *br, struct net_bridge_vlan_group *vg, struct sk_buff *skb, u16 *vid, u8 *state, struct net_bridge_vlan **vlan) { /* If VLAN filtering is disabled on the bridge, all packets are * permitted. */ *vlan = NULL; if (!br_opt_get(br, BROPT_VLAN_ENABLED)) { BR_INPUT_SKB_CB(skb)->vlan_filtered = false; return true; } return __allowed_ingress(br, vg, skb, vid, state, vlan); } /* Called under RCU. */ bool br_allowed_egress(struct net_bridge_vlan_group *vg, const struct sk_buff *skb) { const struct net_bridge_vlan *v; u16 vid; /* If this packet was not filtered at input, let it pass */ if (!BR_INPUT_SKB_CB(skb)->vlan_filtered) return true; br_vlan_get_tag(skb, &vid); v = br_vlan_find(vg, vid); if (v && br_vlan_should_use(v) && br_vlan_state_allowed(br_vlan_get_state(v), false)) return true; return false; } /* Called under RCU */ bool br_should_learn(struct net_bridge_port *p, struct sk_buff *skb, u16 *vid) { struct net_bridge_vlan_group *vg; struct net_bridge *br = p->br; struct net_bridge_vlan *v; /* If filtering was disabled at input, let it pass. */ if (!br_opt_get(br, BROPT_VLAN_ENABLED)) return true; vg = nbp_vlan_group_rcu(p); if (!vg || !vg->num_vlans) return false; if (!br_vlan_get_tag(skb, vid) && skb->vlan_proto != br->vlan_proto) *vid = 0; if (!*vid) { *vid = br_get_pvid(vg); if (!*vid || !br_vlan_state_allowed(br_vlan_get_pvid_state(vg), true)) return false; return true; } v = br_vlan_find(vg, *vid); if (v && br_vlan_state_allowed(br_vlan_get_state(v), true)) return true; return false; } static int br_vlan_add_existing(struct net_bridge *br, struct net_bridge_vlan_group *vg, struct net_bridge_vlan *vlan, u16 flags, bool *changed, struct netlink_ext_ack *extack) { bool becomes_brentry = false; bool would_change = false; int err; if (!br_vlan_is_brentry(vlan)) { /* Trying to change flags of non-existent bridge vlan */ if (!(flags & BRIDGE_VLAN_INFO_BRENTRY)) return -EINVAL; becomes_brentry = true; } else { would_change = __vlan_flags_would_change(vlan, flags); } /* Master VLANs that aren't brentries weren't notified before, * time to notify them now. */ if (becomes_brentry || would_change) { err = br_switchdev_port_vlan_add(br->dev, vlan->vid, flags, would_change, extack); if (err && err != -EOPNOTSUPP) return err; } if (becomes_brentry) { /* It was only kept for port vlans, now make it real */ err = br_fdb_add_local(br, NULL, br->dev->dev_addr, vlan->vid); if (err) { br_err(br, "failed to insert local address into bridge forwarding table\n"); goto err_fdb_insert; } refcount_inc(&vlan->refcnt); vlan->flags |= BRIDGE_VLAN_INFO_BRENTRY; vg->num_vlans++; *changed = true; br_multicast_toggle_one_vlan(vlan, true); } __vlan_flags_commit(vlan, flags); if (would_change) *changed = true; return 0; err_fdb_insert: br_switchdev_port_vlan_del(br->dev, vlan->vid); return err; } /* Must be protected by RTNL. * Must be called with vid in range from 1 to 4094 inclusive. * changed must be true only if the vlan was created or updated */ int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags, bool *changed, struct netlink_ext_ack *extack) { struct net_bridge_vlan_group *vg; struct net_bridge_vlan *vlan; int ret; ASSERT_RTNL(); *changed = false; vg = br_vlan_group(br); vlan = br_vlan_find(vg, vid); if (vlan) return br_vlan_add_existing(br, vg, vlan, flags, changed, extack); vlan = kzalloc(sizeof(*vlan), GFP_KERNEL); if (!vlan) return -ENOMEM; vlan->stats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); if (!vlan->stats) { kfree(vlan); return -ENOMEM; } vlan->vid = vid; vlan->flags = flags | BRIDGE_VLAN_INFO_MASTER; vlan->flags &= ~BRIDGE_VLAN_INFO_PVID; vlan->br = br; if (flags & BRIDGE_VLAN_INFO_BRENTRY) refcount_set(&vlan->refcnt, 1); ret = __vlan_add(vlan, flags, extack); if (ret) { free_percpu(vlan->stats); kfree(vlan); } else { *changed = true; } return ret; } /* Must be protected by RTNL. * Must be called with vid in range from 1 to 4094 inclusive. */ int br_vlan_delete(struct net_bridge *br, u16 vid) { struct net_bridge_vlan_group *vg; struct net_bridge_vlan *v; ASSERT_RTNL(); vg = br_vlan_group(br); v = br_vlan_find(vg, vid); if (!v || !br_vlan_is_brentry(v)) return -ENOENT; br_fdb_find_delete_local(br, NULL, br->dev->dev_addr, vid); br_fdb_delete_by_port(br, NULL, vid, 0); vlan_tunnel_info_del(vg, v); return __vlan_del(v); } void br_vlan_flush(struct net_bridge *br) { struct net_bridge_vlan_group *vg; ASSERT_RTNL(); vg = br_vlan_group(br); __vlan_flush(br, NULL, vg); RCU_INIT_POINTER(br->vlgrp, NULL); synchronize_net(); __vlan_group_free(vg); } struct net_bridge_vlan *br_vlan_find(struct net_bridge_vlan_group *vg, u16 vid) { if (!vg) return NULL; return br_vlan_lookup(&vg->vlan_hash, vid); } /* Must be protected by RTNL. */ static void recalculate_group_addr(struct net_bridge *br) { if (br_opt_get(br, BROPT_GROUP_ADDR_SET)) return; spin_lock_bh(&br->lock); if (!br_opt_get(br, BROPT_VLAN_ENABLED) || br->vlan_proto == htons(ETH_P_8021Q)) { /* Bridge Group Address */ br->group_addr[5] = 0x00; } else { /* vlan_enabled && ETH_P_8021AD */ /* Provider Bridge Group Address */ br->group_addr[5] = 0x08; } spin_unlock_bh(&br->lock); } /* Must be protected by RTNL. */ void br_recalculate_fwd_mask(struct net_bridge *br) { if (!br_opt_get(br, BROPT_VLAN_ENABLED) || br->vlan_proto == htons(ETH_P_8021Q)) br->group_fwd_mask_required = BR_GROUPFWD_DEFAULT; else /* vlan_enabled && ETH_P_8021AD */ br->group_fwd_mask_required = BR_GROUPFWD_8021AD & ~(1u << br->group_addr[5]); } int br_vlan_filter_toggle(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { struct switchdev_attr attr = { .orig_dev = br->dev, .id = SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING, .flags = SWITCHDEV_F_SKIP_EOPNOTSUPP, .u.vlan_filtering = val, }; int err; if (br_opt_get(br, BROPT_VLAN_ENABLED) == !!val) return 0; br_opt_toggle(br, BROPT_VLAN_ENABLED, !!val); err = switchdev_port_attr_set(br->dev, &attr, extack); if (err && err != -EOPNOTSUPP) { br_opt_toggle(br, BROPT_VLAN_ENABLED, !val); return err; } br_manage_promisc(br); recalculate_group_addr(br); br_recalculate_fwd_mask(br); if (!val && br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED)) { br_info(br, "vlan filtering disabled, automatically disabling multicast vlan snooping\n"); br_multicast_toggle_vlan_snooping(br, false, NULL); } return 0; } bool br_vlan_enabled(const struct net_device *dev) { struct net_bridge *br = netdev_priv(dev); return br_opt_get(br, BROPT_VLAN_ENABLED); } EXPORT_SYMBOL_GPL(br_vlan_enabled); int br_vlan_get_proto(const struct net_device *dev, u16 *p_proto) { struct net_bridge *br = netdev_priv(dev); *p_proto = ntohs(br->vlan_proto); return 0; } EXPORT_SYMBOL_GPL(br_vlan_get_proto); int __br_vlan_set_proto(struct net_bridge *br, __be16 proto, struct netlink_ext_ack *extack) { struct switchdev_attr attr = { .orig_dev = br->dev, .id = SWITCHDEV_ATTR_ID_BRIDGE_VLAN_PROTOCOL, .flags = SWITCHDEV_F_SKIP_EOPNOTSUPP, .u.vlan_protocol = ntohs(proto), }; int err = 0; struct net_bridge_port *p; struct net_bridge_vlan *vlan; struct net_bridge_vlan_group *vg; __be16 oldproto = br->vlan_proto; if (br->vlan_proto == proto) return 0; err = switchdev_port_attr_set(br->dev, &attr, extack); if (err && err != -EOPNOTSUPP) return err; /* Add VLANs for the new proto to the device filter. */ list_for_each_entry(p, &br->port_list, list) { vg = nbp_vlan_group(p); list_for_each_entry(vlan, &vg->vlan_list, vlist) { if (vlan->priv_flags & BR_VLFLAG_ADDED_BY_SWITCHDEV) continue; err = vlan_vid_add(p->dev, proto, vlan->vid); if (err) goto err_filt; } } br->vlan_proto = proto; recalculate_group_addr(br); br_recalculate_fwd_mask(br); /* Delete VLANs for the old proto from the device filter. */ list_for_each_entry(p, &br->port_list, list) { vg = nbp_vlan_group(p); list_for_each_entry(vlan, &vg->vlan_list, vlist) { if (vlan->priv_flags & BR_VLFLAG_ADDED_BY_SWITCHDEV) continue; vlan_vid_del(p->dev, oldproto, vlan->vid); } } return 0; err_filt: attr.u.vlan_protocol = ntohs(oldproto); switchdev_port_attr_set(br->dev, &attr, NULL); list_for_each_entry_continue_reverse(vlan, &vg->vlan_list, vlist) { if (vlan->priv_flags & BR_VLFLAG_ADDED_BY_SWITCHDEV) continue; vlan_vid_del(p->dev, proto, vlan->vid); } list_for_each_entry_continue_reverse(p, &br->port_list, list) { vg = nbp_vlan_group(p); list_for_each_entry(vlan, &vg->vlan_list, vlist) { if (vlan->priv_flags & BR_VLFLAG_ADDED_BY_SWITCHDEV) continue; vlan_vid_del(p->dev, proto, vlan->vid); } } return err; } int br_vlan_set_proto(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { if (!eth_type_vlan(htons(val))) return -EPROTONOSUPPORT; return __br_vlan_set_proto(br, htons(val), extack); } int br_vlan_set_stats(struct net_bridge *br, unsigned long val) { switch (val) { case 0: case 1: br_opt_toggle(br, BROPT_VLAN_STATS_ENABLED, !!val); break; default: return -EINVAL; } return 0; } int br_vlan_set_stats_per_port(struct net_bridge *br, unsigned long val) { struct net_bridge_port *p; /* allow to change the option if there are no port vlans configured */ list_for_each_entry(p, &br->port_list, list) { struct net_bridge_vlan_group *vg = nbp_vlan_group(p); if (vg->num_vlans) return -EBUSY; } switch (val) { case 0: case 1: br_opt_toggle(br, BROPT_VLAN_STATS_PER_PORT, !!val); break; default: return -EINVAL; } return 0; } static bool vlan_default_pvid(struct net_bridge_vlan_group *vg, u16 vid) { struct net_bridge_vlan *v; if (vid != vg->pvid) return false; v = br_vlan_lookup(&vg->vlan_hash, vid); if (v && br_vlan_should_use(v) && (v->flags & BRIDGE_VLAN_INFO_UNTAGGED)) return true; return false; } static void br_vlan_disable_default_pvid(struct net_bridge *br) { struct net_bridge_port *p; u16 pvid = br->default_pvid; /* Disable default_pvid on all ports where it is still * configured. */ if (vlan_default_pvid(br_vlan_group(br), pvid)) { if (!br_vlan_delete(br, pvid)) br_vlan_notify(br, NULL, pvid, 0, RTM_DELVLAN); } list_for_each_entry(p, &br->port_list, list) { if (vlan_default_pvid(nbp_vlan_group(p), pvid) && !nbp_vlan_delete(p, pvid)) br_vlan_notify(br, p, pvid, 0, RTM_DELVLAN); } br->default_pvid = 0; } int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid, struct netlink_ext_ack *extack) { const struct net_bridge_vlan *pvent; struct net_bridge_vlan_group *vg; struct net_bridge_port *p; unsigned long *changed; bool vlchange; u16 old_pvid; int err = 0; if (!pvid) { br_vlan_disable_default_pvid(br); return 0; } changed = bitmap_zalloc(BR_MAX_PORTS, GFP_KERNEL); if (!changed) return -ENOMEM; old_pvid = br->default_pvid; /* Update default_pvid config only if we do not conflict with * user configuration. */ vg = br_vlan_group(br); pvent = br_vlan_find(vg, pvid); if ((!old_pvid || vlan_default_pvid(vg, old_pvid)) && (!pvent || !br_vlan_should_use(pvent))) { err = br_vlan_add(br, pvid, BRIDGE_VLAN_INFO_PVID | BRIDGE_VLAN_INFO_UNTAGGED | BRIDGE_VLAN_INFO_BRENTRY, &vlchange, extack); if (err) goto out; if (br_vlan_delete(br, old_pvid)) br_vlan_notify(br, NULL, old_pvid, 0, RTM_DELVLAN); br_vlan_notify(br, NULL, pvid, 0, RTM_NEWVLAN); __set_bit(0, changed); } list_for_each_entry(p, &br->port_list, list) { /* Update default_pvid config only if we do not conflict with * user configuration. */ vg = nbp_vlan_group(p); if ((old_pvid && !vlan_default_pvid(vg, old_pvid)) || br_vlan_find(vg, pvid)) continue; err = nbp_vlan_add(p, pvid, BRIDGE_VLAN_INFO_PVID | BRIDGE_VLAN_INFO_UNTAGGED, &vlchange, extack); if (err) goto err_port; if (nbp_vlan_delete(p, old_pvid)) br_vlan_notify(br, p, old_pvid, 0, RTM_DELVLAN); br_vlan_notify(p->br, p, pvid, 0, RTM_NEWVLAN); __set_bit(p->port_no, changed); } br->default_pvid = pvid; out: bitmap_free(changed); return err; err_port: list_for_each_entry_continue_reverse(p, &br->port_list, list) { if (!test_bit(p->port_no, changed)) continue; if (old_pvid) { nbp_vlan_add(p, old_pvid, BRIDGE_VLAN_INFO_PVID | BRIDGE_VLAN_INFO_UNTAGGED, &vlchange, NULL); br_vlan_notify(p->br, p, old_pvid, 0, RTM_NEWVLAN); } nbp_vlan_delete(p, pvid); br_vlan_notify(br, p, pvid, 0, RTM_DELVLAN); } if (test_bit(0, changed)) { if (old_pvid) { br_vlan_add(br, old_pvid, BRIDGE_VLAN_INFO_PVID | BRIDGE_VLAN_INFO_UNTAGGED | BRIDGE_VLAN_INFO_BRENTRY, &vlchange, NULL); br_vlan_notify(br, NULL, old_pvid, 0, RTM_NEWVLAN); } br_vlan_delete(br, pvid); br_vlan_notify(br, NULL, pvid, 0, RTM_DELVLAN); } goto out; } int br_vlan_set_default_pvid(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { u16 pvid = val; int err = 0; if (val >= VLAN_VID_MASK) return -EINVAL; if (pvid == br->default_pvid) goto out; /* Only allow default pvid change when filtering is disabled */ if (br_opt_get(br, BROPT_VLAN_ENABLED)) { pr_info_once("Please disable vlan filtering to change default_pvid\n"); err = -EPERM; goto out; } err = __br_vlan_set_default_pvid(br, pvid, extack); out: return err; } int br_vlan_init(struct net_bridge *br) { struct net_bridge_vlan_group *vg; int ret = -ENOMEM; vg = kzalloc(sizeof(*vg), GFP_KERNEL); if (!vg) goto out; ret = rhashtable_init(&vg->vlan_hash, &br_vlan_rht_params); if (ret) goto err_rhtbl; ret = vlan_tunnel_init(vg); if (ret) goto err_tunnel_init; INIT_LIST_HEAD(&vg->vlan_list); br->vlan_proto = htons(ETH_P_8021Q); br->default_pvid = 1; rcu_assign_pointer(br->vlgrp, vg); out: return ret; err_tunnel_init: rhashtable_destroy(&vg->vlan_hash); err_rhtbl: kfree(vg); goto out; } int nbp_vlan_init(struct net_bridge_port *p, struct netlink_ext_ack *extack) { struct switchdev_attr attr = { .orig_dev = p->br->dev, .id = SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING, .flags = SWITCHDEV_F_SKIP_EOPNOTSUPP, .u.vlan_filtering = br_opt_get(p->br, BROPT_VLAN_ENABLED), }; struct net_bridge_vlan_group *vg; int ret = -ENOMEM; vg = kzalloc(sizeof(struct net_bridge_vlan_group), GFP_KERNEL); if (!vg) goto out; ret = switchdev_port_attr_set(p->dev, &attr, extack); if (ret && ret != -EOPNOTSUPP) goto err_vlan_enabled; ret = rhashtable_init(&vg->vlan_hash, &br_vlan_rht_params); if (ret) goto err_rhtbl; ret = vlan_tunnel_init(vg); if (ret) goto err_tunnel_init; INIT_LIST_HEAD(&vg->vlan_list); rcu_assign_pointer(p->vlgrp, vg); if (p->br->default_pvid) { bool changed; ret = nbp_vlan_add(p, p->br->default_pvid, BRIDGE_VLAN_INFO_PVID | BRIDGE_VLAN_INFO_UNTAGGED, &changed, extack); if (ret) goto err_vlan_add; br_vlan_notify(p->br, p, p->br->default_pvid, 0, RTM_NEWVLAN); } out: return ret; err_vlan_add: RCU_INIT_POINTER(p->vlgrp, NULL); synchronize_rcu(); vlan_tunnel_deinit(vg); err_tunnel_init: rhashtable_destroy(&vg->vlan_hash); err_rhtbl: err_vlan_enabled: kfree(vg); goto out; } /* Must be protected by RTNL. * Must be called with vid in range from 1 to 4094 inclusive. * changed must be true only if the vlan was created or updated */ int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags, bool *changed, struct netlink_ext_ack *extack) { struct net_bridge_vlan *vlan; int ret; ASSERT_RTNL(); *changed = false; vlan = br_vlan_find(nbp_vlan_group(port), vid); if (vlan) { bool would_change = __vlan_flags_would_change(vlan, flags); if (would_change) { /* Pass the flags to the hardware bridge */ ret = br_switchdev_port_vlan_add(port->dev, vid, flags, true, extack); if (ret && ret != -EOPNOTSUPP) return ret; } __vlan_flags_commit(vlan, flags); *changed = would_change; return 0; } vlan = kzalloc(sizeof(*vlan), GFP_KERNEL); if (!vlan) return -ENOMEM; vlan->vid = vid; vlan->port = port; ret = __vlan_add(vlan, flags, extack); if (ret) kfree(vlan); else *changed = true; return ret; } /* Must be protected by RTNL. * Must be called with vid in range from 1 to 4094 inclusive. */ int nbp_vlan_delete(struct net_bridge_port *port, u16 vid) { struct net_bridge_vlan *v; ASSERT_RTNL(); v = br_vlan_find(nbp_vlan_group(port), vid); if (!v) return -ENOENT; br_fdb_find_delete_local(port->br, port, port->dev->dev_addr, vid); br_fdb_delete_by_port(port->br, port, vid, 0); return __vlan_del(v); } void nbp_vlan_flush(struct net_bridge_port *port) { struct net_bridge_vlan_group *vg; ASSERT_RTNL(); vg = nbp_vlan_group(port); __vlan_flush(port->br, port, vg); RCU_INIT_POINTER(port->vlgrp, NULL); synchronize_net(); __vlan_group_free(vg); } void br_vlan_get_stats(const struct net_bridge_vlan *v, struct pcpu_sw_netstats *stats) { int i; memset(stats, 0, sizeof(*stats)); for_each_possible_cpu(i) { u64 rxpackets, rxbytes, txpackets, txbytes; struct pcpu_sw_netstats *cpu_stats; unsigned int start; cpu_stats = per_cpu_ptr(v->stats, i); do { start = u64_stats_fetch_begin(&cpu_stats->syncp); rxpackets = u64_stats_read(&cpu_stats->rx_packets); rxbytes = u64_stats_read(&cpu_stats->rx_bytes); txbytes = u64_stats_read(&cpu_stats->tx_bytes); txpackets = u64_stats_read(&cpu_stats->tx_packets); } while (u64_stats_fetch_retry(&cpu_stats->syncp, start)); u64_stats_add(&stats->rx_packets, rxpackets); u64_stats_add(&stats->rx_bytes, rxbytes); u64_stats_add(&stats->tx_bytes, txbytes); u64_stats_add(&stats->tx_packets, txpackets); } } int br_vlan_get_pvid(const struct net_device *dev, u16 *p_pvid) { struct net_bridge_vlan_group *vg; struct net_bridge_port *p; ASSERT_RTNL(); p = br_port_get_check_rtnl(dev); if (p) vg = nbp_vlan_group(p); else if (netif_is_bridge_master(dev)) vg = br_vlan_group(netdev_priv(dev)); else return -EINVAL; *p_pvid = br_get_pvid(vg); return 0; } EXPORT_SYMBOL_GPL(br_vlan_get_pvid); int br_vlan_get_pvid_rcu(const struct net_device *dev, u16 *p_pvid) { struct net_bridge_vlan_group *vg; struct net_bridge_port *p; p = br_port_get_check_rcu(dev); if (p) vg = nbp_vlan_group_rcu(p); else if (netif_is_bridge_master(dev)) vg = br_vlan_group_rcu(netdev_priv(dev)); else return -EINVAL; *p_pvid = br_get_pvid(vg); return 0; } EXPORT_SYMBOL_GPL(br_vlan_get_pvid_rcu); void br_vlan_fill_forward_path_pvid(struct net_bridge *br, struct net_device_path_ctx *ctx, struct net_device_path *path) { struct net_bridge_vlan_group *vg; int idx = ctx->num_vlans - 1; u16 vid; path->bridge.vlan_mode = DEV_PATH_BR_VLAN_KEEP; if (!br_opt_get(br, BROPT_VLAN_ENABLED)) return; vg = br_vlan_group(br); if (idx >= 0 && ctx->vlan[idx].proto == br->vlan_proto) { vid = ctx->vlan[idx].id; } else { path->bridge.vlan_mode = DEV_PATH_BR_VLAN_TAG; vid = br_get_pvid(vg); } path->bridge.vlan_id = vid; path->bridge.vlan_proto = br->vlan_proto; } int br_vlan_fill_forward_path_mode(struct net_bridge *br, struct net_bridge_port *dst, struct net_device_path *path) { struct net_bridge_vlan_group *vg; struct net_bridge_vlan *v; if (!br_opt_get(br, BROPT_VLAN_ENABLED)) return 0; vg = nbp_vlan_group_rcu(dst); v = br_vlan_find(vg, path->bridge.vlan_id); if (!v || !br_vlan_should_use(v)) return -EINVAL; if (!(v->flags & BRIDGE_VLAN_INFO_UNTAGGED)) return 0; if (path->bridge.vlan_mode == DEV_PATH_BR_VLAN_TAG) path->bridge.vlan_mode = DEV_PATH_BR_VLAN_KEEP; else if (v->priv_flags & BR_VLFLAG_ADDED_BY_SWITCHDEV) path->bridge.vlan_mode = DEV_PATH_BR_VLAN_UNTAG_HW; else path->bridge.vlan_mode = DEV_PATH_BR_VLAN_UNTAG; return 0; } int br_vlan_get_info(const struct net_device *dev, u16 vid, struct bridge_vlan_info *p_vinfo) { struct net_bridge_vlan_group *vg; struct net_bridge_vlan *v; struct net_bridge_port *p; ASSERT_RTNL(); p = br_port_get_check_rtnl(dev); if (p) vg = nbp_vlan_group(p); else if (netif_is_bridge_master(dev)) vg = br_vlan_group(netdev_priv(dev)); else return -EINVAL; v = br_vlan_find(vg, vid); if (!v) return -ENOENT; p_vinfo->vid = vid; p_vinfo->flags = v->flags; if (vid == br_get_pvid(vg)) p_vinfo->flags |= BRIDGE_VLAN_INFO_PVID; return 0; } EXPORT_SYMBOL_GPL(br_vlan_get_info); int br_vlan_get_info_rcu(const struct net_device *dev, u16 vid, struct bridge_vlan_info *p_vinfo) { struct net_bridge_vlan_group *vg; struct net_bridge_vlan *v; struct net_bridge_port *p; p = br_port_get_check_rcu(dev); if (p) vg = nbp_vlan_group_rcu(p); else if (netif_is_bridge_master(dev)) vg = br_vlan_group_rcu(netdev_priv(dev)); else return -EINVAL; v = br_vlan_find(vg, vid); if (!v) return -ENOENT; p_vinfo->vid = vid; p_vinfo->flags = v->flags; if (vid == br_get_pvid(vg)) p_vinfo->flags |= BRIDGE_VLAN_INFO_PVID; return 0; } EXPORT_SYMBOL_GPL(br_vlan_get_info_rcu); static int br_vlan_is_bind_vlan_dev(const struct net_device *dev) { return is_vlan_dev(dev) && !!(vlan_dev_priv(dev)->flags & VLAN_FLAG_BRIDGE_BINDING); } static int br_vlan_is_bind_vlan_dev_fn(struct net_device *dev, __always_unused struct netdev_nested_priv *priv) { return br_vlan_is_bind_vlan_dev(dev); } static bool br_vlan_has_upper_bind_vlan_dev(struct net_device *dev) { int found; rcu_read_lock(); found = netdev_walk_all_upper_dev_rcu(dev, br_vlan_is_bind_vlan_dev_fn, NULL); rcu_read_unlock(); return !!found; } struct br_vlan_bind_walk_data { u16 vid; struct net_device *result; }; static int br_vlan_match_bind_vlan_dev_fn(struct net_device *dev, struct netdev_nested_priv *priv) { struct br_vlan_bind_walk_data *data = priv->data; int found = 0; if (br_vlan_is_bind_vlan_dev(dev) && vlan_dev_priv(dev)->vlan_id == data->vid) { data->result = dev; found = 1; } return found; } static struct net_device * br_vlan_get_upper_bind_vlan_dev(struct net_device *dev, u16 vid) { struct br_vlan_bind_walk_data data = { .vid = vid, }; struct netdev_nested_priv priv = { .data = (void *)&data, }; rcu_read_lock(); netdev_walk_all_upper_dev_rcu(dev, br_vlan_match_bind_vlan_dev_fn, &priv); rcu_read_unlock(); return data.result; } static bool br_vlan_is_dev_up(const struct net_device *dev) { return !!(dev->flags & IFF_UP) && netif_oper_up(dev); } static void br_vlan_set_vlan_dev_state(const struct net_bridge *br, struct net_device *vlan_dev) { u16 vid = vlan_dev_priv(vlan_dev)->vlan_id; struct net_bridge_vlan_group *vg; struct net_bridge_port *p; bool has_carrier = false; if (!netif_carrier_ok(br->dev)) { netif_carrier_off(vlan_dev); return; } list_for_each_entry(p, &br->port_list, list) { vg = nbp_vlan_group(p); if (br_vlan_find(vg, vid) && br_vlan_is_dev_up(p->dev)) { has_carrier = true; break; } } if (has_carrier) netif_carrier_on(vlan_dev); else netif_carrier_off(vlan_dev); } static void br_vlan_set_all_vlan_dev_state(struct net_bridge_port *p) { struct net_bridge_vlan_group *vg = nbp_vlan_group(p); struct net_bridge_vlan *vlan; struct net_device *vlan_dev; list_for_each_entry(vlan, &vg->vlan_list, vlist) { vlan_dev = br_vlan_get_upper_bind_vlan_dev(p->br->dev, vlan->vid); if (vlan_dev) { if (br_vlan_is_dev_up(p->dev)) { if (netif_carrier_ok(p->br->dev)) netif_carrier_on(vlan_dev); } else { br_vlan_set_vlan_dev_state(p->br, vlan_dev); } } } } static void br_vlan_toggle_bridge_binding(struct net_device *br_dev, bool enable) { struct net_bridge *br = netdev_priv(br_dev); if (enable) br_opt_toggle(br, BROPT_VLAN_BRIDGE_BINDING, true); else br_opt_toggle(br, BROPT_VLAN_BRIDGE_BINDING, br_vlan_has_upper_bind_vlan_dev(br_dev)); } static void br_vlan_upper_change(struct net_device *dev, struct net_device *upper_dev, bool linking) { struct net_bridge *br = netdev_priv(dev); if (!br_vlan_is_bind_vlan_dev(upper_dev)) return; br_vlan_toggle_bridge_binding(dev, linking); if (linking) br_vlan_set_vlan_dev_state(br, upper_dev); } struct br_vlan_link_state_walk_data { struct net_bridge *br; }; static int br_vlan_link_state_change_fn(struct net_device *vlan_dev, struct netdev_nested_priv *priv) { struct br_vlan_link_state_walk_data *data = priv->data; if (br_vlan_is_bind_vlan_dev(vlan_dev)) br_vlan_set_vlan_dev_state(data->br, vlan_dev); return 0; } static void br_vlan_link_state_change(struct net_device *dev, struct net_bridge *br) { struct br_vlan_link_state_walk_data data = { .br = br }; struct netdev_nested_priv priv = { .data = (void *)&data, }; rcu_read_lock(); netdev_walk_all_upper_dev_rcu(dev, br_vlan_link_state_change_fn, &priv); rcu_read_unlock(); } /* Must be protected by RTNL. */ static void nbp_vlan_set_vlan_dev_state(struct net_bridge_port *p, u16 vid) { struct net_device *vlan_dev; if (!br_opt_get(p->br, BROPT_VLAN_BRIDGE_BINDING)) return; vlan_dev = br_vlan_get_upper_bind_vlan_dev(p->br->dev, vid); if (vlan_dev) br_vlan_set_vlan_dev_state(p->br, vlan_dev); } /* Must be protected by RTNL. */ int br_vlan_bridge_event(struct net_device *dev, unsigned long event, void *ptr) { struct netdev_notifier_changeupper_info *info; struct net_bridge *br = netdev_priv(dev); int vlcmd = 0, ret = 0; bool changed = false; switch (event) { case NETDEV_REGISTER: ret = br_vlan_add(br, br->default_pvid, BRIDGE_VLAN_INFO_PVID | BRIDGE_VLAN_INFO_UNTAGGED | BRIDGE_VLAN_INFO_BRENTRY, &changed, NULL); vlcmd = RTM_NEWVLAN; break; case NETDEV_UNREGISTER: changed = !br_vlan_delete(br, br->default_pvid); vlcmd = RTM_DELVLAN; break; case NETDEV_CHANGEUPPER: info = ptr; br_vlan_upper_change(dev, info->upper_dev, info->linking); break; case NETDEV_CHANGE: case NETDEV_UP: if (!br_opt_get(br, BROPT_VLAN_BRIDGE_BINDING)) break; br_vlan_link_state_change(dev, br); break; } if (changed) br_vlan_notify(br, NULL, br->default_pvid, 0, vlcmd); return ret; } void br_vlan_vlan_upper_event(struct net_device *br_dev, struct net_device *vlan_dev, unsigned long event) { struct vlan_dev_priv *vlan = vlan_dev_priv(vlan_dev); struct net_bridge *br = netdev_priv(br_dev); bool bridge_binding; switch (event) { case NETDEV_CHANGE: case NETDEV_UP: break; default: return; } bridge_binding = vlan->flags & VLAN_FLAG_BRIDGE_BINDING; br_vlan_toggle_bridge_binding(br_dev, bridge_binding); if (bridge_binding) br_vlan_set_vlan_dev_state(br, vlan_dev); else if (!bridge_binding && netif_carrier_ok(br_dev)) netif_carrier_on(vlan_dev); } /* Must be protected by RTNL. */ void br_vlan_port_event(struct net_bridge_port *p, unsigned long event) { if (!br_opt_get(p->br, BROPT_VLAN_BRIDGE_BINDING)) return; switch (event) { case NETDEV_CHANGE: case NETDEV_DOWN: case NETDEV_UP: br_vlan_set_all_vlan_dev_state(p); break; } } static bool br_vlan_stats_fill(struct sk_buff *skb, const struct net_bridge_vlan *v) { struct pcpu_sw_netstats stats; struct nlattr *nest; nest = nla_nest_start(skb, BRIDGE_VLANDB_ENTRY_STATS); if (!nest) return false; br_vlan_get_stats(v, &stats); if (nla_put_u64_64bit(skb, BRIDGE_VLANDB_STATS_RX_BYTES, u64_stats_read(&stats.rx_bytes), BRIDGE_VLANDB_STATS_PAD) || nla_put_u64_64bit(skb, BRIDGE_VLANDB_STATS_RX_PACKETS, u64_stats_read(&stats.rx_packets), BRIDGE_VLANDB_STATS_PAD) || nla_put_u64_64bit(skb, BRIDGE_VLANDB_STATS_TX_BYTES, u64_stats_read(&stats.tx_bytes), BRIDGE_VLANDB_STATS_PAD) || nla_put_u64_64bit(skb, BRIDGE_VLANDB_STATS_TX_PACKETS, u64_stats_read(&stats.tx_packets), BRIDGE_VLANDB_STATS_PAD)) goto out_err; nla_nest_end(skb, nest); return true; out_err: nla_nest_cancel(skb, nest); return false; } /* v_opts is used to dump the options which must be equal in the whole range */ static bool br_vlan_fill_vids(struct sk_buff *skb, u16 vid, u16 vid_range, const struct net_bridge_vlan *v_opts, const struct net_bridge_port *p, u16 flags, bool dump_stats) { struct bridge_vlan_info info; struct nlattr *nest; nest = nla_nest_start(skb, BRIDGE_VLANDB_ENTRY); if (!nest) return false; memset(&info, 0, sizeof(info)); info.vid = vid; if (flags & BRIDGE_VLAN_INFO_UNTAGGED) info.flags |= BRIDGE_VLAN_INFO_UNTAGGED; if (flags & BRIDGE_VLAN_INFO_PVID) info.flags |= BRIDGE_VLAN_INFO_PVID; if (nla_put(skb, BRIDGE_VLANDB_ENTRY_INFO, sizeof(info), &info)) goto out_err; if (vid_range && vid < vid_range && !(flags & BRIDGE_VLAN_INFO_PVID) && nla_put_u16(skb, BRIDGE_VLANDB_ENTRY_RANGE, vid_range)) goto out_err; if (v_opts) { if (!br_vlan_opts_fill(skb, v_opts, p)) goto out_err; if (dump_stats && !br_vlan_stats_fill(skb, v_opts)) goto out_err; } nla_nest_end(skb, nest); return true; out_err: nla_nest_cancel(skb, nest); return false; } static size_t rtnl_vlan_nlmsg_size(void) { return NLMSG_ALIGN(sizeof(struct br_vlan_msg)) + nla_total_size(0) /* BRIDGE_VLANDB_ENTRY */ + nla_total_size(sizeof(u16)) /* BRIDGE_VLANDB_ENTRY_RANGE */ + nla_total_size(sizeof(struct bridge_vlan_info)) /* BRIDGE_VLANDB_ENTRY_INFO */ + br_vlan_opts_nl_size(); /* bridge vlan options */ } void br_vlan_notify(const struct net_bridge *br, const struct net_bridge_port *p, u16 vid, u16 vid_range, int cmd) { struct net_bridge_vlan_group *vg; struct net_bridge_vlan *v = NULL; struct br_vlan_msg *bvm; struct nlmsghdr *nlh; struct sk_buff *skb; int err = -ENOBUFS; struct net *net; u16 flags = 0; int ifindex; /* right now notifications are done only with rtnl held */ ASSERT_RTNL(); if (p) { ifindex = p->dev->ifindex; vg = nbp_vlan_group(p); net = dev_net(p->dev); } else { ifindex = br->dev->ifindex; vg = br_vlan_group(br); net = dev_net(br->dev); } skb = nlmsg_new(rtnl_vlan_nlmsg_size(), GFP_KERNEL); if (!skb) goto out_err; err = -EMSGSIZE; nlh = nlmsg_put(skb, 0, 0, cmd, sizeof(*bvm), 0); if (!nlh) goto out_err; bvm = nlmsg_data(nlh); memset(bvm, 0, sizeof(*bvm)); bvm->family = AF_BRIDGE; bvm->ifindex = ifindex; switch (cmd) { case RTM_NEWVLAN: /* need to find the vlan due to flags/options */ v = br_vlan_find(vg, vid); if (!v || !br_vlan_should_use(v)) goto out_kfree; flags = v->flags; if (br_get_pvid(vg) == v->vid) flags |= BRIDGE_VLAN_INFO_PVID; break; case RTM_DELVLAN: break; default: goto out_kfree; } if (!br_vlan_fill_vids(skb, vid, vid_range, v, p, flags, false)) goto out_err; nlmsg_end(skb, nlh); rtnl_notify(skb, net, 0, RTNLGRP_BRVLAN, NULL, GFP_KERNEL); return; out_err: rtnl_set_sk_err(net, RTNLGRP_BRVLAN, err); out_kfree: kfree_skb(skb); } /* check if v_curr can enter a range ending in range_end */ bool br_vlan_can_enter_range(const struct net_bridge_vlan *v_curr, const struct net_bridge_vlan *range_end) { return v_curr->vid - range_end->vid == 1 && range_end->flags == v_curr->flags && br_vlan_opts_eq_range(v_curr, range_end); } static int br_vlan_dump_dev(const struct net_device *dev, struct sk_buff *skb, struct netlink_callback *cb, u32 dump_flags) { struct net_bridge_vlan *v, *range_start = NULL, *range_end = NULL; bool dump_global = !!(dump_flags & BRIDGE_VLANDB_DUMPF_GLOBAL); bool dump_stats = !!(dump_flags & BRIDGE_VLANDB_DUMPF_STATS); struct net_bridge_vlan_group *vg; int idx = 0, s_idx = cb->args[1]; struct nlmsghdr *nlh = NULL; struct net_bridge_port *p; struct br_vlan_msg *bvm; struct net_bridge *br; int err = 0; u16 pvid; if (!netif_is_bridge_master(dev) && !netif_is_bridge_port(dev)) return -EINVAL; if (netif_is_bridge_master(dev)) { br = netdev_priv(dev); vg = br_vlan_group_rcu(br); p = NULL; } else { /* global options are dumped only for bridge devices */ if (dump_global) return 0; p = br_port_get_rcu(dev); if (WARN_ON(!p)) return -EINVAL; vg = nbp_vlan_group_rcu(p); br = p->br; } if (!vg) return 0; nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWVLAN, sizeof(*bvm), NLM_F_MULTI); if (!nlh) return -EMSGSIZE; bvm = nlmsg_data(nlh); memset(bvm, 0, sizeof(*bvm)); bvm->family = PF_BRIDGE; bvm->ifindex = dev->ifindex; pvid = br_get_pvid(vg); /* idx must stay at range's beginning until it is filled in */ list_for_each_entry_rcu(v, &vg->vlan_list, vlist) { if (!dump_global && !br_vlan_should_use(v)) continue; if (idx < s_idx) { idx++; continue; } if (!range_start) { range_start = v; range_end = v; continue; } if (dump_global) { if (br_vlan_global_opts_can_enter_range(v, range_end)) goto update_end; if (!br_vlan_global_opts_fill(skb, range_start->vid, range_end->vid, range_start)) { err = -EMSGSIZE; break; } /* advance number of filled vlans */ idx += range_end->vid - range_start->vid + 1; range_start = v; } else if (dump_stats || v->vid == pvid || !br_vlan_can_enter_range(v, range_end)) { u16 vlan_flags = br_vlan_flags(range_start, pvid); if (!br_vlan_fill_vids(skb, range_start->vid, range_end->vid, range_start, p, vlan_flags, dump_stats)) { err = -EMSGSIZE; break; } /* advance number of filled vlans */ idx += range_end->vid - range_start->vid + 1; range_start = v; } update_end: range_end = v; } /* err will be 0 and range_start will be set in 3 cases here: * - first vlan (range_start == range_end) * - last vlan (range_start == range_end, not in range) * - last vlan range (range_start != range_end, in range) */ if (!err && range_start) { if (dump_global && !br_vlan_global_opts_fill(skb, range_start->vid, range_end->vid, range_start)) err = -EMSGSIZE; else if (!dump_global && !br_vlan_fill_vids(skb, range_start->vid, range_end->vid, range_start, p, br_vlan_flags(range_start, pvid), dump_stats)) err = -EMSGSIZE; } cb->args[1] = err ? idx : 0; nlmsg_end(skb, nlh); return err; } static const struct nla_policy br_vlan_db_dump_pol[BRIDGE_VLANDB_DUMP_MAX + 1] = { [BRIDGE_VLANDB_DUMP_FLAGS] = { .type = NLA_U32 }, }; static int br_vlan_rtm_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct nlattr *dtb[BRIDGE_VLANDB_DUMP_MAX + 1]; int idx = 0, err = 0, s_idx = cb->args[0]; struct net *net = sock_net(skb->sk); struct br_vlan_msg *bvm; struct net_device *dev; u32 dump_flags = 0; err = nlmsg_parse(cb->nlh, sizeof(*bvm), dtb, BRIDGE_VLANDB_DUMP_MAX, br_vlan_db_dump_pol, cb->extack); if (err < 0) return err; bvm = nlmsg_data(cb->nlh); if (dtb[BRIDGE_VLANDB_DUMP_FLAGS]) dump_flags = nla_get_u32(dtb[BRIDGE_VLANDB_DUMP_FLAGS]); rcu_read_lock(); if (bvm->ifindex) { dev = dev_get_by_index_rcu(net, bvm->ifindex); if (!dev) { err = -ENODEV; goto out_err; } err = br_vlan_dump_dev(dev, skb, cb, dump_flags); /* if the dump completed without an error we return 0 here */ if (err != -EMSGSIZE) goto out_err; } else { for_each_netdev_rcu(net, dev) { if (idx < s_idx) goto skip; err = br_vlan_dump_dev(dev, skb, cb, dump_flags); if (err == -EMSGSIZE) break; skip: idx++; } } cb->args[0] = idx; rcu_read_unlock(); return skb->len; out_err: rcu_read_unlock(); return err; } static const struct nla_policy br_vlan_db_policy[BRIDGE_VLANDB_ENTRY_MAX + 1] = { [BRIDGE_VLANDB_ENTRY_INFO] = NLA_POLICY_EXACT_LEN(sizeof(struct bridge_vlan_info)), [BRIDGE_VLANDB_ENTRY_RANGE] = { .type = NLA_U16 }, [BRIDGE_VLANDB_ENTRY_STATE] = { .type = NLA_U8 }, [BRIDGE_VLANDB_ENTRY_TUNNEL_INFO] = { .type = NLA_NESTED }, [BRIDGE_VLANDB_ENTRY_MCAST_ROUTER] = { .type = NLA_U8 }, [BRIDGE_VLANDB_ENTRY_MCAST_N_GROUPS] = { .type = NLA_REJECT }, [BRIDGE_VLANDB_ENTRY_MCAST_MAX_GROUPS] = { .type = NLA_U32 }, [BRIDGE_VLANDB_ENTRY_NEIGH_SUPPRESS] = NLA_POLICY_MAX(NLA_U8, 1), }; static int br_vlan_rtm_process_one(struct net_device *dev, const struct nlattr *attr, int cmd, struct netlink_ext_ack *extack) { struct bridge_vlan_info *vinfo, vrange_end, *vinfo_last = NULL; struct nlattr *tb[BRIDGE_VLANDB_ENTRY_MAX + 1]; bool changed = false, skip_processing = false; struct net_bridge_vlan_group *vg; struct net_bridge_port *p = NULL; int err = 0, cmdmap = 0; struct net_bridge *br; if (netif_is_bridge_master(dev)) { br = netdev_priv(dev); vg = br_vlan_group(br); } else { p = br_port_get_rtnl(dev); if (WARN_ON(!p)) return -ENODEV; br = p->br; vg = nbp_vlan_group(p); } if (WARN_ON(!vg)) return -ENODEV; err = nla_parse_nested(tb, BRIDGE_VLANDB_ENTRY_MAX, attr, br_vlan_db_policy, extack); if (err) return err; if (!tb[BRIDGE_VLANDB_ENTRY_INFO]) { NL_SET_ERR_MSG_MOD(extack, "Missing vlan entry info"); return -EINVAL; } memset(&vrange_end, 0, sizeof(vrange_end)); vinfo = nla_data(tb[BRIDGE_VLANDB_ENTRY_INFO]); if (vinfo->flags & (BRIDGE_VLAN_INFO_RANGE_BEGIN | BRIDGE_VLAN_INFO_RANGE_END)) { NL_SET_ERR_MSG_MOD(extack, "Old-style vlan ranges are not allowed when using RTM vlan calls"); return -EINVAL; } if (!br_vlan_valid_id(vinfo->vid, extack)) return -EINVAL; if (tb[BRIDGE_VLANDB_ENTRY_RANGE]) { vrange_end.vid = nla_get_u16(tb[BRIDGE_VLANDB_ENTRY_RANGE]); /* validate user-provided flags without RANGE_BEGIN */ vrange_end.flags = BRIDGE_VLAN_INFO_RANGE_END | vinfo->flags; vinfo->flags |= BRIDGE_VLAN_INFO_RANGE_BEGIN; /* vinfo_last is the range start, vinfo the range end */ vinfo_last = vinfo; vinfo = &vrange_end; if (!br_vlan_valid_id(vinfo->vid, extack) || !br_vlan_valid_range(vinfo, vinfo_last, extack)) return -EINVAL; } switch (cmd) { case RTM_NEWVLAN: cmdmap = RTM_SETLINK; skip_processing = !!(vinfo->flags & BRIDGE_VLAN_INFO_ONLY_OPTS); break; case RTM_DELVLAN: cmdmap = RTM_DELLINK; break; } if (!skip_processing) { struct bridge_vlan_info *tmp_last = vinfo_last; /* br_process_vlan_info may overwrite vinfo_last */ err = br_process_vlan_info(br, p, cmdmap, vinfo, &tmp_last, &changed, extack); /* notify first if anything changed */ if (changed) br_ifinfo_notify(cmdmap, br, p); if (err) return err; } /* deal with options */ if (cmd == RTM_NEWVLAN) { struct net_bridge_vlan *range_start, *range_end; if (vinfo_last) { range_start = br_vlan_find(vg, vinfo_last->vid); range_end = br_vlan_find(vg, vinfo->vid); } else { range_start = br_vlan_find(vg, vinfo->vid); range_end = range_start; } err = br_vlan_process_options(br, p, range_start, range_end, tb, extack); } return err; } static int br_vlan_rtm_process(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct br_vlan_msg *bvm; struct net_device *dev; struct nlattr *attr; int err, vlans = 0; int rem; /* this should validate the header and check for remaining bytes */ err = nlmsg_parse(nlh, sizeof(*bvm), NULL, BRIDGE_VLANDB_MAX, NULL, extack); if (err < 0) return err; bvm = nlmsg_data(nlh); dev = __dev_get_by_index(net, bvm->ifindex); if (!dev) return -ENODEV; if (!netif_is_bridge_master(dev) && !netif_is_bridge_port(dev)) { NL_SET_ERR_MSG_MOD(extack, "The device is not a valid bridge or bridge port"); return -EINVAL; } nlmsg_for_each_attr(attr, nlh, sizeof(*bvm), rem) { switch (nla_type(attr)) { case BRIDGE_VLANDB_ENTRY: err = br_vlan_rtm_process_one(dev, attr, nlh->nlmsg_type, extack); break; case BRIDGE_VLANDB_GLOBAL_OPTIONS: err = br_vlan_rtm_process_global_options(dev, attr, nlh->nlmsg_type, extack); break; default: continue; } vlans++; if (err) break; } if (!vlans) { NL_SET_ERR_MSG_MOD(extack, "No vlans found to process"); err = -EINVAL; } return err; } static const struct rtnl_msg_handler br_vlan_rtnl_msg_handlers[] = { {THIS_MODULE, PF_BRIDGE, RTM_NEWVLAN, br_vlan_rtm_process, NULL, 0}, {THIS_MODULE, PF_BRIDGE, RTM_DELVLAN, br_vlan_rtm_process, NULL, 0}, {THIS_MODULE, PF_BRIDGE, RTM_GETVLAN, NULL, br_vlan_rtm_dump, 0}, }; int br_vlan_rtnl_init(void) { return rtnl_register_many(br_vlan_rtnl_msg_handlers); } void br_vlan_rtnl_uninit(void) { rtnl_unregister_many(br_vlan_rtnl_msg_handlers); }
6583 6579 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM workqueue #if !defined(_TRACE_WORKQUEUE_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_WORKQUEUE_H #include <linux/tracepoint.h> #include <linux/workqueue.h> struct pool_workqueue; /** * workqueue_queue_work - called when a work gets queued * @req_cpu: the requested cpu * @pwq: pointer to struct pool_workqueue * @work: pointer to struct work_struct * * This event occurs when a work is queued immediately or once a * delayed work is actually queued on a workqueue (ie: once the delay * has been reached). */ TRACE_EVENT(workqueue_queue_work, TP_PROTO(int req_cpu, struct pool_workqueue *pwq, struct work_struct *work), TP_ARGS(req_cpu, pwq, work), TP_STRUCT__entry( __field( void *, work ) __field( void *, function) __string( workqueue, pwq->wq->name) __field( int, req_cpu ) __field( int, cpu ) ), TP_fast_assign( __entry->work = work; __entry->function = work->func; __assign_str(workqueue); __entry->req_cpu = req_cpu; __entry->cpu = pwq->pool->cpu; ), TP_printk("work struct=%p function=%ps workqueue=%s req_cpu=%d cpu=%d", __entry->work, __entry->function, __get_str(workqueue), __entry->req_cpu, __entry->cpu) ); /** * workqueue_activate_work - called when a work gets activated * @work: pointer to struct work_struct * * This event occurs when a queued work is put on the active queue, * which happens immediately after queueing unless @max_active limit * is reached. */ TRACE_EVENT(workqueue_activate_work, TP_PROTO(struct work_struct *work), TP_ARGS(work), TP_STRUCT__entry( __field( void *, work ) __field( void *, function) ), TP_fast_assign( __entry->work = work; __entry->function = work->func; ), TP_printk("work struct %p function=%ps ", __entry->work, __entry->function) ); /** * workqueue_execute_start - called immediately before the workqueue callback * @work: pointer to struct work_struct * * Allows to track workqueue execution. */ TRACE_EVENT(workqueue_execute_start, TP_PROTO(struct work_struct *work), TP_ARGS(work), TP_STRUCT__entry( __field( void *, work ) __field( void *, function) ), TP_fast_assign( __entry->work = work; __entry->function = work->func; ), TP_printk("work struct %p: function %ps", __entry->work, __entry->function) ); /** * workqueue_execute_end - called immediately after the workqueue callback * @work: pointer to struct work_struct * @function: pointer to worker function * * Allows to track workqueue execution. */ TRACE_EVENT(workqueue_execute_end, TP_PROTO(struct work_struct *work, work_func_t function), TP_ARGS(work, function), TP_STRUCT__entry( __field( void *, work ) __field( void *, function) ), TP_fast_assign( __entry->work = work; __entry->function = function; ), TP_printk("work struct %p: function %ps", __entry->work, __entry->function) ); #endif /* _TRACE_WORKQUEUE_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
11 11 113 62 328 104 80 25 106 1 103 25 80 96 95 21 74 104 1 105 99 99 108 106 252 250 248 251 1 248 244 120 135 250 1 1 49 1 48 49 49 11 11 11 11 11 11 2 11 49 49 49 49 49 46 3 49 3 46 49 49 14 14 1 14 11 11 653 651 652 650 653 652 4 4 4 4 4 4 7 7 7 8 8 1 7 24 24 24 20 4 23 2 24 1597 1598 130 1599 1597 1532 67 1597 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 // SPDX-License-Identifier: GPL-2.0-only /* * fs/kernfs/file.c - kernfs file implementation * * Copyright (c) 2001-3 Patrick Mochel * Copyright (c) 2007 SUSE Linux Products GmbH * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org> */ #include <linux/fs.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/poll.h> #include <linux/pagemap.h> #include <linux/sched/mm.h> #include <linux/fsnotify.h> #include <linux/uio.h> #include "kernfs-internal.h" struct kernfs_open_node { struct rcu_head rcu_head; atomic_t event; wait_queue_head_t poll; struct list_head files; /* goes through kernfs_open_file.list */ unsigned int nr_mmapped; unsigned int nr_to_release; }; /* * kernfs_notify() may be called from any context and bounces notifications * through a work item. To minimize space overhead in kernfs_node, the * pending queue is implemented as a singly linked list of kernfs_nodes. * The list is terminated with the self pointer so that whether a * kernfs_node is on the list or not can be determined by testing the next * pointer for %NULL. */ #define KERNFS_NOTIFY_EOL ((void *)&kernfs_notify_list) static DEFINE_SPINLOCK(kernfs_notify_lock); static struct kernfs_node *kernfs_notify_list = KERNFS_NOTIFY_EOL; static inline struct mutex *kernfs_open_file_mutex_ptr(struct kernfs_node *kn) { int idx = hash_ptr(kn, NR_KERNFS_LOCK_BITS); return &kernfs_locks->open_file_mutex[idx]; } static inline struct mutex *kernfs_open_file_mutex_lock(struct kernfs_node *kn) { struct mutex *lock; lock = kernfs_open_file_mutex_ptr(kn); mutex_lock(lock); return lock; } /** * of_on - Get the kernfs_open_node of the specified kernfs_open_file * @of: target kernfs_open_file * * Return: the kernfs_open_node of the kernfs_open_file */ static struct kernfs_open_node *of_on(struct kernfs_open_file *of) { return rcu_dereference_protected(of->kn->attr.open, !list_empty(&of->list)); } /** * kernfs_deref_open_node_locked - Get kernfs_open_node corresponding to @kn * * @kn: target kernfs_node. * * Fetch and return ->attr.open of @kn when caller holds the * kernfs_open_file_mutex_ptr(kn). * * Update of ->attr.open happens under kernfs_open_file_mutex_ptr(kn). So when * the caller guarantees that this mutex is being held, other updaters can't * change ->attr.open and this means that we can safely deref ->attr.open * outside RCU read-side critical section. * * The caller needs to make sure that kernfs_open_file_mutex is held. * * Return: @kn->attr.open when kernfs_open_file_mutex is held. */ static struct kernfs_open_node * kernfs_deref_open_node_locked(struct kernfs_node *kn) { return rcu_dereference_protected(kn->attr.open, lockdep_is_held(kernfs_open_file_mutex_ptr(kn))); } static struct kernfs_open_file *kernfs_of(struct file *file) { return ((struct seq_file *)file->private_data)->private; } /* * Determine the kernfs_ops for the given kernfs_node. This function must * be called while holding an active reference. */ static const struct kernfs_ops *kernfs_ops(struct kernfs_node *kn) { if (kn->flags & KERNFS_LOCKDEP) lockdep_assert_held(kn); return kn->attr.ops; } /* * As kernfs_seq_stop() is also called after kernfs_seq_start() or * kernfs_seq_next() failure, it needs to distinguish whether it's stopping * a seq_file iteration which is fully initialized with an active reference * or an aborted kernfs_seq_start() due to get_active failure. The * position pointer is the only context for each seq_file iteration and * thus the stop condition should be encoded in it. As the return value is * directly visible to userland, ERR_PTR(-ENODEV) is the only acceptable * choice to indicate get_active failure. * * Unfortunately, this is complicated due to the optional custom seq_file * operations which may return ERR_PTR(-ENODEV) too. kernfs_seq_stop() * can't distinguish whether ERR_PTR(-ENODEV) is from get_active failure or * custom seq_file operations and thus can't decide whether put_active * should be performed or not only on ERR_PTR(-ENODEV). * * This is worked around by factoring out the custom seq_stop() and * put_active part into kernfs_seq_stop_active(), skipping it from * kernfs_seq_stop() if ERR_PTR(-ENODEV) while invoking it directly after * custom seq_file operations fail with ERR_PTR(-ENODEV) - this ensures * that kernfs_seq_stop_active() is skipped only after get_active failure. */ static void kernfs_seq_stop_active(struct seq_file *sf, void *v) { struct kernfs_open_file *of = sf->private; const struct kernfs_ops *ops = kernfs_ops(of->kn); if (ops->seq_stop) ops->seq_stop(sf, v); kernfs_put_active(of->kn); } static void *kernfs_seq_start(struct seq_file *sf, loff_t *ppos) { struct kernfs_open_file *of = sf->private; const struct kernfs_ops *ops; /* * @of->mutex nests outside active ref and is primarily to ensure that * the ops aren't called concurrently for the same open file. */ mutex_lock(&of->mutex); if (!kernfs_get_active(of->kn)) return ERR_PTR(-ENODEV); ops = kernfs_ops(of->kn); if (ops->seq_start) { void *next = ops->seq_start(sf, ppos); /* see the comment above kernfs_seq_stop_active() */ if (next == ERR_PTR(-ENODEV)) kernfs_seq_stop_active(sf, next); return next; } return single_start(sf, ppos); } static void *kernfs_seq_next(struct seq_file *sf, void *v, loff_t *ppos) { struct kernfs_open_file *of = sf->private; const struct kernfs_ops *ops = kernfs_ops(of->kn); if (ops->seq_next) { void *next = ops->seq_next(sf, v, ppos); /* see the comment above kernfs_seq_stop_active() */ if (next == ERR_PTR(-ENODEV)) kernfs_seq_stop_active(sf, next); return next; } else { /* * The same behavior and code as single_open(), always * terminate after the initial read. */ ++*ppos; return NULL; } } static void kernfs_seq_stop(struct seq_file *sf, void *v) { struct kernfs_open_file *of = sf->private; if (v != ERR_PTR(-ENODEV)) kernfs_seq_stop_active(sf, v); mutex_unlock(&of->mutex); } static int kernfs_seq_show(struct seq_file *sf, void *v) { struct kernfs_open_file *of = sf->private; of->event = atomic_read(&of_on(of)->event); return of->kn->attr.ops->seq_show(sf, v); } static const struct seq_operations kernfs_seq_ops = { .start = kernfs_seq_start, .next = kernfs_seq_next, .stop = kernfs_seq_stop, .show = kernfs_seq_show, }; /* * As reading a bin file can have side-effects, the exact offset and bytes * specified in read(2) call should be passed to the read callback making * it difficult to use seq_file. Implement simplistic custom buffering for * bin files. */ static ssize_t kernfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) { struct kernfs_open_file *of = kernfs_of(iocb->ki_filp); ssize_t len = min_t(size_t, iov_iter_count(iter), PAGE_SIZE); const struct kernfs_ops *ops; char *buf; buf = of->prealloc_buf; if (buf) mutex_lock(&of->prealloc_mutex); else buf = kmalloc(len, GFP_KERNEL); if (!buf) return -ENOMEM; /* * @of->mutex nests outside active ref and is used both to ensure that * the ops aren't called concurrently for the same open file. */ mutex_lock(&of->mutex); if (!kernfs_get_active(of->kn)) { len = -ENODEV; mutex_unlock(&of->mutex); goto out_free; } of->event = atomic_read(&of_on(of)->event); ops = kernfs_ops(of->kn); if (ops->read) len = ops->read(of, buf, len, iocb->ki_pos); else len = -EINVAL; kernfs_put_active(of->kn); mutex_unlock(&of->mutex); if (len < 0) goto out_free; if (copy_to_iter(buf, len, iter) != len) { len = -EFAULT; goto out_free; } iocb->ki_pos += len; out_free: if (buf == of->prealloc_buf) mutex_unlock(&of->prealloc_mutex); else kfree(buf); return len; } static ssize_t kernfs_fop_read_iter(struct kiocb *iocb, struct iov_iter *iter) { if (kernfs_of(iocb->ki_filp)->kn->flags & KERNFS_HAS_SEQ_SHOW) return seq_read_iter(iocb, iter); return kernfs_file_read_iter(iocb, iter); } /* * Copy data in from userland and pass it to the matching kernfs write * operation. * * There is no easy way for us to know if userspace is only doing a partial * write, so we don't support them. We expect the entire buffer to come on * the first write. Hint: if you're writing a value, first read the file, * modify only the value you're changing, then write entire buffer * back. */ static ssize_t kernfs_fop_write_iter(struct kiocb *iocb, struct iov_iter *iter) { struct kernfs_open_file *of = kernfs_of(iocb->ki_filp); ssize_t len = iov_iter_count(iter); const struct kernfs_ops *ops; char *buf; if (of->atomic_write_len) { if (len > of->atomic_write_len) return -E2BIG; } else { len = min_t(size_t, len, PAGE_SIZE); } buf = of->prealloc_buf; if (buf) mutex_lock(&of->prealloc_mutex); else buf = kmalloc(len + 1, GFP_KERNEL); if (!buf) return -ENOMEM; if (copy_from_iter(buf, len, iter) != len) { len = -EFAULT; goto out_free; } buf[len] = '\0'; /* guarantee string termination */ /* * @of->mutex nests outside active ref and is used both to ensure that * the ops aren't called concurrently for the same open file. */ mutex_lock(&of->mutex); if (!kernfs_get_active(of->kn)) { mutex_unlock(&of->mutex); len = -ENODEV; goto out_free; } ops = kernfs_ops(of->kn); if (ops->write) len = ops->write(of, buf, len, iocb->ki_pos); else len = -EINVAL; kernfs_put_active(of->kn); mutex_unlock(&of->mutex); if (len > 0) iocb->ki_pos += len; out_free: if (buf == of->prealloc_buf) mutex_unlock(&of->prealloc_mutex); else kfree(buf); return len; } static void kernfs_vma_open(struct vm_area_struct *vma) { struct file *file = vma->vm_file; struct kernfs_open_file *of = kernfs_of(file); if (!of->vm_ops) return; if (!kernfs_get_active(of->kn)) return; if (of->vm_ops->open) of->vm_ops->open(vma); kernfs_put_active(of->kn); } static vm_fault_t kernfs_vma_fault(struct vm_fault *vmf) { struct file *file = vmf->vma->vm_file; struct kernfs_open_file *of = kernfs_of(file); vm_fault_t ret; if (!of->vm_ops) return VM_FAULT_SIGBUS; if (!kernfs_get_active(of->kn)) return VM_FAULT_SIGBUS; ret = VM_FAULT_SIGBUS; if (of->vm_ops->fault) ret = of->vm_ops->fault(vmf); kernfs_put_active(of->kn); return ret; } static vm_fault_t kernfs_vma_page_mkwrite(struct vm_fault *vmf) { struct file *file = vmf->vma->vm_file; struct kernfs_open_file *of = kernfs_of(file); vm_fault_t ret; if (!of->vm_ops) return VM_FAULT_SIGBUS; if (!kernfs_get_active(of->kn)) return VM_FAULT_SIGBUS; ret = 0; if (of->vm_ops->page_mkwrite) ret = of->vm_ops->page_mkwrite(vmf); else file_update_time(file); kernfs_put_active(of->kn); return ret; } static int kernfs_vma_access(struct vm_area_struct *vma, unsigned long addr, void *buf, int len, int write) { struct file *file = vma->vm_file; struct kernfs_open_file *of = kernfs_of(file); int ret; if (!of->vm_ops) return -EINVAL; if (!kernfs_get_active(of->kn)) return -EINVAL; ret = -EINVAL; if (of->vm_ops->access) ret = of->vm_ops->access(vma, addr, buf, len, write); kernfs_put_active(of->kn); return ret; } static const struct vm_operations_struct kernfs_vm_ops = { .open = kernfs_vma_open, .fault = kernfs_vma_fault, .page_mkwrite = kernfs_vma_page_mkwrite, .access = kernfs_vma_access, }; static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma) { struct kernfs_open_file *of = kernfs_of(file); const struct kernfs_ops *ops; int rc; /* * mmap path and of->mutex are prone to triggering spurious lockdep * warnings and we don't want to add spurious locking dependency * between the two. Check whether mmap is actually implemented * without grabbing @of->mutex by testing HAS_MMAP flag. See the * comment in kernfs_fop_open() for more details. */ if (!(of->kn->flags & KERNFS_HAS_MMAP)) return -ENODEV; mutex_lock(&of->mutex); rc = -ENODEV; if (!kernfs_get_active(of->kn)) goto out_unlock; ops = kernfs_ops(of->kn); rc = ops->mmap(of, vma); if (rc) goto out_put; /* * PowerPC's pci_mmap of legacy_mem uses shmem_zero_setup() * to satisfy versions of X which crash if the mmap fails: that * substitutes a new vm_file, and we don't then want bin_vm_ops. */ if (vma->vm_file != file) goto out_put; rc = -EINVAL; if (of->mmapped && of->vm_ops != vma->vm_ops) goto out_put; /* * It is not possible to successfully wrap close. * So error if someone is trying to use close. */ if (vma->vm_ops && vma->vm_ops->close) goto out_put; rc = 0; if (!of->mmapped) { of->mmapped = true; of_on(of)->nr_mmapped++; of->vm_ops = vma->vm_ops; } vma->vm_ops = &kernfs_vm_ops; out_put: kernfs_put_active(of->kn); out_unlock: mutex_unlock(&of->mutex); return rc; } /** * kernfs_get_open_node - get or create kernfs_open_node * @kn: target kernfs_node * @of: kernfs_open_file for this instance of open * * If @kn->attr.open exists, increment its reference count; otherwise, * create one. @of is chained to the files list. * * Locking: * Kernel thread context (may sleep). * * Return: * %0 on success, -errno on failure. */ static int kernfs_get_open_node(struct kernfs_node *kn, struct kernfs_open_file *of) { struct kernfs_open_node *on; struct mutex *mutex; mutex = kernfs_open_file_mutex_lock(kn); on = kernfs_deref_open_node_locked(kn); if (!on) { /* not there, initialize a new one */ on = kzalloc(sizeof(*on), GFP_KERNEL); if (!on) { mutex_unlock(mutex); return -ENOMEM; } atomic_set(&on->event, 1); init_waitqueue_head(&on->poll); INIT_LIST_HEAD(&on->files); rcu_assign_pointer(kn->attr.open, on); } list_add_tail(&of->list, &on->files); if (kn->flags & KERNFS_HAS_RELEASE) on->nr_to_release++; mutex_unlock(mutex); return 0; } /** * kernfs_unlink_open_file - Unlink @of from @kn. * * @kn: target kernfs_node * @of: associated kernfs_open_file * @open_failed: ->open() failed, cancel ->release() * * Unlink @of from list of @kn's associated open files. If list of * associated open files becomes empty, disassociate and free * kernfs_open_node. * * LOCKING: * None. */ static void kernfs_unlink_open_file(struct kernfs_node *kn, struct kernfs_open_file *of, bool open_failed) { struct kernfs_open_node *on; struct mutex *mutex; mutex = kernfs_open_file_mutex_lock(kn); on = kernfs_deref_open_node_locked(kn); if (!on) { mutex_unlock(mutex); return; } if (of) { if (kn->flags & KERNFS_HAS_RELEASE) { WARN_ON_ONCE(of->released == open_failed); if (open_failed) on->nr_to_release--; } if (of->mmapped) on->nr_mmapped--; list_del(&of->list); } if (list_empty(&on->files)) { rcu_assign_pointer(kn->attr.open, NULL); kfree_rcu(on, rcu_head); } mutex_unlock(mutex); } static int kernfs_fop_open(struct inode *inode, struct file *file) { struct kernfs_node *kn = inode->i_private; struct kernfs_root *root = kernfs_root(kn); const struct kernfs_ops *ops; struct kernfs_open_file *of; bool has_read, has_write, has_mmap; int error = -EACCES; if (!kernfs_get_active(kn)) return -ENODEV; ops = kernfs_ops(kn); has_read = ops->seq_show || ops->read || ops->mmap; has_write = ops->write || ops->mmap; has_mmap = ops->mmap; /* see the flag definition for details */ if (root->flags & KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK) { if ((file->f_mode & FMODE_WRITE) && (!(inode->i_mode & S_IWUGO) || !has_write)) goto err_out; if ((file->f_mode & FMODE_READ) && (!(inode->i_mode & S_IRUGO) || !has_read)) goto err_out; } /* allocate a kernfs_open_file for the file */ error = -ENOMEM; of = kzalloc(sizeof(struct kernfs_open_file), GFP_KERNEL); if (!of) goto err_out; /* * The following is done to give a different lockdep key to * @of->mutex for files which implement mmap. This is a rather * crude way to avoid false positive lockdep warning around * mm->mmap_lock - mmap nests @of->mutex under mm->mmap_lock and * reading /sys/block/sda/trace/act_mask grabs sr_mutex, under * which mm->mmap_lock nests, while holding @of->mutex. As each * open file has a separate mutex, it's okay as long as those don't * happen on the same file. At this point, we can't easily give * each file a separate locking class. Let's differentiate on * whether the file has mmap or not for now. * * For similar reasons, writable and readonly files are given different * lockdep key, because the writable file /sys/power/resume may call vfs * lookup helpers for arbitrary paths and readonly files can be read by * overlayfs from vfs helpers when sysfs is a lower layer of overalyfs. * * All three cases look the same. They're supposed to * look that way and give @of->mutex different static lockdep keys. */ if (has_mmap) mutex_init(&of->mutex); else if (file->f_mode & FMODE_WRITE) mutex_init(&of->mutex); else mutex_init(&of->mutex); of->kn = kn; of->file = file; /* * Write path needs to atomic_write_len outside active reference. * Cache it in open_file. See kernfs_fop_write_iter() for details. */ of->atomic_write_len = ops->atomic_write_len; error = -EINVAL; /* * ->seq_show is incompatible with ->prealloc, * as seq_read does its own allocation. * ->read must be used instead. */ if (ops->prealloc && ops->seq_show) goto err_free; if (ops->prealloc) { int len = of->atomic_write_len ?: PAGE_SIZE; of->prealloc_buf = kmalloc(len + 1, GFP_KERNEL); error = -ENOMEM; if (!of->prealloc_buf) goto err_free; mutex_init(&of->prealloc_mutex); } /* * Always instantiate seq_file even if read access doesn't use * seq_file or is not requested. This unifies private data access * and readable regular files are the vast majority anyway. */ if (ops->seq_show) error = seq_open(file, &kernfs_seq_ops); else error = seq_open(file, NULL); if (error) goto err_free; of->seq_file = file->private_data; of->seq_file->private = of; /* seq_file clears PWRITE unconditionally, restore it if WRITE */ if (file->f_mode & FMODE_WRITE) file->f_mode |= FMODE_PWRITE; /* make sure we have open node struct */ error = kernfs_get_open_node(kn, of); if (error) goto err_seq_release; if (ops->open) { /* nobody has access to @of yet, skip @of->mutex */ error = ops->open(of); if (error) goto err_put_node; } /* open succeeded, put active references */ kernfs_put_active(kn); return 0; err_put_node: kernfs_unlink_open_file(kn, of, true); err_seq_release: seq_release(inode, file); err_free: kfree(of->prealloc_buf); kfree(of); err_out: kernfs_put_active(kn); return error; } /* used from release/drain to ensure that ->release() is called exactly once */ static void kernfs_release_file(struct kernfs_node *kn, struct kernfs_open_file *of) { /* * @of is guaranteed to have no other file operations in flight and * we just want to synchronize release and drain paths. * @kernfs_open_file_mutex_ptr(kn) is enough. @of->mutex can't be used * here because drain path may be called from places which can * cause circular dependency. */ lockdep_assert_held(kernfs_open_file_mutex_ptr(kn)); if (!of->released) { /* * A file is never detached without being released and we * need to be able to release files which are deactivated * and being drained. Don't use kernfs_ops(). */ kn->attr.ops->release(of); of->released = true; of_on(of)->nr_to_release--; } } static int kernfs_fop_release(struct inode *inode, struct file *filp) { struct kernfs_node *kn = inode->i_private; struct kernfs_open_file *of = kernfs_of(filp); if (kn->flags & KERNFS_HAS_RELEASE) { struct mutex *mutex; mutex = kernfs_open_file_mutex_lock(kn); kernfs_release_file(kn, of); mutex_unlock(mutex); } kernfs_unlink_open_file(kn, of, false); seq_release(inode, filp); kfree(of->prealloc_buf); kfree(of); return 0; } bool kernfs_should_drain_open_files(struct kernfs_node *kn) { struct kernfs_open_node *on; bool ret; /* * @kn being deactivated guarantees that @kn->attr.open can't change * beneath us making the lockless test below safe. */ WARN_ON_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS); rcu_read_lock(); on = rcu_dereference(kn->attr.open); ret = on && (on->nr_mmapped || on->nr_to_release); rcu_read_unlock(); return ret; } void kernfs_drain_open_files(struct kernfs_node *kn) { struct kernfs_open_node *on; struct kernfs_open_file *of; struct mutex *mutex; mutex = kernfs_open_file_mutex_lock(kn); on = kernfs_deref_open_node_locked(kn); if (!on) { mutex_unlock(mutex); return; } list_for_each_entry(of, &on->files, list) { struct inode *inode = file_inode(of->file); if (of->mmapped) { unmap_mapping_range(inode->i_mapping, 0, 0, 1); of->mmapped = false; on->nr_mmapped--; } if (kn->flags & KERNFS_HAS_RELEASE) kernfs_release_file(kn, of); } WARN_ON_ONCE(on->nr_mmapped || on->nr_to_release); mutex_unlock(mutex); } /* * Kernfs attribute files are pollable. The idea is that you read * the content and then you use 'poll' or 'select' to wait for * the content to change. When the content changes (assuming the * manager for the kobject supports notification), poll will * return EPOLLERR|EPOLLPRI, and select will return the fd whether * it is waiting for read, write, or exceptions. * Once poll/select indicates that the value has changed, you * need to close and re-open the file, or seek to 0 and read again. * Reminder: this only works for attributes which actively support * it, and it is not possible to test an attribute from userspace * to see if it supports poll (Neither 'poll' nor 'select' return * an appropriate error code). When in doubt, set a suitable timeout value. */ __poll_t kernfs_generic_poll(struct kernfs_open_file *of, poll_table *wait) { struct kernfs_open_node *on = of_on(of); poll_wait(of->file, &on->poll, wait); if (of->event != atomic_read(&on->event)) return DEFAULT_POLLMASK|EPOLLERR|EPOLLPRI; return DEFAULT_POLLMASK; } static __poll_t kernfs_fop_poll(struct file *filp, poll_table *wait) { struct kernfs_open_file *of = kernfs_of(filp); struct kernfs_node *kn = kernfs_dentry_node(filp->f_path.dentry); __poll_t ret; if (!kernfs_get_active(kn)) return DEFAULT_POLLMASK|EPOLLERR|EPOLLPRI; if (kn->attr.ops->poll) ret = kn->attr.ops->poll(of, wait); else ret = kernfs_generic_poll(of, wait); kernfs_put_active(kn); return ret; } static loff_t kernfs_fop_llseek(struct file *file, loff_t offset, int whence) { struct kernfs_open_file *of = kernfs_of(file); const struct kernfs_ops *ops; loff_t ret; /* * @of->mutex nests outside active ref and is primarily to ensure that * the ops aren't called concurrently for the same open file. */ mutex_lock(&of->mutex); if (!kernfs_get_active(of->kn)) { mutex_unlock(&of->mutex); return -ENODEV; } ops = kernfs_ops(of->kn); if (ops->llseek) ret = ops->llseek(of, offset, whence); else ret = generic_file_llseek(file, offset, whence); kernfs_put_active(of->kn); mutex_unlock(&of->mutex); return ret; } static void kernfs_notify_workfn(struct work_struct *work) { struct kernfs_node *kn; struct kernfs_super_info *info; struct kernfs_root *root; repeat: /* pop one off the notify_list */ spin_lock_irq(&kernfs_notify_lock); kn = kernfs_notify_list; if (kn == KERNFS_NOTIFY_EOL) { spin_unlock_irq(&kernfs_notify_lock); return; } kernfs_notify_list = kn->attr.notify_next; kn->attr.notify_next = NULL; spin_unlock_irq(&kernfs_notify_lock); root = kernfs_root(kn); /* kick fsnotify */ down_read(&root->kernfs_supers_rwsem); down_read(&root->kernfs_rwsem); list_for_each_entry(info, &kernfs_root(kn)->supers, node) { struct kernfs_node *parent; struct inode *p_inode = NULL; const char *kn_name; struct inode *inode; struct qstr name; /* * We want fsnotify_modify() on @kn but as the * modifications aren't originating from userland don't * have the matching @file available. Look up the inodes * and generate the events manually. */ inode = ilookup(info->sb, kernfs_ino(kn)); if (!inode) continue; kn_name = kernfs_rcu_name(kn); name = QSTR(kn_name); parent = kernfs_get_parent(kn); if (parent) { p_inode = ilookup(info->sb, kernfs_ino(parent)); if (p_inode) { fsnotify(FS_MODIFY | FS_EVENT_ON_CHILD, inode, FSNOTIFY_EVENT_INODE, p_inode, &name, inode, 0); iput(p_inode); } kernfs_put(parent); } if (!p_inode) fsnotify_inode(inode, FS_MODIFY); iput(inode); } up_read(&root->kernfs_rwsem); up_read(&root->kernfs_supers_rwsem); kernfs_put(kn); goto repeat; } /** * kernfs_notify - notify a kernfs file * @kn: file to notify * * Notify @kn such that poll(2) on @kn wakes up. Maybe be called from any * context. */ void kernfs_notify(struct kernfs_node *kn) { static DECLARE_WORK(kernfs_notify_work, kernfs_notify_workfn); unsigned long flags; struct kernfs_open_node *on; if (WARN_ON(kernfs_type(kn) != KERNFS_FILE)) return; /* kick poll immediately */ rcu_read_lock(); on = rcu_dereference(kn->attr.open); if (on) { atomic_inc(&on->event); wake_up_interruptible(&on->poll); } rcu_read_unlock(); /* schedule work to kick fsnotify */ spin_lock_irqsave(&kernfs_notify_lock, flags); if (!kn->attr.notify_next) { kernfs_get(kn); kn->attr.notify_next = kernfs_notify_list; kernfs_notify_list = kn; schedule_work(&kernfs_notify_work); } spin_unlock_irqrestore(&kernfs_notify_lock, flags); } EXPORT_SYMBOL_GPL(kernfs_notify); const struct file_operations kernfs_file_fops = { .read_iter = kernfs_fop_read_iter, .write_iter = kernfs_fop_write_iter, .llseek = kernfs_fop_llseek, .mmap = kernfs_fop_mmap, .open = kernfs_fop_open, .release = kernfs_fop_release, .poll = kernfs_fop_poll, .fsync = noop_fsync, .splice_read = copy_splice_read, .splice_write = iter_file_splice_write, }; /** * __kernfs_create_file - kernfs internal function to create a file * @parent: directory to create the file in * @name: name of the file * @mode: mode of the file * @uid: uid of the file * @gid: gid of the file * @size: size of the file * @ops: kernfs operations for the file * @priv: private data for the file * @ns: optional namespace tag of the file * @key: lockdep key for the file's active_ref, %NULL to disable lockdep * * Return: the created node on success, ERR_PTR() value on error. */ struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent, const char *name, umode_t mode, kuid_t uid, kgid_t gid, loff_t size, const struct kernfs_ops *ops, void *priv, const void *ns, struct lock_class_key *key) { struct kernfs_node *kn; unsigned flags; int rc; flags = KERNFS_FILE; kn = kernfs_new_node(parent, name, (mode & S_IALLUGO) | S_IFREG, uid, gid, flags); if (!kn) return ERR_PTR(-ENOMEM); kn->attr.ops = ops; kn->attr.size = size; kn->ns = ns; kn->priv = priv; #ifdef CONFIG_DEBUG_LOCK_ALLOC if (key) { lockdep_init_map(&kn->dep_map, "kn->active", key, 0); kn->flags |= KERNFS_LOCKDEP; } #endif /* * kn->attr.ops is accessible only while holding active ref. We * need to know whether some ops are implemented outside active * ref. Cache their existence in flags. */ if (ops->seq_show) kn->flags |= KERNFS_HAS_SEQ_SHOW; if (ops->mmap) kn->flags |= KERNFS_HAS_MMAP; if (ops->release) kn->flags |= KERNFS_HAS_RELEASE; rc = kernfs_add_one(kn); if (rc) { kernfs_put(kn); return ERR_PTR(rc); } return kn; }
1570 272 4 5591 5315 167 5536 129 5587 76 76 245 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* Red Black Trees (C) 1999 Andrea Arcangeli <andrea@suse.de> linux/include/linux/rbtree.h To use rbtrees you'll have to implement your own insert and search cores. This will avoid us to use callbacks and to drop drammatically performances. I know it's not the cleaner way, but in C (not in C++) to get performances and genericity... See Documentation/core-api/rbtree.rst for documentation and samples. */ #ifndef _LINUX_RBTREE_H #define _LINUX_RBTREE_H #include <linux/container_of.h> #include <linux/rbtree_types.h> #include <linux/stddef.h> #include <linux/rcupdate.h> #define rb_parent(r) ((struct rb_node *)((r)->__rb_parent_color & ~3)) #define rb_entry(ptr, type, member) container_of(ptr, type, member) #define RB_EMPTY_ROOT(root) (READ_ONCE((root)->rb_node) == NULL) /* 'empty' nodes are nodes that are known not to be inserted in an rbtree */ #define RB_EMPTY_NODE(node) \ ((node)->__rb_parent_color == (unsigned long)(node)) #define RB_CLEAR_NODE(node) \ ((node)->__rb_parent_color = (unsigned long)(node)) extern void rb_insert_color(struct rb_node *, struct rb_root *); extern void rb_erase(struct rb_node *, struct rb_root *); /* Find logical next and previous nodes in a tree */ extern struct rb_node *rb_next(const struct rb_node *); extern struct rb_node *rb_prev(const struct rb_node *); extern struct rb_node *rb_first(const struct rb_root *); extern struct rb_node *rb_last(const struct rb_root *); /* Postorder iteration - always visit the parent after its children */ extern struct rb_node *rb_first_postorder(const struct rb_root *); extern struct rb_node *rb_next_postorder(const struct rb_node *); /* Fast replacement of a single node without remove/rebalance/add/rebalance */ extern void rb_replace_node(struct rb_node *victim, struct rb_node *new, struct rb_root *root); extern void rb_replace_node_rcu(struct rb_node *victim, struct rb_node *new, struct rb_root *root); static inline void rb_link_node(struct rb_node *node, struct rb_node *parent, struct rb_node **rb_link) { node->__rb_parent_color = (unsigned long)parent; node->rb_left = node->rb_right = NULL; *rb_link = node; } static inline void rb_link_node_rcu(struct rb_node *node, struct rb_node *parent, struct rb_node **rb_link) { node->__rb_parent_color = (unsigned long)parent; node->rb_left = node->rb_right = NULL; rcu_assign_pointer(*rb_link, node); } #define rb_entry_safe(ptr, type, member) \ ({ typeof(ptr) ____ptr = (ptr); \ ____ptr ? rb_entry(____ptr, type, member) : NULL; \ }) /** * rbtree_postorder_for_each_entry_safe - iterate in post-order over rb_root of * given type allowing the backing memory of @pos to be invalidated * * @pos: the 'type *' to use as a loop cursor. * @n: another 'type *' to use as temporary storage * @root: 'rb_root *' of the rbtree. * @field: the name of the rb_node field within 'type'. * * rbtree_postorder_for_each_entry_safe() provides a similar guarantee as * list_for_each_entry_safe() and allows the iteration to continue independent * of changes to @pos by the body of the loop. * * Note, however, that it cannot handle other modifications that re-order the * rbtree it is iterating over. This includes calling rb_erase() on @pos, as * rb_erase() may rebalance the tree, causing us to miss some nodes. */ #define rbtree_postorder_for_each_entry_safe(pos, n, root, field) \ for (pos = rb_entry_safe(rb_first_postorder(root), typeof(*pos), field); \ pos && ({ n = rb_entry_safe(rb_next_postorder(&pos->field), \ typeof(*pos), field); 1; }); \ pos = n) /* Same as rb_first(), but O(1) */ #define rb_first_cached(root) (root)->rb_leftmost static inline void rb_insert_color_cached(struct rb_node *node, struct rb_root_cached *root, bool leftmost) { if (leftmost) root->rb_leftmost = node; rb_insert_color(node, &root->rb_root); } static inline struct rb_node * rb_erase_cached(struct rb_node *node, struct rb_root_cached *root) { struct rb_node *leftmost = NULL; if (root->rb_leftmost == node) leftmost = root->rb_leftmost = rb_next(node); rb_erase(node, &root->rb_root); return leftmost; } static inline void rb_replace_node_cached(struct rb_node *victim, struct rb_node *new, struct rb_root_cached *root) { if (root->rb_leftmost == victim) root->rb_leftmost = new; rb_replace_node(victim, new, &root->rb_root); } /* * The below helper functions use 2 operators with 3 different * calling conventions. The operators are related like: * * comp(a->key,b) < 0 := less(a,b) * comp(a->key,b) > 0 := less(b,a) * comp(a->key,b) == 0 := !less(a,b) && !less(b,a) * * If these operators define a partial order on the elements we make no * guarantee on which of the elements matching the key is found. See * rb_find(). * * The reason for this is to allow the find() interface without requiring an * on-stack dummy object, which might not be feasible due to object size. */ /** * rb_add_cached() - insert @node into the leftmost cached tree @tree * @node: node to insert * @tree: leftmost cached tree to insert @node into * @less: operator defining the (partial) node order * * Returns @node when it is the new leftmost, or NULL. */ static __always_inline struct rb_node * rb_add_cached(struct rb_node *node, struct rb_root_cached *tree, bool (*less)(struct rb_node *, const struct rb_node *)) { struct rb_node **link = &tree->rb_root.rb_node; struct rb_node *parent = NULL; bool leftmost = true; while (*link) { parent = *link; if (less(node, parent)) { link = &parent->rb_left; } else { link = &parent->rb_right; leftmost = false; } } rb_link_node(node, parent, link); rb_insert_color_cached(node, tree, leftmost); return leftmost ? node : NULL; } /** * rb_add() - insert @node into @tree * @node: node to insert * @tree: tree to insert @node into * @less: operator defining the (partial) node order */ static __always_inline void rb_add(struct rb_node *node, struct rb_root *tree, bool (*less)(struct rb_node *, const struct rb_node *)) { struct rb_node **link = &tree->rb_node; struct rb_node *parent = NULL; while (*link) { parent = *link; if (less(node, parent)) link = &parent->rb_left; else link = &parent->rb_right; } rb_link_node(node, parent, link); rb_insert_color(node, tree); } /** * rb_find_add_cached() - find equivalent @node in @tree, or add @node * @node: node to look-for / insert * @tree: tree to search / modify * @cmp: operator defining the node order * * Returns the rb_node matching @node, or NULL when no match is found and @node * is inserted. */ static __always_inline struct rb_node * rb_find_add_cached(struct rb_node *node, struct rb_root_cached *tree, int (*cmp)(const struct rb_node *new, const struct rb_node *exist)) { bool leftmost = true; struct rb_node **link = &tree->rb_root.rb_node; struct rb_node *parent = NULL; int c; while (*link) { parent = *link; c = cmp(node, parent); if (c < 0) { link = &parent->rb_left; } else if (c > 0) { link = &parent->rb_right; leftmost = false; } else { return parent; } } rb_link_node(node, parent, link); rb_insert_color_cached(node, tree, leftmost); return NULL; } /** * rb_find_add() - find equivalent @node in @tree, or add @node * @node: node to look-for / insert * @tree: tree to search / modify * @cmp: operator defining the node order * * Returns the rb_node matching @node, or NULL when no match is found and @node * is inserted. */ static __always_inline struct rb_node * rb_find_add(struct rb_node *node, struct rb_root *tree, int (*cmp)(struct rb_node *, const struct rb_node *)) { struct rb_node **link = &tree->rb_node; struct rb_node *parent = NULL; int c; while (*link) { parent = *link; c = cmp(node, parent); if (c < 0) link = &parent->rb_left; else if (c > 0) link = &parent->rb_right; else return parent; } rb_link_node(node, parent, link); rb_insert_color(node, tree); return NULL; } /** * rb_find_add_rcu() - find equivalent @node in @tree, or add @node * @node: node to look-for / insert * @tree: tree to search / modify * @cmp: operator defining the node order * * Adds a Store-Release for link_node. * * Returns the rb_node matching @node, or NULL when no match is found and @node * is inserted. */ static __always_inline struct rb_node * rb_find_add_rcu(struct rb_node *node, struct rb_root *tree, int (*cmp)(struct rb_node *, const struct rb_node *)) { struct rb_node **link = &tree->rb_node; struct rb_node *parent = NULL; int c; while (*link) { parent = *link; c = cmp(node, parent); if (c < 0) link = &parent->rb_left; else if (c > 0) link = &parent->rb_right; else return parent; } rb_link_node_rcu(node, parent, link); rb_insert_color(node, tree); return NULL; } /** * rb_find() - find @key in tree @tree * @key: key to match * @tree: tree to search * @cmp: operator defining the node order * * Returns the rb_node matching @key or NULL. */ static __always_inline struct rb_node * rb_find(const void *key, const struct rb_root *tree, int (*cmp)(const void *key, const struct rb_node *)) { struct rb_node *node = tree->rb_node; while (node) { int c = cmp(key, node); if (c < 0) node = node->rb_left; else if (c > 0) node = node->rb_right; else return node; } return NULL; } /** * rb_find_rcu() - find @key in tree @tree * @key: key to match * @tree: tree to search * @cmp: operator defining the node order * * Notably, tree descent vs concurrent tree rotations is unsound and can result * in false-negatives. * * Returns the rb_node matching @key or NULL. */ static __always_inline struct rb_node * rb_find_rcu(const void *key, const struct rb_root *tree, int (*cmp)(const void *key, const struct rb_node *)) { struct rb_node *node = tree->rb_node; while (node) { int c = cmp(key, node); if (c < 0) node = rcu_dereference_raw(node->rb_left); else if (c > 0) node = rcu_dereference_raw(node->rb_right); else return node; } return NULL; } /** * rb_find_first() - find the first @key in @tree * @key: key to match * @tree: tree to search * @cmp: operator defining node order * * Returns the leftmost node matching @key, or NULL. */ static __always_inline struct rb_node * rb_find_first(const void *key, const struct rb_root *tree, int (*cmp)(const void *key, const struct rb_node *)) { struct rb_node *node = tree->rb_node; struct rb_node *match = NULL; while (node) { int c = cmp(key, node); if (c <= 0) { if (!c) match = node; node = node->rb_left; } else if (c > 0) { node = node->rb_right; } } return match; } /** * rb_next_match() - find the next @key in @tree * @key: key to match * @tree: tree to search * @cmp: operator defining node order * * Returns the next node matching @key, or NULL. */ static __always_inline struct rb_node * rb_next_match(const void *key, struct rb_node *node, int (*cmp)(const void *key, const struct rb_node *)) { node = rb_next(node); if (node && cmp(key, node)) node = NULL; return node; } /** * rb_for_each() - iterates a subtree matching @key * @node: iterator * @key: key to match * @tree: tree to search * @cmp: operator defining node order */ #define rb_for_each(node, key, tree, cmp) \ for ((node) = rb_find_first((key), (tree), (cmp)); \ (node); (node) = rb_next_match((key), (node), (cmp))) #endif /* _LINUX_RBTREE_H */
20 14 2 21 3 21 1 21 19 2 20 1 38 21 10 31 9 31 32 14 38 4 14 12 2 51 51 3 21 25 13 54 54 47 41 38 22 16 20 21 20 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2014 Fraunhofer ITWM * * Written by: * Phoebe Buckheister <phoebe.buckheister@itwm.fraunhofer.de> */ #include <linux/ieee802154.h> #include <net/mac802154.h> #include <net/ieee802154_netdev.h> static int ieee802154_hdr_push_addr(u8 *buf, const struct ieee802154_addr *addr, bool omit_pan) { int pos = 0; if (addr->mode == IEEE802154_ADDR_NONE) return 0; if (!omit_pan) { memcpy(buf + pos, &addr->pan_id, 2); pos += 2; } switch (addr->mode) { case IEEE802154_ADDR_SHORT: memcpy(buf + pos, &addr->short_addr, 2); pos += 2; break; case IEEE802154_ADDR_LONG: memcpy(buf + pos, &addr->extended_addr, IEEE802154_ADDR_LEN); pos += IEEE802154_ADDR_LEN; break; default: return -EINVAL; } return pos; } static int ieee802154_hdr_push_sechdr(u8 *buf, const struct ieee802154_sechdr *hdr) { int pos = 5; memcpy(buf, hdr, 1); memcpy(buf + 1, &hdr->frame_counter, 4); switch (hdr->key_id_mode) { case IEEE802154_SCF_KEY_IMPLICIT: return pos; case IEEE802154_SCF_KEY_INDEX: break; case IEEE802154_SCF_KEY_SHORT_INDEX: memcpy(buf + pos, &hdr->short_src, 4); pos += 4; break; case IEEE802154_SCF_KEY_HW_INDEX: memcpy(buf + pos, &hdr->extended_src, IEEE802154_ADDR_LEN); pos += IEEE802154_ADDR_LEN; break; } buf[pos++] = hdr->key_id; return pos; } int ieee802154_hdr_push(struct sk_buff *skb, struct ieee802154_hdr *hdr) { u8 buf[IEEE802154_MAX_HEADER_LEN]; int pos = 2; int rc; struct ieee802154_hdr_fc *fc = &hdr->fc; buf[pos++] = hdr->seq; fc->dest_addr_mode = hdr->dest.mode; rc = ieee802154_hdr_push_addr(buf + pos, &hdr->dest, false); if (rc < 0) return -EINVAL; pos += rc; fc->source_addr_mode = hdr->source.mode; if (hdr->source.pan_id == hdr->dest.pan_id && hdr->dest.mode != IEEE802154_ADDR_NONE) fc->intra_pan = true; rc = ieee802154_hdr_push_addr(buf + pos, &hdr->source, fc->intra_pan); if (rc < 0) return -EINVAL; pos += rc; if (fc->security_enabled) { fc->version = 1; rc = ieee802154_hdr_push_sechdr(buf + pos, &hdr->sec); if (rc < 0) return -EINVAL; pos += rc; } memcpy(buf, fc, 2); memcpy(skb_push(skb, pos), buf, pos); return pos; } EXPORT_SYMBOL_GPL(ieee802154_hdr_push); int ieee802154_mac_cmd_push(struct sk_buff *skb, void *f, const void *pl, unsigned int pl_len) { struct ieee802154_mac_cmd_frame *frame = f; struct ieee802154_mac_cmd_pl *mac_pl = &frame->mac_pl; struct ieee802154_hdr *mhr = &frame->mhr; int ret; skb_reserve(skb, sizeof(*mhr)); ret = ieee802154_hdr_push(skb, mhr); if (ret < 0) return ret; skb_reset_mac_header(skb); skb->mac_len = ret; skb_put_data(skb, mac_pl, sizeof(*mac_pl)); skb_put_data(skb, pl, pl_len); return 0; } EXPORT_SYMBOL_GPL(ieee802154_mac_cmd_push); int ieee802154_beacon_push(struct sk_buff *skb, struct ieee802154_beacon_frame *beacon) { struct ieee802154_beacon_hdr *mac_pl = &beacon->mac_pl; struct ieee802154_hdr *mhr = &beacon->mhr; int ret; skb_reserve(skb, sizeof(*mhr)); ret = ieee802154_hdr_push(skb, mhr); if (ret < 0) return ret; skb_reset_mac_header(skb); skb->mac_len = ret; skb_put_data(skb, mac_pl, sizeof(*mac_pl)); if (mac_pl->pend_short_addr_count || mac_pl->pend_ext_addr_count) return -EOPNOTSUPP; return 0; } EXPORT_SYMBOL_GPL(ieee802154_beacon_push); static int ieee802154_hdr_get_addr(const u8 *buf, int mode, bool omit_pan, struct ieee802154_addr *addr) { int pos = 0; addr->mode = mode; if (mode == IEEE802154_ADDR_NONE) return 0; if (!omit_pan) { memcpy(&addr->pan_id, buf + pos, 2); pos += 2; } if (mode == IEEE802154_ADDR_SHORT) { memcpy(&addr->short_addr, buf + pos, 2); return pos + 2; } else { memcpy(&addr->extended_addr, buf + pos, IEEE802154_ADDR_LEN); return pos + IEEE802154_ADDR_LEN; } } static int ieee802154_hdr_addr_len(int mode, bool omit_pan) { int pan_len = omit_pan ? 0 : 2; switch (mode) { case IEEE802154_ADDR_NONE: return 0; case IEEE802154_ADDR_SHORT: return 2 + pan_len; case IEEE802154_ADDR_LONG: return IEEE802154_ADDR_LEN + pan_len; default: return -EINVAL; } } static int ieee802154_hdr_get_sechdr(const u8 *buf, struct ieee802154_sechdr *hdr) { int pos = 5; memcpy(hdr, buf, 1); memcpy(&hdr->frame_counter, buf + 1, 4); switch (hdr->key_id_mode) { case IEEE802154_SCF_KEY_IMPLICIT: return pos; case IEEE802154_SCF_KEY_INDEX: break; case IEEE802154_SCF_KEY_SHORT_INDEX: memcpy(&hdr->short_src, buf + pos, 4); pos += 4; break; case IEEE802154_SCF_KEY_HW_INDEX: memcpy(&hdr->extended_src, buf + pos, IEEE802154_ADDR_LEN); pos += IEEE802154_ADDR_LEN; break; } hdr->key_id = buf[pos++]; return pos; } static int ieee802154_sechdr_lengths[4] = { [IEEE802154_SCF_KEY_IMPLICIT] = 5, [IEEE802154_SCF_KEY_INDEX] = 6, [IEEE802154_SCF_KEY_SHORT_INDEX] = 10, [IEEE802154_SCF_KEY_HW_INDEX] = 14, }; static int ieee802154_hdr_sechdr_len(u8 sc) { return ieee802154_sechdr_lengths[IEEE802154_SCF_KEY_ID_MODE(sc)]; } static int ieee802154_hdr_minlen(const struct ieee802154_hdr *hdr) { int dlen, slen; dlen = ieee802154_hdr_addr_len(hdr->fc.dest_addr_mode, false); slen = ieee802154_hdr_addr_len(hdr->fc.source_addr_mode, hdr->fc.intra_pan); if (slen < 0 || dlen < 0) return -EINVAL; return 3 + dlen + slen + hdr->fc.security_enabled; } static int ieee802154_hdr_get_addrs(const u8 *buf, struct ieee802154_hdr *hdr) { int pos = 0; pos += ieee802154_hdr_get_addr(buf + pos, hdr->fc.dest_addr_mode, false, &hdr->dest); pos += ieee802154_hdr_get_addr(buf + pos, hdr->fc.source_addr_mode, hdr->fc.intra_pan, &hdr->source); if (hdr->fc.intra_pan) hdr->source.pan_id = hdr->dest.pan_id; return pos; } int ieee802154_hdr_pull(struct sk_buff *skb, struct ieee802154_hdr *hdr) { int pos = 3, rc; if (!pskb_may_pull(skb, 3)) return -EINVAL; memcpy(hdr, skb->data, 3); rc = ieee802154_hdr_minlen(hdr); if (rc < 0 || !pskb_may_pull(skb, rc)) return -EINVAL; pos += ieee802154_hdr_get_addrs(skb->data + pos, hdr); if (hdr->fc.security_enabled) { int want = pos + ieee802154_hdr_sechdr_len(skb->data[pos]); if (!pskb_may_pull(skb, want)) return -EINVAL; pos += ieee802154_hdr_get_sechdr(skb->data + pos, &hdr->sec); } skb_pull(skb, pos); return pos; } EXPORT_SYMBOL_GPL(ieee802154_hdr_pull); int ieee802154_mac_cmd_pl_pull(struct sk_buff *skb, struct ieee802154_mac_cmd_pl *mac_pl) { if (!pskb_may_pull(skb, sizeof(*mac_pl))) return -EINVAL; memcpy(mac_pl, skb->data, sizeof(*mac_pl)); skb_pull(skb, sizeof(*mac_pl)); return 0; } EXPORT_SYMBOL_GPL(ieee802154_mac_cmd_pl_pull); int ieee802154_hdr_peek_addrs(const struct sk_buff *skb, struct ieee802154_hdr *hdr) { const u8 *buf = skb_mac_header(skb); int pos = 3, rc; if (buf + 3 > skb_tail_pointer(skb)) return -EINVAL; memcpy(hdr, buf, 3); rc = ieee802154_hdr_minlen(hdr); if (rc < 0 || buf + rc > skb_tail_pointer(skb)) return -EINVAL; pos += ieee802154_hdr_get_addrs(buf + pos, hdr); return pos; } EXPORT_SYMBOL_GPL(ieee802154_hdr_peek_addrs); int ieee802154_hdr_peek(const struct sk_buff *skb, struct ieee802154_hdr *hdr) { const u8 *buf = skb_mac_header(skb); int pos; pos = ieee802154_hdr_peek_addrs(skb, hdr); if (pos < 0) return -EINVAL; if (hdr->fc.security_enabled) { u8 key_id_mode = IEEE802154_SCF_KEY_ID_MODE(*(buf + pos)); int want = pos + ieee802154_sechdr_lengths[key_id_mode]; if (buf + want > skb_tail_pointer(skb)) return -EINVAL; pos += ieee802154_hdr_get_sechdr(buf + pos, &hdr->sec); } return pos; } EXPORT_SYMBOL_GPL(ieee802154_hdr_peek); int ieee802154_max_payload(const struct ieee802154_hdr *hdr) { int hlen = ieee802154_hdr_minlen(hdr); if (hdr->fc.security_enabled) { hlen += ieee802154_sechdr_lengths[hdr->sec.key_id_mode] - 1; hlen += ieee802154_sechdr_authtag_len(&hdr->sec); } return IEEE802154_MTU - hlen - IEEE802154_MFR_SIZE; } EXPORT_SYMBOL_GPL(ieee802154_max_payload);
2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 /* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ /* Authors: Bernard Metzler <bmt@zurich.ibm.com> */ /* Copyright (c) 2008-2019, IBM Corporation */ #ifndef _SIW_MEM_H #define _SIW_MEM_H struct siw_umem *siw_umem_get(struct ib_device *base_dave, u64 start, u64 len, int rights); void siw_umem_release(struct siw_umem *umem); struct siw_pbl *siw_pbl_alloc(u32 num_buf); dma_addr_t siw_pbl_get_buffer(struct siw_pbl *pbl, u64 off, int *len, int *idx); struct siw_mem *siw_mem_id2obj(struct siw_device *sdev, int stag_index); int siw_mem_add(struct siw_device *sdev, struct siw_mem *m); int siw_invalidate_stag(struct ib_pd *pd, u32 stag); int siw_check_mem(struct ib_pd *pd, struct siw_mem *mem, u64 addr, enum ib_access_flags perms, int len); int siw_check_sge(struct ib_pd *pd, struct siw_sge *sge, struct siw_mem *mem[], enum ib_access_flags perms, u32 off, int len); void siw_wqe_put_mem(struct siw_wqe *wqe, enum siw_opcode op); int siw_mr_add_mem(struct siw_mr *mr, struct ib_pd *pd, void *mem_obj, u64 start, u64 len, int rights); void siw_mr_drop_mem(struct siw_mr *mr); void siw_free_mem(struct kref *ref); static inline void siw_mem_put(struct siw_mem *mem) { kref_put(&mem->ref, siw_free_mem); } static inline void siw_unref_mem_sgl(struct siw_mem **mem, unsigned int num_sge) { while (num_sge) { if (*mem == NULL) break; siw_mem_put(*mem); *mem = NULL; mem++; num_sge--; } } #define CHUNK_SHIFT 9 /* sets number of pages per chunk */ #define PAGES_PER_CHUNK (_AC(1, UL) << CHUNK_SHIFT) #define CHUNK_MASK (~(PAGES_PER_CHUNK - 1)) #define PAGE_CHUNK_SIZE (PAGES_PER_CHUNK * sizeof(struct page *)) /* * siw_get_upage() * * Get page pointer for address on given umem. * * @umem: two dimensional list of page pointers * @addr: user virtual address */ static inline struct page *siw_get_upage(struct siw_umem *umem, u64 addr) { unsigned int page_idx = (addr - umem->fp_addr) >> PAGE_SHIFT, chunk_idx = page_idx >> CHUNK_SHIFT, page_in_chunk = page_idx & ~CHUNK_MASK; if (likely(page_idx < umem->num_pages)) return umem->page_chunk[chunk_idx].plist[page_in_chunk]; return NULL; } #endif
117 15 27 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 /* SPDX-License-Identifier: GPL-2.0-or-later */ #include <net/inet_common.h> enum linux_mptcp_mib_field { MPTCP_MIB_NUM = 0, MPTCP_MIB_MPCAPABLEPASSIVE, /* Received SYN with MP_CAPABLE */ MPTCP_MIB_MPCAPABLEACTIVE, /* Sent SYN with MP_CAPABLE */ MPTCP_MIB_MPCAPABLEACTIVEACK, /* Received SYN/ACK with MP_CAPABLE */ MPTCP_MIB_MPCAPABLEPASSIVEACK, /* Received third ACK with MP_CAPABLE */ MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK,/* Server-side fallback during 3-way handshake */ MPTCP_MIB_MPCAPABLEACTIVEFALLBACK, /* Client-side fallback during 3-way handshake */ MPTCP_MIB_MPCAPABLEACTIVEDROP, /* Client-side fallback due to a MPC drop */ MPTCP_MIB_MPCAPABLEACTIVEDISABLED, /* Client-side disabled due to past issues */ MPTCP_MIB_MPCAPABLEENDPATTEMPT, /* Prohibited MPC to port-based endp */ MPTCP_MIB_TOKENFALLBACKINIT, /* Could not init/allocate token */ MPTCP_MIB_RETRANSSEGS, /* Segments retransmitted at the MPTCP-level */ MPTCP_MIB_JOINNOTOKEN, /* Received MP_JOIN but the token was not found */ MPTCP_MIB_JOINSYNRX, /* Received a SYN + MP_JOIN */ MPTCP_MIB_JOINSYNBACKUPRX, /* Received a SYN + MP_JOIN + backup flag */ MPTCP_MIB_JOINSYNACKRX, /* Received a SYN/ACK + MP_JOIN */ MPTCP_MIB_JOINSYNACKBACKUPRX, /* Received a SYN/ACK + MP_JOIN + backup flag */ MPTCP_MIB_JOINSYNACKMAC, /* HMAC was wrong on SYN/ACK + MP_JOIN */ MPTCP_MIB_JOINACKRX, /* Received an ACK + MP_JOIN */ MPTCP_MIB_JOINACKMAC, /* HMAC was wrong on ACK + MP_JOIN */ MPTCP_MIB_JOINREJECTED, /* The PM rejected the JOIN request */ MPTCP_MIB_JOINSYNTX, /* Sending a SYN + MP_JOIN */ MPTCP_MIB_JOINSYNTXCREATSKERR, /* Not able to create a socket when sending a SYN + MP_JOIN */ MPTCP_MIB_JOINSYNTXBINDERR, /* Not able to bind() the address when sending a SYN + MP_JOIN */ MPTCP_MIB_JOINSYNTXCONNECTERR, /* Not able to connect() when sending a SYN + MP_JOIN */ MPTCP_MIB_DSSNOMATCH, /* Received a new mapping that did not match the previous one */ MPTCP_MIB_DSSCORRUPTIONFALLBACK,/* DSS corruption detected, fallback */ MPTCP_MIB_DSSCORRUPTIONRESET, /* DSS corruption detected, MPJ subflow reset */ MPTCP_MIB_INFINITEMAPTX, /* Sent an infinite mapping */ MPTCP_MIB_INFINITEMAPRX, /* Received an infinite mapping */ MPTCP_MIB_DSSTCPMISMATCH, /* DSS-mapping did not map with TCP's sequence numbers */ MPTCP_MIB_DATACSUMERR, /* The data checksum fail */ MPTCP_MIB_OFOQUEUETAIL, /* Segments inserted into OoO queue tail */ MPTCP_MIB_OFOQUEUE, /* Segments inserted into OoO queue */ MPTCP_MIB_OFOMERGE, /* Segments merged in OoO queue */ MPTCP_MIB_NODSSWINDOW, /* Segments not in MPTCP windows */ MPTCP_MIB_DUPDATA, /* Segments discarded due to duplicate DSS */ MPTCP_MIB_ADDADDR, /* Received ADD_ADDR with echo-flag=0 */ MPTCP_MIB_ADDADDRTX, /* Sent ADD_ADDR with echo-flag=0 */ MPTCP_MIB_ADDADDRTXDROP, /* ADD_ADDR with echo-flag=0 not send due to * resource exhaustion */ MPTCP_MIB_ECHOADD, /* Received ADD_ADDR with echo-flag=1 */ MPTCP_MIB_ECHOADDTX, /* Send ADD_ADDR with echo-flag=1 */ MPTCP_MIB_ECHOADDTXDROP, /* ADD_ADDR with echo-flag=1 not send due * to resource exhaustion */ MPTCP_MIB_PORTADD, /* Received ADD_ADDR with a port-number */ MPTCP_MIB_ADDADDRDROP, /* Dropped incoming ADD_ADDR */ MPTCP_MIB_JOINPORTSYNRX, /* Received a SYN MP_JOIN with a different port-number */ MPTCP_MIB_JOINPORTSYNACKRX, /* Received a SYNACK MP_JOIN with a different port-number */ MPTCP_MIB_JOINPORTACKRX, /* Received an ACK MP_JOIN with a different port-number */ MPTCP_MIB_MISMATCHPORTSYNRX, /* Received a SYN MP_JOIN with a mismatched port-number */ MPTCP_MIB_MISMATCHPORTACKRX, /* Received an ACK MP_JOIN with a mismatched port-number */ MPTCP_MIB_RMADDR, /* Received RM_ADDR */ MPTCP_MIB_RMADDRDROP, /* Dropped incoming RM_ADDR */ MPTCP_MIB_RMADDRTX, /* Sent RM_ADDR */ MPTCP_MIB_RMADDRTXDROP, /* RM_ADDR not sent due to resource exhaustion */ MPTCP_MIB_RMSUBFLOW, /* Remove a subflow */ MPTCP_MIB_MPPRIOTX, /* Transmit a MP_PRIO */ MPTCP_MIB_MPPRIORX, /* Received a MP_PRIO */ MPTCP_MIB_MPFAILTX, /* Transmit a MP_FAIL */ MPTCP_MIB_MPFAILRX, /* Received a MP_FAIL */ MPTCP_MIB_MPFASTCLOSETX, /* Transmit a MP_FASTCLOSE */ MPTCP_MIB_MPFASTCLOSERX, /* Received a MP_FASTCLOSE */ MPTCP_MIB_MPRSTTX, /* Transmit a MP_RST */ MPTCP_MIB_MPRSTRX, /* Received a MP_RST */ MPTCP_MIB_RCVPRUNED, /* Incoming packet dropped due to memory limit */ MPTCP_MIB_SUBFLOWSTALE, /* Subflows entered 'stale' status */ MPTCP_MIB_SUBFLOWRECOVER, /* Subflows returned to active status after being stale */ MPTCP_MIB_SNDWNDSHARED, /* Subflow snd wnd is overridden by msk's one */ MPTCP_MIB_RCVWNDSHARED, /* Subflow rcv wnd is overridden by msk's one */ MPTCP_MIB_RCVWNDCONFLICTUPDATE, /* subflow rcv wnd is overridden by msk's one due to * conflict with another subflow while updating msk rcv wnd */ MPTCP_MIB_RCVWNDCONFLICT, /* Conflict with while updating msk rcv wnd */ MPTCP_MIB_CURRESTAB, /* Current established MPTCP connections */ MPTCP_MIB_BLACKHOLE, /* A blackhole has been detected */ __MPTCP_MIB_MAX }; #define LINUX_MIB_MPTCP_MAX __MPTCP_MIB_MAX struct mptcp_mib { unsigned long mibs[LINUX_MIB_MPTCP_MAX]; }; static inline void MPTCP_ADD_STATS(struct net *net, enum linux_mptcp_mib_field field, int val) { if (likely(net->mib.mptcp_statistics)) SNMP_ADD_STATS(net->mib.mptcp_statistics, field, val); } static inline void MPTCP_INC_STATS(struct net *net, enum linux_mptcp_mib_field field) { if (likely(net->mib.mptcp_statistics)) SNMP_INC_STATS(net->mib.mptcp_statistics, field); } static inline void __MPTCP_INC_STATS(struct net *net, enum linux_mptcp_mib_field field) { if (likely(net->mib.mptcp_statistics)) __SNMP_INC_STATS(net->mib.mptcp_statistics, field); } static inline void MPTCP_DEC_STATS(struct net *net, enum linux_mptcp_mib_field field) { if (likely(net->mib.mptcp_statistics)) SNMP_DEC_STATS(net->mib.mptcp_statistics, field); } bool mptcp_mib_alloc(struct net *net);
13 121 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 // SPDX-License-Identifier: GPL-2.0-or-later /* * Handle firewalling core * Linux ethernet bridge * * Authors: * Lennert Buytenhek <buytenh@gnu.org> * Bart De Schuymer <bdschuym@pandora.be> * * Lennert dedicates this file to Kerstin Wurdinger. */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/in_route.h> #include <linux/inetdevice.h> #include <net/route.h> #include "br_private.h" #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> #endif static void fake_update_pmtu(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu, bool confirm_neigh) { } static void fake_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) { } static u32 *fake_cow_metrics(struct dst_entry *dst, unsigned long old) { return NULL; } static struct neighbour *fake_neigh_lookup(const struct dst_entry *dst, struct sk_buff *skb, const void *daddr) { return NULL; } static unsigned int fake_mtu(const struct dst_entry *dst) { return dst->dev->mtu; } static struct dst_ops fake_dst_ops = { .family = AF_INET, .update_pmtu = fake_update_pmtu, .redirect = fake_redirect, .cow_metrics = fake_cow_metrics, .neigh_lookup = fake_neigh_lookup, .mtu = fake_mtu, }; /* * Initialize bogus route table used to keep netfilter happy. * Currently, we fill in the PMTU entry because netfilter * refragmentation needs it, and the rt_flags entry because * ipt_REJECT needs it. Future netfilter modules might * require us to fill additional fields. */ static const u32 br_dst_default_metrics[RTAX_MAX] = { [RTAX_MTU - 1] = 1500, }; void br_netfilter_rtable_init(struct net_bridge *br) { struct rtable *rt = &br->fake_rtable; rcuref_init(&rt->dst.__rcuref, 1); rt->dst.dev = br->dev; dst_init_metrics(&rt->dst, br_dst_default_metrics, true); rt->dst.flags = DST_NOXFRM | DST_FAKE_RTABLE; rt->dst.ops = &fake_dst_ops; } int __init br_nf_core_init(void) { return dst_entries_init(&fake_dst_ops); } void br_nf_core_fini(void) { dst_entries_destroy(&fake_dst_ops); }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 // SPDX-License-Identifier: GPL-2.0 /* * consolidates trace point definitions * * Copyright (C) 2009 Neil Horman <nhorman@tuxdriver.com> */ #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/string.h> #include <linux/if_arp.h> #include <linux/inetdevice.h> #include <linux/inet.h> #include <linux/interrupt.h> #include <linux/export.h> #include <linux/netpoll.h> #include <linux/sched.h> #include <linux/delay.h> #include <linux/rcupdate.h> #include <linux/types.h> #include <linux/workqueue.h> #include <linux/netlink.h> #include <linux/net_dropmon.h> #include <linux/slab.h> #include <linux/unaligned.h> #include <asm/bitops.h> #define CREATE_TRACE_POINTS #include <trace/events/skb.h> #include <trace/events/net.h> #include <trace/events/napi.h> #include <trace/events/sock.h> #include <trace/events/udp.h> #include <trace/events/tcp.h> #include <trace/events/fib.h> #include <trace/events/qdisc.h> #if IS_ENABLED(CONFIG_BRIDGE) #include <trace/events/bridge.h> EXPORT_TRACEPOINT_SYMBOL_GPL(br_fdb_add); EXPORT_TRACEPOINT_SYMBOL_GPL(br_fdb_external_learn_add); EXPORT_TRACEPOINT_SYMBOL_GPL(fdb_delete); EXPORT_TRACEPOINT_SYMBOL_GPL(br_fdb_update); EXPORT_TRACEPOINT_SYMBOL_GPL(br_mdb_full); #endif #if IS_ENABLED(CONFIG_PAGE_POOL) #include <trace/events/page_pool.h> #endif #include <trace/events/neigh.h> EXPORT_TRACEPOINT_SYMBOL_GPL(neigh_update); EXPORT_TRACEPOINT_SYMBOL_GPL(neigh_update_done); EXPORT_TRACEPOINT_SYMBOL_GPL(neigh_timer_handler); EXPORT_TRACEPOINT_SYMBOL_GPL(neigh_event_send_done); EXPORT_TRACEPOINT_SYMBOL_GPL(neigh_event_send_dead); EXPORT_TRACEPOINT_SYMBOL_GPL(neigh_cleanup_and_release); EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb); EXPORT_TRACEPOINT_SYMBOL_GPL(napi_poll); EXPORT_TRACEPOINT_SYMBOL_GPL(tcp_send_reset); EXPORT_TRACEPOINT_SYMBOL_GPL(tcp_bad_csum); EXPORT_TRACEPOINT_SYMBOL_GPL(udp_fail_queue_rcv_skb); EXPORT_TRACEPOINT_SYMBOL_GPL(sk_data_ready);
101 86 67 69 101 88 88 88 88 103 101 86 6 88 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2005,2006,2007,2008 IBM Corporation * * Authors: * Serge Hallyn <serue@us.ibm.com> * Reiner Sailer <sailer@watson.ibm.com> * Mimi Zohar <zohar@us.ibm.com> * * File: ima_queue.c * Implements queues that store template measurements and * maintains aggregate over the stored measurements * in the pre-configured TPM PCR (if available). * The measurement list is append-only. No entry is * ever removed or changed during the boot-cycle. */ #include <linux/rculist.h> #include <linux/reboot.h> #include <linux/slab.h> #include "ima.h" #define AUDIT_CAUSE_LEN_MAX 32 /* pre-allocated array of tpm_digest structures to extend a PCR */ static struct tpm_digest *digests; LIST_HEAD(ima_measurements); /* list of all measurements */ #ifdef CONFIG_IMA_KEXEC static unsigned long binary_runtime_size; #else static unsigned long binary_runtime_size = ULONG_MAX; #endif /* key: inode (before secure-hashing a file) */ struct ima_h_table ima_htable = { .len = ATOMIC_LONG_INIT(0), .violations = ATOMIC_LONG_INIT(0), .queue[0 ... IMA_MEASURE_HTABLE_SIZE - 1] = HLIST_HEAD_INIT }; /* mutex protects atomicity of extending measurement list * and extending the TPM PCR aggregate. Since tpm_extend can take * long (and the tpm driver uses a mutex), we can't use the spinlock. */ static DEFINE_MUTEX(ima_extend_list_mutex); /* * Used internally by the kernel to suspend measurements. * Protected by ima_extend_list_mutex. */ static bool ima_measurements_suspended; /* lookup up the digest value in the hash table, and return the entry */ static struct ima_queue_entry *ima_lookup_digest_entry(u8 *digest_value, int pcr) { struct ima_queue_entry *qe, *ret = NULL; unsigned int key; int rc; key = ima_hash_key(digest_value); rcu_read_lock(); hlist_for_each_entry_rcu(qe, &ima_htable.queue[key], hnext) { rc = memcmp(qe->entry->digests[ima_hash_algo_idx].digest, digest_value, hash_digest_size[ima_hash_algo]); if ((rc == 0) && (qe->entry->pcr == pcr)) { ret = qe; break; } } rcu_read_unlock(); return ret; } /* * Calculate the memory required for serializing a single * binary_runtime_measurement list entry, which contains a * couple of variable length fields (e.g template name and data). */ static int get_binary_runtime_size(struct ima_template_entry *entry) { int size = 0; size += sizeof(u32); /* pcr */ size += TPM_DIGEST_SIZE; size += sizeof(int); /* template name size field */ size += strlen(entry->template_desc->name); size += sizeof(entry->template_data_len); size += entry->template_data_len; return size; } /* ima_add_template_entry helper function: * - Add template entry to the measurement list and hash table, for * all entries except those carried across kexec. * * (Called with ima_extend_list_mutex held.) */ static int ima_add_digest_entry(struct ima_template_entry *entry, bool update_htable) { struct ima_queue_entry *qe; unsigned int key; qe = kmalloc(sizeof(*qe), GFP_KERNEL); if (qe == NULL) { pr_err("OUT OF MEMORY ERROR creating queue entry\n"); return -ENOMEM; } qe->entry = entry; INIT_LIST_HEAD(&qe->later); list_add_tail_rcu(&qe->later, &ima_measurements); atomic_long_inc(&ima_htable.len); if (update_htable) { key = ima_hash_key(entry->digests[ima_hash_algo_idx].digest); hlist_add_head_rcu(&qe->hnext, &ima_htable.queue[key]); } if (binary_runtime_size != ULONG_MAX) { int size; size = get_binary_runtime_size(entry); binary_runtime_size = (binary_runtime_size < ULONG_MAX - size) ? binary_runtime_size + size : ULONG_MAX; } return 0; } /* * Return the amount of memory required for serializing the * entire binary_runtime_measurement list, including the ima_kexec_hdr * structure. */ unsigned long ima_get_binary_runtime_size(void) { if (binary_runtime_size >= (ULONG_MAX - sizeof(struct ima_kexec_hdr))) return ULONG_MAX; else return binary_runtime_size + sizeof(struct ima_kexec_hdr); } static int ima_pcr_extend(struct tpm_digest *digests_arg, int pcr) { int result = 0; if (!ima_tpm_chip) return result; result = tpm_pcr_extend(ima_tpm_chip, pcr, digests_arg); if (result != 0) pr_err("Error Communicating to TPM chip, result: %d\n", result); return result; } /* * Add template entry to the measurement list and hash table, and * extend the pcr. * * On systems which support carrying the IMA measurement list across * kexec, maintain the total memory size required for serializing the * binary_runtime_measurements. */ int ima_add_template_entry(struct ima_template_entry *entry, int violation, const char *op, struct inode *inode, const unsigned char *filename) { u8 *digest = entry->digests[ima_hash_algo_idx].digest; struct tpm_digest *digests_arg = entry->digests; const char *audit_cause = "hash_added"; char tpm_audit_cause[AUDIT_CAUSE_LEN_MAX]; int audit_info = 1; int result = 0, tpmresult = 0; mutex_lock(&ima_extend_list_mutex); /* * Avoid appending to the measurement log when the TPM subsystem has * been shut down while preparing for system reboot. */ if (ima_measurements_suspended) { audit_cause = "measurements_suspended"; audit_info = 0; result = -ENODEV; goto out; } if (!violation && !IS_ENABLED(CONFIG_IMA_DISABLE_HTABLE)) { if (ima_lookup_digest_entry(digest, entry->pcr)) { audit_cause = "hash_exists"; result = -EEXIST; goto out; } } result = ima_add_digest_entry(entry, !IS_ENABLED(CONFIG_IMA_DISABLE_HTABLE)); if (result < 0) { audit_cause = "ENOMEM"; audit_info = 0; goto out; } if (violation) /* invalidate pcr */ digests_arg = digests; tpmresult = ima_pcr_extend(digests_arg, entry->pcr); if (tpmresult != 0) { snprintf(tpm_audit_cause, AUDIT_CAUSE_LEN_MAX, "TPM_error(%d)", tpmresult); audit_cause = tpm_audit_cause; audit_info = 0; } out: mutex_unlock(&ima_extend_list_mutex); integrity_audit_msg(AUDIT_INTEGRITY_PCR, inode, filename, op, audit_cause, result, audit_info); return result; } int ima_restore_measurement_entry(struct ima_template_entry *entry) { int result = 0; mutex_lock(&ima_extend_list_mutex); result = ima_add_digest_entry(entry, 0); mutex_unlock(&ima_extend_list_mutex); return result; } static void ima_measurements_suspend(void) { mutex_lock(&ima_extend_list_mutex); ima_measurements_suspended = true; mutex_unlock(&ima_extend_list_mutex); } static int ima_reboot_notifier(struct notifier_block *nb, unsigned long action, void *data) { ima_measurements_suspend(); return NOTIFY_DONE; } static struct notifier_block ima_reboot_nb = { .notifier_call = ima_reboot_notifier, }; void __init ima_init_reboot_notifier(void) { register_reboot_notifier(&ima_reboot_nb); } int __init ima_init_digests(void) { u16 digest_size; u16 crypto_id; int i; if (!ima_tpm_chip) return 0; digests = kcalloc(ima_tpm_chip->nr_allocated_banks, sizeof(*digests), GFP_NOFS); if (!digests) return -ENOMEM; for (i = 0; i < ima_tpm_chip->nr_allocated_banks; i++) { digests[i].alg_id = ima_tpm_chip->allocated_banks[i].alg_id; digest_size = ima_tpm_chip->allocated_banks[i].digest_size; crypto_id = ima_tpm_chip->allocated_banks[i].crypto_id; /* for unmapped TPM algorithms digest is still a padded SHA1 */ if (crypto_id == HASH_ALGO__LAST) digest_size = SHA1_DIGEST_SIZE; memset(digests[i].digest, 0xff, digest_size); } return 0; }
2 2 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 16 16 10 6 16 16 16 15 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB /* * Copyright (c) 2019 Mellanox Technologies. All rights reserved. */ #include <rdma/ib_verbs.h> #include <rdma/rdma_counter.h> #include "core_priv.h" #include "restrack.h" #define ALL_AUTO_MODE_MASKS (RDMA_COUNTER_MASK_QP_TYPE | RDMA_COUNTER_MASK_PID) static int __counter_set_mode(struct rdma_port_counter *port_counter, enum rdma_nl_counter_mode new_mode, enum rdma_nl_counter_mask new_mask, bool bind_opcnt) { if (new_mode == RDMA_COUNTER_MODE_AUTO) { if (new_mask & (~ALL_AUTO_MODE_MASKS)) return -EINVAL; if (port_counter->num_counters) return -EBUSY; } port_counter->mode.mode = new_mode; port_counter->mode.mask = new_mask; port_counter->mode.bind_opcnt = bind_opcnt; return 0; } /* * rdma_counter_set_auto_mode() - Turn on/off per-port auto mode * * @dev: Device to operate * @port: Port to use * @mask: Mask to configure * @extack: Message to the user * * Return 0 on success. If counter mode wasn't changed then it is considered * as success as well. * Return -EBUSY when changing to auto mode while there are bounded counters. * */ int rdma_counter_set_auto_mode(struct ib_device *dev, u32 port, enum rdma_nl_counter_mask mask, bool bind_opcnt, struct netlink_ext_ack *extack) { struct rdma_port_counter *port_counter; enum rdma_nl_counter_mode mode; int ret; port_counter = &dev->port_data[port].port_counter; if (!port_counter->hstats) return -EOPNOTSUPP; mutex_lock(&port_counter->lock); if (mask) mode = RDMA_COUNTER_MODE_AUTO; else mode = (port_counter->num_counters) ? RDMA_COUNTER_MODE_MANUAL : RDMA_COUNTER_MODE_NONE; if (port_counter->mode.mode == mode && port_counter->mode.mask == mask && port_counter->mode.bind_opcnt == bind_opcnt) { ret = 0; goto out; } ret = __counter_set_mode(port_counter, mode, mask, bind_opcnt); out: mutex_unlock(&port_counter->lock); if (ret == -EBUSY) NL_SET_ERR_MSG( extack, "Modifying auto mode is not allowed when there is a bound QP"); return ret; } static void auto_mode_init_counter(struct rdma_counter *counter, const struct ib_qp *qp, enum rdma_nl_counter_mask new_mask) { struct auto_mode_param *param = &counter->mode.param; counter->mode.mode = RDMA_COUNTER_MODE_AUTO; counter->mode.mask = new_mask; if (new_mask & RDMA_COUNTER_MASK_QP_TYPE) param->qp_type = qp->qp_type; } static int __rdma_counter_bind_qp(struct rdma_counter *counter, struct ib_qp *qp, u32 port) { int ret; if (qp->counter) return -EINVAL; if (!qp->device->ops.counter_bind_qp) return -EOPNOTSUPP; mutex_lock(&counter->lock); ret = qp->device->ops.counter_bind_qp(counter, qp, port); mutex_unlock(&counter->lock); return ret; } int rdma_counter_modify(struct ib_device *dev, u32 port, unsigned int index, bool enable) { struct rdma_hw_stats *stats; int ret = 0; if (!dev->ops.modify_hw_stat) return -EOPNOTSUPP; stats = ib_get_hw_stats_port(dev, port); if (!stats || index >= stats->num_counters || !(stats->descs[index].flags & IB_STAT_FLAG_OPTIONAL)) return -EINVAL; mutex_lock(&stats->lock); if (enable != test_bit(index, stats->is_disabled)) goto out; ret = dev->ops.modify_hw_stat(dev, port, index, enable); if (ret) goto out; if (enable) clear_bit(index, stats->is_disabled); else set_bit(index, stats->is_disabled); out: mutex_unlock(&stats->lock); return ret; } static struct rdma_counter *alloc_and_bind(struct ib_device *dev, u32 port, struct ib_qp *qp, enum rdma_nl_counter_mode mode, bool bind_opcnt) { struct rdma_port_counter *port_counter; struct rdma_counter *counter; int ret; if (!dev->ops.counter_dealloc || !dev->ops.counter_alloc_stats) return NULL; counter = rdma_zalloc_drv_obj(dev, rdma_counter); if (!counter) return NULL; counter->device = dev; counter->port = port; dev->ops.counter_init(counter); rdma_restrack_new(&counter->res, RDMA_RESTRACK_COUNTER); counter->stats = dev->ops.counter_alloc_stats(counter); if (!counter->stats) goto err_stats; port_counter = &dev->port_data[port].port_counter; mutex_lock(&port_counter->lock); switch (mode) { case RDMA_COUNTER_MODE_MANUAL: ret = __counter_set_mode(port_counter, RDMA_COUNTER_MODE_MANUAL, 0, bind_opcnt); if (ret) { mutex_unlock(&port_counter->lock); goto err_mode; } break; case RDMA_COUNTER_MODE_AUTO: auto_mode_init_counter(counter, qp, port_counter->mode.mask); break; default: ret = -EOPNOTSUPP; mutex_unlock(&port_counter->lock); goto err_mode; } port_counter->num_counters++; mutex_unlock(&port_counter->lock); counter->mode.mode = mode; counter->mode.bind_opcnt = bind_opcnt; kref_init(&counter->kref); mutex_init(&counter->lock); ret = __rdma_counter_bind_qp(counter, qp, port); if (ret) goto err_mode; rdma_restrack_parent_name(&counter->res, &qp->res); rdma_restrack_add(&counter->res); return counter; err_mode: rdma_free_hw_stats_struct(counter->stats); err_stats: rdma_restrack_put(&counter->res); kfree(counter); return NULL; } static void rdma_counter_free(struct rdma_counter *counter) { struct rdma_port_counter *port_counter; port_counter = &counter->device->port_data[counter->port].port_counter; mutex_lock(&port_counter->lock); port_counter->num_counters--; if (!port_counter->num_counters && (port_counter->mode.mode == RDMA_COUNTER_MODE_MANUAL)) __counter_set_mode(port_counter, RDMA_COUNTER_MODE_NONE, 0, false); mutex_unlock(&port_counter->lock); rdma_restrack_del(&counter->res); rdma_free_hw_stats_struct(counter->stats); kfree(counter); } static bool auto_mode_match(struct ib_qp *qp, struct rdma_counter *counter, enum rdma_nl_counter_mask auto_mask) { struct auto_mode_param *param = &counter->mode.param; bool match = true; if (auto_mask & RDMA_COUNTER_MASK_QP_TYPE) match &= (param->qp_type == qp->qp_type); if (auto_mask & RDMA_COUNTER_MASK_PID) match &= (task_pid_nr(counter->res.task) == task_pid_nr(qp->res.task)); return match; } static int __rdma_counter_unbind_qp(struct ib_qp *qp, u32 port) { struct rdma_counter *counter = qp->counter; int ret; if (!qp->device->ops.counter_unbind_qp) return -EOPNOTSUPP; mutex_lock(&counter->lock); ret = qp->device->ops.counter_unbind_qp(qp, port); mutex_unlock(&counter->lock); return ret; } static void counter_history_stat_update(struct rdma_counter *counter) { struct ib_device *dev = counter->device; struct rdma_port_counter *port_counter; int i; port_counter = &dev->port_data[counter->port].port_counter; if (!port_counter->hstats) return; rdma_counter_query_stats(counter); for (i = 0; i < counter->stats->num_counters; i++) port_counter->hstats->value[i] += counter->stats->value[i]; } /* * rdma_get_counter_auto_mode - Find the counter that @qp should be bound * with in auto mode * * Return: The counter (with ref-count increased) if found */ static struct rdma_counter *rdma_get_counter_auto_mode(struct ib_qp *qp, u32 port) { struct rdma_port_counter *port_counter; struct rdma_counter *counter = NULL; struct ib_device *dev = qp->device; struct rdma_restrack_entry *res; struct rdma_restrack_root *rt; unsigned long id = 0; port_counter = &dev->port_data[port].port_counter; rt = &dev->res[RDMA_RESTRACK_COUNTER]; xa_lock(&rt->xa); xa_for_each(&rt->xa, id, res) { counter = container_of(res, struct rdma_counter, res); if ((counter->device != qp->device) || (counter->port != port)) goto next; if (auto_mode_match(qp, counter, port_counter->mode.mask)) break; next: counter = NULL; } if (counter && !kref_get_unless_zero(&counter->kref)) counter = NULL; xa_unlock(&rt->xa); return counter; } static void counter_release(struct kref *kref) { struct rdma_counter *counter; counter = container_of(kref, struct rdma_counter, kref); counter_history_stat_update(counter); counter->device->ops.counter_dealloc(counter); rdma_counter_free(counter); } /* * rdma_counter_bind_qp_auto - Check and bind the QP to a counter base on * the auto-mode rule */ int rdma_counter_bind_qp_auto(struct ib_qp *qp, u32 port) { struct rdma_port_counter *port_counter; struct ib_device *dev = qp->device; struct rdma_counter *counter; int ret; if (!rdma_restrack_is_tracked(&qp->res) || rdma_is_kernel_res(&qp->res)) return 0; if (!rdma_is_port_valid(dev, port)) return -EINVAL; port_counter = &dev->port_data[port].port_counter; if (port_counter->mode.mode != RDMA_COUNTER_MODE_AUTO) return 0; counter = rdma_get_counter_auto_mode(qp, port); if (counter) { ret = __rdma_counter_bind_qp(counter, qp, port); if (ret) { kref_put(&counter->kref, counter_release); return ret; } } else { counter = alloc_and_bind(dev, port, qp, RDMA_COUNTER_MODE_AUTO, port_counter->mode.bind_opcnt); if (!counter) return -ENOMEM; } return 0; } /* * rdma_counter_unbind_qp - Unbind a qp from a counter * @force: * true - Decrease the counter ref-count anyway (e.g., qp destroy) */ int rdma_counter_unbind_qp(struct ib_qp *qp, u32 port, bool force) { struct rdma_counter *counter = qp->counter; int ret; if (!counter) return -EINVAL; ret = __rdma_counter_unbind_qp(qp, port); if (ret && !force) return ret; kref_put(&counter->kref, counter_release); return 0; } int rdma_counter_query_stats(struct rdma_counter *counter) { struct ib_device *dev = counter->device; int ret; if (!dev->ops.counter_update_stats) return -EINVAL; mutex_lock(&counter->lock); ret = dev->ops.counter_update_stats(counter); mutex_unlock(&counter->lock); return ret; } static u64 get_running_counters_hwstat_sum(struct ib_device *dev, u32 port, u32 index) { struct rdma_restrack_entry *res; struct rdma_restrack_root *rt; struct rdma_counter *counter; unsigned long id = 0; u64 sum = 0; rt = &dev->res[RDMA_RESTRACK_COUNTER]; xa_lock(&rt->xa); xa_for_each(&rt->xa, id, res) { if (!rdma_restrack_get(res)) continue; xa_unlock(&rt->xa); counter = container_of(res, struct rdma_counter, res); if ((counter->device != dev) || (counter->port != port) || rdma_counter_query_stats(counter)) goto next; sum += counter->stats->value[index]; next: xa_lock(&rt->xa); rdma_restrack_put(res); } xa_unlock(&rt->xa); return sum; } /* * rdma_counter_get_hwstat_value() - Get the sum value of all counters on a * specific port, including the running ones and history data */ u64 rdma_counter_get_hwstat_value(struct ib_device *dev, u32 port, u32 index) { struct rdma_port_counter *port_counter; u64 sum; port_counter = &dev->port_data[port].port_counter; if (!port_counter->hstats) return 0; sum = get_running_counters_hwstat_sum(dev, port, index); sum += port_counter->hstats->value[index]; return sum; } static struct ib_qp *rdma_counter_get_qp(struct ib_device *dev, u32 qp_num) { struct rdma_restrack_entry *res = NULL; struct ib_qp *qp = NULL; res = rdma_restrack_get_byid(dev, RDMA_RESTRACK_QP, qp_num); if (IS_ERR(res)) return NULL; qp = container_of(res, struct ib_qp, res); if (qp->qp_type == IB_QPT_RAW_PACKET && !capable(CAP_NET_RAW)) goto err; return qp; err: rdma_restrack_put(res); return NULL; } static struct rdma_counter *rdma_get_counter_by_id(struct ib_device *dev, u32 counter_id) { struct rdma_restrack_entry *res; struct rdma_counter *counter; res = rdma_restrack_get_byid(dev, RDMA_RESTRACK_COUNTER, counter_id); if (IS_ERR(res)) return NULL; counter = container_of(res, struct rdma_counter, res); kref_get(&counter->kref); rdma_restrack_put(res); return counter; } /* * rdma_counter_bind_qpn() - Bind QP @qp_num to counter @counter_id */ int rdma_counter_bind_qpn(struct ib_device *dev, u32 port, u32 qp_num, u32 counter_id) { struct rdma_port_counter *port_counter; struct rdma_counter *counter; struct ib_qp *qp; int ret; port_counter = &dev->port_data[port].port_counter; if (port_counter->mode.mode == RDMA_COUNTER_MODE_AUTO) return -EINVAL; qp = rdma_counter_get_qp(dev, qp_num); if (!qp) return -ENOENT; counter = rdma_get_counter_by_id(dev, counter_id); if (!counter) { ret = -ENOENT; goto err; } if (rdma_is_kernel_res(&counter->res) != rdma_is_kernel_res(&qp->res)) { ret = -EINVAL; goto err_task; } if ((counter->device != qp->device) || (counter->port != qp->port)) { ret = -EINVAL; goto err_task; } ret = __rdma_counter_bind_qp(counter, qp, port); if (ret) goto err_task; rdma_restrack_put(&qp->res); return 0; err_task: kref_put(&counter->kref, counter_release); err: rdma_restrack_put(&qp->res); return ret; } /* * rdma_counter_bind_qpn_alloc() - Alloc a counter and bind QP @qp_num to it * The id of new counter is returned in @counter_id */ int rdma_counter_bind_qpn_alloc(struct ib_device *dev, u32 port, u32 qp_num, u32 *counter_id) { struct rdma_port_counter *port_counter; struct rdma_counter *counter; struct ib_qp *qp; int ret; if (!rdma_is_port_valid(dev, port)) return -EINVAL; port_counter = &dev->port_data[port].port_counter; if (!port_counter->hstats) return -EOPNOTSUPP; if (port_counter->mode.mode == RDMA_COUNTER_MODE_AUTO) return -EINVAL; qp = rdma_counter_get_qp(dev, qp_num); if (!qp) return -ENOENT; if (rdma_is_port_valid(dev, qp->port) && (qp->port != port)) { ret = -EINVAL; goto err; } counter = alloc_and_bind(dev, port, qp, RDMA_COUNTER_MODE_MANUAL, true); if (!counter) { ret = -ENOMEM; goto err; } if (counter_id) *counter_id = counter->id; rdma_restrack_put(&qp->res); return 0; err: rdma_restrack_put(&qp->res); return ret; } /* * rdma_counter_unbind_qpn() - Unbind QP @qp_num from a counter */ int rdma_counter_unbind_qpn(struct ib_device *dev, u32 port, u32 qp_num, u32 counter_id) { struct rdma_port_counter *port_counter; struct ib_qp *qp; int ret; if (!rdma_is_port_valid(dev, port)) return -EINVAL; qp = rdma_counter_get_qp(dev, qp_num); if (!qp) return -ENOENT; if (rdma_is_port_valid(dev, qp->port) && (qp->port != port)) { ret = -EINVAL; goto out; } port_counter = &dev->port_data[port].port_counter; if (!qp->counter || qp->counter->id != counter_id || port_counter->mode.mode != RDMA_COUNTER_MODE_MANUAL) { ret = -EINVAL; goto out; } ret = rdma_counter_unbind_qp(qp, port, false); out: rdma_restrack_put(&qp->res); return ret; } int rdma_counter_get_mode(struct ib_device *dev, u32 port, enum rdma_nl_counter_mode *mode, enum rdma_nl_counter_mask *mask, bool *opcnt) { struct rdma_port_counter *port_counter; port_counter = &dev->port_data[port].port_counter; *mode = port_counter->mode.mode; *mask = port_counter->mode.mask; *opcnt = port_counter->mode.bind_opcnt; return 0; } void rdma_counter_init(struct ib_device *dev) { struct rdma_port_counter *port_counter; u32 port, i; if (!dev->port_data) return; rdma_for_each_port(dev, port) { port_counter = &dev->port_data[port].port_counter; port_counter->mode.mode = RDMA_COUNTER_MODE_NONE; mutex_init(&port_counter->lock); if (!dev->ops.alloc_hw_port_stats) continue; port_counter->hstats = dev->ops.alloc_hw_port_stats(dev, port); if (!port_counter->hstats) goto fail; } return; fail: for (i = port; i >= rdma_start_port(dev); i--) { port_counter = &dev->port_data[port].port_counter; rdma_free_hw_stats_struct(port_counter->hstats); port_counter->hstats = NULL; mutex_destroy(&port_counter->lock); } } void rdma_counter_release(struct ib_device *dev) { struct rdma_port_counter *port_counter; u32 port; rdma_for_each_port(dev, port) { port_counter = &dev->port_data[port].port_counter; rdma_free_hw_stats_struct(port_counter->hstats); mutex_destroy(&port_counter->lock); } }
17 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 /* SPDX-License-Identifier: GPL-2.0-only */ /* * AppArmor security module * * This file contains AppArmor lib definitions * * 2017 Canonical Ltd. */ #ifndef __AA_LIB_H #define __AA_LIB_H #include <linux/slab.h> #include <linux/fs.h> #include <linux/lsm_hooks.h> #include "match.h" extern struct aa_dfa *stacksplitdfa; /* * DEBUG remains global (no per profile flag) since it is mostly used in sysctl * which is not related to profile accesses. */ #define DEBUG_ON (aa_g_debug) /* * split individual debug cases out in preparation for finer grained * debug controls in the future. */ #define AA_DEBUG_LABEL DEBUG_ON #define dbg_printk(__fmt, __args...) pr_debug(__fmt, ##__args) #define AA_DEBUG(fmt, args...) \ do { \ if (DEBUG_ON) \ pr_debug_ratelimited("AppArmor: " fmt, ##args); \ } while (0) #define AA_WARN(X) WARN((X), "APPARMOR WARN %s: %s\n", __func__, #X) #define AA_BUG(X, args...) \ do { \ _Pragma("GCC diagnostic ignored \"-Wformat-zero-length\""); \ AA_BUG_FMT((X), "" args); \ _Pragma("GCC diagnostic warning \"-Wformat-zero-length\""); \ } while (0) #ifdef CONFIG_SECURITY_APPARMOR_DEBUG_ASSERTS #define AA_BUG_FMT(X, fmt, args...) \ WARN((X), "AppArmor WARN %s: (" #X "): " fmt, __func__, ##args) #else #define AA_BUG_FMT(X, fmt, args...) no_printk(fmt, ##args) #endif #define AA_ERROR(fmt, args...) \ pr_err_ratelimited("AppArmor: " fmt, ##args) /* Flag indicating whether initialization completed */ extern int apparmor_initialized; /* fn's in lib */ const char *skipn_spaces(const char *str, size_t n); const char *aa_splitn_fqname(const char *fqname, size_t n, const char **ns_name, size_t *ns_len); void aa_info_message(const char *str); /* Security blob offsets */ extern struct lsm_blob_sizes apparmor_blob_sizes; /** * aa_strneq - compare null terminated @str to a non null terminated substring * @str: a null terminated string * @sub: a substring, not necessarily null terminated * @len: length of @sub to compare * * The @str string must be full consumed for this to be considered a match */ static inline bool aa_strneq(const char *str, const char *sub, int len) { return !strncmp(str, sub, len) && !str[len]; } /** * aa_dfa_null_transition - step to next state after null character * @dfa: the dfa to match against * @start: the state of the dfa to start matching in * * aa_dfa_null_transition transitions to the next state after a null * character which is not used in standard matching and is only * used to separate pairs. */ static inline aa_state_t aa_dfa_null_transition(struct aa_dfa *dfa, aa_state_t start) { /* the null transition only needs the string's null terminator byte */ return aa_dfa_next(dfa, start, 0); } static inline bool path_mediated_fs(struct dentry *dentry) { return !(dentry->d_sb->s_flags & SB_NOUSER); } struct aa_str_table { int size; char **table; }; void aa_free_str_table(struct aa_str_table *table); struct counted_str { struct kref count; char name[]; }; #define str_to_counted(str) \ ((struct counted_str *)(str - offsetof(struct counted_str, name))) #define __counted /* atm just a notation */ void aa_str_kref(struct kref *kref); char *aa_str_alloc(int size, gfp_t gfp); static inline __counted char *aa_get_str(__counted char *str) { if (str) kref_get(&(str_to_counted(str)->count)); return str; } static inline void aa_put_str(__counted char *str) { if (str) kref_put(&str_to_counted(str)->count, aa_str_kref); } /* struct aa_policy - common part of both namespaces and profiles * @name: name of the object * @hname - The hierarchical name * @list: list policy object is on * @profiles: head of the profiles list contained in the object */ struct aa_policy { const char *name; __counted char *hname; struct list_head list; struct list_head profiles; }; /** * basename - find the last component of an hname * @name: hname to find the base profile name component of (NOT NULL) * * Returns: the tail (base profile name) name component of an hname */ static inline const char *basename(const char *hname) { char *split; hname = strim((char *)hname); for (split = strstr(hname, "//"); split; split = strstr(hname, "//")) hname = split + 2; return hname; } /** * __policy_find - find a policy by @name on a policy list * @head: list to search (NOT NULL) * @name: name to search for (NOT NULL) * * Requires: rcu_read_lock be held * * Returns: unrefcounted policy that match @name or NULL if not found */ static inline struct aa_policy *__policy_find(struct list_head *head, const char *name) { struct aa_policy *policy; list_for_each_entry_rcu(policy, head, list) { if (!strcmp(policy->name, name)) return policy; } return NULL; } /** * __policy_strn_find - find a policy that's name matches @len chars of @str * @head: list to search (NOT NULL) * @str: string to search for (NOT NULL) * @len: length of match required * * Requires: rcu_read_lock be held * * Returns: unrefcounted policy that match @str or NULL if not found * * if @len == strlen(@strlen) then this is equiv to __policy_find * other wise it allows searching for policy by a partial match of name */ static inline struct aa_policy *__policy_strn_find(struct list_head *head, const char *str, int len) { struct aa_policy *policy; list_for_each_entry_rcu(policy, head, list) { if (aa_strneq(policy->name, str, len)) return policy; } return NULL; } bool aa_policy_init(struct aa_policy *policy, const char *prefix, const char *name, gfp_t gfp); void aa_policy_destroy(struct aa_policy *policy); /* * fn_label_build - abstract out the build of a label transition * @L: label the transition is being computed for * @P: profile parameter derived from L by this macro, can be passed to FN * @GFP: memory allocation type to use * @FN: fn to call for each profile transition. @P is set to the profile * * Returns: new label on success * ERR_PTR if build @FN fails * NULL if label_build fails due to low memory conditions * * @FN must return a label or ERR_PTR on failure. NULL is not allowed */ #define fn_label_build(L, P, GFP, FN) \ ({ \ __label__ __do_cleanup, __done; \ struct aa_label *__new_; \ \ if ((L)->size > 1) { \ /* TODO: add cache of transitions already done */ \ struct label_it __i; \ int __j, __k, __count; \ DEFINE_VEC(label, __lvec); \ DEFINE_VEC(profile, __pvec); \ if (vec_setup(label, __lvec, (L)->size, (GFP))) { \ __new_ = NULL; \ goto __done; \ } \ __j = 0; \ label_for_each(__i, (L), (P)) { \ __new_ = (FN); \ AA_BUG(!__new_); \ if (IS_ERR(__new_)) \ goto __do_cleanup; \ __lvec[__j++] = __new_; \ } \ for (__j = __count = 0; __j < (L)->size; __j++) \ __count += __lvec[__j]->size; \ if (!vec_setup(profile, __pvec, __count, (GFP))) { \ for (__j = __k = 0; __j < (L)->size; __j++) { \ label_for_each(__i, __lvec[__j], (P)) \ __pvec[__k++] = aa_get_profile(P); \ } \ __count -= aa_vec_unique(__pvec, __count, 0); \ if (__count > 1) { \ __new_ = aa_vec_find_or_create_label(__pvec,\ __count, (GFP)); \ /* only fails if out of Mem */ \ if (!__new_) \ __new_ = NULL; \ } else \ __new_ = aa_get_label(&__pvec[0]->label); \ vec_cleanup(profile, __pvec, __count); \ } else \ __new_ = NULL; \ __do_cleanup: \ vec_cleanup(label, __lvec, (L)->size); \ } else { \ (P) = labels_profile(L); \ __new_ = (FN); \ } \ __done: \ if (!__new_) \ AA_DEBUG("label build failed\n"); \ (__new_); \ }) #define __fn_build_in_ns(NS, P, NS_FN, OTHER_FN) \ ({ \ struct aa_label *__new; \ if ((P)->ns != (NS)) \ __new = (OTHER_FN); \ else \ __new = (NS_FN); \ (__new); \ }) #define fn_label_build_in_ns(L, P, GFP, NS_FN, OTHER_FN) \ ({ \ fn_label_build((L), (P), (GFP), \ __fn_build_in_ns(labels_ns(L), (P), (NS_FN), (OTHER_FN))); \ }) #endif /* __AA_LIB_H */
4858 4855 250 4749 4730 4716 4725 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 // SPDX-License-Identifier: GPL-2.0 /* * security/tomoyo/common.c * * Copyright (C) 2005-2011 NTT DATA CORPORATION */ #include <linux/uaccess.h> #include <linux/slab.h> #include <linux/security.h> #include <linux/string_helpers.h> #include "common.h" /* String table for operation mode. */ const char * const tomoyo_mode[TOMOYO_CONFIG_MAX_MODE] = { [TOMOYO_CONFIG_DISABLED] = "disabled", [TOMOYO_CONFIG_LEARNING] = "learning", [TOMOYO_CONFIG_PERMISSIVE] = "permissive", [TOMOYO_CONFIG_ENFORCING] = "enforcing" }; /* String table for /sys/kernel/security/tomoyo/profile */ const char * const tomoyo_mac_keywords[TOMOYO_MAX_MAC_INDEX + TOMOYO_MAX_MAC_CATEGORY_INDEX] = { /* CONFIG::file group */ [TOMOYO_MAC_FILE_EXECUTE] = "execute", [TOMOYO_MAC_FILE_OPEN] = "open", [TOMOYO_MAC_FILE_CREATE] = "create", [TOMOYO_MAC_FILE_UNLINK] = "unlink", [TOMOYO_MAC_FILE_GETATTR] = "getattr", [TOMOYO_MAC_FILE_MKDIR] = "mkdir", [TOMOYO_MAC_FILE_RMDIR] = "rmdir", [TOMOYO_MAC_FILE_MKFIFO] = "mkfifo", [TOMOYO_MAC_FILE_MKSOCK] = "mksock", [TOMOYO_MAC_FILE_TRUNCATE] = "truncate", [TOMOYO_MAC_FILE_SYMLINK] = "symlink", [TOMOYO_MAC_FILE_MKBLOCK] = "mkblock", [TOMOYO_MAC_FILE_MKCHAR] = "mkchar", [TOMOYO_MAC_FILE_LINK] = "link", [TOMOYO_MAC_FILE_RENAME] = "rename", [TOMOYO_MAC_FILE_CHMOD] = "chmod", [TOMOYO_MAC_FILE_CHOWN] = "chown", [TOMOYO_MAC_FILE_CHGRP] = "chgrp", [TOMOYO_MAC_FILE_IOCTL] = "ioctl", [TOMOYO_MAC_FILE_CHROOT] = "chroot", [TOMOYO_MAC_FILE_MOUNT] = "mount", [TOMOYO_MAC_FILE_UMOUNT] = "unmount", [TOMOYO_MAC_FILE_PIVOT_ROOT] = "pivot_root", /* CONFIG::network group */ [TOMOYO_MAC_NETWORK_INET_STREAM_BIND] = "inet_stream_bind", [TOMOYO_MAC_NETWORK_INET_STREAM_LISTEN] = "inet_stream_listen", [TOMOYO_MAC_NETWORK_INET_STREAM_CONNECT] = "inet_stream_connect", [TOMOYO_MAC_NETWORK_INET_DGRAM_BIND] = "inet_dgram_bind", [TOMOYO_MAC_NETWORK_INET_DGRAM_SEND] = "inet_dgram_send", [TOMOYO_MAC_NETWORK_INET_RAW_BIND] = "inet_raw_bind", [TOMOYO_MAC_NETWORK_INET_RAW_SEND] = "inet_raw_send", [TOMOYO_MAC_NETWORK_UNIX_STREAM_BIND] = "unix_stream_bind", [TOMOYO_MAC_NETWORK_UNIX_STREAM_LISTEN] = "unix_stream_listen", [TOMOYO_MAC_NETWORK_UNIX_STREAM_CONNECT] = "unix_stream_connect", [TOMOYO_MAC_NETWORK_UNIX_DGRAM_BIND] = "unix_dgram_bind", [TOMOYO_MAC_NETWORK_UNIX_DGRAM_SEND] = "unix_dgram_send", [TOMOYO_MAC_NETWORK_UNIX_SEQPACKET_BIND] = "unix_seqpacket_bind", [TOMOYO_MAC_NETWORK_UNIX_SEQPACKET_LISTEN] = "unix_seqpacket_listen", [TOMOYO_MAC_NETWORK_UNIX_SEQPACKET_CONNECT] = "unix_seqpacket_connect", /* CONFIG::misc group */ [TOMOYO_MAC_ENVIRON] = "env", /* CONFIG group */ [TOMOYO_MAX_MAC_INDEX + TOMOYO_MAC_CATEGORY_FILE] = "file", [TOMOYO_MAX_MAC_INDEX + TOMOYO_MAC_CATEGORY_NETWORK] = "network", [TOMOYO_MAX_MAC_INDEX + TOMOYO_MAC_CATEGORY_MISC] = "misc", }; /* String table for conditions. */ const char * const tomoyo_condition_keyword[TOMOYO_MAX_CONDITION_KEYWORD] = { [TOMOYO_TASK_UID] = "task.uid", [TOMOYO_TASK_EUID] = "task.euid", [TOMOYO_TASK_SUID] = "task.suid", [TOMOYO_TASK_FSUID] = "task.fsuid", [TOMOYO_TASK_GID] = "task.gid", [TOMOYO_TASK_EGID] = "task.egid", [TOMOYO_TASK_SGID] = "task.sgid", [TOMOYO_TASK_FSGID] = "task.fsgid", [TOMOYO_TASK_PID] = "task.pid", [TOMOYO_TASK_PPID] = "task.ppid", [TOMOYO_EXEC_ARGC] = "exec.argc", [TOMOYO_EXEC_ENVC] = "exec.envc", [TOMOYO_TYPE_IS_SOCKET] = "socket", [TOMOYO_TYPE_IS_SYMLINK] = "symlink", [TOMOYO_TYPE_IS_FILE] = "file", [TOMOYO_TYPE_IS_BLOCK_DEV] = "block", [TOMOYO_TYPE_IS_DIRECTORY] = "directory", [TOMOYO_TYPE_IS_CHAR_DEV] = "char", [TOMOYO_TYPE_IS_FIFO] = "fifo", [TOMOYO_MODE_SETUID] = "setuid", [TOMOYO_MODE_SETGID] = "setgid", [TOMOYO_MODE_STICKY] = "sticky", [TOMOYO_MODE_OWNER_READ] = "owner_read", [TOMOYO_MODE_OWNER_WRITE] = "owner_write", [TOMOYO_MODE_OWNER_EXECUTE] = "owner_execute", [TOMOYO_MODE_GROUP_READ] = "group_read", [TOMOYO_MODE_GROUP_WRITE] = "group_write", [TOMOYO_MODE_GROUP_EXECUTE] = "group_execute", [TOMOYO_MODE_OTHERS_READ] = "others_read", [TOMOYO_MODE_OTHERS_WRITE] = "others_write", [TOMOYO_MODE_OTHERS_EXECUTE] = "others_execute", [TOMOYO_EXEC_REALPATH] = "exec.realpath", [TOMOYO_SYMLINK_TARGET] = "symlink.target", [TOMOYO_PATH1_UID] = "path1.uid", [TOMOYO_PATH1_GID] = "path1.gid", [TOMOYO_PATH1_INO] = "path1.ino", [TOMOYO_PATH1_MAJOR] = "path1.major", [TOMOYO_PATH1_MINOR] = "path1.minor", [TOMOYO_PATH1_PERM] = "path1.perm", [TOMOYO_PATH1_TYPE] = "path1.type", [TOMOYO_PATH1_DEV_MAJOR] = "path1.dev_major", [TOMOYO_PATH1_DEV_MINOR] = "path1.dev_minor", [TOMOYO_PATH2_UID] = "path2.uid", [TOMOYO_PATH2_GID] = "path2.gid", [TOMOYO_PATH2_INO] = "path2.ino", [TOMOYO_PATH2_MAJOR] = "path2.major", [TOMOYO_PATH2_MINOR] = "path2.minor", [TOMOYO_PATH2_PERM] = "path2.perm", [TOMOYO_PATH2_TYPE] = "path2.type", [TOMOYO_PATH2_DEV_MAJOR] = "path2.dev_major", [TOMOYO_PATH2_DEV_MINOR] = "path2.dev_minor", [TOMOYO_PATH1_PARENT_UID] = "path1.parent.uid", [TOMOYO_PATH1_PARENT_GID] = "path1.parent.gid", [TOMOYO_PATH1_PARENT_INO] = "path1.parent.ino", [TOMOYO_PATH1_PARENT_PERM] = "path1.parent.perm", [TOMOYO_PATH2_PARENT_UID] = "path2.parent.uid", [TOMOYO_PATH2_PARENT_GID] = "path2.parent.gid", [TOMOYO_PATH2_PARENT_INO] = "path2.parent.ino", [TOMOYO_PATH2_PARENT_PERM] = "path2.parent.perm", }; /* String table for PREFERENCE keyword. */ static const char * const tomoyo_pref_keywords[TOMOYO_MAX_PREF] = { [TOMOYO_PREF_MAX_AUDIT_LOG] = "max_audit_log", [TOMOYO_PREF_MAX_LEARNING_ENTRY] = "max_learning_entry", }; /* String table for path operation. */ const char * const tomoyo_path_keyword[TOMOYO_MAX_PATH_OPERATION] = { [TOMOYO_TYPE_EXECUTE] = "execute", [TOMOYO_TYPE_READ] = "read", [TOMOYO_TYPE_WRITE] = "write", [TOMOYO_TYPE_APPEND] = "append", [TOMOYO_TYPE_UNLINK] = "unlink", [TOMOYO_TYPE_GETATTR] = "getattr", [TOMOYO_TYPE_RMDIR] = "rmdir", [TOMOYO_TYPE_TRUNCATE] = "truncate", [TOMOYO_TYPE_SYMLINK] = "symlink", [TOMOYO_TYPE_CHROOT] = "chroot", [TOMOYO_TYPE_UMOUNT] = "unmount", }; /* String table for socket's operation. */ const char * const tomoyo_socket_keyword[TOMOYO_MAX_NETWORK_OPERATION] = { [TOMOYO_NETWORK_BIND] = "bind", [TOMOYO_NETWORK_LISTEN] = "listen", [TOMOYO_NETWORK_CONNECT] = "connect", [TOMOYO_NETWORK_SEND] = "send", }; /* String table for categories. */ static const char * const tomoyo_category_keywords [TOMOYO_MAX_MAC_CATEGORY_INDEX] = { [TOMOYO_MAC_CATEGORY_FILE] = "file", [TOMOYO_MAC_CATEGORY_NETWORK] = "network", [TOMOYO_MAC_CATEGORY_MISC] = "misc", }; /* Permit policy management by non-root user? */ static bool tomoyo_manage_by_non_root; /* Utility functions. */ /** * tomoyo_addprintf - strncat()-like-snprintf(). * * @buffer: Buffer to write to. Must be '\0'-terminated. * @len: Size of @buffer. * @fmt: The printf()'s format string, followed by parameters. * * Returns nothing. */ __printf(3, 4) static void tomoyo_addprintf(char *buffer, int len, const char *fmt, ...) { va_list args; const int pos = strlen(buffer); va_start(args, fmt); vsnprintf(buffer + pos, len - pos - 1, fmt, args); va_end(args); } /** * tomoyo_flush - Flush queued string to userspace's buffer. * * @head: Pointer to "struct tomoyo_io_buffer". * * Returns true if all data was flushed, false otherwise. */ static bool tomoyo_flush(struct tomoyo_io_buffer *head) { while (head->r.w_pos) { const char *w = head->r.w[0]; size_t len = strlen(w); if (len) { if (len > head->read_user_buf_avail) len = head->read_user_buf_avail; if (!len) return false; if (copy_to_user(head->read_user_buf, w, len)) return false; head->read_user_buf_avail -= len; head->read_user_buf += len; w += len; } head->r.w[0] = w; if (*w) return false; /* Add '\0' for audit logs and query. */ if (head->poll) { if (!head->read_user_buf_avail || copy_to_user(head->read_user_buf, "", 1)) return false; head->read_user_buf_avail--; head->read_user_buf++; } head->r.w_pos--; for (len = 0; len < head->r.w_pos; len++) head->r.w[len] = head->r.w[len + 1]; } head->r.avail = 0; return true; } /** * tomoyo_set_string - Queue string to "struct tomoyo_io_buffer" structure. * * @head: Pointer to "struct tomoyo_io_buffer". * @string: String to print. * * Note that @string has to be kept valid until @head is kfree()d. * This means that char[] allocated on stack memory cannot be passed to * this function. Use tomoyo_io_printf() for char[] allocated on stack memory. */ static void tomoyo_set_string(struct tomoyo_io_buffer *head, const char *string) { if (head->r.w_pos < TOMOYO_MAX_IO_READ_QUEUE) { head->r.w[head->r.w_pos++] = string; tomoyo_flush(head); } else WARN_ON(1); } static void tomoyo_io_printf(struct tomoyo_io_buffer *head, const char *fmt, ...) __printf(2, 3); /** * tomoyo_io_printf - printf() to "struct tomoyo_io_buffer" structure. * * @head: Pointer to "struct tomoyo_io_buffer". * @fmt: The printf()'s format string, followed by parameters. */ static void tomoyo_io_printf(struct tomoyo_io_buffer *head, const char *fmt, ...) { va_list args; size_t len; size_t pos = head->r.avail; int size = head->readbuf_size - pos; if (size <= 0) return; va_start(args, fmt); len = vsnprintf(head->read_buf + pos, size, fmt, args) + 1; va_end(args); if (pos + len >= head->readbuf_size) { WARN_ON(1); return; } head->r.avail += len; tomoyo_set_string(head, head->read_buf + pos); } /** * tomoyo_set_space - Put a space to "struct tomoyo_io_buffer" structure. * * @head: Pointer to "struct tomoyo_io_buffer". * * Returns nothing. */ static void tomoyo_set_space(struct tomoyo_io_buffer *head) { tomoyo_set_string(head, " "); } /** * tomoyo_set_lf - Put a line feed to "struct tomoyo_io_buffer" structure. * * @head: Pointer to "struct tomoyo_io_buffer". * * Returns nothing. */ static bool tomoyo_set_lf(struct tomoyo_io_buffer *head) { tomoyo_set_string(head, "\n"); return !head->r.w_pos; } /** * tomoyo_set_slash - Put a shash to "struct tomoyo_io_buffer" structure. * * @head: Pointer to "struct tomoyo_io_buffer". * * Returns nothing. */ static void tomoyo_set_slash(struct tomoyo_io_buffer *head) { tomoyo_set_string(head, "/"); } /* List of namespaces. */ LIST_HEAD(tomoyo_namespace_list); /* True if namespace other than tomoyo_kernel_namespace is defined. */ static bool tomoyo_namespace_enabled; /** * tomoyo_init_policy_namespace - Initialize namespace. * * @ns: Pointer to "struct tomoyo_policy_namespace". * * Returns nothing. */ void tomoyo_init_policy_namespace(struct tomoyo_policy_namespace *ns) { unsigned int idx; for (idx = 0; idx < TOMOYO_MAX_ACL_GROUPS; idx++) INIT_LIST_HEAD(&ns->acl_group[idx]); for (idx = 0; idx < TOMOYO_MAX_GROUP; idx++) INIT_LIST_HEAD(&ns->group_list[idx]); for (idx = 0; idx < TOMOYO_MAX_POLICY; idx++) INIT_LIST_HEAD(&ns->policy_list[idx]); ns->profile_version = 20150505; tomoyo_namespace_enabled = !list_empty(&tomoyo_namespace_list); list_add_tail_rcu(&ns->namespace_list, &tomoyo_namespace_list); } /** * tomoyo_print_namespace - Print namespace header. * * @head: Pointer to "struct tomoyo_io_buffer". * * Returns nothing. */ static void tomoyo_print_namespace(struct tomoyo_io_buffer *head) { if (!tomoyo_namespace_enabled) return; tomoyo_set_string(head, container_of(head->r.ns, struct tomoyo_policy_namespace, namespace_list)->name); tomoyo_set_space(head); } /** * tomoyo_print_name_union - Print a tomoyo_name_union. * * @head: Pointer to "struct tomoyo_io_buffer". * @ptr: Pointer to "struct tomoyo_name_union". */ static void tomoyo_print_name_union(struct tomoyo_io_buffer *head, const struct tomoyo_name_union *ptr) { tomoyo_set_space(head); if (ptr->group) { tomoyo_set_string(head, "@"); tomoyo_set_string(head, ptr->group->group_name->name); } else { tomoyo_set_string(head, ptr->filename->name); } } /** * tomoyo_print_name_union_quoted - Print a tomoyo_name_union with a quote. * * @head: Pointer to "struct tomoyo_io_buffer". * @ptr: Pointer to "struct tomoyo_name_union". * * Returns nothing. */ static void tomoyo_print_name_union_quoted(struct tomoyo_io_buffer *head, const struct tomoyo_name_union *ptr) { if (ptr->group) { tomoyo_set_string(head, "@"); tomoyo_set_string(head, ptr->group->group_name->name); } else { tomoyo_set_string(head, "\""); tomoyo_set_string(head, ptr->filename->name); tomoyo_set_string(head, "\""); } } /** * tomoyo_print_number_union_nospace - Print a tomoyo_number_union without a space. * * @head: Pointer to "struct tomoyo_io_buffer". * @ptr: Pointer to "struct tomoyo_number_union". * * Returns nothing. */ static void tomoyo_print_number_union_nospace (struct tomoyo_io_buffer *head, const struct tomoyo_number_union *ptr) { if (ptr->group) { tomoyo_set_string(head, "@"); tomoyo_set_string(head, ptr->group->group_name->name); } else { int i; unsigned long min = ptr->values[0]; const unsigned long max = ptr->values[1]; u8 min_type = ptr->value_type[0]; const u8 max_type = ptr->value_type[1]; char buffer[128]; buffer[0] = '\0'; for (i = 0; i < 2; i++) { switch (min_type) { case TOMOYO_VALUE_TYPE_HEXADECIMAL: tomoyo_addprintf(buffer, sizeof(buffer), "0x%lX", min); break; case TOMOYO_VALUE_TYPE_OCTAL: tomoyo_addprintf(buffer, sizeof(buffer), "0%lo", min); break; default: tomoyo_addprintf(buffer, sizeof(buffer), "%lu", min); break; } if (min == max && min_type == max_type) break; tomoyo_addprintf(buffer, sizeof(buffer), "-"); min_type = max_type; min = max; } tomoyo_io_printf(head, "%s", buffer); } } /** * tomoyo_print_number_union - Print a tomoyo_number_union. * * @head: Pointer to "struct tomoyo_io_buffer". * @ptr: Pointer to "struct tomoyo_number_union". * * Returns nothing. */ static void tomoyo_print_number_union(struct tomoyo_io_buffer *head, const struct tomoyo_number_union *ptr) { tomoyo_set_space(head); tomoyo_print_number_union_nospace(head, ptr); } /** * tomoyo_assign_profile - Create a new profile. * * @ns: Pointer to "struct tomoyo_policy_namespace". * @profile: Profile number to create. * * Returns pointer to "struct tomoyo_profile" on success, NULL otherwise. */ static struct tomoyo_profile *tomoyo_assign_profile (struct tomoyo_policy_namespace *ns, const unsigned int profile) { struct tomoyo_profile *ptr; struct tomoyo_profile *entry; if (profile >= TOMOYO_MAX_PROFILES) return NULL; ptr = ns->profile_ptr[profile]; if (ptr) return ptr; entry = kzalloc(sizeof(*entry), GFP_NOFS | __GFP_NOWARN); if (mutex_lock_interruptible(&tomoyo_policy_lock)) goto out; ptr = ns->profile_ptr[profile]; if (!ptr && tomoyo_memory_ok(entry)) { ptr = entry; ptr->default_config = TOMOYO_CONFIG_DISABLED | TOMOYO_CONFIG_WANT_GRANT_LOG | TOMOYO_CONFIG_WANT_REJECT_LOG; memset(ptr->config, TOMOYO_CONFIG_USE_DEFAULT, sizeof(ptr->config)); ptr->pref[TOMOYO_PREF_MAX_AUDIT_LOG] = CONFIG_SECURITY_TOMOYO_MAX_AUDIT_LOG; ptr->pref[TOMOYO_PREF_MAX_LEARNING_ENTRY] = CONFIG_SECURITY_TOMOYO_MAX_ACCEPT_ENTRY; mb(); /* Avoid out-of-order execution. */ ns->profile_ptr[profile] = ptr; entry = NULL; } mutex_unlock(&tomoyo_policy_lock); out: kfree(entry); return ptr; } /** * tomoyo_profile - Find a profile. * * @ns: Pointer to "struct tomoyo_policy_namespace". * @profile: Profile number to find. * * Returns pointer to "struct tomoyo_profile". */ struct tomoyo_profile *tomoyo_profile(const struct tomoyo_policy_namespace *ns, const u8 profile) { static struct tomoyo_profile tomoyo_null_profile; struct tomoyo_profile *ptr = ns->profile_ptr[profile]; if (!ptr) ptr = &tomoyo_null_profile; return ptr; } /** * tomoyo_find_yesno - Find values for specified keyword. * * @string: String to check. * @find: Name of keyword. * * Returns 1 if "@find=yes" was found, 0 if "@find=no" was found, -1 otherwise. */ static s8 tomoyo_find_yesno(const char *string, const char *find) { const char *cp = strstr(string, find); if (cp) { cp += strlen(find); if (!strncmp(cp, "=yes", 4)) return 1; else if (!strncmp(cp, "=no", 3)) return 0; } return -1; } /** * tomoyo_set_uint - Set value for specified preference. * * @i: Pointer to "unsigned int". * @string: String to check. * @find: Name of keyword. * * Returns nothing. */ static void tomoyo_set_uint(unsigned int *i, const char *string, const char *find) { const char *cp = strstr(string, find); if (cp) sscanf(cp + strlen(find), "=%u", i); } /** * tomoyo_set_mode - Set mode for specified profile. * * @name: Name of functionality. * @value: Mode for @name. * @profile: Pointer to "struct tomoyo_profile". * * Returns 0 on success, negative value otherwise. */ static int tomoyo_set_mode(char *name, const char *value, struct tomoyo_profile *profile) { u8 i; u8 config; if (!strcmp(name, "CONFIG")) { i = TOMOYO_MAX_MAC_INDEX + TOMOYO_MAX_MAC_CATEGORY_INDEX; config = profile->default_config; } else if (tomoyo_str_starts(&name, "CONFIG::")) { config = 0; for (i = 0; i < TOMOYO_MAX_MAC_INDEX + TOMOYO_MAX_MAC_CATEGORY_INDEX; i++) { int len = 0; if (i < TOMOYO_MAX_MAC_INDEX) { const u8 c = tomoyo_index2category[i]; const char *category = tomoyo_category_keywords[c]; len = strlen(category); if (strncmp(name, category, len) || name[len++] != ':' || name[len++] != ':') continue; } if (strcmp(name + len, tomoyo_mac_keywords[i])) continue; config = profile->config[i]; break; } if (i == TOMOYO_MAX_MAC_INDEX + TOMOYO_MAX_MAC_CATEGORY_INDEX) return -EINVAL; } else { return -EINVAL; } if (strstr(value, "use_default")) { config = TOMOYO_CONFIG_USE_DEFAULT; } else { u8 mode; for (mode = 0; mode < 4; mode++) if (strstr(value, tomoyo_mode[mode])) /* * Update lower 3 bits in order to distinguish * 'config' from 'TOMOYO_CONFIG_USE_DEFAULT'. */ config = (config & ~7) | mode; if (config != TOMOYO_CONFIG_USE_DEFAULT) { switch (tomoyo_find_yesno(value, "grant_log")) { case 1: config |= TOMOYO_CONFIG_WANT_GRANT_LOG; break; case 0: config &= ~TOMOYO_CONFIG_WANT_GRANT_LOG; break; } switch (tomoyo_find_yesno(value, "reject_log")) { case 1: config |= TOMOYO_CONFIG_WANT_REJECT_LOG; break; case 0: config &= ~TOMOYO_CONFIG_WANT_REJECT_LOG; break; } } } if (i < TOMOYO_MAX_MAC_INDEX + TOMOYO_MAX_MAC_CATEGORY_INDEX) profile->config[i] = config; else if (config != TOMOYO_CONFIG_USE_DEFAULT) profile->default_config = config; return 0; } /** * tomoyo_write_profile - Write profile table. * * @head: Pointer to "struct tomoyo_io_buffer". * * Returns 0 on success, negative value otherwise. */ static int tomoyo_write_profile(struct tomoyo_io_buffer *head) { char *data = head->write_buf; unsigned int i; char *cp; struct tomoyo_profile *profile; if (sscanf(data, "PROFILE_VERSION=%u", &head->w.ns->profile_version) == 1) return 0; i = simple_strtoul(data, &cp, 10); if (*cp != '-') return -EINVAL; data = cp + 1; profile = tomoyo_assign_profile(head->w.ns, i); if (!profile) return -EINVAL; cp = strchr(data, '='); if (!cp) return -EINVAL; *cp++ = '\0'; if (!strcmp(data, "COMMENT")) { static DEFINE_SPINLOCK(lock); const struct tomoyo_path_info *new_comment = tomoyo_get_name(cp); const struct tomoyo_path_info *old_comment; if (!new_comment) return -ENOMEM; spin_lock(&lock); old_comment = profile->comment; profile->comment = new_comment; spin_unlock(&lock); tomoyo_put_name(old_comment); return 0; } if (!strcmp(data, "PREFERENCE")) { for (i = 0; i < TOMOYO_MAX_PREF; i++) tomoyo_set_uint(&profile->pref[i], cp, tomoyo_pref_keywords[i]); return 0; } return tomoyo_set_mode(data, cp, profile); } /** * tomoyo_print_config - Print mode for specified functionality. * * @head: Pointer to "struct tomoyo_io_buffer". * @config: Mode for that functionality. * * Returns nothing. * * Caller prints functionality's name. */ static void tomoyo_print_config(struct tomoyo_io_buffer *head, const u8 config) { tomoyo_io_printf(head, "={ mode=%s grant_log=%s reject_log=%s }\n", tomoyo_mode[config & 3], str_yes_no(config & TOMOYO_CONFIG_WANT_GRANT_LOG), str_yes_no(config & TOMOYO_CONFIG_WANT_REJECT_LOG)); } /** * tomoyo_read_profile - Read profile table. * * @head: Pointer to "struct tomoyo_io_buffer". * * Returns nothing. */ static void tomoyo_read_profile(struct tomoyo_io_buffer *head) { u8 index; struct tomoyo_policy_namespace *ns = container_of(head->r.ns, typeof(*ns), namespace_list); const struct tomoyo_profile *profile; if (head->r.eof) return; next: index = head->r.index; profile = ns->profile_ptr[index]; switch (head->r.step) { case 0: tomoyo_print_namespace(head); tomoyo_io_printf(head, "PROFILE_VERSION=%u\n", ns->profile_version); head->r.step++; break; case 1: for ( ; head->r.index < TOMOYO_MAX_PROFILES; head->r.index++) if (ns->profile_ptr[head->r.index]) break; if (head->r.index == TOMOYO_MAX_PROFILES) { head->r.eof = true; return; } head->r.step++; break; case 2: { u8 i; const struct tomoyo_path_info *comment = profile->comment; tomoyo_print_namespace(head); tomoyo_io_printf(head, "%u-COMMENT=", index); tomoyo_set_string(head, comment ? comment->name : ""); tomoyo_set_lf(head); tomoyo_print_namespace(head); tomoyo_io_printf(head, "%u-PREFERENCE={ ", index); for (i = 0; i < TOMOYO_MAX_PREF; i++) tomoyo_io_printf(head, "%s=%u ", tomoyo_pref_keywords[i], profile->pref[i]); tomoyo_set_string(head, "}\n"); head->r.step++; } break; case 3: { tomoyo_print_namespace(head); tomoyo_io_printf(head, "%u-%s", index, "CONFIG"); tomoyo_print_config(head, profile->default_config); head->r.bit = 0; head->r.step++; } break; case 4: for ( ; head->r.bit < TOMOYO_MAX_MAC_INDEX + TOMOYO_MAX_MAC_CATEGORY_INDEX; head->r.bit++) { const u8 i = head->r.bit; const u8 config = profile->config[i]; if (config == TOMOYO_CONFIG_USE_DEFAULT) continue; tomoyo_print_namespace(head); if (i < TOMOYO_MAX_MAC_INDEX) tomoyo_io_printf(head, "%u-CONFIG::%s::%s", index, tomoyo_category_keywords [tomoyo_index2category[i]], tomoyo_mac_keywords[i]); else tomoyo_io_printf(head, "%u-CONFIG::%s", index, tomoyo_mac_keywords[i]); tomoyo_print_config(head, config); head->r.bit++; break; } if (head->r.bit == TOMOYO_MAX_MAC_INDEX + TOMOYO_MAX_MAC_CATEGORY_INDEX) { head->r.index++; head->r.step = 1; } break; } if (tomoyo_flush(head)) goto next; } /** * tomoyo_same_manager - Check for duplicated "struct tomoyo_manager" entry. * * @a: Pointer to "struct tomoyo_acl_head". * @b: Pointer to "struct tomoyo_acl_head". * * Returns true if @a == @b, false otherwise. */ static bool tomoyo_same_manager(const struct tomoyo_acl_head *a, const struct tomoyo_acl_head *b) { return container_of(a, struct tomoyo_manager, head)->manager == container_of(b, struct tomoyo_manager, head)->manager; } /** * tomoyo_update_manager_entry - Add a manager entry. * * @manager: The path to manager or the domainnamme. * @is_delete: True if it is a delete request. * * Returns 0 on success, negative value otherwise. * * Caller holds tomoyo_read_lock(). */ static int tomoyo_update_manager_entry(const char *manager, const bool is_delete) { struct tomoyo_manager e = { }; struct tomoyo_acl_param param = { /* .ns = &tomoyo_kernel_namespace, */ .is_delete = is_delete, .list = &tomoyo_kernel_namespace.policy_list[TOMOYO_ID_MANAGER], }; int error = is_delete ? -ENOENT : -ENOMEM; if (!tomoyo_correct_domain(manager) && !tomoyo_correct_word(manager)) return -EINVAL; e.manager = tomoyo_get_name(manager); if (e.manager) { error = tomoyo_update_policy(&e.head, sizeof(e), &param, tomoyo_same_manager); tomoyo_put_name(e.manager); } return error; } /** * tomoyo_write_manager - Write manager policy. * * @head: Pointer to "struct tomoyo_io_buffer". * * Returns 0 on success, negative value otherwise. * * Caller holds tomoyo_read_lock(). */ static int tomoyo_write_manager(struct tomoyo_io_buffer *head) { char *data = head->write_buf; if (!strcmp(data, "manage_by_non_root")) { tomoyo_manage_by_non_root = !head->w.is_delete; return 0; } return tomoyo_update_manager_entry(data, head->w.is_delete); } /** * tomoyo_read_manager - Read manager policy. * * @head: Pointer to "struct tomoyo_io_buffer". * * Caller holds tomoyo_read_lock(). */ static void tomoyo_read_manager(struct tomoyo_io_buffer *head) { if (head->r.eof) return; list_for_each_cookie(head->r.acl, &tomoyo_kernel_namespace.policy_list[TOMOYO_ID_MANAGER]) { struct tomoyo_manager *ptr = list_entry(head->r.acl, typeof(*ptr), head.list); if (ptr->head.is_deleted) continue; if (!tomoyo_flush(head)) return; tomoyo_set_string(head, ptr->manager->name); tomoyo_set_lf(head); } head->r.eof = true; } /** * tomoyo_manager - Check whether the current process is a policy manager. * * Returns true if the current process is permitted to modify policy * via /sys/kernel/security/tomoyo/ interface. * * Caller holds tomoyo_read_lock(). */ static bool tomoyo_manager(void) { struct tomoyo_manager *ptr; const char *exe; const struct task_struct *task = current; const struct tomoyo_path_info *domainname = tomoyo_domain()->domainname; bool found = IS_ENABLED(CONFIG_SECURITY_TOMOYO_INSECURE_BUILTIN_SETTING); if (!tomoyo_policy_loaded) return true; if (!tomoyo_manage_by_non_root && (!uid_eq(task->cred->uid, GLOBAL_ROOT_UID) || !uid_eq(task->cred->euid, GLOBAL_ROOT_UID))) return false; exe = tomoyo_get_exe(); if (!exe) return false; list_for_each_entry_rcu(ptr, &tomoyo_kernel_namespace.policy_list[TOMOYO_ID_MANAGER], head.list, srcu_read_lock_held(&tomoyo_ss)) { if (!ptr->head.is_deleted && (!tomoyo_pathcmp(domainname, ptr->manager) || !strcmp(exe, ptr->manager->name))) { found = true; break; } } if (!found) { /* Reduce error messages. */ static pid_t last_pid; const pid_t pid = current->pid; if (last_pid != pid) { pr_warn("%s ( %s ) is not permitted to update policies.\n", domainname->name, exe); last_pid = pid; } } kfree(exe); return found; } static struct tomoyo_domain_info *tomoyo_find_domain_by_qid (unsigned int serial); /** * tomoyo_select_domain - Parse select command. * * @head: Pointer to "struct tomoyo_io_buffer". * @data: String to parse. * * Returns true on success, false otherwise. * * Caller holds tomoyo_read_lock(). */ static bool tomoyo_select_domain(struct tomoyo_io_buffer *head, const char *data) { unsigned int pid; struct tomoyo_domain_info *domain = NULL; bool global_pid = false; if (strncmp(data, "select ", 7)) return false; data += 7; if (sscanf(data, "pid=%u", &pid) == 1 || (global_pid = true, sscanf(data, "global-pid=%u", &pid) == 1)) { struct task_struct *p; rcu_read_lock(); if (global_pid) p = find_task_by_pid_ns(pid, &init_pid_ns); else p = find_task_by_vpid(pid); if (p) domain = tomoyo_task(p)->domain_info; rcu_read_unlock(); } else if (!strncmp(data, "domain=", 7)) { if (tomoyo_domain_def(data + 7)) domain = tomoyo_find_domain(data + 7); } else if (sscanf(data, "Q=%u", &pid) == 1) { domain = tomoyo_find_domain_by_qid(pid); } else return false; head->w.domain = domain; /* Accessing read_buf is safe because head->io_sem is held. */ if (!head->read_buf) return true; /* Do nothing if open(O_WRONLY). */ memset(&head->r, 0, sizeof(head->r)); head->r.print_this_domain_only = true; if (domain) head->r.domain = &domain->list; else head->r.eof = true; tomoyo_io_printf(head, "# select %s\n", data); if (domain && domain->is_deleted) tomoyo_io_printf(head, "# This is a deleted domain.\n"); return true; } /** * tomoyo_same_task_acl - Check for duplicated "struct tomoyo_task_acl" entry. * * @a: Pointer to "struct tomoyo_acl_info". * @b: Pointer to "struct tomoyo_acl_info". * * Returns true if @a == @b, false otherwise. */ static bool tomoyo_same_task_acl(const struct tomoyo_acl_info *a, const struct tomoyo_acl_info *b) { const struct tomoyo_task_acl *p1 = container_of(a, typeof(*p1), head); const struct tomoyo_task_acl *p2 = container_of(b, typeof(*p2), head); return p1->domainname == p2->domainname; } /** * tomoyo_write_task - Update task related list. * * @param: Pointer to "struct tomoyo_acl_param". * * Returns 0 on success, negative value otherwise. * * Caller holds tomoyo_read_lock(). */ static int tomoyo_write_task(struct tomoyo_acl_param *param) { int error = -EINVAL; if (tomoyo_str_starts(&param->data, "manual_domain_transition ")) { struct tomoyo_task_acl e = { .head.type = TOMOYO_TYPE_MANUAL_TASK_ACL, .domainname = tomoyo_get_domainname(param), }; if (e.domainname) error = tomoyo_update_domain(&e.head, sizeof(e), param, tomoyo_same_task_acl, NULL); tomoyo_put_name(e.domainname); } return error; } /** * tomoyo_delete_domain - Delete a domain. * * @domainname: The name of domain. * * Returns 0 on success, negative value otherwise. * * Caller holds tomoyo_read_lock(). */ static int tomoyo_delete_domain(char *domainname) { struct tomoyo_domain_info *domain; struct tomoyo_path_info name; name.name = domainname; tomoyo_fill_path_info(&name); if (mutex_lock_interruptible(&tomoyo_policy_lock)) return -EINTR; /* Is there an active domain? */ list_for_each_entry_rcu(domain, &tomoyo_domain_list, list, srcu_read_lock_held(&tomoyo_ss)) { /* Never delete tomoyo_kernel_domain */ if (domain == &tomoyo_kernel_domain) continue; if (domain->is_deleted || tomoyo_pathcmp(domain->domainname, &name)) continue; domain->is_deleted = true; break; } mutex_unlock(&tomoyo_policy_lock); return 0; } /** * tomoyo_write_domain2 - Write domain policy. * * @ns: Pointer to "struct tomoyo_policy_namespace". * @list: Pointer to "struct list_head". * @data: Policy to be interpreted. * @is_delete: True if it is a delete request. * * Returns 0 on success, negative value otherwise. * * Caller holds tomoyo_read_lock(). */ static int tomoyo_write_domain2(struct tomoyo_policy_namespace *ns, struct list_head *list, char *data, const bool is_delete) { struct tomoyo_acl_param param = { .ns = ns, .list = list, .data = data, .is_delete = is_delete, }; static const struct { const char *keyword; int (*write)(struct tomoyo_acl_param *param); } tomoyo_callback[5] = { { "file ", tomoyo_write_file }, { "network inet ", tomoyo_write_inet_network }, { "network unix ", tomoyo_write_unix_network }, { "misc ", tomoyo_write_misc }, { "task ", tomoyo_write_task }, }; u8 i; for (i = 0; i < ARRAY_SIZE(tomoyo_callback); i++) { if (!tomoyo_str_starts(&param.data, tomoyo_callback[i].keyword)) continue; return tomoyo_callback[i].write(&param); } return -EINVAL; } /* String table for domain flags. */ const char * const tomoyo_dif[TOMOYO_MAX_DOMAIN_INFO_FLAGS] = { [TOMOYO_DIF_QUOTA_WARNED] = "quota_exceeded\n", [TOMOYO_DIF_TRANSITION_FAILED] = "transition_failed\n", }; /** * tomoyo_write_domain - Write domain policy. * * @head: Pointer to "struct tomoyo_io_buffer". * * Returns 0 on success, negative value otherwise. * * Caller holds tomoyo_read_lock(). */ static int tomoyo_write_domain(struct tomoyo_io_buffer *head) { char *data = head->write_buf; struct tomoyo_policy_namespace *ns; struct tomoyo_domain_info *domain = head->w.domain; const bool is_delete = head->w.is_delete; bool is_select = !is_delete && tomoyo_str_starts(&data, "select "); unsigned int idx; if (*data == '<') { int ret = 0; domain = NULL; if (is_delete) ret = tomoyo_delete_domain(data); else if (is_select) domain = tomoyo_find_domain(data); else domain = tomoyo_assign_domain(data, false); head->w.domain = domain; return ret; } if (!domain) return -EINVAL; ns = domain->ns; if (sscanf(data, "use_profile %u", &idx) == 1 && idx < TOMOYO_MAX_PROFILES) { if (!tomoyo_policy_loaded || ns->profile_ptr[idx]) if (!is_delete) domain->profile = (u8) idx; return 0; } if (sscanf(data, "use_group %u\n", &idx) == 1 && idx < TOMOYO_MAX_ACL_GROUPS) { if (!is_delete) set_bit(idx, domain->group); else clear_bit(idx, domain->group); return 0; } for (idx = 0; idx < TOMOYO_MAX_DOMAIN_INFO_FLAGS; idx++) { const char *cp = tomoyo_dif[idx]; if (strncmp(data, cp, strlen(cp) - 1)) continue; domain->flags[idx] = !is_delete; return 0; } return tomoyo_write_domain2(ns, &domain->acl_info_list, data, is_delete); } /** * tomoyo_print_condition - Print condition part. * * @head: Pointer to "struct tomoyo_io_buffer". * @cond: Pointer to "struct tomoyo_condition". * * Returns true on success, false otherwise. */ static bool tomoyo_print_condition(struct tomoyo_io_buffer *head, const struct tomoyo_condition *cond) { switch (head->r.cond_step) { case 0: head->r.cond_index = 0; head->r.cond_step++; if (cond->transit) { tomoyo_set_space(head); tomoyo_set_string(head, cond->transit->name); } fallthrough; case 1: { const u16 condc = cond->condc; const struct tomoyo_condition_element *condp = (typeof(condp)) (cond + 1); const struct tomoyo_number_union *numbers_p = (typeof(numbers_p)) (condp + condc); const struct tomoyo_name_union *names_p = (typeof(names_p)) (numbers_p + cond->numbers_count); const struct tomoyo_argv *argv = (typeof(argv)) (names_p + cond->names_count); const struct tomoyo_envp *envp = (typeof(envp)) (argv + cond->argc); u16 skip; for (skip = 0; skip < head->r.cond_index; skip++) { const u8 left = condp->left; const u8 right = condp->right; condp++; switch (left) { case TOMOYO_ARGV_ENTRY: argv++; continue; case TOMOYO_ENVP_ENTRY: envp++; continue; case TOMOYO_NUMBER_UNION: numbers_p++; break; } switch (right) { case TOMOYO_NAME_UNION: names_p++; break; case TOMOYO_NUMBER_UNION: numbers_p++; break; } } while (head->r.cond_index < condc) { const u8 match = condp->equals; const u8 left = condp->left; const u8 right = condp->right; if (!tomoyo_flush(head)) return false; condp++; head->r.cond_index++; tomoyo_set_space(head); switch (left) { case TOMOYO_ARGV_ENTRY: tomoyo_io_printf(head, "exec.argv[%lu]%s=\"", argv->index, argv->is_not ? "!" : ""); tomoyo_set_string(head, argv->value->name); tomoyo_set_string(head, "\""); argv++; continue; case TOMOYO_ENVP_ENTRY: tomoyo_set_string(head, "exec.envp[\""); tomoyo_set_string(head, envp->name->name); tomoyo_io_printf(head, "\"]%s=", envp->is_not ? "!" : ""); if (envp->value) { tomoyo_set_string(head, "\""); tomoyo_set_string(head, envp->value->name); tomoyo_set_string(head, "\""); } else { tomoyo_set_string(head, "NULL"); } envp++; continue; case TOMOYO_NUMBER_UNION: tomoyo_print_number_union_nospace (head, numbers_p++); break; default: tomoyo_set_string(head, tomoyo_condition_keyword[left]); break; } tomoyo_set_string(head, match ? "=" : "!="); switch (right) { case TOMOYO_NAME_UNION: tomoyo_print_name_union_quoted (head, names_p++); break; case TOMOYO_NUMBER_UNION: tomoyo_print_number_union_nospace (head, numbers_p++); break; default: tomoyo_set_string(head, tomoyo_condition_keyword[right]); break; } } } head->r.cond_step++; fallthrough; case 2: if (!tomoyo_flush(head)) break; head->r.cond_step++; fallthrough; case 3: if (cond->grant_log != TOMOYO_GRANTLOG_AUTO) tomoyo_io_printf(head, " grant_log=%s", str_yes_no(cond->grant_log == TOMOYO_GRANTLOG_YES)); tomoyo_set_lf(head); return true; } return false; } /** * tomoyo_set_group - Print "acl_group " header keyword and category name. * * @head: Pointer to "struct tomoyo_io_buffer". * @category: Category name. * * Returns nothing. */ static void tomoyo_set_group(struct tomoyo_io_buffer *head, const char *category) { if (head->type == TOMOYO_EXCEPTIONPOLICY) { tomoyo_print_namespace(head); tomoyo_io_printf(head, "acl_group %u ", head->r.acl_group_index); } tomoyo_set_string(head, category); } /** * tomoyo_print_entry - Print an ACL entry. * * @head: Pointer to "struct tomoyo_io_buffer". * @acl: Pointer to an ACL entry. * * Returns true on success, false otherwise. */ static bool tomoyo_print_entry(struct tomoyo_io_buffer *head, struct tomoyo_acl_info *acl) { const u8 acl_type = acl->type; bool first = true; u8 bit; if (head->r.print_cond_part) goto print_cond_part; if (acl->is_deleted) return true; if (!tomoyo_flush(head)) return false; else if (acl_type == TOMOYO_TYPE_PATH_ACL) { struct tomoyo_path_acl *ptr = container_of(acl, typeof(*ptr), head); const u16 perm = ptr->perm; for (bit = 0; bit < TOMOYO_MAX_PATH_OPERATION; bit++) { if (!(perm & (1 << bit))) continue; if (head->r.print_transition_related_only && bit != TOMOYO_TYPE_EXECUTE) continue; if (first) { tomoyo_set_group(head, "file "); first = false; } else { tomoyo_set_slash(head); } tomoyo_set_string(head, tomoyo_path_keyword[bit]); } if (first) return true; tomoyo_print_name_union(head, &ptr->name); } else if (acl_type == TOMOYO_TYPE_MANUAL_TASK_ACL) { struct tomoyo_task_acl *ptr = container_of(acl, typeof(*ptr), head); tomoyo_set_group(head, "task "); tomoyo_set_string(head, "manual_domain_transition "); tomoyo_set_string(head, ptr->domainname->name); } else if (head->r.print_transition_related_only) { return true; } else if (acl_type == TOMOYO_TYPE_PATH2_ACL) { struct tomoyo_path2_acl *ptr = container_of(acl, typeof(*ptr), head); const u8 perm = ptr->perm; for (bit = 0; bit < TOMOYO_MAX_PATH2_OPERATION; bit++) { if (!(perm & (1 << bit))) continue; if (first) { tomoyo_set_group(head, "file "); first = false; } else { tomoyo_set_slash(head); } tomoyo_set_string(head, tomoyo_mac_keywords [tomoyo_pp2mac[bit]]); } if (first) return true; tomoyo_print_name_union(head, &ptr->name1); tomoyo_print_name_union(head, &ptr->name2); } else if (acl_type == TOMOYO_TYPE_PATH_NUMBER_ACL) { struct tomoyo_path_number_acl *ptr = container_of(acl, typeof(*ptr), head); const u8 perm = ptr->perm; for (bit = 0; bit < TOMOYO_MAX_PATH_NUMBER_OPERATION; bit++) { if (!(perm & (1 << bit))) continue; if (first) { tomoyo_set_group(head, "file "); first = false; } else { tomoyo_set_slash(head); } tomoyo_set_string(head, tomoyo_mac_keywords [tomoyo_pn2mac[bit]]); } if (first) return true; tomoyo_print_name_union(head, &ptr->name); tomoyo_print_number_union(head, &ptr->number); } else if (acl_type == TOMOYO_TYPE_MKDEV_ACL) { struct tomoyo_mkdev_acl *ptr = container_of(acl, typeof(*ptr), head); const u8 perm = ptr->perm; for (bit = 0; bit < TOMOYO_MAX_MKDEV_OPERATION; bit++) { if (!(perm & (1 << bit))) continue; if (first) { tomoyo_set_group(head, "file "); first = false; } else { tomoyo_set_slash(head); } tomoyo_set_string(head, tomoyo_mac_keywords [tomoyo_pnnn2mac[bit]]); } if (first) return true; tomoyo_print_name_union(head, &ptr->name); tomoyo_print_number_union(head, &ptr->mode); tomoyo_print_number_union(head, &ptr->major); tomoyo_print_number_union(head, &ptr->minor); } else if (acl_type == TOMOYO_TYPE_INET_ACL) { struct tomoyo_inet_acl *ptr = container_of(acl, typeof(*ptr), head); const u8 perm = ptr->perm; for (bit = 0; bit < TOMOYO_MAX_NETWORK_OPERATION; bit++) { if (!(perm & (1 << bit))) continue; if (first) { tomoyo_set_group(head, "network inet "); tomoyo_set_string(head, tomoyo_proto_keyword [ptr->protocol]); tomoyo_set_space(head); first = false; } else { tomoyo_set_slash(head); } tomoyo_set_string(head, tomoyo_socket_keyword[bit]); } if (first) return true; tomoyo_set_space(head); if (ptr->address.group) { tomoyo_set_string(head, "@"); tomoyo_set_string(head, ptr->address.group->group_name ->name); } else { char buf[128]; tomoyo_print_ip(buf, sizeof(buf), &ptr->address); tomoyo_io_printf(head, "%s", buf); } tomoyo_print_number_union(head, &ptr->port); } else if (acl_type == TOMOYO_TYPE_UNIX_ACL) { struct tomoyo_unix_acl *ptr = container_of(acl, typeof(*ptr), head); const u8 perm = ptr->perm; for (bit = 0; bit < TOMOYO_MAX_NETWORK_OPERATION; bit++) { if (!(perm & (1 << bit))) continue; if (first) { tomoyo_set_group(head, "network unix "); tomoyo_set_string(head, tomoyo_proto_keyword [ptr->protocol]); tomoyo_set_space(head); first = false; } else { tomoyo_set_slash(head); } tomoyo_set_string(head, tomoyo_socket_keyword[bit]); } if (first) return true; tomoyo_print_name_union(head, &ptr->name); } else if (acl_type == TOMOYO_TYPE_MOUNT_ACL) { struct tomoyo_mount_acl *ptr = container_of(acl, typeof(*ptr), head); tomoyo_set_group(head, "file mount"); tomoyo_print_name_union(head, &ptr->dev_name); tomoyo_print_name_union(head, &ptr->dir_name); tomoyo_print_name_union(head, &ptr->fs_type); tomoyo_print_number_union(head, &ptr->flags); } else if (acl_type == TOMOYO_TYPE_ENV_ACL) { struct tomoyo_env_acl *ptr = container_of(acl, typeof(*ptr), head); tomoyo_set_group(head, "misc env "); tomoyo_set_string(head, ptr->env->name); } if (acl->cond) { head->r.print_cond_part = true; head->r.cond_step = 0; if (!tomoyo_flush(head)) return false; print_cond_part: if (!tomoyo_print_condition(head, acl->cond)) return false; head->r.print_cond_part = false; } else { tomoyo_set_lf(head); } return true; } /** * tomoyo_read_domain2 - Read domain policy. * * @head: Pointer to "struct tomoyo_io_buffer". * @list: Pointer to "struct list_head". * * Caller holds tomoyo_read_lock(). * * Returns true on success, false otherwise. */ static bool tomoyo_read_domain2(struct tomoyo_io_buffer *head, struct list_head *list) { list_for_each_cookie(head->r.acl, list) { struct tomoyo_acl_info *ptr = list_entry(head->r.acl, typeof(*ptr), list); if (!tomoyo_print_entry(head, ptr)) return false; } head->r.acl = NULL; return true; } /** * tomoyo_read_domain - Read domain policy. * * @head: Pointer to "struct tomoyo_io_buffer". * * Caller holds tomoyo_read_lock(). */ static void tomoyo_read_domain(struct tomoyo_io_buffer *head) { if (head->r.eof) return; list_for_each_cookie(head->r.domain, &tomoyo_domain_list) { struct tomoyo_domain_info *domain = list_entry(head->r.domain, typeof(*domain), list); u8 i; switch (head->r.step) { case 0: if (domain->is_deleted && !head->r.print_this_domain_only) continue; /* Print domainname and flags. */ tomoyo_set_string(head, domain->domainname->name); tomoyo_set_lf(head); tomoyo_io_printf(head, "use_profile %u\n", domain->profile); for (i = 0; i < TOMOYO_MAX_DOMAIN_INFO_FLAGS; i++) if (domain->flags[i]) tomoyo_set_string(head, tomoyo_dif[i]); head->r.index = 0; head->r.step++; fallthrough; case 1: while (head->r.index < TOMOYO_MAX_ACL_GROUPS) { i = head->r.index++; if (!test_bit(i, domain->group)) continue; tomoyo_io_printf(head, "use_group %u\n", i); if (!tomoyo_flush(head)) return; } head->r.index = 0; head->r.step++; tomoyo_set_lf(head); fallthrough; case 2: if (!tomoyo_read_domain2(head, &domain->acl_info_list)) return; head->r.step++; if (!tomoyo_set_lf(head)) return; fallthrough; case 3: head->r.step = 0; if (head->r.print_this_domain_only) goto done; } } done: head->r.eof = true; } /** * tomoyo_write_pid: Specify PID to obtain domainname. * * @head: Pointer to "struct tomoyo_io_buffer". * * Returns 0. */ static int tomoyo_write_pid(struct tomoyo_io_buffer *head) { head->r.eof = false; return 0; } /** * tomoyo_read_pid - Get domainname of the specified PID. * * @head: Pointer to "struct tomoyo_io_buffer". * * Returns the domainname which the specified PID is in on success, * empty string otherwise. * The PID is specified by tomoyo_write_pid() so that the user can obtain * using read()/write() interface rather than sysctl() interface. */ static void tomoyo_read_pid(struct tomoyo_io_buffer *head) { char *buf = head->write_buf; bool global_pid = false; unsigned int pid; struct task_struct *p; struct tomoyo_domain_info *domain = NULL; /* Accessing write_buf is safe because head->io_sem is held. */ if (!buf) { head->r.eof = true; return; /* Do nothing if open(O_RDONLY). */ } if (head->r.w_pos || head->r.eof) return; head->r.eof = true; if (tomoyo_str_starts(&buf, "global-pid ")) global_pid = true; if (kstrtouint(buf, 10, &pid)) return; rcu_read_lock(); if (global_pid) p = find_task_by_pid_ns(pid, &init_pid_ns); else p = find_task_by_vpid(pid); if (p) domain = tomoyo_task(p)->domain_info; rcu_read_unlock(); if (!domain) return; tomoyo_io_printf(head, "%u %u ", pid, domain->profile); tomoyo_set_string(head, domain->domainname->name); } /* String table for domain transition control keywords. */ static const char *tomoyo_transition_type[TOMOYO_MAX_TRANSITION_TYPE] = { [TOMOYO_TRANSITION_CONTROL_NO_RESET] = "no_reset_domain ", [TOMOYO_TRANSITION_CONTROL_RESET] = "reset_domain ", [TOMOYO_TRANSITION_CONTROL_NO_INITIALIZE] = "no_initialize_domain ", [TOMOYO_TRANSITION_CONTROL_INITIALIZE] = "initialize_domain ", [TOMOYO_TRANSITION_CONTROL_NO_KEEP] = "no_keep_domain ", [TOMOYO_TRANSITION_CONTROL_KEEP] = "keep_domain ", }; /* String table for grouping keywords. */ static const char *tomoyo_group_name[TOMOYO_MAX_GROUP] = { [TOMOYO_PATH_GROUP] = "path_group ", [TOMOYO_NUMBER_GROUP] = "number_group ", [TOMOYO_ADDRESS_GROUP] = "address_group ", }; /** * tomoyo_write_exception - Write exception policy. * * @head: Pointer to "struct tomoyo_io_buffer". * * Returns 0 on success, negative value otherwise. * * Caller holds tomoyo_read_lock(). */ static int tomoyo_write_exception(struct tomoyo_io_buffer *head) { const bool is_delete = head->w.is_delete; struct tomoyo_acl_param param = { .ns = head->w.ns, .is_delete = is_delete, .data = head->write_buf, }; u8 i; if (tomoyo_str_starts(&param.data, "aggregator ")) return tomoyo_write_aggregator(&param); for (i = 0; i < TOMOYO_MAX_TRANSITION_TYPE; i++) if (tomoyo_str_starts(&param.data, tomoyo_transition_type[i])) return tomoyo_write_transition_control(&param, i); for (i = 0; i < TOMOYO_MAX_GROUP; i++) if (tomoyo_str_starts(&param.data, tomoyo_group_name[i])) return tomoyo_write_group(&param, i); if (tomoyo_str_starts(&param.data, "acl_group ")) { unsigned int group; char *data; group = simple_strtoul(param.data, &data, 10); if (group < TOMOYO_MAX_ACL_GROUPS && *data++ == ' ') return tomoyo_write_domain2 (head->w.ns, &head->w.ns->acl_group[group], data, is_delete); } return -EINVAL; } /** * tomoyo_read_group - Read "struct tomoyo_path_group"/"struct tomoyo_number_group"/"struct tomoyo_address_group" list. * * @head: Pointer to "struct tomoyo_io_buffer". * @idx: Index number. * * Returns true on success, false otherwise. * * Caller holds tomoyo_read_lock(). */ static bool tomoyo_read_group(struct tomoyo_io_buffer *head, const int idx) { struct tomoyo_policy_namespace *ns = container_of(head->r.ns, typeof(*ns), namespace_list); struct list_head *list = &ns->group_list[idx]; list_for_each_cookie(head->r.group, list) { struct tomoyo_group *group = list_entry(head->r.group, typeof(*group), head.list); list_for_each_cookie(head->r.acl, &group->member_list) { struct tomoyo_acl_head *ptr = list_entry(head->r.acl, typeof(*ptr), list); if (ptr->is_deleted) continue; if (!tomoyo_flush(head)) return false; tomoyo_print_namespace(head); tomoyo_set_string(head, tomoyo_group_name[idx]); tomoyo_set_string(head, group->group_name->name); if (idx == TOMOYO_PATH_GROUP) { tomoyo_set_space(head); tomoyo_set_string(head, container_of (ptr, struct tomoyo_path_group, head)->member_name->name); } else if (idx == TOMOYO_NUMBER_GROUP) { tomoyo_print_number_union(head, &container_of (ptr, struct tomoyo_number_group, head)->number); } else if (idx == TOMOYO_ADDRESS_GROUP) { char buffer[128]; struct tomoyo_address_group *member = container_of(ptr, typeof(*member), head); tomoyo_print_ip(buffer, sizeof(buffer), &member->address); tomoyo_io_printf(head, " %s", buffer); } tomoyo_set_lf(head); } head->r.acl = NULL; } head->r.group = NULL; return true; } /** * tomoyo_read_policy - Read "struct tomoyo_..._entry" list. * * @head: Pointer to "struct tomoyo_io_buffer". * @idx: Index number. * * Returns true on success, false otherwise. * * Caller holds tomoyo_read_lock(). */ static bool tomoyo_read_policy(struct tomoyo_io_buffer *head, const int idx) { struct tomoyo_policy_namespace *ns = container_of(head->r.ns, typeof(*ns), namespace_list); struct list_head *list = &ns->policy_list[idx]; list_for_each_cookie(head->r.acl, list) { struct tomoyo_acl_head *acl = container_of(head->r.acl, typeof(*acl), list); if (acl->is_deleted) continue; if (!tomoyo_flush(head)) return false; switch (idx) { case TOMOYO_ID_TRANSITION_CONTROL: { struct tomoyo_transition_control *ptr = container_of(acl, typeof(*ptr), head); tomoyo_print_namespace(head); tomoyo_set_string(head, tomoyo_transition_type [ptr->type]); tomoyo_set_string(head, ptr->program ? ptr->program->name : "any"); tomoyo_set_string(head, " from "); tomoyo_set_string(head, ptr->domainname ? ptr->domainname->name : "any"); } break; case TOMOYO_ID_AGGREGATOR: { struct tomoyo_aggregator *ptr = container_of(acl, typeof(*ptr), head); tomoyo_print_namespace(head); tomoyo_set_string(head, "aggregator "); tomoyo_set_string(head, ptr->original_name->name); tomoyo_set_space(head); tomoyo_set_string(head, ptr->aggregated_name->name); } break; default: continue; } tomoyo_set_lf(head); } head->r.acl = NULL; return true; } /** * tomoyo_read_exception - Read exception policy. * * @head: Pointer to "struct tomoyo_io_buffer". * * Caller holds tomoyo_read_lock(). */ static void tomoyo_read_exception(struct tomoyo_io_buffer *head) { struct tomoyo_policy_namespace *ns = container_of(head->r.ns, typeof(*ns), namespace_list); if (head->r.eof) return; while (head->r.step < TOMOYO_MAX_POLICY && tomoyo_read_policy(head, head->r.step)) head->r.step++; if (head->r.step < TOMOYO_MAX_POLICY) return; while (head->r.step < TOMOYO_MAX_POLICY + TOMOYO_MAX_GROUP && tomoyo_read_group(head, head->r.step - TOMOYO_MAX_POLICY)) head->r.step++; if (head->r.step < TOMOYO_MAX_POLICY + TOMOYO_MAX_GROUP) return; while (head->r.step < TOMOYO_MAX_POLICY + TOMOYO_MAX_GROUP + TOMOYO_MAX_ACL_GROUPS) { head->r.acl_group_index = head->r.step - TOMOYO_MAX_POLICY - TOMOYO_MAX_GROUP; if (!tomoyo_read_domain2(head, &ns->acl_group [head->r.acl_group_index])) return; head->r.step++; } head->r.eof = true; } /* Wait queue for kernel -> userspace notification. */ static DECLARE_WAIT_QUEUE_HEAD(tomoyo_query_wait); /* Wait queue for userspace -> kernel notification. */ static DECLARE_WAIT_QUEUE_HEAD(tomoyo_answer_wait); /* Structure for query. */ struct tomoyo_query { struct list_head list; struct tomoyo_domain_info *domain; char *query; size_t query_len; unsigned int serial; u8 timer; u8 answer; u8 retry; }; /* The list for "struct tomoyo_query". */ static LIST_HEAD(tomoyo_query_list); /* Lock for manipulating tomoyo_query_list. */ static DEFINE_SPINLOCK(tomoyo_query_list_lock); /* * Number of "struct file" referring /sys/kernel/security/tomoyo/query * interface. */ static atomic_t tomoyo_query_observers = ATOMIC_INIT(0); /** * tomoyo_truncate - Truncate a line. * * @str: String to truncate. * * Returns length of truncated @str. */ static int tomoyo_truncate(char *str) { char *start = str; while (*(unsigned char *) str > (unsigned char) ' ') str++; *str = '\0'; return strlen(start) + 1; } /** * tomoyo_numscan - sscanf() which stores the length of a decimal integer value. * * @str: String to scan. * @head: Leading string that must start with. * @width: Pointer to "int" for storing length of a decimal integer value after @head. * @tail: Optional character that must match after a decimal integer value. * * Returns whether @str starts with @head and a decimal value follows @head. */ static bool tomoyo_numscan(const char *str, const char *head, int *width, const char tail) { const char *cp; const int n = strlen(head); if (!strncmp(str, head, n)) { cp = str + n; while (*cp && *cp >= '0' && *cp <= '9') cp++; if (*cp == tail || !tail) { *width = cp - (str + n); return *width != 0; } } *width = 0; return 0; } /** * tomoyo_patternize_path - Make patterns for file path. Used by learning mode. * * @buffer: Destination buffer. * @len: Size of @buffer. * @entry: Original line. * * Returns nothing. */ static void tomoyo_patternize_path(char *buffer, const int len, char *entry) { int width; char *cp = entry; /* Nothing to do if this line is not for "file" related entry. */ if (strncmp(entry, "file ", 5)) goto flush; /* * Nothing to do if there is no colon in this line, for this rewriting * applies to only filesystems where numeric values in the path are volatile. */ cp = strchr(entry + 5, ':'); if (!cp) { cp = entry; goto flush; } /* Flush e.g. "file ioctl" part. */ while (*cp != ' ') cp--; *cp++ = '\0'; tomoyo_addprintf(buffer, len, "%s ", entry); /* e.g. file ioctl pipe:[$INO] $CMD */ if (tomoyo_numscan(cp, "pipe:[", &width, ']')) { cp += width + 7; tomoyo_addprintf(buffer, len, "pipe:[\\$]"); goto flush; } /* e.g. file ioctl socket:[$INO] $CMD */ if (tomoyo_numscan(cp, "socket:[", &width, ']')) { cp += width + 9; tomoyo_addprintf(buffer, len, "socket:[\\$]"); goto flush; } if (!strncmp(cp, "proc:/self", 10)) { /* e.g. file read proc:/self/task/$TID/fdinfo/$FD */ cp += 10; tomoyo_addprintf(buffer, len, "proc:/self"); } else if (tomoyo_numscan(cp, "proc:/", &width, 0)) { /* e.g. file read proc:/$PID/task/$TID/fdinfo/$FD */ /* * Don't patternize $PID part if $PID == 1, for several * programs access only files in /proc/1/ directory. */ cp += width + 6; if (width == 1 && *(cp - 1) == '1') tomoyo_addprintf(buffer, len, "proc:/1"); else tomoyo_addprintf(buffer, len, "proc:/\\$"); } else { goto flush; } /* Patternize $TID part if "/task/" follows. */ if (tomoyo_numscan(cp, "/task/", &width, 0)) { cp += width + 6; tomoyo_addprintf(buffer, len, "/task/\\$"); } /* Patternize $FD part if "/fd/" or "/fdinfo/" follows. */ if (tomoyo_numscan(cp, "/fd/", &width, 0)) { cp += width + 4; tomoyo_addprintf(buffer, len, "/fd/\\$"); } else if (tomoyo_numscan(cp, "/fdinfo/", &width, 0)) { cp += width + 8; tomoyo_addprintf(buffer, len, "/fdinfo/\\$"); } flush: /* Flush remaining part if any. */ if (*cp) tomoyo_addprintf(buffer, len, "%s", cp); } /** * tomoyo_add_entry - Add an ACL to current thread's domain. Used by learning mode. * * @domain: Pointer to "struct tomoyo_domain_info". * @header: Lines containing ACL. * * Returns nothing. */ static void tomoyo_add_entry(struct tomoyo_domain_info *domain, char *header) { char *buffer; char *realpath = NULL; char *argv0 = NULL; char *symlink = NULL; char *cp = strchr(header, '\n'); int len; if (!cp) return; cp = strchr(cp + 1, '\n'); if (!cp) return; *cp++ = '\0'; /* Reserve some space for potentially using patterns. */ len = strlen(cp) + 16; /* strstr() will return NULL if ordering is wrong. */ if (*cp == 'f') { argv0 = strstr(header, " argv[]={ \""); if (argv0) { argv0 += 10; len += tomoyo_truncate(argv0) + 14; } realpath = strstr(header, " exec={ realpath=\""); if (realpath) { realpath += 8; len += tomoyo_truncate(realpath) + 6; } symlink = strstr(header, " symlink.target=\""); if (symlink) len += tomoyo_truncate(symlink + 1) + 1; } buffer = kmalloc(len, GFP_NOFS | __GFP_ZERO); if (!buffer) return; tomoyo_patternize_path(buffer, len, cp); if (realpath) tomoyo_addprintf(buffer, len, " exec.%s", realpath); if (argv0) tomoyo_addprintf(buffer, len, " exec.argv[0]=%s", argv0); if (symlink) tomoyo_addprintf(buffer, len, "%s", symlink); tomoyo_normalize_line(buffer); if (!tomoyo_write_domain2(domain->ns, &domain->acl_info_list, buffer, false)) tomoyo_update_stat(TOMOYO_STAT_POLICY_UPDATES); kfree(buffer); } /** * tomoyo_supervisor - Ask for the supervisor's decision. * * @r: Pointer to "struct tomoyo_request_info". * @fmt: The printf()'s format string, followed by parameters. * * Returns 0 if the supervisor decided to permit the access request which * violated the policy in enforcing mode, TOMOYO_RETRY_REQUEST if the * supervisor decided to retry the access request which violated the policy in * enforcing mode, 0 if it is not in enforcing mode, -EPERM otherwise. */ int tomoyo_supervisor(struct tomoyo_request_info *r, const char *fmt, ...) { va_list args; int error; int len; static unsigned int tomoyo_serial; struct tomoyo_query entry = { }; bool quota_exceeded = false; va_start(args, fmt); len = vsnprintf(NULL, 0, fmt, args) + 1; va_end(args); /* Write /sys/kernel/security/tomoyo/audit. */ va_start(args, fmt); tomoyo_write_log2(r, len, fmt, args); va_end(args); /* Nothing more to do if granted. */ if (r->granted) return 0; if (r->mode) tomoyo_update_stat(r->mode); switch (r->mode) { case TOMOYO_CONFIG_ENFORCING: error = -EPERM; if (atomic_read(&tomoyo_query_observers)) break; goto out; case TOMOYO_CONFIG_LEARNING: error = 0; /* Check max_learning_entry parameter. */ if (tomoyo_domain_quota_is_ok(r)) break; fallthrough; default: return 0; } /* Get message. */ va_start(args, fmt); entry.query = tomoyo_init_log(r, len, fmt, args); va_end(args); if (!entry.query) goto out; entry.query_len = strlen(entry.query) + 1; if (!error) { tomoyo_add_entry(r->domain, entry.query); goto out; } len = kmalloc_size_roundup(entry.query_len); entry.domain = r->domain; spin_lock(&tomoyo_query_list_lock); if (tomoyo_memory_quota[TOMOYO_MEMORY_QUERY] && tomoyo_memory_used[TOMOYO_MEMORY_QUERY] + len >= tomoyo_memory_quota[TOMOYO_MEMORY_QUERY]) { quota_exceeded = true; } else { entry.serial = tomoyo_serial++; entry.retry = r->retry; tomoyo_memory_used[TOMOYO_MEMORY_QUERY] += len; list_add_tail(&entry.list, &tomoyo_query_list); } spin_unlock(&tomoyo_query_list_lock); if (quota_exceeded) goto out; /* Give 10 seconds for supervisor's opinion. */ while (entry.timer < 10) { wake_up_all(&tomoyo_query_wait); if (wait_event_interruptible_timeout (tomoyo_answer_wait, entry.answer || !atomic_read(&tomoyo_query_observers), HZ)) break; entry.timer++; } spin_lock(&tomoyo_query_list_lock); list_del(&entry.list); tomoyo_memory_used[TOMOYO_MEMORY_QUERY] -= len; spin_unlock(&tomoyo_query_list_lock); switch (entry.answer) { case 3: /* Asked to retry by administrator. */ error = TOMOYO_RETRY_REQUEST; r->retry++; break; case 1: /* Granted by administrator. */ error = 0; break; default: /* Timed out or rejected by administrator. */ break; } out: kfree(entry.query); return error; } /** * tomoyo_find_domain_by_qid - Get domain by query id. * * @serial: Query ID assigned by tomoyo_supervisor(). * * Returns pointer to "struct tomoyo_domain_info" if found, NULL otherwise. */ static struct tomoyo_domain_info *tomoyo_find_domain_by_qid (unsigned int serial) { struct tomoyo_query *ptr; struct tomoyo_domain_info *domain = NULL; spin_lock(&tomoyo_query_list_lock); list_for_each_entry(ptr, &tomoyo_query_list, list) { if (ptr->serial != serial) continue; domain = ptr->domain; break; } spin_unlock(&tomoyo_query_list_lock); return domain; } /** * tomoyo_poll_query - poll() for /sys/kernel/security/tomoyo/query. * * @file: Pointer to "struct file". * @wait: Pointer to "poll_table". * * Returns EPOLLIN | EPOLLRDNORM when ready to read, 0 otherwise. * * Waits for access requests which violated policy in enforcing mode. */ static __poll_t tomoyo_poll_query(struct file *file, poll_table *wait) { if (!list_empty(&tomoyo_query_list)) return EPOLLIN | EPOLLRDNORM; poll_wait(file, &tomoyo_query_wait, wait); if (!list_empty(&tomoyo_query_list)) return EPOLLIN | EPOLLRDNORM; return 0; } /** * tomoyo_read_query - Read access requests which violated policy in enforcing mode. * * @head: Pointer to "struct tomoyo_io_buffer". */ static void tomoyo_read_query(struct tomoyo_io_buffer *head) { struct list_head *tmp; unsigned int pos = 0; size_t len = 0; char *buf; if (head->r.w_pos) return; kfree(head->read_buf); head->read_buf = NULL; spin_lock(&tomoyo_query_list_lock); list_for_each(tmp, &tomoyo_query_list) { struct tomoyo_query *ptr = list_entry(tmp, typeof(*ptr), list); if (pos++ != head->r.query_index) continue; len = ptr->query_len; break; } spin_unlock(&tomoyo_query_list_lock); if (!len) { head->r.query_index = 0; return; } buf = kzalloc(len + 32, GFP_NOFS); if (!buf) return; pos = 0; spin_lock(&tomoyo_query_list_lock); list_for_each(tmp, &tomoyo_query_list) { struct tomoyo_query *ptr = list_entry(tmp, typeof(*ptr), list); if (pos++ != head->r.query_index) continue; /* * Some query can be skipped because tomoyo_query_list * can change, but I don't care. */ if (len == ptr->query_len) snprintf(buf, len + 31, "Q%u-%hu\n%s", ptr->serial, ptr->retry, ptr->query); break; } spin_unlock(&tomoyo_query_list_lock); if (buf[0]) { head->read_buf = buf; head->r.w[head->r.w_pos++] = buf; head->r.query_index++; } else { kfree(buf); } } /** * tomoyo_write_answer - Write the supervisor's decision. * * @head: Pointer to "struct tomoyo_io_buffer". * * Returns 0 on success, -EINVAL otherwise. */ static int tomoyo_write_answer(struct tomoyo_io_buffer *head) { char *data = head->write_buf; struct list_head *tmp; unsigned int serial; unsigned int answer; spin_lock(&tomoyo_query_list_lock); list_for_each(tmp, &tomoyo_query_list) { struct tomoyo_query *ptr = list_entry(tmp, typeof(*ptr), list); ptr->timer = 0; } spin_unlock(&tomoyo_query_list_lock); if (sscanf(data, "A%u=%u", &serial, &answer) != 2) return -EINVAL; spin_lock(&tomoyo_query_list_lock); list_for_each(tmp, &tomoyo_query_list) { struct tomoyo_query *ptr = list_entry(tmp, typeof(*ptr), list); if (ptr->serial != serial) continue; ptr->answer = answer; /* Remove from tomoyo_query_list. */ if (ptr->answer) list_del_init(&ptr->list); break; } spin_unlock(&tomoyo_query_list_lock); return 0; } /** * tomoyo_read_version: Get version. * * @head: Pointer to "struct tomoyo_io_buffer". * * Returns version information. */ static void tomoyo_read_version(struct tomoyo_io_buffer *head) { if (!head->r.eof) { tomoyo_io_printf(head, "2.6.0"); head->r.eof = true; } } /* String table for /sys/kernel/security/tomoyo/stat interface. */ static const char * const tomoyo_policy_headers[TOMOYO_MAX_POLICY_STAT] = { [TOMOYO_STAT_POLICY_UPDATES] = "update:", [TOMOYO_STAT_POLICY_LEARNING] = "violation in learning mode:", [TOMOYO_STAT_POLICY_PERMISSIVE] = "violation in permissive mode:", [TOMOYO_STAT_POLICY_ENFORCING] = "violation in enforcing mode:", }; /* String table for /sys/kernel/security/tomoyo/stat interface. */ static const char * const tomoyo_memory_headers[TOMOYO_MAX_MEMORY_STAT] = { [TOMOYO_MEMORY_POLICY] = "policy:", [TOMOYO_MEMORY_AUDIT] = "audit log:", [TOMOYO_MEMORY_QUERY] = "query message:", }; /* Counter for number of updates. */ static atomic_t tomoyo_stat_updated[TOMOYO_MAX_POLICY_STAT]; /* Timestamp counter for last updated. */ static time64_t tomoyo_stat_modified[TOMOYO_MAX_POLICY_STAT]; /** * tomoyo_update_stat - Update statistic counters. * * @index: Index for policy type. * * Returns nothing. */ void tomoyo_update_stat(const u8 index) { atomic_inc(&tomoyo_stat_updated[index]); tomoyo_stat_modified[index] = ktime_get_real_seconds(); } /** * tomoyo_read_stat - Read statistic data. * * @head: Pointer to "struct tomoyo_io_buffer". * * Returns nothing. */ static void tomoyo_read_stat(struct tomoyo_io_buffer *head) { u8 i; unsigned int total = 0; if (head->r.eof) return; for (i = 0; i < TOMOYO_MAX_POLICY_STAT; i++) { tomoyo_io_printf(head, "Policy %-30s %10u", tomoyo_policy_headers[i], atomic_read(&tomoyo_stat_updated[i])); if (tomoyo_stat_modified[i]) { struct tomoyo_time stamp; tomoyo_convert_time(tomoyo_stat_modified[i], &stamp); tomoyo_io_printf(head, " (Last: %04u/%02u/%02u %02u:%02u:%02u)", stamp.year, stamp.month, stamp.day, stamp.hour, stamp.min, stamp.sec); } tomoyo_set_lf(head); } for (i = 0; i < TOMOYO_MAX_MEMORY_STAT; i++) { unsigned int used = tomoyo_memory_used[i]; total += used; tomoyo_io_printf(head, "Memory used by %-22s %10u", tomoyo_memory_headers[i], used); used = tomoyo_memory_quota[i]; if (used) tomoyo_io_printf(head, " (Quota: %10u)", used); tomoyo_set_lf(head); } tomoyo_io_printf(head, "Total memory used: %10u\n", total); head->r.eof = true; } /** * tomoyo_write_stat - Set memory quota. * * @head: Pointer to "struct tomoyo_io_buffer". * * Returns 0. */ static int tomoyo_write_stat(struct tomoyo_io_buffer *head) { char *data = head->write_buf; u8 i; if (tomoyo_str_starts(&data, "Memory used by ")) for (i = 0; i < TOMOYO_MAX_MEMORY_STAT; i++) if (tomoyo_str_starts(&data, tomoyo_memory_headers[i])) sscanf(data, "%u", &tomoyo_memory_quota[i]); return 0; } /** * tomoyo_open_control - open() for /sys/kernel/security/tomoyo/ interface. * * @type: Type of interface. * @file: Pointer to "struct file". * * Returns 0 on success, negative value otherwise. */ int tomoyo_open_control(const u8 type, struct file *file) { struct tomoyo_io_buffer *head = kzalloc(sizeof(*head), GFP_NOFS); if (!head) return -ENOMEM; mutex_init(&head->io_sem); head->type = type; switch (type) { case TOMOYO_DOMAINPOLICY: /* /sys/kernel/security/tomoyo/domain_policy */ head->write = tomoyo_write_domain; head->read = tomoyo_read_domain; break; case TOMOYO_EXCEPTIONPOLICY: /* /sys/kernel/security/tomoyo/exception_policy */ head->write = tomoyo_write_exception; head->read = tomoyo_read_exception; break; case TOMOYO_AUDIT: /* /sys/kernel/security/tomoyo/audit */ head->poll = tomoyo_poll_log; head->read = tomoyo_read_log; break; case TOMOYO_PROCESS_STATUS: /* /sys/kernel/security/tomoyo/.process_status */ head->write = tomoyo_write_pid; head->read = tomoyo_read_pid; break; case TOMOYO_VERSION: /* /sys/kernel/security/tomoyo/version */ head->read = tomoyo_read_version; head->readbuf_size = 128; break; case TOMOYO_STAT: /* /sys/kernel/security/tomoyo/stat */ head->write = tomoyo_write_stat; head->read = tomoyo_read_stat; head->readbuf_size = 1024; break; case TOMOYO_PROFILE: /* /sys/kernel/security/tomoyo/profile */ head->write = tomoyo_write_profile; head->read = tomoyo_read_profile; break; case TOMOYO_QUERY: /* /sys/kernel/security/tomoyo/query */ head->poll = tomoyo_poll_query; head->write = tomoyo_write_answer; head->read = tomoyo_read_query; break; case TOMOYO_MANAGER: /* /sys/kernel/security/tomoyo/manager */ head->write = tomoyo_write_manager; head->read = tomoyo_read_manager; break; } if (!(file->f_mode & FMODE_READ)) { /* * No need to allocate read_buf since it is not opened * for reading. */ head->read = NULL; head->poll = NULL; } else if (!head->poll) { /* Don't allocate read_buf for poll() access. */ if (!head->readbuf_size) head->readbuf_size = 4096 * 2; head->read_buf = kzalloc(head->readbuf_size, GFP_NOFS); if (!head->read_buf) { kfree(head); return -ENOMEM; } } if (!(file->f_mode & FMODE_WRITE)) { /* * No need to allocate write_buf since it is not opened * for writing. */ head->write = NULL; } else if (head->write) { head->writebuf_size = 4096 * 2; head->write_buf = kzalloc(head->writebuf_size, GFP_NOFS); if (!head->write_buf) { kfree(head->read_buf); kfree(head); return -ENOMEM; } } /* * If the file is /sys/kernel/security/tomoyo/query , increment the * observer counter. * The obserber counter is used by tomoyo_supervisor() to see if * there is some process monitoring /sys/kernel/security/tomoyo/query. */ if (type == TOMOYO_QUERY) atomic_inc(&tomoyo_query_observers); file->private_data = head; tomoyo_notify_gc(head, true); return 0; } /** * tomoyo_poll_control - poll() for /sys/kernel/security/tomoyo/ interface. * * @file: Pointer to "struct file". * @wait: Pointer to "poll_table". Maybe NULL. * * Returns EPOLLIN | EPOLLRDNORM | EPOLLOUT | EPOLLWRNORM if ready to read/write, * EPOLLOUT | EPOLLWRNORM otherwise. */ __poll_t tomoyo_poll_control(struct file *file, poll_table *wait) { struct tomoyo_io_buffer *head = file->private_data; if (head->poll) return head->poll(file, wait) | EPOLLOUT | EPOLLWRNORM; return EPOLLIN | EPOLLRDNORM | EPOLLOUT | EPOLLWRNORM; } /** * tomoyo_set_namespace_cursor - Set namespace to read. * * @head: Pointer to "struct tomoyo_io_buffer". * * Returns nothing. */ static inline void tomoyo_set_namespace_cursor(struct tomoyo_io_buffer *head) { struct list_head *ns; if (head->type != TOMOYO_EXCEPTIONPOLICY && head->type != TOMOYO_PROFILE) return; /* * If this is the first read, or reading previous namespace finished * and has more namespaces to read, update the namespace cursor. */ ns = head->r.ns; if (!ns || (head->r.eof && ns->next != &tomoyo_namespace_list)) { /* Clearing is OK because tomoyo_flush() returned true. */ memset(&head->r, 0, sizeof(head->r)); head->r.ns = ns ? ns->next : tomoyo_namespace_list.next; } } /** * tomoyo_has_more_namespace - Check for unread namespaces. * * @head: Pointer to "struct tomoyo_io_buffer". * * Returns true if we have more entries to print, false otherwise. */ static inline bool tomoyo_has_more_namespace(struct tomoyo_io_buffer *head) { return (head->type == TOMOYO_EXCEPTIONPOLICY || head->type == TOMOYO_PROFILE) && head->r.eof && head->r.ns->next != &tomoyo_namespace_list; } /** * tomoyo_read_control - read() for /sys/kernel/security/tomoyo/ interface. * * @head: Pointer to "struct tomoyo_io_buffer". * @buffer: Pointer to buffer to write to. * @buffer_len: Size of @buffer. * * Returns bytes read on success, negative value otherwise. */ ssize_t tomoyo_read_control(struct tomoyo_io_buffer *head, char __user *buffer, const int buffer_len) { int len; int idx; if (!head->read) return -EINVAL; if (mutex_lock_interruptible(&head->io_sem)) return -EINTR; head->read_user_buf = buffer; head->read_user_buf_avail = buffer_len; idx = tomoyo_read_lock(); if (tomoyo_flush(head)) /* Call the policy handler. */ do { tomoyo_set_namespace_cursor(head); head->read(head); } while (tomoyo_flush(head) && tomoyo_has_more_namespace(head)); tomoyo_read_unlock(idx); len = head->read_user_buf - buffer; mutex_unlock(&head->io_sem); return len; } /** * tomoyo_parse_policy - Parse a policy line. * * @head: Pointer to "struct tomoyo_io_buffer". * @line: Line to parse. * * Returns 0 on success, negative value otherwise. * * Caller holds tomoyo_read_lock(). */ static int tomoyo_parse_policy(struct tomoyo_io_buffer *head, char *line) { /* Delete request? */ head->w.is_delete = !strncmp(line, "delete ", 7); if (head->w.is_delete) memmove(line, line + 7, strlen(line + 7) + 1); /* Selecting namespace to update. */ if (head->type == TOMOYO_EXCEPTIONPOLICY || head->type == TOMOYO_PROFILE) { if (*line == '<') { char *cp = strchr(line, ' '); if (cp) { *cp++ = '\0'; head->w.ns = tomoyo_assign_namespace(line); memmove(line, cp, strlen(cp) + 1); } else head->w.ns = NULL; } else head->w.ns = &tomoyo_kernel_namespace; /* Don't allow updating if namespace is invalid. */ if (!head->w.ns) return -ENOENT; } /* Do the update. */ return head->write(head); } /** * tomoyo_write_control - write() for /sys/kernel/security/tomoyo/ interface. * * @head: Pointer to "struct tomoyo_io_buffer". * @buffer: Pointer to buffer to read from. * @buffer_len: Size of @buffer. * * Returns @buffer_len on success, negative value otherwise. */ ssize_t tomoyo_write_control(struct tomoyo_io_buffer *head, const char __user *buffer, const int buffer_len) { int error = buffer_len; size_t avail_len = buffer_len; char *cp0; int idx; if (!head->write) return -EINVAL; if (mutex_lock_interruptible(&head->io_sem)) return -EINTR; cp0 = head->write_buf; head->read_user_buf_avail = 0; idx = tomoyo_read_lock(); /* Read a line and dispatch it to the policy handler. */ while (avail_len > 0) { char c; if (head->w.avail >= head->writebuf_size - 1) { const int len = head->writebuf_size * 2; char *cp = kzalloc(len, GFP_NOFS | __GFP_NOWARN); if (!cp) { error = -ENOMEM; break; } memmove(cp, cp0, head->w.avail); kfree(cp0); head->write_buf = cp; cp0 = cp; head->writebuf_size = len; } if (get_user(c, buffer)) { error = -EFAULT; break; } buffer++; avail_len--; cp0[head->w.avail++] = c; if (c != '\n') continue; cp0[head->w.avail - 1] = '\0'; head->w.avail = 0; tomoyo_normalize_line(cp0); if (!strcmp(cp0, "reset")) { head->w.ns = &tomoyo_kernel_namespace; head->w.domain = NULL; memset(&head->r, 0, sizeof(head->r)); continue; } /* Don't allow updating policies by non manager programs. */ switch (head->type) { case TOMOYO_PROCESS_STATUS: /* This does not write anything. */ break; case TOMOYO_DOMAINPOLICY: if (tomoyo_select_domain(head, cp0)) continue; fallthrough; case TOMOYO_EXCEPTIONPOLICY: if (!strcmp(cp0, "select transition_only")) { head->r.print_transition_related_only = true; continue; } fallthrough; default: if (!tomoyo_manager()) { error = -EPERM; goto out; } } switch (tomoyo_parse_policy(head, cp0)) { case -EPERM: error = -EPERM; goto out; case 0: switch (head->type) { case TOMOYO_DOMAINPOLICY: case TOMOYO_EXCEPTIONPOLICY: case TOMOYO_STAT: case TOMOYO_PROFILE: case TOMOYO_MANAGER: tomoyo_update_stat(TOMOYO_STAT_POLICY_UPDATES); break; default: break; } break; } } out: tomoyo_read_unlock(idx); mutex_unlock(&head->io_sem); return error; } /** * tomoyo_close_control - close() for /sys/kernel/security/tomoyo/ interface. * * @head: Pointer to "struct tomoyo_io_buffer". */ void tomoyo_close_control(struct tomoyo_io_buffer *head) { /* * If the file is /sys/kernel/security/tomoyo/query , decrement the * observer counter. */ if (head->type == TOMOYO_QUERY && atomic_dec_and_test(&tomoyo_query_observers)) wake_up_all(&tomoyo_answer_wait); tomoyo_notify_gc(head, false); } /** * tomoyo_check_profile - Check all profiles currently assigned to domains are defined. */ void tomoyo_check_profile(void) { struct tomoyo_domain_info *domain; const int idx = tomoyo_read_lock(); tomoyo_policy_loaded = true; pr_info("TOMOYO: 2.6.0\n"); list_for_each_entry_rcu(domain, &tomoyo_domain_list, list, srcu_read_lock_held(&tomoyo_ss)) { const u8 profile = domain->profile; struct tomoyo_policy_namespace *ns = domain->ns; if (ns->profile_version == 20110903) { pr_info_once("Converting profile version from %u to %u.\n", 20110903, 20150505); ns->profile_version = 20150505; } if (ns->profile_version != 20150505) pr_err("Profile version %u is not supported.\n", ns->profile_version); else if (!ns->profile_ptr[profile]) pr_err("Profile %u (used by '%s') is not defined.\n", profile, domain->domainname->name); else continue; pr_err("Userland tools for TOMOYO 2.6 must be installed and policy must be initialized.\n"); pr_err("Please see https://tomoyo.sourceforge.net/2.6/ for more information.\n"); panic("STOP!"); } tomoyo_read_unlock(idx); pr_info("Mandatory Access Control activated.\n"); } /** * tomoyo_load_builtin_policy - Load built-in policy. * * Returns nothing. */ void __init tomoyo_load_builtin_policy(void) { #ifdef CONFIG_SECURITY_TOMOYO_INSECURE_BUILTIN_SETTING static char tomoyo_builtin_profile[] __initdata = "PROFILE_VERSION=20150505\n" "0-CONFIG={ mode=learning grant_log=no reject_log=yes }\n"; static char tomoyo_builtin_exception_policy[] __initdata = "aggregator proc:/self/exe /proc/self/exe\n"; static char tomoyo_builtin_domain_policy[] __initdata = ""; static char tomoyo_builtin_manager[] __initdata = ""; static char tomoyo_builtin_stat[] __initdata = ""; #else /* * This include file is manually created and contains built-in policy * named "tomoyo_builtin_profile", "tomoyo_builtin_exception_policy", * "tomoyo_builtin_domain_policy", "tomoyo_builtin_manager", * "tomoyo_builtin_stat" in the form of "static char [] __initdata". */ #include "builtin-policy.h" #endif u8 i; const int idx = tomoyo_read_lock(); for (i = 0; i < 5; i++) { struct tomoyo_io_buffer head = { }; char *start = ""; switch (i) { case 0: start = tomoyo_builtin_profile; head.type = TOMOYO_PROFILE; head.write = tomoyo_write_profile; break; case 1: start = tomoyo_builtin_exception_policy; head.type = TOMOYO_EXCEPTIONPOLICY; head.write = tomoyo_write_exception; break; case 2: start = tomoyo_builtin_domain_policy; head.type = TOMOYO_DOMAINPOLICY; head.write = tomoyo_write_domain; break; case 3: start = tomoyo_builtin_manager; head.type = TOMOYO_MANAGER; head.write = tomoyo_write_manager; break; case 4: start = tomoyo_builtin_stat; head.type = TOMOYO_STAT; head.write = tomoyo_write_stat; break; } while (1) { char *end = strchr(start, '\n'); if (!end) break; *end = '\0'; tomoyo_normalize_line(start); head.write_buf = start; tomoyo_parse_policy(&head, start); start = end + 1; } } tomoyo_read_unlock(idx); #ifdef CONFIG_SECURITY_TOMOYO_OMIT_USERSPACE_LOADER tomoyo_check_profile(); #endif }
159 159 160 212 215 93 160 13 6 10 10 18 18 18 18 18 17 109 5 77 55 54 51 3 16 7 4 4 4 1 3 3 3 27 13 17 12 4 4 4 4 4 10 4 10 27 24 37 38 37 28 10 18 18 18 5 1 5 5 32 32 32 1 32 31 18 18 14 32 14 18 41 41 41 48 27 30 21 31 31 13 31 3 3 2 33 3 3 3 3 30 28 3 28 30 49 22 49 49 6 20 19 19 19 19 55 54 54 5 19 55 3 1 1 55 5 52 25 5 55 12 53 52 53 15 4 19 19 19 19 19 19 18 19 19 1 1 9 9 19 19 19 19 2 19 19 19 19 8 19 19 19 19 19 4 4 4 4 4 4 2 4 4 4 4 4 4 4 2 19 19 6 13 3 19 19 19 19 19 18 15 3 3 19 19 18 19 4 19 6 19 19 2 6 19 10 19 8 19 52 51 4 48 2 9 14 4 37 21 21 6 21 18 72 73 72 32 32 15 1 26 24 21 3 21 6 21 21 21 19 6 19 19 7 14 1 58 17 17 1 7 7 1 1 2 5 17 17 13 8 2 8 7 3 1 1 1 1 1 1 1 35 45 45 6 7 32 31 30 35 29 1 30 2 27 18 9 1 9 24 25 1 2 24 23 18 6 18 17 17 17 7 2 1 9 2 5 5 3 18 6 1 1 6 24 15 15 2 15 77 73 6 90 90 87 6 16 79 19 30 73 90 77 19 23 90 25 74 4 77 77 3 15 9 9 15 8 8 8 4 4 4 3 7 3 4 6 6 6 6 25 25 92 93 25 25 25 25 25 25 92 92 6 26 26 1 30 10 10 25 15 15 10 10 91 40 166 27 19 19 19 29 30 30 19 15 15 15 15 15 26 26 26 26 26 26 6 36 90 76 17 17 17 23 23 23 21 2 5 17 17 6 17 22 23 23 22 1 23 1 22 23 118 118 118 78 43 72 72 1 73 73 73 40 40 92 92 92 92 27 27 41 41 3 39 45 45 6 6 472 469 473 28 28 27 5 28 8 18 16 471 39 467 40 40 48 31 60 11 7 6 2 2 11 11 11 11 4 7 1 14 14 14 7 2 7 7 7 7 7 14 14 9 10 7 2 5 4 2 2 2 14 14 3 1 3 3 4 3 107 106 105 107 107 107 42 65 107 71 86 42 65 71 86 31 31 1 14 16 17 18 19 19 19 1 18 18 18 5 5 5 1 4 2 2 2 2 2 2 2 32 1 40 40 40 40 8 36 6 32 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 // SPDX-License-Identifier: GPL-2.0 /* Multipath TCP * * Copyright (c) 2017 - 2019, Intel Corporation. */ #define pr_fmt(fmt) "MPTCP: " fmt #include <linux/kernel.h> #include <linux/module.h> #include <linux/netdevice.h> #include <linux/sched/signal.h> #include <linux/atomic.h> #include <net/sock.h> #include <net/inet_common.h> #include <net/inet_hashtables.h> #include <net/protocol.h> #include <net/tcp_states.h> #if IS_ENABLED(CONFIG_MPTCP_IPV6) #include <net/transp_v6.h> #endif #include <net/mptcp.h> #include <net/hotdata.h> #include <net/xfrm.h> #include <asm/ioctls.h> #include "protocol.h" #include "mib.h" #define CREATE_TRACE_POINTS #include <trace/events/mptcp.h> #if IS_ENABLED(CONFIG_MPTCP_IPV6) struct mptcp6_sock { struct mptcp_sock msk; struct ipv6_pinfo np; }; #endif enum { MPTCP_CMSG_TS = BIT(0), MPTCP_CMSG_INQ = BIT(1), }; static struct percpu_counter mptcp_sockets_allocated ____cacheline_aligned_in_smp; static void __mptcp_destroy_sock(struct sock *sk); static void mptcp_check_send_data_fin(struct sock *sk); DEFINE_PER_CPU(struct mptcp_delegated_action, mptcp_delegated_actions) = { .bh_lock = INIT_LOCAL_LOCK(bh_lock), }; static struct net_device *mptcp_napi_dev; /* Returns end sequence number of the receiver's advertised window */ static u64 mptcp_wnd_end(const struct mptcp_sock *msk) { return READ_ONCE(msk->wnd_end); } static const struct proto_ops *mptcp_fallback_tcp_ops(const struct sock *sk) { #if IS_ENABLED(CONFIG_MPTCP_IPV6) if (sk->sk_prot == &tcpv6_prot) return &inet6_stream_ops; #endif WARN_ON_ONCE(sk->sk_prot != &tcp_prot); return &inet_stream_ops; } static int __mptcp_socket_create(struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow; struct sock *sk = (struct sock *)msk; struct socket *ssock; int err; err = mptcp_subflow_create_socket(sk, sk->sk_family, &ssock); if (err) return err; msk->scaling_ratio = tcp_sk(ssock->sk)->scaling_ratio; WRITE_ONCE(msk->first, ssock->sk); subflow = mptcp_subflow_ctx(ssock->sk); list_add(&subflow->node, &msk->conn_list); sock_hold(ssock->sk); subflow->request_mptcp = 1; subflow->subflow_id = msk->subflow_id++; /* This is the first subflow, always with id 0 */ WRITE_ONCE(subflow->local_id, 0); mptcp_sock_graft(msk->first, sk->sk_socket); iput(SOCK_INODE(ssock)); return 0; } /* If the MPC handshake is not started, returns the first subflow, * eventually allocating it. */ struct sock *__mptcp_nmpc_sk(struct mptcp_sock *msk) { struct sock *sk = (struct sock *)msk; int ret; if (!((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) return ERR_PTR(-EINVAL); if (!msk->first) { ret = __mptcp_socket_create(msk); if (ret) return ERR_PTR(ret); } return msk->first; } static void mptcp_drop(struct sock *sk, struct sk_buff *skb) { sk_drops_add(sk, skb); __kfree_skb(skb); } static bool mptcp_try_coalesce(struct sock *sk, struct sk_buff *to, struct sk_buff *from) { bool fragstolen; int delta; if (unlikely(MPTCP_SKB_CB(to)->cant_coalesce) || MPTCP_SKB_CB(from)->offset || ((to->len + from->len) > (sk->sk_rcvbuf >> 3)) || !skb_try_coalesce(to, from, &fragstolen, &delta)) return false; pr_debug("colesced seq %llx into %llx new len %d new end seq %llx\n", MPTCP_SKB_CB(from)->map_seq, MPTCP_SKB_CB(to)->map_seq, to->len, MPTCP_SKB_CB(from)->end_seq); MPTCP_SKB_CB(to)->end_seq = MPTCP_SKB_CB(from)->end_seq; /* note the fwd memory can reach a negative value after accounting * for the delta, but the later skb free will restore a non * negative one */ atomic_add(delta, &sk->sk_rmem_alloc); sk_mem_charge(sk, delta); kfree_skb_partial(from, fragstolen); return true; } static bool mptcp_ooo_try_coalesce(struct mptcp_sock *msk, struct sk_buff *to, struct sk_buff *from) { if (MPTCP_SKB_CB(from)->map_seq != MPTCP_SKB_CB(to)->end_seq) return false; return mptcp_try_coalesce((struct sock *)msk, to, from); } /* "inspired" by tcp_data_queue_ofo(), main differences: * - use mptcp seqs * - don't cope with sacks */ static void mptcp_data_queue_ofo(struct mptcp_sock *msk, struct sk_buff *skb) { struct sock *sk = (struct sock *)msk; struct rb_node **p, *parent; u64 seq, end_seq, max_seq; struct sk_buff *skb1; seq = MPTCP_SKB_CB(skb)->map_seq; end_seq = MPTCP_SKB_CB(skb)->end_seq; max_seq = atomic64_read(&msk->rcv_wnd_sent); pr_debug("msk=%p seq=%llx limit=%llx empty=%d\n", msk, seq, max_seq, RB_EMPTY_ROOT(&msk->out_of_order_queue)); if (after64(end_seq, max_seq)) { /* out of window */ mptcp_drop(sk, skb); pr_debug("oow by %lld, rcv_wnd_sent %llu\n", (unsigned long long)end_seq - (unsigned long)max_seq, (unsigned long long)atomic64_read(&msk->rcv_wnd_sent)); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_NODSSWINDOW); return; } p = &msk->out_of_order_queue.rb_node; MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOQUEUE); if (RB_EMPTY_ROOT(&msk->out_of_order_queue)) { rb_link_node(&skb->rbnode, NULL, p); rb_insert_color(&skb->rbnode, &msk->out_of_order_queue); msk->ooo_last_skb = skb; goto end; } /* with 2 subflows, adding at end of ooo queue is quite likely * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup. */ if (mptcp_ooo_try_coalesce(msk, msk->ooo_last_skb, skb)) { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOMERGE); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOQUEUETAIL); return; } /* Can avoid an rbtree lookup if we are adding skb after ooo_last_skb */ if (!before64(seq, MPTCP_SKB_CB(msk->ooo_last_skb)->end_seq)) { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOQUEUETAIL); parent = &msk->ooo_last_skb->rbnode; p = &parent->rb_right; goto insert; } /* Find place to insert this segment. Handle overlaps on the way. */ parent = NULL; while (*p) { parent = *p; skb1 = rb_to_skb(parent); if (before64(seq, MPTCP_SKB_CB(skb1)->map_seq)) { p = &parent->rb_left; continue; } if (before64(seq, MPTCP_SKB_CB(skb1)->end_seq)) { if (!after64(end_seq, MPTCP_SKB_CB(skb1)->end_seq)) { /* All the bits are present. Drop. */ mptcp_drop(sk, skb); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA); return; } if (after64(seq, MPTCP_SKB_CB(skb1)->map_seq)) { /* partial overlap: * | skb | * | skb1 | * continue traversing */ } else { /* skb's seq == skb1's seq and skb covers skb1. * Replace skb1 with skb. */ rb_replace_node(&skb1->rbnode, &skb->rbnode, &msk->out_of_order_queue); mptcp_drop(sk, skb1); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA); goto merge_right; } } else if (mptcp_ooo_try_coalesce(msk, skb1, skb)) { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOMERGE); return; } p = &parent->rb_right; } insert: /* Insert segment into RB tree. */ rb_link_node(&skb->rbnode, parent, p); rb_insert_color(&skb->rbnode, &msk->out_of_order_queue); merge_right: /* Remove other segments covered by skb. */ while ((skb1 = skb_rb_next(skb)) != NULL) { if (before64(end_seq, MPTCP_SKB_CB(skb1)->end_seq)) break; rb_erase(&skb1->rbnode, &msk->out_of_order_queue); mptcp_drop(sk, skb1); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA); } /* If there is no skb after us, we are the last_skb ! */ if (!skb1) msk->ooo_last_skb = skb; end: skb_condense(skb); skb_set_owner_r(skb, sk); } static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, struct sk_buff *skb, unsigned int offset, size_t copy_len) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct sock *sk = (struct sock *)msk; struct sk_buff *tail; bool has_rxtstamp; __skb_unlink(skb, &ssk->sk_receive_queue); skb_ext_reset(skb); skb_orphan(skb); /* try to fetch required memory from subflow */ if (!sk_rmem_schedule(sk, skb, skb->truesize)) { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RCVPRUNED); goto drop; } has_rxtstamp = TCP_SKB_CB(skb)->has_rxtstamp; /* the skb map_seq accounts for the skb offset: * mptcp_subflow_get_mapped_dsn() is based on the current tp->copied_seq * value */ MPTCP_SKB_CB(skb)->map_seq = mptcp_subflow_get_mapped_dsn(subflow); MPTCP_SKB_CB(skb)->end_seq = MPTCP_SKB_CB(skb)->map_seq + copy_len; MPTCP_SKB_CB(skb)->offset = offset; MPTCP_SKB_CB(skb)->has_rxtstamp = has_rxtstamp; MPTCP_SKB_CB(skb)->cant_coalesce = 0; if (MPTCP_SKB_CB(skb)->map_seq == msk->ack_seq) { /* in sequence */ msk->bytes_received += copy_len; WRITE_ONCE(msk->ack_seq, msk->ack_seq + copy_len); tail = skb_peek_tail(&sk->sk_receive_queue); if (tail && mptcp_try_coalesce(sk, tail, skb)) return true; skb_set_owner_r(skb, sk); __skb_queue_tail(&sk->sk_receive_queue, skb); return true; } else if (after64(MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq)) { mptcp_data_queue_ofo(msk, skb); return false; } /* old data, keep it simple and drop the whole pkt, sender * will retransmit as needed, if needed. */ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA); drop: mptcp_drop(sk, skb); return false; } static void mptcp_stop_rtx_timer(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); sk_stop_timer(sk, &icsk->icsk_retransmit_timer); mptcp_sk(sk)->timer_ival = 0; } static void mptcp_close_wake_up(struct sock *sk) { if (sock_flag(sk, SOCK_DEAD)) return; sk->sk_state_change(sk); if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE) sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP); else sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); } /* called under the msk socket lock */ static bool mptcp_pending_data_fin_ack(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); return ((1 << sk->sk_state) & (TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_LAST_ACK)) && msk->write_seq == READ_ONCE(msk->snd_una); } static void mptcp_check_data_fin_ack(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); /* Look for an acknowledged DATA_FIN */ if (mptcp_pending_data_fin_ack(sk)) { WRITE_ONCE(msk->snd_data_fin_enable, 0); switch (sk->sk_state) { case TCP_FIN_WAIT1: mptcp_set_state(sk, TCP_FIN_WAIT2); break; case TCP_CLOSING: case TCP_LAST_ACK: mptcp_set_state(sk, TCP_CLOSE); break; } mptcp_close_wake_up(sk); } } /* can be called with no lock acquired */ static bool mptcp_pending_data_fin(struct sock *sk, u64 *seq) { struct mptcp_sock *msk = mptcp_sk(sk); if (READ_ONCE(msk->rcv_data_fin) && ((1 << inet_sk_state_load(sk)) & (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2))) { u64 rcv_data_fin_seq = READ_ONCE(msk->rcv_data_fin_seq); if (READ_ONCE(msk->ack_seq) == rcv_data_fin_seq) { if (seq) *seq = rcv_data_fin_seq; return true; } } return false; } static void mptcp_set_datafin_timeout(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); u32 retransmits; retransmits = min_t(u32, icsk->icsk_retransmits, ilog2(TCP_RTO_MAX / TCP_RTO_MIN)); mptcp_sk(sk)->timer_ival = TCP_RTO_MIN << retransmits; } static void __mptcp_set_timeout(struct sock *sk, long tout) { mptcp_sk(sk)->timer_ival = tout > 0 ? tout : TCP_RTO_MIN; } static long mptcp_timeout_from_subflow(const struct mptcp_subflow_context *subflow) { const struct sock *ssk = mptcp_subflow_tcp_sock(subflow); return inet_csk(ssk)->icsk_pending && !subflow->stale_count ? icsk_timeout(inet_csk(ssk)) - jiffies : 0; } static void mptcp_set_timeout(struct sock *sk) { struct mptcp_subflow_context *subflow; long tout = 0; mptcp_for_each_subflow(mptcp_sk(sk), subflow) tout = max(tout, mptcp_timeout_from_subflow(subflow)); __mptcp_set_timeout(sk, tout); } static inline bool tcp_can_send_ack(const struct sock *ssk) { return !((1 << inet_sk_state_load(ssk)) & (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_TIME_WAIT | TCPF_CLOSE | TCPF_LISTEN)); } void __mptcp_subflow_send_ack(struct sock *ssk) { if (tcp_can_send_ack(ssk)) tcp_send_ack(ssk); } static void mptcp_subflow_send_ack(struct sock *ssk) { bool slow; slow = lock_sock_fast(ssk); __mptcp_subflow_send_ack(ssk); unlock_sock_fast(ssk, slow); } static void mptcp_send_ack(struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow; mptcp_for_each_subflow(msk, subflow) mptcp_subflow_send_ack(mptcp_subflow_tcp_sock(subflow)); } static void mptcp_subflow_cleanup_rbuf(struct sock *ssk, int copied) { bool slow; slow = lock_sock_fast(ssk); if (tcp_can_send_ack(ssk)) tcp_cleanup_rbuf(ssk, copied); unlock_sock_fast(ssk, slow); } static bool mptcp_subflow_could_cleanup(const struct sock *ssk, bool rx_empty) { const struct inet_connection_sock *icsk = inet_csk(ssk); u8 ack_pending = READ_ONCE(icsk->icsk_ack.pending); const struct tcp_sock *tp = tcp_sk(ssk); return (ack_pending & ICSK_ACK_SCHED) && ((READ_ONCE(tp->rcv_nxt) - READ_ONCE(tp->rcv_wup) > READ_ONCE(icsk->icsk_ack.rcv_mss)) || (rx_empty && ack_pending & (ICSK_ACK_PUSHED2 | ICSK_ACK_PUSHED))); } static void mptcp_cleanup_rbuf(struct mptcp_sock *msk, int copied) { int old_space = READ_ONCE(msk->old_wspace); struct mptcp_subflow_context *subflow; struct sock *sk = (struct sock *)msk; int space = __mptcp_space(sk); bool cleanup, rx_empty; cleanup = (space > 0) && (space >= (old_space << 1)) && copied; rx_empty = !sk_rmem_alloc_get(sk) && copied; mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); if (cleanup || mptcp_subflow_could_cleanup(ssk, rx_empty)) mptcp_subflow_cleanup_rbuf(ssk, copied); } } static bool mptcp_check_data_fin(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); u64 rcv_data_fin_seq; bool ret = false; /* Need to ack a DATA_FIN received from a peer while this side * of the connection is in ESTABLISHED, FIN_WAIT1, or FIN_WAIT2. * msk->rcv_data_fin was set when parsing the incoming options * at the subflow level and the msk lock was not held, so this * is the first opportunity to act on the DATA_FIN and change * the msk state. * * If we are caught up to the sequence number of the incoming * DATA_FIN, send the DATA_ACK now and do state transition. If * not caught up, do nothing and let the recv code send DATA_ACK * when catching up. */ if (mptcp_pending_data_fin(sk, &rcv_data_fin_seq)) { WRITE_ONCE(msk->ack_seq, msk->ack_seq + 1); WRITE_ONCE(msk->rcv_data_fin, 0); WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | RCV_SHUTDOWN); smp_mb__before_atomic(); /* SHUTDOWN must be visible first */ switch (sk->sk_state) { case TCP_ESTABLISHED: mptcp_set_state(sk, TCP_CLOSE_WAIT); break; case TCP_FIN_WAIT1: mptcp_set_state(sk, TCP_CLOSING); break; case TCP_FIN_WAIT2: mptcp_set_state(sk, TCP_CLOSE); break; default: /* Other states not expected */ WARN_ON_ONCE(1); break; } ret = true; if (!__mptcp_check_fallback(msk)) mptcp_send_ack(msk); mptcp_close_wake_up(sk); } return ret; } static void mptcp_dss_corruption(struct mptcp_sock *msk, struct sock *ssk) { if (READ_ONCE(msk->allow_infinite_fallback)) { MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DSSCORRUPTIONFALLBACK); mptcp_do_fallback(ssk); } else { MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DSSCORRUPTIONRESET); mptcp_subflow_reset(ssk); } } static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, struct sock *ssk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct sock *sk = (struct sock *)msk; bool more_data_avail; struct tcp_sock *tp; bool ret = false; pr_debug("msk=%p ssk=%p\n", msk, ssk); tp = tcp_sk(ssk); do { u32 map_remaining, offset; u32 seq = tp->copied_seq; struct sk_buff *skb; bool fin; if (sk_rmem_alloc_get(sk) > sk->sk_rcvbuf) break; /* try to move as much data as available */ map_remaining = subflow->map_data_len - mptcp_subflow_get_map_offset(subflow); skb = skb_peek(&ssk->sk_receive_queue); if (unlikely(!skb)) break; if (__mptcp_check_fallback(msk)) { /* Under fallback skbs have no MPTCP extension and TCP could * collapse them between the dummy map creation and the * current dequeue. Be sure to adjust the map size. */ map_remaining = skb->len; subflow->map_data_len = skb->len; } offset = seq - TCP_SKB_CB(skb)->seq; fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN; if (fin) seq++; if (offset < skb->len) { size_t len = skb->len - offset; ret = __mptcp_move_skb(msk, ssk, skb, offset, len) || ret; seq += len; if (unlikely(map_remaining < len)) { DEBUG_NET_WARN_ON_ONCE(1); mptcp_dss_corruption(msk, ssk); } } else { if (unlikely(!fin)) { DEBUG_NET_WARN_ON_ONCE(1); mptcp_dss_corruption(msk, ssk); } sk_eat_skb(ssk, skb); } WRITE_ONCE(tp->copied_seq, seq); more_data_avail = mptcp_subflow_data_available(ssk); } while (more_data_avail); if (ret) msk->last_data_recv = tcp_jiffies32; return ret; } static bool __mptcp_ofo_queue(struct mptcp_sock *msk) { struct sock *sk = (struct sock *)msk; struct sk_buff *skb, *tail; bool moved = false; struct rb_node *p; u64 end_seq; p = rb_first(&msk->out_of_order_queue); pr_debug("msk=%p empty=%d\n", msk, RB_EMPTY_ROOT(&msk->out_of_order_queue)); while (p) { skb = rb_to_skb(p); if (after64(MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq)) break; p = rb_next(p); rb_erase(&skb->rbnode, &msk->out_of_order_queue); if (unlikely(!after64(MPTCP_SKB_CB(skb)->end_seq, msk->ack_seq))) { mptcp_drop(sk, skb); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA); continue; } end_seq = MPTCP_SKB_CB(skb)->end_seq; tail = skb_peek_tail(&sk->sk_receive_queue); if (!tail || !mptcp_ooo_try_coalesce(msk, tail, skb)) { int delta = msk->ack_seq - MPTCP_SKB_CB(skb)->map_seq; /* skip overlapping data, if any */ pr_debug("uncoalesced seq=%llx ack seq=%llx delta=%d\n", MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq, delta); MPTCP_SKB_CB(skb)->offset += delta; MPTCP_SKB_CB(skb)->map_seq += delta; __skb_queue_tail(&sk->sk_receive_queue, skb); } msk->bytes_received += end_seq - msk->ack_seq; WRITE_ONCE(msk->ack_seq, end_seq); moved = true; } return moved; } static bool __mptcp_subflow_error_report(struct sock *sk, struct sock *ssk) { int err = sock_error(ssk); int ssk_state; if (!err) return false; /* only propagate errors on fallen-back sockets or * on MPC connect */ if (sk->sk_state != TCP_SYN_SENT && !__mptcp_check_fallback(mptcp_sk(sk))) return false; /* We need to propagate only transition to CLOSE state. * Orphaned socket will see such state change via * subflow_sched_work_if_closed() and that path will properly * destroy the msk as needed. */ ssk_state = inet_sk_state_load(ssk); if (ssk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DEAD)) mptcp_set_state(sk, ssk_state); WRITE_ONCE(sk->sk_err, -err); /* This barrier is coupled with smp_rmb() in mptcp_poll() */ smp_wmb(); sk_error_report(sk); return true; } void __mptcp_error_report(struct sock *sk) { struct mptcp_subflow_context *subflow; struct mptcp_sock *msk = mptcp_sk(sk); mptcp_for_each_subflow(msk, subflow) if (__mptcp_subflow_error_report(sk, mptcp_subflow_tcp_sock(subflow))) break; } /* In most cases we will be able to lock the mptcp socket. If its already * owned, we need to defer to the work queue to avoid ABBA deadlock. */ static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk) { struct sock *sk = (struct sock *)msk; bool moved; moved = __mptcp_move_skbs_from_subflow(msk, ssk); __mptcp_ofo_queue(msk); if (unlikely(ssk->sk_err)) { if (!sock_owned_by_user(sk)) __mptcp_error_report(sk); else __set_bit(MPTCP_ERROR_REPORT, &msk->cb_flags); } /* If the moves have caught up with the DATA_FIN sequence number * it's time to ack the DATA_FIN and change socket state, but * this is not a good place to change state. Let the workqueue * do it. */ if (mptcp_pending_data_fin(sk, NULL)) mptcp_schedule_work(sk); return moved; } static void __mptcp_rcvbuf_update(struct sock *sk, struct sock *ssk) { if (unlikely(ssk->sk_rcvbuf > sk->sk_rcvbuf)) WRITE_ONCE(sk->sk_rcvbuf, ssk->sk_rcvbuf); } static void __mptcp_data_ready(struct sock *sk, struct sock *ssk) { struct mptcp_sock *msk = mptcp_sk(sk); __mptcp_rcvbuf_update(sk, ssk); /* Wake-up the reader only for in-sequence data */ if (move_skbs_to_msk(msk, ssk) && mptcp_epollin_ready(sk)) sk->sk_data_ready(sk); } void mptcp_data_ready(struct sock *sk, struct sock *ssk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); /* The peer can send data while we are shutting down this * subflow at msk destruction time, but we must avoid enqueuing * more data to the msk receive queue */ if (unlikely(subflow->disposable)) return; mptcp_data_lock(sk); if (!sock_owned_by_user(sk)) __mptcp_data_ready(sk, ssk); else __set_bit(MPTCP_DEQUEUE, &mptcp_sk(sk)->cb_flags); mptcp_data_unlock(sk); } static void mptcp_subflow_joined(struct mptcp_sock *msk, struct sock *ssk) { mptcp_subflow_ctx(ssk)->map_seq = READ_ONCE(msk->ack_seq); WRITE_ONCE(msk->allow_infinite_fallback, false); mptcp_event(MPTCP_EVENT_SUB_ESTABLISHED, msk, ssk, GFP_ATOMIC); } static bool __mptcp_finish_join(struct mptcp_sock *msk, struct sock *ssk) { struct sock *sk = (struct sock *)msk; if (sk->sk_state != TCP_ESTABLISHED) return false; /* attach to msk socket only after we are sure we will deal with it * at close time */ if (sk->sk_socket && !ssk->sk_socket) mptcp_sock_graft(ssk, sk->sk_socket); mptcp_subflow_ctx(ssk)->subflow_id = msk->subflow_id++; mptcp_sockopt_sync_locked(msk, ssk); mptcp_subflow_joined(msk, ssk); mptcp_stop_tout_timer(sk); __mptcp_propagate_sndbuf(sk, ssk); return true; } static void __mptcp_flush_join_list(struct sock *sk, struct list_head *join_list) { struct mptcp_subflow_context *tmp, *subflow; struct mptcp_sock *msk = mptcp_sk(sk); list_for_each_entry_safe(subflow, tmp, join_list, node) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); bool slow = lock_sock_fast(ssk); list_move_tail(&subflow->node, &msk->conn_list); if (!__mptcp_finish_join(msk, ssk)) mptcp_subflow_reset(ssk); unlock_sock_fast(ssk, slow); } } static bool mptcp_rtx_timer_pending(struct sock *sk) { return timer_pending(&inet_csk(sk)->icsk_retransmit_timer); } static void mptcp_reset_rtx_timer(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); unsigned long tout; /* prevent rescheduling on close */ if (unlikely(inet_sk_state_load(sk) == TCP_CLOSE)) return; tout = mptcp_sk(sk)->timer_ival; sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + tout); } bool mptcp_schedule_work(struct sock *sk) { if (inet_sk_state_load(sk) != TCP_CLOSE && schedule_work(&mptcp_sk(sk)->work)) { /* each subflow already holds a reference to the sk, and the * workqueue is invoked by a subflow, so sk can't go away here. */ sock_hold(sk); return true; } return false; } static bool mptcp_skb_can_collapse_to(u64 write_seq, const struct sk_buff *skb, const struct mptcp_ext *mpext) { if (!tcp_skb_can_collapse_to(skb)) return false; /* can collapse only if MPTCP level sequence is in order and this * mapping has not been xmitted yet */ return mpext && mpext->data_seq + mpext->data_len == write_seq && !mpext->frozen; } /* we can append data to the given data frag if: * - there is space available in the backing page_frag * - the data frag tail matches the current page_frag free offset * - the data frag end sequence number matches the current write seq */ static bool mptcp_frag_can_collapse_to(const struct mptcp_sock *msk, const struct page_frag *pfrag, const struct mptcp_data_frag *df) { return df && pfrag->page == df->page && pfrag->size - pfrag->offset > 0 && pfrag->offset == (df->offset + df->data_len) && df->data_seq + df->data_len == msk->write_seq; } static void dfrag_uncharge(struct sock *sk, int len) { sk_mem_uncharge(sk, len); sk_wmem_queued_add(sk, -len); } static void dfrag_clear(struct sock *sk, struct mptcp_data_frag *dfrag) { int len = dfrag->data_len + dfrag->overhead; list_del(&dfrag->list); dfrag_uncharge(sk, len); put_page(dfrag->page); } /* called under both the msk socket lock and the data lock */ static void __mptcp_clean_una(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_data_frag *dtmp, *dfrag; u64 snd_una; snd_una = msk->snd_una; list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list) { if (after64(dfrag->data_seq + dfrag->data_len, snd_una)) break; if (unlikely(dfrag == msk->first_pending)) { /* in recovery mode can see ack after the current snd head */ if (WARN_ON_ONCE(!msk->recovery)) break; WRITE_ONCE(msk->first_pending, mptcp_send_next(sk)); } dfrag_clear(sk, dfrag); } dfrag = mptcp_rtx_head(sk); if (dfrag && after64(snd_una, dfrag->data_seq)) { u64 delta = snd_una - dfrag->data_seq; /* prevent wrap around in recovery mode */ if (unlikely(delta > dfrag->already_sent)) { if (WARN_ON_ONCE(!msk->recovery)) goto out; if (WARN_ON_ONCE(delta > dfrag->data_len)) goto out; dfrag->already_sent += delta - dfrag->already_sent; } dfrag->data_seq += delta; dfrag->offset += delta; dfrag->data_len -= delta; dfrag->already_sent -= delta; dfrag_uncharge(sk, delta); } /* all retransmitted data acked, recovery completed */ if (unlikely(msk->recovery) && after64(msk->snd_una, msk->recovery_snd_nxt)) msk->recovery = false; out: if (snd_una == msk->snd_nxt && snd_una == msk->write_seq) { if (mptcp_rtx_timer_pending(sk) && !mptcp_data_fin_enabled(msk)) mptcp_stop_rtx_timer(sk); } else { mptcp_reset_rtx_timer(sk); } if (mptcp_pending_data_fin_ack(sk)) mptcp_schedule_work(sk); } static void __mptcp_clean_una_wakeup(struct sock *sk) { lockdep_assert_held_once(&sk->sk_lock.slock); __mptcp_clean_una(sk); mptcp_write_space(sk); } static void mptcp_clean_una_wakeup(struct sock *sk) { mptcp_data_lock(sk); __mptcp_clean_una_wakeup(sk); mptcp_data_unlock(sk); } static void mptcp_enter_memory_pressure(struct sock *sk) { struct mptcp_subflow_context *subflow; struct mptcp_sock *msk = mptcp_sk(sk); bool first = true; mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); if (first) tcp_enter_memory_pressure(ssk); sk_stream_moderate_sndbuf(ssk); first = false; } __mptcp_sync_sndbuf(sk); } /* ensure we get enough memory for the frag hdr, beyond some minimal amount of * data */ static bool mptcp_page_frag_refill(struct sock *sk, struct page_frag *pfrag) { if (likely(skb_page_frag_refill(32U + sizeof(struct mptcp_data_frag), pfrag, sk->sk_allocation))) return true; mptcp_enter_memory_pressure(sk); return false; } static struct mptcp_data_frag * mptcp_carve_data_frag(const struct mptcp_sock *msk, struct page_frag *pfrag, int orig_offset) { int offset = ALIGN(orig_offset, sizeof(long)); struct mptcp_data_frag *dfrag; dfrag = (struct mptcp_data_frag *)(page_to_virt(pfrag->page) + offset); dfrag->data_len = 0; dfrag->data_seq = msk->write_seq; dfrag->overhead = offset - orig_offset + sizeof(struct mptcp_data_frag); dfrag->offset = offset + sizeof(struct mptcp_data_frag); dfrag->already_sent = 0; dfrag->page = pfrag->page; return dfrag; } struct mptcp_sendmsg_info { int mss_now; int size_goal; u16 limit; u16 sent; unsigned int flags; bool data_lock_held; }; static int mptcp_check_allowed_size(const struct mptcp_sock *msk, struct sock *ssk, u64 data_seq, int avail_size) { u64 window_end = mptcp_wnd_end(msk); u64 mptcp_snd_wnd; if (__mptcp_check_fallback(msk)) return avail_size; mptcp_snd_wnd = window_end - data_seq; avail_size = min_t(unsigned int, mptcp_snd_wnd, avail_size); if (unlikely(tcp_sk(ssk)->snd_wnd < mptcp_snd_wnd)) { tcp_sk(ssk)->snd_wnd = min_t(u64, U32_MAX, mptcp_snd_wnd); MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_SNDWNDSHARED); } return avail_size; } static bool __mptcp_add_ext(struct sk_buff *skb, gfp_t gfp) { struct skb_ext *mpext = __skb_ext_alloc(gfp); if (!mpext) return false; __skb_ext_set(skb, SKB_EXT_MPTCP, mpext); return true; } static struct sk_buff *__mptcp_do_alloc_tx_skb(struct sock *sk, gfp_t gfp) { struct sk_buff *skb; skb = alloc_skb_fclone(MAX_TCP_HEADER, gfp); if (likely(skb)) { if (likely(__mptcp_add_ext(skb, gfp))) { skb_reserve(skb, MAX_TCP_HEADER); skb->ip_summed = CHECKSUM_PARTIAL; INIT_LIST_HEAD(&skb->tcp_tsorted_anchor); return skb; } __kfree_skb(skb); } else { mptcp_enter_memory_pressure(sk); } return NULL; } static struct sk_buff *__mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, gfp_t gfp) { struct sk_buff *skb; skb = __mptcp_do_alloc_tx_skb(sk, gfp); if (!skb) return NULL; if (likely(sk_wmem_schedule(ssk, skb->truesize))) { tcp_skb_entail(ssk, skb); return skb; } tcp_skb_tsorted_anchor_cleanup(skb); kfree_skb(skb); return NULL; } static struct sk_buff *mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, bool data_lock_held) { gfp_t gfp = data_lock_held ? GFP_ATOMIC : sk->sk_allocation; return __mptcp_alloc_tx_skb(sk, ssk, gfp); } /* note: this always recompute the csum on the whole skb, even * if we just appended a single frag. More status info needed */ static void mptcp_update_data_checksum(struct sk_buff *skb, int added) { struct mptcp_ext *mpext = mptcp_get_ext(skb); __wsum csum = ~csum_unfold(mpext->csum); int offset = skb->len - added; mpext->csum = csum_fold(csum_block_add(csum, skb_checksum(skb, offset, added, 0), offset)); } static void mptcp_update_infinite_map(struct mptcp_sock *msk, struct sock *ssk, struct mptcp_ext *mpext) { if (!mpext) return; mpext->infinite_map = 1; mpext->data_len = 0; MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPTX); mptcp_subflow_ctx(ssk)->send_infinite_map = 0; pr_fallback(msk); mptcp_do_fallback(ssk); } #define MPTCP_MAX_GSO_SIZE (GSO_LEGACY_MAX_SIZE - (MAX_TCP_HEADER + 1)) static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, struct mptcp_data_frag *dfrag, struct mptcp_sendmsg_info *info) { u64 data_seq = dfrag->data_seq + info->sent; int offset = dfrag->offset + info->sent; struct mptcp_sock *msk = mptcp_sk(sk); bool zero_window_probe = false; struct mptcp_ext *mpext = NULL; bool can_coalesce = false; bool reuse_skb = true; struct sk_buff *skb; size_t copy; int i; pr_debug("msk=%p ssk=%p sending dfrag at seq=%llu len=%u already sent=%u\n", msk, ssk, dfrag->data_seq, dfrag->data_len, info->sent); if (WARN_ON_ONCE(info->sent > info->limit || info->limit > dfrag->data_len)) return 0; if (unlikely(!__tcp_can_send(ssk))) return -EAGAIN; /* compute send limit */ if (unlikely(ssk->sk_gso_max_size > MPTCP_MAX_GSO_SIZE)) ssk->sk_gso_max_size = MPTCP_MAX_GSO_SIZE; info->mss_now = tcp_send_mss(ssk, &info->size_goal, info->flags); copy = info->size_goal; skb = tcp_write_queue_tail(ssk); if (skb && copy > skb->len) { /* Limit the write to the size available in the * current skb, if any, so that we create at most a new skb. * Explicitly tells TCP internals to avoid collapsing on later * queue management operation, to avoid breaking the ext <-> * SSN association set here */ mpext = mptcp_get_ext(skb); if (!mptcp_skb_can_collapse_to(data_seq, skb, mpext)) { TCP_SKB_CB(skb)->eor = 1; tcp_mark_push(tcp_sk(ssk), skb); goto alloc_skb; } i = skb_shinfo(skb)->nr_frags; can_coalesce = skb_can_coalesce(skb, i, dfrag->page, offset); if (!can_coalesce && i >= READ_ONCE(net_hotdata.sysctl_max_skb_frags)) { tcp_mark_push(tcp_sk(ssk), skb); goto alloc_skb; } copy -= skb->len; } else { alloc_skb: skb = mptcp_alloc_tx_skb(sk, ssk, info->data_lock_held); if (!skb) return -ENOMEM; i = skb_shinfo(skb)->nr_frags; reuse_skb = false; mpext = mptcp_get_ext(skb); } /* Zero window and all data acked? Probe. */ copy = mptcp_check_allowed_size(msk, ssk, data_seq, copy); if (copy == 0) { u64 snd_una = READ_ONCE(msk->snd_una); if (snd_una != msk->snd_nxt || tcp_write_queue_tail(ssk)) { tcp_remove_empty_skb(ssk); return 0; } zero_window_probe = true; data_seq = snd_una - 1; copy = 1; } copy = min_t(size_t, copy, info->limit - info->sent); if (!sk_wmem_schedule(ssk, copy)) { tcp_remove_empty_skb(ssk); return -ENOMEM; } if (can_coalesce) { skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); } else { get_page(dfrag->page); skb_fill_page_desc(skb, i, dfrag->page, offset, copy); } skb->len += copy; skb->data_len += copy; skb->truesize += copy; sk_wmem_queued_add(ssk, copy); sk_mem_charge(ssk, copy); WRITE_ONCE(tcp_sk(ssk)->write_seq, tcp_sk(ssk)->write_seq + copy); TCP_SKB_CB(skb)->end_seq += copy; tcp_skb_pcount_set(skb, 0); /* on skb reuse we just need to update the DSS len */ if (reuse_skb) { TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH; mpext->data_len += copy; goto out; } memset(mpext, 0, sizeof(*mpext)); mpext->data_seq = data_seq; mpext->subflow_seq = mptcp_subflow_ctx(ssk)->rel_write_seq; mpext->data_len = copy; mpext->use_map = 1; mpext->dsn64 = 1; pr_debug("data_seq=%llu subflow_seq=%u data_len=%u dsn64=%d\n", mpext->data_seq, mpext->subflow_seq, mpext->data_len, mpext->dsn64); if (zero_window_probe) { mptcp_subflow_ctx(ssk)->rel_write_seq += copy; mpext->frozen = 1; if (READ_ONCE(msk->csum_enabled)) mptcp_update_data_checksum(skb, copy); tcp_push_pending_frames(ssk); return 0; } out: if (READ_ONCE(msk->csum_enabled)) mptcp_update_data_checksum(skb, copy); if (mptcp_subflow_ctx(ssk)->send_infinite_map) mptcp_update_infinite_map(msk, ssk, mpext); trace_mptcp_sendmsg_frag(mpext); mptcp_subflow_ctx(ssk)->rel_write_seq += copy; return copy; } #define MPTCP_SEND_BURST_SIZE ((1 << 16) - \ sizeof(struct tcphdr) - \ MAX_TCP_OPTION_SPACE - \ sizeof(struct ipv6hdr) - \ sizeof(struct frag_hdr)) struct subflow_send_info { struct sock *ssk; u64 linger_time; }; void mptcp_subflow_set_active(struct mptcp_subflow_context *subflow) { if (!subflow->stale) return; subflow->stale = 0; MPTCP_INC_STATS(sock_net(mptcp_subflow_tcp_sock(subflow)), MPTCP_MIB_SUBFLOWRECOVER); } bool mptcp_subflow_active(struct mptcp_subflow_context *subflow) { if (unlikely(subflow->stale)) { u32 rcv_tstamp = READ_ONCE(tcp_sk(mptcp_subflow_tcp_sock(subflow))->rcv_tstamp); if (subflow->stale_rcv_tstamp == rcv_tstamp) return false; mptcp_subflow_set_active(subflow); } return __mptcp_subflow_active(subflow); } #define SSK_MODE_ACTIVE 0 #define SSK_MODE_BACKUP 1 #define SSK_MODE_MAX 2 /* implement the mptcp packet scheduler; * returns the subflow that will transmit the next DSS * additionally updates the rtx timeout */ struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) { struct subflow_send_info send_info[SSK_MODE_MAX]; struct mptcp_subflow_context *subflow; struct sock *sk = (struct sock *)msk; u32 pace, burst, wmem; int i, nr_active = 0; struct sock *ssk; u64 linger_time; long tout = 0; /* pick the subflow with the lower wmem/wspace ratio */ for (i = 0; i < SSK_MODE_MAX; ++i) { send_info[i].ssk = NULL; send_info[i].linger_time = -1; } mptcp_for_each_subflow(msk, subflow) { bool backup = subflow->backup || subflow->request_bkup; trace_mptcp_subflow_get_send(subflow); ssk = mptcp_subflow_tcp_sock(subflow); if (!mptcp_subflow_active(subflow)) continue; tout = max(tout, mptcp_timeout_from_subflow(subflow)); nr_active += !backup; pace = subflow->avg_pacing_rate; if (unlikely(!pace)) { /* init pacing rate from socket */ subflow->avg_pacing_rate = READ_ONCE(ssk->sk_pacing_rate); pace = subflow->avg_pacing_rate; if (!pace) continue; } linger_time = div_u64((u64)READ_ONCE(ssk->sk_wmem_queued) << 32, pace); if (linger_time < send_info[backup].linger_time) { send_info[backup].ssk = ssk; send_info[backup].linger_time = linger_time; } } __mptcp_set_timeout(sk, tout); /* pick the best backup if no other subflow is active */ if (!nr_active) send_info[SSK_MODE_ACTIVE].ssk = send_info[SSK_MODE_BACKUP].ssk; /* According to the blest algorithm, to avoid HoL blocking for the * faster flow, we need to: * - estimate the faster flow linger time * - use the above to estimate the amount of byte transferred * by the faster flow * - check that the amount of queued data is greter than the above, * otherwise do not use the picked, slower, subflow * We select the subflow with the shorter estimated time to flush * the queued mem, which basically ensure the above. We just need * to check that subflow has a non empty cwin. */ ssk = send_info[SSK_MODE_ACTIVE].ssk; if (!ssk || !sk_stream_memory_free(ssk)) return NULL; burst = min_t(int, MPTCP_SEND_BURST_SIZE, mptcp_wnd_end(msk) - msk->snd_nxt); wmem = READ_ONCE(ssk->sk_wmem_queued); if (!burst) return ssk; subflow = mptcp_subflow_ctx(ssk); subflow->avg_pacing_rate = div_u64((u64)subflow->avg_pacing_rate * wmem + READ_ONCE(ssk->sk_pacing_rate) * burst, burst + wmem); msk->snd_burst = burst; return ssk; } static void mptcp_push_release(struct sock *ssk, struct mptcp_sendmsg_info *info) { tcp_push(ssk, 0, info->mss_now, tcp_sk(ssk)->nonagle, info->size_goal); release_sock(ssk); } static void mptcp_update_post_push(struct mptcp_sock *msk, struct mptcp_data_frag *dfrag, u32 sent) { u64 snd_nxt_new = dfrag->data_seq; dfrag->already_sent += sent; msk->snd_burst -= sent; snd_nxt_new += dfrag->already_sent; /* snd_nxt_new can be smaller than snd_nxt in case mptcp * is recovering after a failover. In that event, this re-sends * old segments. * * Thus compute snd_nxt_new candidate based on * the dfrag->data_seq that was sent and the data * that has been handed to the subflow for transmission * and skip update in case it was old dfrag. */ if (likely(after64(snd_nxt_new, msk->snd_nxt))) { msk->bytes_sent += snd_nxt_new - msk->snd_nxt; WRITE_ONCE(msk->snd_nxt, snd_nxt_new); } } void mptcp_check_and_set_pending(struct sock *sk) { if (mptcp_send_head(sk)) { mptcp_data_lock(sk); mptcp_sk(sk)->cb_flags |= BIT(MPTCP_PUSH_PENDING); mptcp_data_unlock(sk); } } static int __subflow_push_pending(struct sock *sk, struct sock *ssk, struct mptcp_sendmsg_info *info) { struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_data_frag *dfrag; int len, copied = 0, err = 0; while ((dfrag = mptcp_send_head(sk))) { info->sent = dfrag->already_sent; info->limit = dfrag->data_len; len = dfrag->data_len - dfrag->already_sent; while (len > 0) { int ret = 0; ret = mptcp_sendmsg_frag(sk, ssk, dfrag, info); if (ret <= 0) { err = copied ? : ret; goto out; } info->sent += ret; copied += ret; len -= ret; mptcp_update_post_push(msk, dfrag, ret); } WRITE_ONCE(msk->first_pending, mptcp_send_next(sk)); if (msk->snd_burst <= 0 || !sk_stream_memory_free(ssk) || !mptcp_subflow_active(mptcp_subflow_ctx(ssk))) { err = copied; goto out; } mptcp_set_timeout(sk); } err = copied; out: if (err > 0) msk->last_data_sent = tcp_jiffies32; return err; } void __mptcp_push_pending(struct sock *sk, unsigned int flags) { struct sock *prev_ssk = NULL, *ssk = NULL; struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_sendmsg_info info = { .flags = flags, }; bool do_check_data_fin = false; int push_count = 1; while (mptcp_send_head(sk) && (push_count > 0)) { struct mptcp_subflow_context *subflow; int ret = 0; if (mptcp_sched_get_send(msk)) break; push_count = 0; mptcp_for_each_subflow(msk, subflow) { if (READ_ONCE(subflow->scheduled)) { mptcp_subflow_set_scheduled(subflow, false); prev_ssk = ssk; ssk = mptcp_subflow_tcp_sock(subflow); if (ssk != prev_ssk) { /* First check. If the ssk has changed since * the last round, release prev_ssk */ if (prev_ssk) mptcp_push_release(prev_ssk, &info); /* Need to lock the new subflow only if different * from the previous one, otherwise we are still * helding the relevant lock */ lock_sock(ssk); } push_count++; ret = __subflow_push_pending(sk, ssk, &info); if (ret <= 0) { if (ret != -EAGAIN || (1 << ssk->sk_state) & (TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2 | TCPF_CLOSE)) push_count--; continue; } do_check_data_fin = true; } } } /* at this point we held the socket lock for the last subflow we used */ if (ssk) mptcp_push_release(ssk, &info); /* ensure the rtx timer is running */ if (!mptcp_rtx_timer_pending(sk)) mptcp_reset_rtx_timer(sk); if (do_check_data_fin) mptcp_check_send_data_fin(sk); } static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk, bool first) { struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_sendmsg_info info = { .data_lock_held = true, }; bool keep_pushing = true; struct sock *xmit_ssk; int copied = 0; info.flags = 0; while (mptcp_send_head(sk) && keep_pushing) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); int ret = 0; /* check for a different subflow usage only after * spooling the first chunk of data */ if (first) { mptcp_subflow_set_scheduled(subflow, false); ret = __subflow_push_pending(sk, ssk, &info); first = false; if (ret <= 0) break; copied += ret; continue; } if (mptcp_sched_get_send(msk)) goto out; if (READ_ONCE(subflow->scheduled)) { mptcp_subflow_set_scheduled(subflow, false); ret = __subflow_push_pending(sk, ssk, &info); if (ret <= 0) keep_pushing = false; copied += ret; } mptcp_for_each_subflow(msk, subflow) { if (READ_ONCE(subflow->scheduled)) { xmit_ssk = mptcp_subflow_tcp_sock(subflow); if (xmit_ssk != ssk) { mptcp_subflow_delegate(subflow, MPTCP_DELEGATE_SEND); keep_pushing = false; } } } } out: /* __mptcp_alloc_tx_skb could have released some wmem and we are * not going to flush it via release_sock() */ if (copied) { tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle, info.size_goal); if (!mptcp_rtx_timer_pending(sk)) mptcp_reset_rtx_timer(sk); if (msk->snd_data_fin_enable && msk->snd_nxt + 1 == msk->write_seq) mptcp_schedule_work(sk); } } static int mptcp_disconnect(struct sock *sk, int flags); static int mptcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, size_t len, int *copied_syn) { unsigned int saved_flags = msg->msg_flags; struct mptcp_sock *msk = mptcp_sk(sk); struct sock *ssk; int ret; /* on flags based fastopen the mptcp is supposed to create the * first subflow right now. Otherwise we are in the defer_connect * path, and the first subflow must be already present. * Since the defer_connect flag is cleared after the first succsful * fastopen attempt, no need to check for additional subflow status. */ if (msg->msg_flags & MSG_FASTOPEN) { ssk = __mptcp_nmpc_sk(msk); if (IS_ERR(ssk)) return PTR_ERR(ssk); } if (!msk->first) return -EINVAL; ssk = msk->first; lock_sock(ssk); msg->msg_flags |= MSG_DONTWAIT; msk->fastopening = 1; ret = tcp_sendmsg_fastopen(ssk, msg, copied_syn, len, NULL); msk->fastopening = 0; msg->msg_flags = saved_flags; release_sock(ssk); /* do the blocking bits of inet_stream_connect outside the ssk socket lock */ if (ret == -EINPROGRESS && !(msg->msg_flags & MSG_DONTWAIT)) { ret = __inet_stream_connect(sk->sk_socket, msg->msg_name, msg->msg_namelen, msg->msg_flags, 1); /* Keep the same behaviour of plain TCP: zero the copied bytes in * case of any error, except timeout or signal */ if (ret && ret != -EINPROGRESS && ret != -ERESTARTSYS && ret != -EINTR) *copied_syn = 0; } else if (ret && ret != -EINPROGRESS) { /* The disconnect() op called by tcp_sendmsg_fastopen()/ * __inet_stream_connect() can fail, due to looking check, * see mptcp_disconnect(). * Attempt it again outside the problematic scope. */ if (!mptcp_disconnect(sk, 0)) { sk->sk_disconnects++; sk->sk_socket->state = SS_UNCONNECTED; } } inet_clear_bit(DEFER_CONNECT, sk); return ret; } static int do_copy_data_nocache(struct sock *sk, int copy, struct iov_iter *from, char *to) { if (sk->sk_route_caps & NETIF_F_NOCACHE_COPY) { if (!copy_from_iter_full_nocache(to, copy, from)) return -EFAULT; } else if (!copy_from_iter_full(to, copy, from)) { return -EFAULT; } return 0; } /* open-code sk_stream_memory_free() plus sent limit computation to * avoid indirect calls in fast-path. * Called under the msk socket lock, so we can avoid a bunch of ONCE * annotations. */ static u32 mptcp_send_limit(const struct sock *sk) { const struct mptcp_sock *msk = mptcp_sk(sk); u32 limit, not_sent; if (sk->sk_wmem_queued >= READ_ONCE(sk->sk_sndbuf)) return 0; limit = mptcp_notsent_lowat(sk); if (limit == UINT_MAX) return UINT_MAX; not_sent = msk->write_seq - msk->snd_nxt; if (not_sent >= limit) return 0; return limit - not_sent; } static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { struct mptcp_sock *msk = mptcp_sk(sk); struct page_frag *pfrag; size_t copied = 0; int ret = 0; long timeo; /* silently ignore everything else */ msg->msg_flags &= MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL | MSG_FASTOPEN; lock_sock(sk); if (unlikely(inet_test_bit(DEFER_CONNECT, sk) || msg->msg_flags & MSG_FASTOPEN)) { int copied_syn = 0; ret = mptcp_sendmsg_fastopen(sk, msg, len, &copied_syn); copied += copied_syn; if (ret == -EINPROGRESS && copied_syn > 0) goto out; else if (ret) goto do_error; } timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) { ret = sk_stream_wait_connect(sk, &timeo); if (ret) goto do_error; } ret = -EPIPE; if (unlikely(sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))) goto do_error; pfrag = sk_page_frag(sk); while (msg_data_left(msg)) { int total_ts, frag_truesize = 0; struct mptcp_data_frag *dfrag; bool dfrag_collapsed; size_t psize, offset; u32 copy_limit; /* ensure fitting the notsent_lowat() constraint */ copy_limit = mptcp_send_limit(sk); if (!copy_limit) goto wait_for_memory; /* reuse tail pfrag, if possible, or carve a new one from the * page allocator */ dfrag = mptcp_pending_tail(sk); dfrag_collapsed = mptcp_frag_can_collapse_to(msk, pfrag, dfrag); if (!dfrag_collapsed) { if (!mptcp_page_frag_refill(sk, pfrag)) goto wait_for_memory; dfrag = mptcp_carve_data_frag(msk, pfrag, pfrag->offset); frag_truesize = dfrag->overhead; } /* we do not bound vs wspace, to allow a single packet. * memory accounting will prevent execessive memory usage * anyway */ offset = dfrag->offset + dfrag->data_len; psize = pfrag->size - offset; psize = min_t(size_t, psize, msg_data_left(msg)); psize = min_t(size_t, psize, copy_limit); total_ts = psize + frag_truesize; if (!sk_wmem_schedule(sk, total_ts)) goto wait_for_memory; ret = do_copy_data_nocache(sk, psize, &msg->msg_iter, page_address(dfrag->page) + offset); if (ret) goto do_error; /* data successfully copied into the write queue */ sk_forward_alloc_add(sk, -total_ts); copied += psize; dfrag->data_len += psize; frag_truesize += psize; pfrag->offset += frag_truesize; WRITE_ONCE(msk->write_seq, msk->write_seq + psize); /* charge data on mptcp pending queue to the msk socket * Note: we charge such data both to sk and ssk */ sk_wmem_queued_add(sk, frag_truesize); if (!dfrag_collapsed) { get_page(dfrag->page); list_add_tail(&dfrag->list, &msk->rtx_queue); if (!msk->first_pending) WRITE_ONCE(msk->first_pending, dfrag); } pr_debug("msk=%p dfrag at seq=%llu len=%u sent=%u new=%d\n", msk, dfrag->data_seq, dfrag->data_len, dfrag->already_sent, !dfrag_collapsed); continue; wait_for_memory: set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); __mptcp_push_pending(sk, msg->msg_flags); ret = sk_stream_wait_memory(sk, &timeo); if (ret) goto do_error; } if (copied) __mptcp_push_pending(sk, msg->msg_flags); out: release_sock(sk); return copied; do_error: if (copied) goto out; copied = sk_stream_error(sk, msg->msg_flags, ret); goto out; } static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied); static int __mptcp_recvmsg_mskq(struct sock *sk, struct msghdr *msg, size_t len, int flags, struct scm_timestamping_internal *tss, int *cmsg_flags) { struct mptcp_sock *msk = mptcp_sk(sk); struct sk_buff *skb, *tmp; int copied = 0; skb_queue_walk_safe(&sk->sk_receive_queue, skb, tmp) { u32 offset = MPTCP_SKB_CB(skb)->offset; u32 data_len = skb->len - offset; u32 count = min_t(size_t, len - copied, data_len); int err; if (!(flags & MSG_TRUNC)) { err = skb_copy_datagram_msg(skb, offset, msg, count); if (unlikely(err < 0)) { if (!copied) return err; break; } } if (MPTCP_SKB_CB(skb)->has_rxtstamp) { tcp_update_recv_tstamps(skb, tss); *cmsg_flags |= MPTCP_CMSG_TS; } copied += count; if (count < data_len) { if (!(flags & MSG_PEEK)) { MPTCP_SKB_CB(skb)->offset += count; MPTCP_SKB_CB(skb)->map_seq += count; msk->bytes_consumed += count; } break; } if (!(flags & MSG_PEEK)) { /* avoid the indirect call, we know the destructor is sock_wfree */ skb->destructor = NULL; atomic_sub(skb->truesize, &sk->sk_rmem_alloc); sk_mem_uncharge(sk, skb->truesize); __skb_unlink(skb, &sk->sk_receive_queue); __kfree_skb(skb); msk->bytes_consumed += count; } if (copied >= len) break; } mptcp_rcv_space_adjust(msk, copied); return copied; } /* receive buffer autotuning. See tcp_rcv_space_adjust for more information. * * Only difference: Use highest rtt estimate of the subflows in use. */ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied) { struct mptcp_subflow_context *subflow; struct sock *sk = (struct sock *)msk; u8 scaling_ratio = U8_MAX; u32 time, advmss = 1; u64 rtt_us, mstamp; msk_owned_by_me(msk); if (copied <= 0) return; if (!msk->rcvspace_init) mptcp_rcv_space_init(msk, msk->first); msk->rcvq_space.copied += copied; mstamp = div_u64(tcp_clock_ns(), NSEC_PER_USEC); time = tcp_stamp_us_delta(mstamp, msk->rcvq_space.time); rtt_us = msk->rcvq_space.rtt_us; if (rtt_us && time < (rtt_us >> 3)) return; rtt_us = 0; mptcp_for_each_subflow(msk, subflow) { const struct tcp_sock *tp; u64 sf_rtt_us; u32 sf_advmss; tp = tcp_sk(mptcp_subflow_tcp_sock(subflow)); sf_rtt_us = READ_ONCE(tp->rcv_rtt_est.rtt_us); sf_advmss = READ_ONCE(tp->advmss); rtt_us = max(sf_rtt_us, rtt_us); advmss = max(sf_advmss, advmss); scaling_ratio = min(tp->scaling_ratio, scaling_ratio); } msk->rcvq_space.rtt_us = rtt_us; msk->scaling_ratio = scaling_ratio; if (time < (rtt_us >> 3) || rtt_us == 0) return; if (msk->rcvq_space.copied <= msk->rcvq_space.space) goto new_measure; if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) && !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { u64 rcvwin, grow; int rcvbuf; rcvwin = ((u64)msk->rcvq_space.copied << 1) + 16 * advmss; grow = rcvwin * (msk->rcvq_space.copied - msk->rcvq_space.space); do_div(grow, msk->rcvq_space.space); rcvwin += (grow << 1); rcvbuf = min_t(u64, mptcp_space_from_win(sk, rcvwin), READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])); if (rcvbuf > sk->sk_rcvbuf) { u32 window_clamp; window_clamp = mptcp_win_from_space(sk, rcvbuf); WRITE_ONCE(sk->sk_rcvbuf, rcvbuf); /* Make subflows follow along. If we do not do this, we * get drops at subflow level if skbs can't be moved to * the mptcp rx queue fast enough (announced rcv_win can * exceed ssk->sk_rcvbuf). */ mptcp_for_each_subflow(msk, subflow) { struct sock *ssk; bool slow; ssk = mptcp_subflow_tcp_sock(subflow); slow = lock_sock_fast(ssk); WRITE_ONCE(ssk->sk_rcvbuf, rcvbuf); WRITE_ONCE(tcp_sk(ssk)->window_clamp, window_clamp); if (tcp_can_send_ack(ssk)) tcp_cleanup_rbuf(ssk, 1); unlock_sock_fast(ssk, slow); } } } msk->rcvq_space.space = msk->rcvq_space.copied; new_measure: msk->rcvq_space.copied = 0; msk->rcvq_space.time = mstamp; } static struct mptcp_subflow_context * __mptcp_first_ready_from(struct mptcp_sock *msk, struct mptcp_subflow_context *subflow) { struct mptcp_subflow_context *start_subflow = subflow; while (!READ_ONCE(subflow->data_avail)) { subflow = mptcp_next_subflow(msk, subflow); if (subflow == start_subflow) return NULL; } return subflow; } static bool __mptcp_move_skbs(struct sock *sk) { struct mptcp_subflow_context *subflow; struct mptcp_sock *msk = mptcp_sk(sk); bool ret = false; if (list_empty(&msk->conn_list)) return false; /* verify we can move any data from the subflow, eventually updating */ if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) mptcp_for_each_subflow(msk, subflow) __mptcp_rcvbuf_update(sk, subflow->tcp_sock); subflow = list_first_entry(&msk->conn_list, struct mptcp_subflow_context, node); for (;;) { struct sock *ssk; bool slowpath; /* * As an optimization avoid traversing the subflows list * and ev. acquiring the subflow socket lock before baling out */ if (sk_rmem_alloc_get(sk) > sk->sk_rcvbuf) break; subflow = __mptcp_first_ready_from(msk, subflow); if (!subflow) break; ssk = mptcp_subflow_tcp_sock(subflow); slowpath = lock_sock_fast(ssk); ret = __mptcp_move_skbs_from_subflow(msk, ssk) || ret; if (unlikely(ssk->sk_err)) __mptcp_error_report(sk); unlock_sock_fast(ssk, slowpath); subflow = mptcp_next_subflow(msk, subflow); } __mptcp_ofo_queue(msk); if (ret) mptcp_check_data_fin((struct sock *)msk); return ret; } static unsigned int mptcp_inq_hint(const struct sock *sk) { const struct mptcp_sock *msk = mptcp_sk(sk); const struct sk_buff *skb; skb = skb_peek(&sk->sk_receive_queue); if (skb) { u64 hint_val = READ_ONCE(msk->ack_seq) - MPTCP_SKB_CB(skb)->map_seq; if (hint_val >= INT_MAX) return INT_MAX; return (unsigned int)hint_val; } if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN)) return 1; return 0; } static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len) { struct mptcp_sock *msk = mptcp_sk(sk); struct scm_timestamping_internal tss; int copied = 0, cmsg_flags = 0; int target; long timeo; /* MSG_ERRQUEUE is really a no-op till we support IP_RECVERR */ if (unlikely(flags & MSG_ERRQUEUE)) return inet_recv_error(sk, msg, len, addr_len); lock_sock(sk); if (unlikely(sk->sk_state == TCP_LISTEN)) { copied = -ENOTCONN; goto out_err; } timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); len = min_t(size_t, len, INT_MAX); target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); if (unlikely(msk->recvmsg_inq)) cmsg_flags = MPTCP_CMSG_INQ; while (copied < len) { int err, bytes_read; bytes_read = __mptcp_recvmsg_mskq(sk, msg, len - copied, flags, &tss, &cmsg_flags); if (unlikely(bytes_read < 0)) { if (!copied) copied = bytes_read; goto out_err; } copied += bytes_read; if (skb_queue_empty(&sk->sk_receive_queue) && __mptcp_move_skbs(sk)) continue; /* only the MPTCP socket status is relevant here. The exit * conditions mirror closely tcp_recvmsg() */ if (copied >= target) break; if (copied) { if (sk->sk_err || sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN) || !timeo || signal_pending(current)) break; } else { if (sk->sk_err) { copied = sock_error(sk); break; } if (sk->sk_shutdown & RCV_SHUTDOWN) { /* race breaker: the shutdown could be after the * previous receive queue check */ if (__mptcp_move_skbs(sk)) continue; break; } if (sk->sk_state == TCP_CLOSE) { copied = -ENOTCONN; break; } if (!timeo) { copied = -EAGAIN; break; } if (signal_pending(current)) { copied = sock_intr_errno(timeo); break; } } pr_debug("block timeout %ld\n", timeo); mptcp_cleanup_rbuf(msk, copied); err = sk_wait_data(sk, &timeo, NULL); if (err < 0) { err = copied ? : err; goto out_err; } } mptcp_cleanup_rbuf(msk, copied); out_err: if (cmsg_flags && copied >= 0) { if (cmsg_flags & MPTCP_CMSG_TS) tcp_recv_timestamp(msg, sk, &tss); if (cmsg_flags & MPTCP_CMSG_INQ) { unsigned int inq = mptcp_inq_hint(sk); put_cmsg(msg, SOL_TCP, TCP_CM_INQ, sizeof(inq), &inq); } } pr_debug("msk=%p rx queue empty=%d copied=%d\n", msk, skb_queue_empty(&sk->sk_receive_queue), copied); release_sock(sk); return copied; } static void mptcp_retransmit_timer(struct timer_list *t) { struct inet_connection_sock *icsk = from_timer(icsk, t, icsk_retransmit_timer); struct sock *sk = &icsk->icsk_inet.sk; struct mptcp_sock *msk = mptcp_sk(sk); bh_lock_sock(sk); if (!sock_owned_by_user(sk)) { /* we need a process context to retransmit */ if (!test_and_set_bit(MPTCP_WORK_RTX, &msk->flags)) mptcp_schedule_work(sk); } else { /* delegate our work to tcp_release_cb() */ __set_bit(MPTCP_RETRANSMIT, &msk->cb_flags); } bh_unlock_sock(sk); sock_put(sk); } static void mptcp_tout_timer(struct timer_list *t) { struct sock *sk = from_timer(sk, t, sk_timer); mptcp_schedule_work(sk); sock_put(sk); } /* Find an idle subflow. Return NULL if there is unacked data at tcp * level. * * A backup subflow is returned only if that is the only kind available. */ struct sock *mptcp_subflow_get_retrans(struct mptcp_sock *msk) { struct sock *backup = NULL, *pick = NULL; struct mptcp_subflow_context *subflow; int min_stale_count = INT_MAX; mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); if (!__mptcp_subflow_active(subflow)) continue; /* still data outstanding at TCP level? skip this */ if (!tcp_rtx_and_write_queues_empty(ssk)) { mptcp_pm_subflow_chk_stale(msk, ssk); min_stale_count = min_t(int, min_stale_count, subflow->stale_count); continue; } if (subflow->backup || subflow->request_bkup) { if (!backup) backup = ssk; continue; } if (!pick) pick = ssk; } if (pick) return pick; /* use backup only if there are no progresses anywhere */ return min_stale_count > 1 ? backup : NULL; } bool __mptcp_retransmit_pending_data(struct sock *sk) { struct mptcp_data_frag *cur, *rtx_head; struct mptcp_sock *msk = mptcp_sk(sk); if (__mptcp_check_fallback(msk)) return false; /* the closing socket has some data untransmitted and/or unacked: * some data in the mptcp rtx queue has not really xmitted yet. * keep it simple and re-inject the whole mptcp level rtx queue */ mptcp_data_lock(sk); __mptcp_clean_una_wakeup(sk); rtx_head = mptcp_rtx_head(sk); if (!rtx_head) { mptcp_data_unlock(sk); return false; } msk->recovery_snd_nxt = msk->snd_nxt; msk->recovery = true; mptcp_data_unlock(sk); msk->first_pending = rtx_head; msk->snd_burst = 0; /* be sure to clear the "sent status" on all re-injected fragments */ list_for_each_entry(cur, &msk->rtx_queue, list) { if (!cur->already_sent) break; cur->already_sent = 0; } return true; } /* flags for __mptcp_close_ssk() */ #define MPTCP_CF_PUSH BIT(1) #define MPTCP_CF_FASTCLOSE BIT(2) /* be sure to send a reset only if the caller asked for it, also * clean completely the subflow status when the subflow reaches * TCP_CLOSE state */ static void __mptcp_subflow_disconnect(struct sock *ssk, struct mptcp_subflow_context *subflow, unsigned int flags) { if (((1 << ssk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) || (flags & MPTCP_CF_FASTCLOSE)) { /* The MPTCP code never wait on the subflow sockets, TCP-level * disconnect should never fail */ WARN_ON_ONCE(tcp_disconnect(ssk, 0)); mptcp_subflow_ctx_reset(subflow); } else { tcp_shutdown(ssk, SEND_SHUTDOWN); } } /* subflow sockets can be either outgoing (connect) or incoming * (accept). * * Outgoing subflows use in-kernel sockets. * Incoming subflows do not have their own 'struct socket' allocated, * so we need to use tcp_close() after detaching them from the mptcp * parent socket. */ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, struct mptcp_subflow_context *subflow, unsigned int flags) { struct mptcp_sock *msk = mptcp_sk(sk); bool dispose_it, need_push = false; /* If the first subflow moved to a close state before accept, e.g. due * to an incoming reset or listener shutdown, the subflow socket is * already deleted by inet_child_forget() and the mptcp socket can't * survive too. */ if (msk->in_accept_queue && msk->first == ssk && (sock_flag(sk, SOCK_DEAD) || sock_flag(ssk, SOCK_DEAD))) { /* ensure later check in mptcp_worker() will dispose the msk */ sock_set_flag(sk, SOCK_DEAD); mptcp_set_close_tout(sk, tcp_jiffies32 - (mptcp_close_timeout(sk) + 1)); lock_sock_nested(ssk, SINGLE_DEPTH_NESTING); mptcp_subflow_drop_ctx(ssk); goto out_release; } dispose_it = msk->free_first || ssk != msk->first; if (dispose_it) list_del(&subflow->node); lock_sock_nested(ssk, SINGLE_DEPTH_NESTING); if ((flags & MPTCP_CF_FASTCLOSE) && !__mptcp_check_fallback(msk)) { /* be sure to force the tcp_close path * to generate the egress reset */ ssk->sk_lingertime = 0; sock_set_flag(ssk, SOCK_LINGER); subflow->send_fastclose = 1; } need_push = (flags & MPTCP_CF_PUSH) && __mptcp_retransmit_pending_data(sk); if (!dispose_it) { __mptcp_subflow_disconnect(ssk, subflow, flags); release_sock(ssk); goto out; } subflow->disposable = 1; /* if ssk hit tcp_done(), tcp_cleanup_ulp() cleared the related ops * the ssk has been already destroyed, we just need to release the * reference owned by msk; */ if (!inet_csk(ssk)->icsk_ulp_ops) { WARN_ON_ONCE(!sock_flag(ssk, SOCK_DEAD)); kfree_rcu(subflow, rcu); } else { /* otherwise tcp will dispose of the ssk and subflow ctx */ __tcp_close(ssk, 0); /* close acquired an extra ref */ __sock_put(ssk); } out_release: __mptcp_subflow_error_report(sk, ssk); release_sock(ssk); sock_put(ssk); if (ssk == msk->first) WRITE_ONCE(msk->first, NULL); out: __mptcp_sync_sndbuf(sk); if (need_push) __mptcp_push_pending(sk, 0); /* Catch every 'all subflows closed' scenario, including peers silently * closing them, e.g. due to timeout. * For established sockets, allow an additional timeout before closing, * as the protocol can still create more subflows. */ if (list_is_singular(&msk->conn_list) && msk->first && inet_sk_state_load(msk->first) == TCP_CLOSE) { if (sk->sk_state != TCP_ESTABLISHED || msk->in_accept_queue || sock_flag(sk, SOCK_DEAD)) { mptcp_set_state(sk, TCP_CLOSE); mptcp_close_wake_up(sk); } else { mptcp_start_tout_timer(sk); } } } void mptcp_close_ssk(struct sock *sk, struct sock *ssk, struct mptcp_subflow_context *subflow) { /* The first subflow can already be closed and still in the list */ if (subflow->close_event_done) return; subflow->close_event_done = true; if (sk->sk_state == TCP_ESTABLISHED) mptcp_event(MPTCP_EVENT_SUB_CLOSED, mptcp_sk(sk), ssk, GFP_KERNEL); /* subflow aborted before reaching the fully_established status * attempt the creation of the next subflow */ mptcp_pm_subflow_check_next(mptcp_sk(sk), subflow); __mptcp_close_ssk(sk, ssk, subflow, MPTCP_CF_PUSH); } static unsigned int mptcp_sync_mss(struct sock *sk, u32 pmtu) { return 0; } static void __mptcp_close_subflow(struct sock *sk) { struct mptcp_subflow_context *subflow, *tmp; struct mptcp_sock *msk = mptcp_sk(sk); might_sleep(); mptcp_for_each_subflow_safe(msk, subflow, tmp) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); int ssk_state = inet_sk_state_load(ssk); if (ssk_state != TCP_CLOSE && (ssk_state != TCP_CLOSE_WAIT || inet_sk_state_load(sk) != TCP_ESTABLISHED)) continue; /* 'subflow_data_ready' will re-sched once rx queue is empty */ if (!skb_queue_empty_lockless(&ssk->sk_receive_queue)) continue; mptcp_close_ssk(sk, ssk, subflow); } } static bool mptcp_close_tout_expired(const struct sock *sk) { if (!inet_csk(sk)->icsk_mtup.probe_timestamp || sk->sk_state == TCP_CLOSE) return false; return time_after32(tcp_jiffies32, inet_csk(sk)->icsk_mtup.probe_timestamp + mptcp_close_timeout(sk)); } static void mptcp_check_fastclose(struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow, *tmp; struct sock *sk = (struct sock *)msk; if (likely(!READ_ONCE(msk->rcv_fastclose))) return; mptcp_token_destroy(msk); mptcp_for_each_subflow_safe(msk, subflow, tmp) { struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow); bool slow; slow = lock_sock_fast(tcp_sk); if (tcp_sk->sk_state != TCP_CLOSE) { mptcp_send_active_reset_reason(tcp_sk); tcp_set_state(tcp_sk, TCP_CLOSE); } unlock_sock_fast(tcp_sk, slow); } /* Mirror the tcp_reset() error propagation */ switch (sk->sk_state) { case TCP_SYN_SENT: WRITE_ONCE(sk->sk_err, ECONNREFUSED); break; case TCP_CLOSE_WAIT: WRITE_ONCE(sk->sk_err, EPIPE); break; case TCP_CLOSE: return; default: WRITE_ONCE(sk->sk_err, ECONNRESET); } mptcp_set_state(sk, TCP_CLOSE); WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK); smp_mb__before_atomic(); /* SHUTDOWN must be visible first */ set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags); /* the calling mptcp_worker will properly destroy the socket */ if (sock_flag(sk, SOCK_DEAD)) return; sk->sk_state_change(sk); sk_error_report(sk); } static void __mptcp_retrans(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_subflow_context *subflow; struct mptcp_sendmsg_info info = {}; struct mptcp_data_frag *dfrag; struct sock *ssk; int ret, err; u16 len = 0; mptcp_clean_una_wakeup(sk); /* first check ssk: need to kick "stale" logic */ err = mptcp_sched_get_retrans(msk); dfrag = mptcp_rtx_head(sk); if (!dfrag) { if (mptcp_data_fin_enabled(msk)) { struct inet_connection_sock *icsk = inet_csk(sk); icsk->icsk_retransmits++; mptcp_set_datafin_timeout(sk); mptcp_send_ack(msk); goto reset_timer; } if (!mptcp_send_head(sk)) return; goto reset_timer; } if (err) goto reset_timer; mptcp_for_each_subflow(msk, subflow) { if (READ_ONCE(subflow->scheduled)) { u16 copied = 0; mptcp_subflow_set_scheduled(subflow, false); ssk = mptcp_subflow_tcp_sock(subflow); lock_sock(ssk); /* limit retransmission to the bytes already sent on some subflows */ info.sent = 0; info.limit = READ_ONCE(msk->csum_enabled) ? dfrag->data_len : dfrag->already_sent; while (info.sent < info.limit) { ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info); if (ret <= 0) break; MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RETRANSSEGS); copied += ret; info.sent += ret; } if (copied) { len = max(copied, len); tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle, info.size_goal); WRITE_ONCE(msk->allow_infinite_fallback, false); } release_sock(ssk); } } msk->bytes_retrans += len; dfrag->already_sent = max(dfrag->already_sent, len); reset_timer: mptcp_check_and_set_pending(sk); if (!mptcp_rtx_timer_pending(sk)) mptcp_reset_rtx_timer(sk); } /* schedule the timeout timer for the relevant event: either close timeout * or mp_fail timeout. The close timeout takes precedence on the mp_fail one */ void mptcp_reset_tout_timer(struct mptcp_sock *msk, unsigned long fail_tout) { struct sock *sk = (struct sock *)msk; unsigned long timeout, close_timeout; if (!fail_tout && !inet_csk(sk)->icsk_mtup.probe_timestamp) return; close_timeout = (unsigned long)inet_csk(sk)->icsk_mtup.probe_timestamp - tcp_jiffies32 + jiffies + mptcp_close_timeout(sk); /* the close timeout takes precedence on the fail one, and here at least one of * them is active */ timeout = inet_csk(sk)->icsk_mtup.probe_timestamp ? close_timeout : fail_tout; sk_reset_timer(sk, &sk->sk_timer, timeout); } static void mptcp_mp_fail_no_response(struct mptcp_sock *msk) { struct sock *ssk = msk->first; bool slow; if (!ssk) return; pr_debug("MP_FAIL doesn't respond, reset the subflow\n"); slow = lock_sock_fast(ssk); mptcp_subflow_reset(ssk); WRITE_ONCE(mptcp_subflow_ctx(ssk)->fail_tout, 0); unlock_sock_fast(ssk, slow); } static void mptcp_do_fastclose(struct sock *sk) { struct mptcp_subflow_context *subflow, *tmp; struct mptcp_sock *msk = mptcp_sk(sk); mptcp_set_state(sk, TCP_CLOSE); mptcp_for_each_subflow_safe(msk, subflow, tmp) __mptcp_close_ssk(sk, mptcp_subflow_tcp_sock(subflow), subflow, MPTCP_CF_FASTCLOSE); } static void mptcp_worker(struct work_struct *work) { struct mptcp_sock *msk = container_of(work, struct mptcp_sock, work); struct sock *sk = (struct sock *)msk; unsigned long fail_tout; int state; lock_sock(sk); state = sk->sk_state; if (unlikely((1 << state) & (TCPF_CLOSE | TCPF_LISTEN))) goto unlock; mptcp_check_fastclose(msk); mptcp_pm_worker(msk); mptcp_check_send_data_fin(sk); mptcp_check_data_fin_ack(sk); mptcp_check_data_fin(sk); if (test_and_clear_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags)) __mptcp_close_subflow(sk); if (mptcp_close_tout_expired(sk)) { mptcp_do_fastclose(sk); mptcp_close_wake_up(sk); } if (sock_flag(sk, SOCK_DEAD) && sk->sk_state == TCP_CLOSE) { __mptcp_destroy_sock(sk); goto unlock; } if (test_and_clear_bit(MPTCP_WORK_RTX, &msk->flags)) __mptcp_retrans(sk); fail_tout = msk->first ? READ_ONCE(mptcp_subflow_ctx(msk->first)->fail_tout) : 0; if (fail_tout && time_after(jiffies, fail_tout)) mptcp_mp_fail_no_response(msk); unlock: release_sock(sk); sock_put(sk); } static void __mptcp_init_sock(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); INIT_LIST_HEAD(&msk->conn_list); INIT_LIST_HEAD(&msk->join_list); INIT_LIST_HEAD(&msk->rtx_queue); INIT_WORK(&msk->work, mptcp_worker); msk->out_of_order_queue = RB_ROOT; msk->first_pending = NULL; msk->timer_ival = TCP_RTO_MIN; msk->scaling_ratio = TCP_DEFAULT_SCALING_RATIO; WRITE_ONCE(msk->first, NULL); inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss; WRITE_ONCE(msk->csum_enabled, mptcp_is_checksum_enabled(sock_net(sk))); WRITE_ONCE(msk->allow_infinite_fallback, true); msk->recovery = false; msk->subflow_id = 1; msk->last_data_sent = tcp_jiffies32; msk->last_data_recv = tcp_jiffies32; msk->last_ack_recv = tcp_jiffies32; mptcp_pm_data_init(msk); /* re-use the csk retrans timer for MPTCP-level retrans */ timer_setup(&msk->sk.icsk_retransmit_timer, mptcp_retransmit_timer, 0); timer_setup(&sk->sk_timer, mptcp_tout_timer, 0); } static void mptcp_ca_reset(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); tcp_assign_congestion_control(sk); strscpy(mptcp_sk(sk)->ca_name, icsk->icsk_ca_ops->name, sizeof(mptcp_sk(sk)->ca_name)); /* no need to keep a reference to the ops, the name will suffice */ tcp_cleanup_congestion_control(sk); icsk->icsk_ca_ops = NULL; } static int mptcp_init_sock(struct sock *sk) { struct net *net = sock_net(sk); int ret; __mptcp_init_sock(sk); if (!mptcp_is_enabled(net)) return -ENOPROTOOPT; if (unlikely(!net->mib.mptcp_statistics) && !mptcp_mib_alloc(net)) return -ENOMEM; rcu_read_lock(); ret = mptcp_init_sched(mptcp_sk(sk), mptcp_sched_find(mptcp_get_scheduler(net))); rcu_read_unlock(); if (ret) return ret; set_bit(SOCK_CUSTOM_SOCKOPT, &sk->sk_socket->flags); /* fetch the ca name; do it outside __mptcp_init_sock(), so that clone will * propagate the correct value */ mptcp_ca_reset(sk); sk_sockets_allocated_inc(sk); sk->sk_rcvbuf = READ_ONCE(net->ipv4.sysctl_tcp_rmem[1]); sk->sk_sndbuf = READ_ONCE(net->ipv4.sysctl_tcp_wmem[1]); return 0; } static void __mptcp_clear_xmit(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_data_frag *dtmp, *dfrag; WRITE_ONCE(msk->first_pending, NULL); list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list) dfrag_clear(sk, dfrag); } void mptcp_cancel_work(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); if (cancel_work_sync(&msk->work)) __sock_put(sk); } void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how) { lock_sock(ssk); switch (ssk->sk_state) { case TCP_LISTEN: if (!(how & RCV_SHUTDOWN)) break; fallthrough; case TCP_SYN_SENT: WARN_ON_ONCE(tcp_disconnect(ssk, O_NONBLOCK)); break; default: if (__mptcp_check_fallback(mptcp_sk(sk))) { pr_debug("Fallback\n"); ssk->sk_shutdown |= how; tcp_shutdown(ssk, how); /* simulate the data_fin ack reception to let the state * machine move forward */ WRITE_ONCE(mptcp_sk(sk)->snd_una, mptcp_sk(sk)->snd_nxt); mptcp_schedule_work(sk); } else { pr_debug("Sending DATA_FIN on subflow %p\n", ssk); tcp_send_ack(ssk); if (!mptcp_rtx_timer_pending(sk)) mptcp_reset_rtx_timer(sk); } break; } release_sock(ssk); } void mptcp_set_state(struct sock *sk, int state) { int oldstate = sk->sk_state; switch (state) { case TCP_ESTABLISHED: if (oldstate != TCP_ESTABLISHED) MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_CURRESTAB); break; case TCP_CLOSE_WAIT: /* Unlike TCP, MPTCP sk would not have the TCP_SYN_RECV state: * MPTCP "accepted" sockets will be created later on. So no * transition from TCP_SYN_RECV to TCP_CLOSE_WAIT. */ break; default: if (oldstate == TCP_ESTABLISHED || oldstate == TCP_CLOSE_WAIT) MPTCP_DEC_STATS(sock_net(sk), MPTCP_MIB_CURRESTAB); } inet_sk_state_store(sk, state); } static const unsigned char new_state[16] = { /* current state: new state: action: */ [0 /* (Invalid) */] = TCP_CLOSE, [TCP_ESTABLISHED] = TCP_FIN_WAIT1 | TCP_ACTION_FIN, [TCP_SYN_SENT] = TCP_CLOSE, [TCP_SYN_RECV] = TCP_FIN_WAIT1 | TCP_ACTION_FIN, [TCP_FIN_WAIT1] = TCP_FIN_WAIT1, [TCP_FIN_WAIT2] = TCP_FIN_WAIT2, [TCP_TIME_WAIT] = TCP_CLOSE, /* should not happen ! */ [TCP_CLOSE] = TCP_CLOSE, [TCP_CLOSE_WAIT] = TCP_LAST_ACK | TCP_ACTION_FIN, [TCP_LAST_ACK] = TCP_LAST_ACK, [TCP_LISTEN] = TCP_CLOSE, [TCP_CLOSING] = TCP_CLOSING, [TCP_NEW_SYN_RECV] = TCP_CLOSE, /* should not happen ! */ }; static int mptcp_close_state(struct sock *sk) { int next = (int)new_state[sk->sk_state]; int ns = next & TCP_STATE_MASK; mptcp_set_state(sk, ns); return next & TCP_ACTION_FIN; } static void mptcp_check_send_data_fin(struct sock *sk) { struct mptcp_subflow_context *subflow; struct mptcp_sock *msk = mptcp_sk(sk); pr_debug("msk=%p snd_data_fin_enable=%d pending=%d snd_nxt=%llu write_seq=%llu\n", msk, msk->snd_data_fin_enable, !!mptcp_send_head(sk), msk->snd_nxt, msk->write_seq); /* we still need to enqueue subflows or not really shutting down, * skip this */ if (!msk->snd_data_fin_enable || msk->snd_nxt + 1 != msk->write_seq || mptcp_send_head(sk)) return; WRITE_ONCE(msk->snd_nxt, msk->write_seq); mptcp_for_each_subflow(msk, subflow) { struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow); mptcp_subflow_shutdown(sk, tcp_sk, SEND_SHUTDOWN); } } static void __mptcp_wr_shutdown(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); pr_debug("msk=%p snd_data_fin_enable=%d shutdown=%x state=%d pending=%d\n", msk, msk->snd_data_fin_enable, sk->sk_shutdown, sk->sk_state, !!mptcp_send_head(sk)); /* will be ignored by fallback sockets */ WRITE_ONCE(msk->write_seq, msk->write_seq + 1); WRITE_ONCE(msk->snd_data_fin_enable, 1); mptcp_check_send_data_fin(sk); } static void __mptcp_destroy_sock(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); pr_debug("msk=%p\n", msk); might_sleep(); mptcp_stop_rtx_timer(sk); sk_stop_timer(sk, &sk->sk_timer); msk->pm.status = 0; mptcp_release_sched(msk); sk->sk_prot->destroy(sk); sk_stream_kill_queues(sk); xfrm_sk_free_policy(sk); sock_put(sk); } void __mptcp_unaccepted_force_close(struct sock *sk) { sock_set_flag(sk, SOCK_DEAD); mptcp_do_fastclose(sk); __mptcp_destroy_sock(sk); } static __poll_t mptcp_check_readable(struct sock *sk) { return mptcp_epollin_ready(sk) ? EPOLLIN | EPOLLRDNORM : 0; } static void mptcp_check_listen_stop(struct sock *sk) { struct sock *ssk; if (inet_sk_state_load(sk) != TCP_LISTEN) return; sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); ssk = mptcp_sk(sk)->first; if (WARN_ON_ONCE(!ssk || inet_sk_state_load(ssk) != TCP_LISTEN)) return; lock_sock_nested(ssk, SINGLE_DEPTH_NESTING); tcp_set_state(ssk, TCP_CLOSE); mptcp_subflow_queue_clean(sk, ssk); inet_csk_listen_stop(ssk); mptcp_event_pm_listener(ssk, MPTCP_EVENT_LISTENER_CLOSED); release_sock(ssk); } bool __mptcp_close(struct sock *sk, long timeout) { struct mptcp_subflow_context *subflow; struct mptcp_sock *msk = mptcp_sk(sk); bool do_cancel_work = false; int subflows_alive = 0; WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK); if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) { mptcp_check_listen_stop(sk); mptcp_set_state(sk, TCP_CLOSE); goto cleanup; } if (mptcp_data_avail(msk) || timeout < 0) { /* If the msk has read data, or the caller explicitly ask it, * do the MPTCP equivalent of TCP reset, aka MPTCP fastclose */ mptcp_do_fastclose(sk); timeout = 0; } else if (mptcp_close_state(sk)) { __mptcp_wr_shutdown(sk); } sk_stream_wait_close(sk, timeout); cleanup: /* orphan all the subflows */ mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); bool slow = lock_sock_fast_nested(ssk); subflows_alive += ssk->sk_state != TCP_CLOSE; /* since the close timeout takes precedence on the fail one, * cancel the latter */ if (ssk == msk->first) subflow->fail_tout = 0; /* detach from the parent socket, but allow data_ready to * push incoming data into the mptcp stack, to properly ack it */ ssk->sk_socket = NULL; ssk->sk_wq = NULL; unlock_sock_fast(ssk, slow); } sock_orphan(sk); /* all the subflows are closed, only timeout can change the msk * state, let's not keep resources busy for no reasons */ if (subflows_alive == 0) mptcp_set_state(sk, TCP_CLOSE); sock_hold(sk); pr_debug("msk=%p state=%d\n", sk, sk->sk_state); mptcp_pm_connection_closed(msk); if (sk->sk_state == TCP_CLOSE) { __mptcp_destroy_sock(sk); do_cancel_work = true; } else { mptcp_start_tout_timer(sk); } return do_cancel_work; } static void mptcp_close(struct sock *sk, long timeout) { bool do_cancel_work; lock_sock(sk); do_cancel_work = __mptcp_close(sk, timeout); release_sock(sk); if (do_cancel_work) mptcp_cancel_work(sk); sock_put(sk); } static void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk) { #if IS_ENABLED(CONFIG_MPTCP_IPV6) const struct ipv6_pinfo *ssk6 = inet6_sk(ssk); struct ipv6_pinfo *msk6 = inet6_sk(msk); msk->sk_v6_daddr = ssk->sk_v6_daddr; msk->sk_v6_rcv_saddr = ssk->sk_v6_rcv_saddr; if (msk6 && ssk6) { msk6->saddr = ssk6->saddr; msk6->flow_label = ssk6->flow_label; } #endif inet_sk(msk)->inet_num = inet_sk(ssk)->inet_num; inet_sk(msk)->inet_dport = inet_sk(ssk)->inet_dport; inet_sk(msk)->inet_sport = inet_sk(ssk)->inet_sport; inet_sk(msk)->inet_daddr = inet_sk(ssk)->inet_daddr; inet_sk(msk)->inet_saddr = inet_sk(ssk)->inet_saddr; inet_sk(msk)->inet_rcv_saddr = inet_sk(ssk)->inet_rcv_saddr; } static int mptcp_disconnect(struct sock *sk, int flags) { struct mptcp_sock *msk = mptcp_sk(sk); /* We are on the fastopen error path. We can't call straight into the * subflows cleanup code due to lock nesting (we are already under * msk->firstsocket lock). */ if (msk->fastopening) return -EBUSY; mptcp_check_listen_stop(sk); mptcp_set_state(sk, TCP_CLOSE); mptcp_stop_rtx_timer(sk); mptcp_stop_tout_timer(sk); mptcp_pm_connection_closed(msk); /* msk->subflow is still intact, the following will not free the first * subflow */ mptcp_destroy_common(msk, MPTCP_CF_FASTCLOSE); WRITE_ONCE(msk->flags, 0); msk->cb_flags = 0; msk->recovery = false; WRITE_ONCE(msk->can_ack, false); WRITE_ONCE(msk->fully_established, false); WRITE_ONCE(msk->rcv_data_fin, false); WRITE_ONCE(msk->snd_data_fin_enable, false); WRITE_ONCE(msk->rcv_fastclose, false); WRITE_ONCE(msk->use_64bit_ack, false); WRITE_ONCE(msk->csum_enabled, mptcp_is_checksum_enabled(sock_net(sk))); mptcp_pm_data_reset(msk); mptcp_ca_reset(sk); msk->bytes_consumed = 0; msk->bytes_acked = 0; msk->bytes_received = 0; msk->bytes_sent = 0; msk->bytes_retrans = 0; msk->rcvspace_init = 0; WRITE_ONCE(sk->sk_shutdown, 0); sk_error_report(sk); return 0; } #if IS_ENABLED(CONFIG_MPTCP_IPV6) static struct ipv6_pinfo *mptcp_inet6_sk(const struct sock *sk) { struct mptcp6_sock *msk6 = container_of(mptcp_sk(sk), struct mptcp6_sock, msk); return &msk6->np; } static void mptcp_copy_ip6_options(struct sock *newsk, const struct sock *sk) { const struct ipv6_pinfo *np = inet6_sk(sk); struct ipv6_txoptions *opt; struct ipv6_pinfo *newnp; newnp = inet6_sk(newsk); rcu_read_lock(); opt = rcu_dereference(np->opt); if (opt) { opt = ipv6_dup_options(newsk, opt); if (!opt) net_warn_ratelimited("%s: Failed to copy ip6 options\n", __func__); } RCU_INIT_POINTER(newnp->opt, opt); rcu_read_unlock(); } #endif static void mptcp_copy_ip_options(struct sock *newsk, const struct sock *sk) { struct ip_options_rcu *inet_opt, *newopt = NULL; const struct inet_sock *inet = inet_sk(sk); struct inet_sock *newinet; newinet = inet_sk(newsk); rcu_read_lock(); inet_opt = rcu_dereference(inet->inet_opt); if (inet_opt) { newopt = sock_kmemdup(newsk, inet_opt, sizeof(*inet_opt) + inet_opt->opt.optlen, GFP_ATOMIC); if (!newopt) net_warn_ratelimited("%s: Failed to copy ip options\n", __func__); } RCU_INIT_POINTER(newinet->inet_opt, newopt); rcu_read_unlock(); } struct sock *mptcp_sk_clone_init(const struct sock *sk, const struct mptcp_options_received *mp_opt, struct sock *ssk, struct request_sock *req) { struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); struct sock *nsk = sk_clone_lock(sk, GFP_ATOMIC); struct mptcp_subflow_context *subflow; struct mptcp_sock *msk; if (!nsk) return NULL; #if IS_ENABLED(CONFIG_MPTCP_IPV6) if (nsk->sk_family == AF_INET6) inet_sk(nsk)->pinet6 = mptcp_inet6_sk(nsk); #endif __mptcp_init_sock(nsk); #if IS_ENABLED(CONFIG_MPTCP_IPV6) if (nsk->sk_family == AF_INET6) mptcp_copy_ip6_options(nsk, sk); else #endif mptcp_copy_ip_options(nsk, sk); msk = mptcp_sk(nsk); WRITE_ONCE(msk->local_key, subflow_req->local_key); WRITE_ONCE(msk->token, subflow_req->token); msk->in_accept_queue = 1; WRITE_ONCE(msk->fully_established, false); if (mp_opt->suboptions & OPTION_MPTCP_CSUMREQD) WRITE_ONCE(msk->csum_enabled, true); WRITE_ONCE(msk->write_seq, subflow_req->idsn + 1); WRITE_ONCE(msk->snd_nxt, msk->write_seq); WRITE_ONCE(msk->snd_una, msk->write_seq); WRITE_ONCE(msk->wnd_end, msk->snd_nxt + tcp_sk(ssk)->snd_wnd); msk->setsockopt_seq = mptcp_sk(sk)->setsockopt_seq; mptcp_init_sched(msk, mptcp_sk(sk)->sched); /* passive msk is created after the first/MPC subflow */ msk->subflow_id = 2; sock_reset_flag(nsk, SOCK_RCU_FREE); security_inet_csk_clone(nsk, req); /* this can't race with mptcp_close(), as the msk is * not yet exposted to user-space */ mptcp_set_state(nsk, TCP_ESTABLISHED); /* The msk maintain a ref to each subflow in the connections list */ WRITE_ONCE(msk->first, ssk); subflow = mptcp_subflow_ctx(ssk); list_add(&subflow->node, &msk->conn_list); sock_hold(ssk); /* new mpc subflow takes ownership of the newly * created mptcp socket */ mptcp_token_accept(subflow_req, msk); /* set msk addresses early to ensure mptcp_pm_get_local_id() * uses the correct data */ mptcp_copy_inaddrs(nsk, ssk); __mptcp_propagate_sndbuf(nsk, ssk); mptcp_rcv_space_init(msk, ssk); if (mp_opt->suboptions & OPTION_MPTCP_MPC_ACK) __mptcp_subflow_fully_established(msk, subflow, mp_opt); bh_unlock_sock(nsk); /* note: the newly allocated socket refcount is 2 now */ return nsk; } void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk) { const struct tcp_sock *tp = tcp_sk(ssk); msk->rcvspace_init = 1; msk->rcvq_space.copied = 0; msk->rcvq_space.rtt_us = 0; msk->rcvq_space.time = tp->tcp_mstamp; /* initial rcv_space offering made to peer */ msk->rcvq_space.space = min_t(u32, tp->rcv_wnd, TCP_INIT_CWND * tp->advmss); if (msk->rcvq_space.space == 0) msk->rcvq_space.space = TCP_INIT_CWND * TCP_MSS_DEFAULT; } void mptcp_destroy_common(struct mptcp_sock *msk, unsigned int flags) { struct mptcp_subflow_context *subflow, *tmp; struct sock *sk = (struct sock *)msk; __mptcp_clear_xmit(sk); /* join list will be eventually flushed (with rst) at sock lock release time */ mptcp_for_each_subflow_safe(msk, subflow, tmp) __mptcp_close_ssk(sk, mptcp_subflow_tcp_sock(subflow), subflow, flags); __skb_queue_purge(&sk->sk_receive_queue); skb_rbtree_purge(&msk->out_of_order_queue); /* move all the rx fwd alloc into the sk_mem_reclaim_final in * inet_sock_destruct() will dispose it */ mptcp_token_destroy(msk); mptcp_pm_destroy(msk); } static void mptcp_destroy(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); /* allow the following to close even the initial subflow */ msk->free_first = 1; mptcp_destroy_common(msk, 0); sk_sockets_allocated_dec(sk); } void __mptcp_data_acked(struct sock *sk) { if (!sock_owned_by_user(sk)) __mptcp_clean_una(sk); else __set_bit(MPTCP_CLEAN_UNA, &mptcp_sk(sk)->cb_flags); } void __mptcp_check_push(struct sock *sk, struct sock *ssk) { if (!mptcp_send_head(sk)) return; if (!sock_owned_by_user(sk)) __mptcp_subflow_push_pending(sk, ssk, false); else __set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->cb_flags); } #define MPTCP_FLAGS_PROCESS_CTX_NEED (BIT(MPTCP_PUSH_PENDING) | \ BIT(MPTCP_RETRANSMIT) | \ BIT(MPTCP_FLUSH_JOIN_LIST) | \ BIT(MPTCP_DEQUEUE)) /* processes deferred events and flush wmem */ static void mptcp_release_cb(struct sock *sk) __must_hold(&sk->sk_lock.slock) { struct mptcp_sock *msk = mptcp_sk(sk); for (;;) { unsigned long flags = (msk->cb_flags & MPTCP_FLAGS_PROCESS_CTX_NEED); struct list_head join_list; if (!flags) break; INIT_LIST_HEAD(&join_list); list_splice_init(&msk->join_list, &join_list); /* the following actions acquire the subflow socket lock * * 1) can't be invoked in atomic scope * 2) must avoid ABBA deadlock with msk socket spinlock: the RX * datapath acquires the msk socket spinlock while helding * the subflow socket lock */ msk->cb_flags &= ~flags; spin_unlock_bh(&sk->sk_lock.slock); if (flags & BIT(MPTCP_FLUSH_JOIN_LIST)) __mptcp_flush_join_list(sk, &join_list); if (flags & BIT(MPTCP_PUSH_PENDING)) __mptcp_push_pending(sk, 0); if (flags & BIT(MPTCP_RETRANSMIT)) __mptcp_retrans(sk); if ((flags & BIT(MPTCP_DEQUEUE)) && __mptcp_move_skbs(sk)) { /* notify ack seq update */ mptcp_cleanup_rbuf(msk, 0); sk->sk_data_ready(sk); } cond_resched(); spin_lock_bh(&sk->sk_lock.slock); } if (__test_and_clear_bit(MPTCP_CLEAN_UNA, &msk->cb_flags)) __mptcp_clean_una_wakeup(sk); if (unlikely(msk->cb_flags)) { /* be sure to sync the msk state before taking actions * depending on sk_state (MPTCP_ERROR_REPORT) * On sk release avoid actions depending on the first subflow */ if (__test_and_clear_bit(MPTCP_SYNC_STATE, &msk->cb_flags) && msk->first) __mptcp_sync_state(sk, msk->pending_state); if (__test_and_clear_bit(MPTCP_ERROR_REPORT, &msk->cb_flags)) __mptcp_error_report(sk); if (__test_and_clear_bit(MPTCP_SYNC_SNDBUF, &msk->cb_flags)) __mptcp_sync_sndbuf(sk); } } /* MP_JOIN client subflow must wait for 4th ack before sending any data: * TCP can't schedule delack timer before the subflow is fully established. * MPTCP uses the delack timer to do 3rd ack retransmissions */ static void schedule_3rdack_retransmission(struct sock *ssk) { struct inet_connection_sock *icsk = inet_csk(ssk); struct tcp_sock *tp = tcp_sk(ssk); unsigned long timeout; if (READ_ONCE(mptcp_subflow_ctx(ssk)->fully_established)) return; /* reschedule with a timeout above RTT, as we must look only for drop */ if (tp->srtt_us) timeout = usecs_to_jiffies(tp->srtt_us >> (3 - 1)); else timeout = TCP_TIMEOUT_INIT; timeout += jiffies; WARN_ON_ONCE(icsk->icsk_ack.pending & ICSK_ACK_TIMER); smp_store_release(&icsk->icsk_ack.pending, icsk->icsk_ack.pending | ICSK_ACK_SCHED | ICSK_ACK_TIMER); sk_reset_timer(ssk, &icsk->icsk_delack_timer, timeout); } void mptcp_subflow_process_delegated(struct sock *ssk, long status) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct sock *sk = subflow->conn; if (status & BIT(MPTCP_DELEGATE_SEND)) { mptcp_data_lock(sk); if (!sock_owned_by_user(sk)) __mptcp_subflow_push_pending(sk, ssk, true); else __set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->cb_flags); mptcp_data_unlock(sk); } if (status & BIT(MPTCP_DELEGATE_SNDBUF)) { mptcp_data_lock(sk); if (!sock_owned_by_user(sk)) __mptcp_sync_sndbuf(sk); else __set_bit(MPTCP_SYNC_SNDBUF, &mptcp_sk(sk)->cb_flags); mptcp_data_unlock(sk); } if (status & BIT(MPTCP_DELEGATE_ACK)) schedule_3rdack_retransmission(ssk); } static int mptcp_hash(struct sock *sk) { /* should never be called, * we hash the TCP subflows not the MPTCP socket */ WARN_ON_ONCE(1); return 0; } static void mptcp_unhash(struct sock *sk) { /* called from sk_common_release(), but nothing to do here */ } static int mptcp_get_port(struct sock *sk, unsigned short snum) { struct mptcp_sock *msk = mptcp_sk(sk); pr_debug("msk=%p, ssk=%p\n", msk, msk->first); if (WARN_ON_ONCE(!msk->first)) return -EINVAL; return inet_csk_get_port(msk->first, snum); } void mptcp_finish_connect(struct sock *ssk) { struct mptcp_subflow_context *subflow; struct mptcp_sock *msk; struct sock *sk; subflow = mptcp_subflow_ctx(ssk); sk = subflow->conn; msk = mptcp_sk(sk); pr_debug("msk=%p, token=%u\n", sk, subflow->token); subflow->map_seq = subflow->iasn; subflow->map_subflow_seq = 1; /* the socket is not connected yet, no msk/subflow ops can access/race * accessing the field below */ WRITE_ONCE(msk->local_key, subflow->local_key); mptcp_pm_new_connection(msk, ssk, 0); } void mptcp_sock_graft(struct sock *sk, struct socket *parent) { write_lock_bh(&sk->sk_callback_lock); rcu_assign_pointer(sk->sk_wq, &parent->wq); sk_set_socket(sk, parent); sk->sk_uid = SOCK_INODE(parent)->i_uid; write_unlock_bh(&sk->sk_callback_lock); } bool mptcp_finish_join(struct sock *ssk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); struct sock *parent = (void *)msk; bool ret = true; pr_debug("msk=%p, subflow=%p\n", msk, subflow); /* mptcp socket already closing? */ if (!mptcp_is_fully_established(parent)) { subflow->reset_reason = MPTCP_RST_EMPTCP; return false; } /* active subflow, already present inside the conn_list */ if (!list_empty(&subflow->node)) { mptcp_subflow_joined(msk, ssk); mptcp_propagate_sndbuf(parent, ssk); return true; } if (!mptcp_pm_allow_new_subflow(msk)) { MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_JOINREJECTED); goto err_prohibited; } /* If we can't acquire msk socket lock here, let the release callback * handle it */ mptcp_data_lock(parent); if (!sock_owned_by_user(parent)) { ret = __mptcp_finish_join(msk, ssk); if (ret) { sock_hold(ssk); list_add_tail(&subflow->node, &msk->conn_list); } } else { sock_hold(ssk); list_add_tail(&subflow->node, &msk->join_list); __set_bit(MPTCP_FLUSH_JOIN_LIST, &msk->cb_flags); } mptcp_data_unlock(parent); if (!ret) { err_prohibited: subflow->reset_reason = MPTCP_RST_EPROHIBIT; return false; } return true; } static void mptcp_shutdown(struct sock *sk, int how) { pr_debug("sk=%p, how=%d\n", sk, how); if ((how & SEND_SHUTDOWN) && mptcp_close_state(sk)) __mptcp_wr_shutdown(sk); } static int mptcp_ioctl_outq(const struct mptcp_sock *msk, u64 v) { const struct sock *sk = (void *)msk; u64 delta; if (sk->sk_state == TCP_LISTEN) return -EINVAL; if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) return 0; delta = msk->write_seq - v; if (__mptcp_check_fallback(msk) && msk->first) { struct tcp_sock *tp = tcp_sk(msk->first); /* the first subflow is disconnected after close - see * __mptcp_close_ssk(). tcp_disconnect() moves the write_seq * so ignore that status, too. */ if (!((1 << msk->first->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_CLOSE))) delta += READ_ONCE(tp->write_seq) - tp->snd_una; } if (delta > INT_MAX) delta = INT_MAX; return (int)delta; } static int mptcp_ioctl(struct sock *sk, int cmd, int *karg) { struct mptcp_sock *msk = mptcp_sk(sk); bool slow; switch (cmd) { case SIOCINQ: if (sk->sk_state == TCP_LISTEN) return -EINVAL; lock_sock(sk); if (__mptcp_move_skbs(sk)) mptcp_cleanup_rbuf(msk, 0); *karg = mptcp_inq_hint(sk); release_sock(sk); break; case SIOCOUTQ: slow = lock_sock_fast(sk); *karg = mptcp_ioctl_outq(msk, READ_ONCE(msk->snd_una)); unlock_sock_fast(sk, slow); break; case SIOCOUTQNSD: slow = lock_sock_fast(sk); *karg = mptcp_ioctl_outq(msk, msk->snd_nxt); unlock_sock_fast(sk, slow); break; default: return -ENOIOCTLCMD; } return 0; } static int mptcp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct mptcp_subflow_context *subflow; struct mptcp_sock *msk = mptcp_sk(sk); int err = -EINVAL; struct sock *ssk; ssk = __mptcp_nmpc_sk(msk); if (IS_ERR(ssk)) return PTR_ERR(ssk); mptcp_set_state(sk, TCP_SYN_SENT); subflow = mptcp_subflow_ctx(ssk); #ifdef CONFIG_TCP_MD5SIG /* no MPTCP if MD5SIG is enabled on this socket or we may run out of * TCP option space. */ if (rcu_access_pointer(tcp_sk(ssk)->md5sig_info)) mptcp_subflow_early_fallback(msk, subflow); #endif if (subflow->request_mptcp) { if (mptcp_active_should_disable(sk)) { MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_MPCAPABLEACTIVEDISABLED); mptcp_subflow_early_fallback(msk, subflow); } else if (mptcp_token_new_connect(ssk) < 0) { MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_TOKENFALLBACKINIT); mptcp_subflow_early_fallback(msk, subflow); } } WRITE_ONCE(msk->write_seq, subflow->idsn); WRITE_ONCE(msk->snd_nxt, subflow->idsn); WRITE_ONCE(msk->snd_una, subflow->idsn); if (likely(!__mptcp_check_fallback(msk))) MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEACTIVE); /* if reaching here via the fastopen/sendmsg path, the caller already * acquired the subflow socket lock, too. */ if (!msk->fastopening) lock_sock(ssk); /* the following mirrors closely a very small chunk of code from * __inet_stream_connect() */ if (ssk->sk_state != TCP_CLOSE) goto out; if (BPF_CGROUP_PRE_CONNECT_ENABLED(ssk)) { err = ssk->sk_prot->pre_connect(ssk, uaddr, addr_len); if (err) goto out; } err = ssk->sk_prot->connect(ssk, uaddr, addr_len); if (err < 0) goto out; inet_assign_bit(DEFER_CONNECT, sk, inet_test_bit(DEFER_CONNECT, ssk)); out: if (!msk->fastopening) release_sock(ssk); /* on successful connect, the msk state will be moved to established by * subflow_finish_connect() */ if (unlikely(err)) { /* avoid leaving a dangling token in an unconnected socket */ mptcp_token_destroy(msk); mptcp_set_state(sk, TCP_CLOSE); return err; } mptcp_copy_inaddrs(sk, ssk); return 0; } static struct proto mptcp_prot = { .name = "MPTCP", .owner = THIS_MODULE, .init = mptcp_init_sock, .connect = mptcp_connect, .disconnect = mptcp_disconnect, .close = mptcp_close, .setsockopt = mptcp_setsockopt, .getsockopt = mptcp_getsockopt, .shutdown = mptcp_shutdown, .destroy = mptcp_destroy, .sendmsg = mptcp_sendmsg, .ioctl = mptcp_ioctl, .recvmsg = mptcp_recvmsg, .release_cb = mptcp_release_cb, .hash = mptcp_hash, .unhash = mptcp_unhash, .get_port = mptcp_get_port, .stream_memory_free = mptcp_stream_memory_free, .sockets_allocated = &mptcp_sockets_allocated, .memory_allocated = &tcp_memory_allocated, .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc, .memory_pressure = &tcp_memory_pressure, .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), .sysctl_mem = sysctl_tcp_mem, .obj_size = sizeof(struct mptcp_sock), .slab_flags = SLAB_TYPESAFE_BY_RCU, .no_autobind = true, }; static int mptcp_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct mptcp_sock *msk = mptcp_sk(sock->sk); struct sock *ssk, *sk = sock->sk; int err = -EINVAL; lock_sock(sk); ssk = __mptcp_nmpc_sk(msk); if (IS_ERR(ssk)) { err = PTR_ERR(ssk); goto unlock; } if (sk->sk_family == AF_INET) err = inet_bind_sk(ssk, uaddr, addr_len); #if IS_ENABLED(CONFIG_MPTCP_IPV6) else if (sk->sk_family == AF_INET6) err = inet6_bind_sk(ssk, uaddr, addr_len); #endif if (!err) mptcp_copy_inaddrs(sk, ssk); unlock: release_sock(sk); return err; } static int mptcp_listen(struct socket *sock, int backlog) { struct mptcp_sock *msk = mptcp_sk(sock->sk); struct sock *sk = sock->sk; struct sock *ssk; int err; pr_debug("msk=%p\n", msk); lock_sock(sk); err = -EINVAL; if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM) goto unlock; ssk = __mptcp_nmpc_sk(msk); if (IS_ERR(ssk)) { err = PTR_ERR(ssk); goto unlock; } mptcp_set_state(sk, TCP_LISTEN); sock_set_flag(sk, SOCK_RCU_FREE); lock_sock(ssk); err = __inet_listen_sk(ssk, backlog); release_sock(ssk); mptcp_set_state(sk, inet_sk_state_load(ssk)); if (!err) { sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); mptcp_copy_inaddrs(sk, ssk); mptcp_event_pm_listener(ssk, MPTCP_EVENT_LISTENER_CREATED); } unlock: release_sock(sk); return err; } static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, struct proto_accept_arg *arg) { struct mptcp_sock *msk = mptcp_sk(sock->sk); struct sock *ssk, *newsk; pr_debug("msk=%p\n", msk); /* Buggy applications can call accept on socket states other then LISTEN * but no need to allocate the first subflow just to error out. */ ssk = READ_ONCE(msk->first); if (!ssk) return -EINVAL; pr_debug("ssk=%p, listener=%p\n", ssk, mptcp_subflow_ctx(ssk)); newsk = inet_csk_accept(ssk, arg); if (!newsk) return arg->err; pr_debug("newsk=%p, subflow is mptcp=%d\n", newsk, sk_is_mptcp(newsk)); if (sk_is_mptcp(newsk)) { struct mptcp_subflow_context *subflow; struct sock *new_mptcp_sock; subflow = mptcp_subflow_ctx(newsk); new_mptcp_sock = subflow->conn; /* is_mptcp should be false if subflow->conn is missing, see * subflow_syn_recv_sock() */ if (WARN_ON_ONCE(!new_mptcp_sock)) { tcp_sk(newsk)->is_mptcp = 0; goto tcpfallback; } newsk = new_mptcp_sock; MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_MPCAPABLEPASSIVEACK); newsk->sk_kern_sock = arg->kern; lock_sock(newsk); __inet_accept(sock, newsock, newsk); set_bit(SOCK_CUSTOM_SOCKOPT, &newsock->flags); msk = mptcp_sk(newsk); msk->in_accept_queue = 0; /* set ssk->sk_socket of accept()ed flows to mptcp socket. * This is needed so NOSPACE flag can be set from tcp stack. */ mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); if (!ssk->sk_socket) mptcp_sock_graft(ssk, newsock); } /* Do late cleanup for the first subflow as necessary. Also * deal with bad peers not doing a complete shutdown. */ if (unlikely(inet_sk_state_load(msk->first) == TCP_CLOSE)) { __mptcp_close_ssk(newsk, msk->first, mptcp_subflow_ctx(msk->first), 0); if (unlikely(list_is_singular(&msk->conn_list))) mptcp_set_state(newsk, TCP_CLOSE); } } else { tcpfallback: newsk->sk_kern_sock = arg->kern; lock_sock(newsk); __inet_accept(sock, newsock, newsk); /* we are being invoked after accepting a non-mp-capable * flow: sk is a tcp_sk, not an mptcp one. * * Hand the socket over to tcp so all further socket ops * bypass mptcp. */ WRITE_ONCE(newsock->sk->sk_socket->ops, mptcp_fallback_tcp_ops(newsock->sk)); } release_sock(newsk); return 0; } static __poll_t mptcp_check_writeable(struct mptcp_sock *msk) { struct sock *sk = (struct sock *)msk; if (__mptcp_stream_is_writeable(sk, 1)) return EPOLLOUT | EPOLLWRNORM; set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); smp_mb__after_atomic(); /* NOSPACE is changed by mptcp_write_space() */ if (__mptcp_stream_is_writeable(sk, 1)) return EPOLLOUT | EPOLLWRNORM; return 0; } static __poll_t mptcp_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait) { struct sock *sk = sock->sk; struct mptcp_sock *msk; __poll_t mask = 0; u8 shutdown; int state; msk = mptcp_sk(sk); sock_poll_wait(file, sock, wait); state = inet_sk_state_load(sk); pr_debug("msk=%p state=%d flags=%lx\n", msk, state, msk->flags); if (state == TCP_LISTEN) { struct sock *ssk = READ_ONCE(msk->first); if (WARN_ON_ONCE(!ssk)) return 0; return inet_csk_listen_poll(ssk); } shutdown = READ_ONCE(sk->sk_shutdown); if (shutdown == SHUTDOWN_MASK || state == TCP_CLOSE) mask |= EPOLLHUP; if (shutdown & RCV_SHUTDOWN) mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; if (state != TCP_SYN_SENT && state != TCP_SYN_RECV) { mask |= mptcp_check_readable(sk); if (shutdown & SEND_SHUTDOWN) mask |= EPOLLOUT | EPOLLWRNORM; else mask |= mptcp_check_writeable(msk); } else if (state == TCP_SYN_SENT && inet_test_bit(DEFER_CONNECT, sk)) { /* cf tcp_poll() note about TFO */ mask |= EPOLLOUT | EPOLLWRNORM; } /* This barrier is coupled with smp_wmb() in __mptcp_error_report() */ smp_rmb(); if (READ_ONCE(sk->sk_err)) mask |= EPOLLERR; return mask; } static const struct proto_ops mptcp_stream_ops = { .family = PF_INET, .owner = THIS_MODULE, .release = inet_release, .bind = mptcp_bind, .connect = inet_stream_connect, .socketpair = sock_no_socketpair, .accept = mptcp_stream_accept, .getname = inet_getname, .poll = mptcp_poll, .ioctl = inet_ioctl, .gettstamp = sock_gettstamp, .listen = mptcp_listen, .shutdown = inet_shutdown, .setsockopt = sock_common_setsockopt, .getsockopt = sock_common_getsockopt, .sendmsg = inet_sendmsg, .recvmsg = inet_recvmsg, .mmap = sock_no_mmap, .set_rcvlowat = mptcp_set_rcvlowat, }; static struct inet_protosw mptcp_protosw = { .type = SOCK_STREAM, .protocol = IPPROTO_MPTCP, .prot = &mptcp_prot, .ops = &mptcp_stream_ops, .flags = INET_PROTOSW_ICSK, }; static int mptcp_napi_poll(struct napi_struct *napi, int budget) { struct mptcp_delegated_action *delegated; struct mptcp_subflow_context *subflow; int work_done = 0; delegated = container_of(napi, struct mptcp_delegated_action, napi); while ((subflow = mptcp_subflow_delegated_next(delegated)) != NULL) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); bh_lock_sock_nested(ssk); if (!sock_owned_by_user(ssk)) { mptcp_subflow_process_delegated(ssk, xchg(&subflow->delegated_status, 0)); } else { /* tcp_release_cb_override already processed * the action or will do at next release_sock(). * In both case must dequeue the subflow here - on the same * CPU that scheduled it. */ smp_wmb(); clear_bit(MPTCP_DELEGATE_SCHEDULED, &subflow->delegated_status); } bh_unlock_sock(ssk); sock_put(ssk); if (++work_done == budget) return budget; } /* always provide a 0 'work_done' argument, so that napi_complete_done * will not try accessing the NULL napi->dev ptr */ napi_complete_done(napi, 0); return work_done; } void __init mptcp_proto_init(void) { struct mptcp_delegated_action *delegated; int cpu; mptcp_prot.h.hashinfo = tcp_prot.h.hashinfo; if (percpu_counter_init(&mptcp_sockets_allocated, 0, GFP_KERNEL)) panic("Failed to allocate MPTCP pcpu counter\n"); mptcp_napi_dev = alloc_netdev_dummy(0); if (!mptcp_napi_dev) panic("Failed to allocate MPTCP dummy netdev\n"); for_each_possible_cpu(cpu) { delegated = per_cpu_ptr(&mptcp_delegated_actions, cpu); INIT_LIST_HEAD(&delegated->head); netif_napi_add_tx(mptcp_napi_dev, &delegated->napi, mptcp_napi_poll); napi_enable(&delegated->napi); } mptcp_subflow_init(); mptcp_pm_init(); mptcp_sched_init(); mptcp_token_init(); if (proto_register(&mptcp_prot, 1) != 0) panic("Failed to register MPTCP proto.\n"); inet_register_protosw(&mptcp_protosw); BUILD_BUG_ON(sizeof(struct mptcp_skb_cb) > sizeof_field(struct sk_buff, cb)); } #if IS_ENABLED(CONFIG_MPTCP_IPV6) static const struct proto_ops mptcp_v6_stream_ops = { .family = PF_INET6, .owner = THIS_MODULE, .release = inet6_release, .bind = mptcp_bind, .connect = inet_stream_connect, .socketpair = sock_no_socketpair, .accept = mptcp_stream_accept, .getname = inet6_getname, .poll = mptcp_poll, .ioctl = inet6_ioctl, .gettstamp = sock_gettstamp, .listen = mptcp_listen, .shutdown = inet_shutdown, .setsockopt = sock_common_setsockopt, .getsockopt = sock_common_getsockopt, .sendmsg = inet6_sendmsg, .recvmsg = inet6_recvmsg, .mmap = sock_no_mmap, #ifdef CONFIG_COMPAT .compat_ioctl = inet6_compat_ioctl, #endif .set_rcvlowat = mptcp_set_rcvlowat, }; static struct proto mptcp_v6_prot; static struct inet_protosw mptcp_v6_protosw = { .type = SOCK_STREAM, .protocol = IPPROTO_MPTCP, .prot = &mptcp_v6_prot, .ops = &mptcp_v6_stream_ops, .flags = INET_PROTOSW_ICSK, }; int __init mptcp_proto_v6_init(void) { int err; mptcp_v6_prot = mptcp_prot; strscpy(mptcp_v6_prot.name, "MPTCPv6", sizeof(mptcp_v6_prot.name)); mptcp_v6_prot.slab = NULL; mptcp_v6_prot.obj_size = sizeof(struct mptcp6_sock); mptcp_v6_prot.ipv6_pinfo_offset = offsetof(struct mptcp6_sock, np); err = proto_register(&mptcp_v6_prot, 1); if (err) return err; err = inet6_register_protosw(&mptcp_v6_protosw); if (err) proto_unregister(&mptcp_v6_prot); return err; } #endif
8 8 8 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 // SPDX-License-Identifier: GPL-2.0 /* * buffered writeback throttling. loosely based on CoDel. We can't drop * packets for IO scheduling, so the logic is something like this: * * - Monitor latencies in a defined window of time. * - If the minimum latency in the above window exceeds some target, increment * scaling step and scale down queue depth by a factor of 2x. The monitoring * window is then shrunk to 100 / sqrt(scaling step + 1). * - For any window where we don't have solid data on what the latencies * look like, retain status quo. * - If latencies look good, decrement scaling step. * - If we're only doing writes, allow the scaling step to go negative. This * will temporarily boost write performance, snapping back to a stable * scaling step of 0 if reads show up or the heavy writers finish. Unlike * positive scaling steps where we shrink the monitoring window, a negative * scaling step retains the default step==0 window size. * * Copyright (C) 2016 Jens Axboe * */ #include <linux/kernel.h> #include <linux/blk_types.h> #include <linux/slab.h> #include <linux/backing-dev.h> #include <linux/swap.h> #include "blk-stat.h" #include "blk-wbt.h" #include "blk-rq-qos.h" #include "elevator.h" #include "blk.h" #define CREATE_TRACE_POINTS #include <trace/events/wbt.h> enum wbt_flags { WBT_TRACKED = 1, /* write, tracked for throttling */ WBT_READ = 2, /* read */ WBT_SWAP = 4, /* write, from swap_writepage() */ WBT_DISCARD = 8, /* discard */ WBT_NR_BITS = 4, /* number of bits */ }; enum { WBT_RWQ_BG = 0, WBT_RWQ_SWAP, WBT_RWQ_DISCARD, WBT_NUM_RWQ, }; /* * If current state is WBT_STATE_ON/OFF_DEFAULT, it can be covered to any other * state, if current state is WBT_STATE_ON/OFF_MANUAL, it can only be covered * to WBT_STATE_OFF/ON_MANUAL. */ enum { WBT_STATE_ON_DEFAULT = 1, /* on by default */ WBT_STATE_ON_MANUAL = 2, /* on manually by sysfs */ WBT_STATE_OFF_DEFAULT = 3, /* off by default */ WBT_STATE_OFF_MANUAL = 4, /* off manually by sysfs */ }; struct rq_wb { /* * Settings that govern how we throttle */ unsigned int wb_background; /* background writeback */ unsigned int wb_normal; /* normal writeback */ short enable_state; /* WBT_STATE_* */ /* * Number of consecutive periods where we don't have enough * information to make a firm scale up/down decision. */ unsigned int unknown_cnt; u64 win_nsec; /* default window size */ u64 cur_win_nsec; /* current window size */ struct blk_stat_callback *cb; u64 sync_issue; void *sync_cookie; unsigned long last_issue; /* last non-throttled issue */ unsigned long last_comp; /* last non-throttled comp */ unsigned long min_lat_nsec; struct rq_qos rqos; struct rq_wait rq_wait[WBT_NUM_RWQ]; struct rq_depth rq_depth; }; static inline struct rq_wb *RQWB(struct rq_qos *rqos) { return container_of(rqos, struct rq_wb, rqos); } static inline void wbt_clear_state(struct request *rq) { rq->wbt_flags = 0; } static inline enum wbt_flags wbt_flags(struct request *rq) { return rq->wbt_flags; } static inline bool wbt_is_tracked(struct request *rq) { return rq->wbt_flags & WBT_TRACKED; } static inline bool wbt_is_read(struct request *rq) { return rq->wbt_flags & WBT_READ; } enum { /* * Default setting, we'll scale up (to 75% of QD max) or down (min 1) * from here depending on device stats */ RWB_DEF_DEPTH = 16, /* * 100msec window */ RWB_WINDOW_NSEC = 100 * 1000 * 1000ULL, /* * Disregard stats, if we don't meet this minimum */ RWB_MIN_WRITE_SAMPLES = 3, /* * If we have this number of consecutive windows without enough * information to scale up or down, slowly return to center state * (step == 0). */ RWB_UNKNOWN_BUMP = 5, }; static inline bool rwb_enabled(struct rq_wb *rwb) { return rwb && rwb->enable_state != WBT_STATE_OFF_DEFAULT && rwb->enable_state != WBT_STATE_OFF_MANUAL; } static void wb_timestamp(struct rq_wb *rwb, unsigned long *var) { if (rwb_enabled(rwb)) { const unsigned long cur = jiffies; if (cur != *var) *var = cur; } } /* * If a task was rate throttled in balance_dirty_pages() within the last * second or so, use that to indicate a higher cleaning rate. */ static bool wb_recent_wait(struct rq_wb *rwb) { struct backing_dev_info *bdi = rwb->rqos.disk->bdi; return time_before(jiffies, bdi->last_bdp_sleep + HZ); } static inline struct rq_wait *get_rq_wait(struct rq_wb *rwb, enum wbt_flags wb_acct) { if (wb_acct & WBT_SWAP) return &rwb->rq_wait[WBT_RWQ_SWAP]; else if (wb_acct & WBT_DISCARD) return &rwb->rq_wait[WBT_RWQ_DISCARD]; return &rwb->rq_wait[WBT_RWQ_BG]; } static void rwb_wake_all(struct rq_wb *rwb) { int i; for (i = 0; i < WBT_NUM_RWQ; i++) { struct rq_wait *rqw = &rwb->rq_wait[i]; if (wq_has_sleeper(&rqw->wait)) wake_up_all(&rqw->wait); } } static void wbt_rqw_done(struct rq_wb *rwb, struct rq_wait *rqw, enum wbt_flags wb_acct) { int inflight, limit; inflight = atomic_dec_return(&rqw->inflight); /* * For discards, our limit is always the background. For writes, if * the device does write back caching, drop further down before we * wake people up. */ if (wb_acct & WBT_DISCARD) limit = rwb->wb_background; else if (blk_queue_write_cache(rwb->rqos.disk->queue) && !wb_recent_wait(rwb)) limit = 0; else limit = rwb->wb_normal; /* * Don't wake anyone up if we are above the normal limit. */ if (inflight && inflight >= limit) return; if (wq_has_sleeper(&rqw->wait)) { int diff = limit - inflight; if (!inflight || diff >= rwb->wb_background / 2) wake_up_all(&rqw->wait); } } static void __wbt_done(struct rq_qos *rqos, enum wbt_flags wb_acct) { struct rq_wb *rwb = RQWB(rqos); struct rq_wait *rqw; if (!(wb_acct & WBT_TRACKED)) return; rqw = get_rq_wait(rwb, wb_acct); wbt_rqw_done(rwb, rqw, wb_acct); } /* * Called on completion of a request. Note that it's also called when * a request is merged, when the request gets freed. */ static void wbt_done(struct rq_qos *rqos, struct request *rq) { struct rq_wb *rwb = RQWB(rqos); if (!wbt_is_tracked(rq)) { if (rwb->sync_cookie == rq) { rwb->sync_issue = 0; rwb->sync_cookie = NULL; } if (wbt_is_read(rq)) wb_timestamp(rwb, &rwb->last_comp); } else { WARN_ON_ONCE(rq == rwb->sync_cookie); __wbt_done(rqos, wbt_flags(rq)); } wbt_clear_state(rq); } static inline bool stat_sample_valid(struct blk_rq_stat *stat) { /* * We need at least one read sample, and a minimum of * RWB_MIN_WRITE_SAMPLES. We require some write samples to know * that it's writes impacting us, and not just some sole read on * a device that is in a lower power state. */ return (stat[READ].nr_samples >= 1 && stat[WRITE].nr_samples >= RWB_MIN_WRITE_SAMPLES); } static u64 rwb_sync_issue_lat(struct rq_wb *rwb) { u64 issue = READ_ONCE(rwb->sync_issue); if (!issue || !rwb->sync_cookie) return 0; return blk_time_get_ns() - issue; } static inline unsigned int wbt_inflight(struct rq_wb *rwb) { unsigned int i, ret = 0; for (i = 0; i < WBT_NUM_RWQ; i++) ret += atomic_read(&rwb->rq_wait[i].inflight); return ret; } enum { LAT_OK = 1, LAT_UNKNOWN, LAT_UNKNOWN_WRITES, LAT_EXCEEDED, }; static int latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat) { struct backing_dev_info *bdi = rwb->rqos.disk->bdi; struct rq_depth *rqd = &rwb->rq_depth; u64 thislat; /* * If our stored sync issue exceeds the window size, or it * exceeds our min target AND we haven't logged any entries, * flag the latency as exceeded. wbt works off completion latencies, * but for a flooded device, a single sync IO can take a long time * to complete after being issued. If this time exceeds our * monitoring window AND we didn't see any other completions in that * window, then count that sync IO as a violation of the latency. */ thislat = rwb_sync_issue_lat(rwb); if (thislat > rwb->cur_win_nsec || (thislat > rwb->min_lat_nsec && !stat[READ].nr_samples)) { trace_wbt_lat(bdi, thislat); return LAT_EXCEEDED; } /* * No read/write mix, if stat isn't valid */ if (!stat_sample_valid(stat)) { /* * If we had writes in this stat window and the window is * current, we're only doing writes. If a task recently * waited or still has writes in flights, consider us doing * just writes as well. */ if (stat[WRITE].nr_samples || wb_recent_wait(rwb) || wbt_inflight(rwb)) return LAT_UNKNOWN_WRITES; return LAT_UNKNOWN; } /* * If the 'min' latency exceeds our target, step down. */ if (stat[READ].min > rwb->min_lat_nsec) { trace_wbt_lat(bdi, stat[READ].min); trace_wbt_stat(bdi, stat); return LAT_EXCEEDED; } if (rqd->scale_step) trace_wbt_stat(bdi, stat); return LAT_OK; } static void rwb_trace_step(struct rq_wb *rwb, const char *msg) { struct backing_dev_info *bdi = rwb->rqos.disk->bdi; struct rq_depth *rqd = &rwb->rq_depth; trace_wbt_step(bdi, msg, rqd->scale_step, rwb->cur_win_nsec, rwb->wb_background, rwb->wb_normal, rqd->max_depth); } static void calc_wb_limits(struct rq_wb *rwb) { if (rwb->min_lat_nsec == 0) { rwb->wb_normal = rwb->wb_background = 0; } else if (rwb->rq_depth.max_depth <= 2) { rwb->wb_normal = rwb->rq_depth.max_depth; rwb->wb_background = 1; } else { rwb->wb_normal = (rwb->rq_depth.max_depth + 1) / 2; rwb->wb_background = (rwb->rq_depth.max_depth + 3) / 4; } } static void scale_up(struct rq_wb *rwb) { if (!rq_depth_scale_up(&rwb->rq_depth)) return; calc_wb_limits(rwb); rwb->unknown_cnt = 0; rwb_wake_all(rwb); rwb_trace_step(rwb, tracepoint_string("scale up")); } static void scale_down(struct rq_wb *rwb, bool hard_throttle) { if (!rq_depth_scale_down(&rwb->rq_depth, hard_throttle)) return; calc_wb_limits(rwb); rwb->unknown_cnt = 0; rwb_trace_step(rwb, tracepoint_string("scale down")); } static void rwb_arm_timer(struct rq_wb *rwb) { struct rq_depth *rqd = &rwb->rq_depth; if (rqd->scale_step > 0) { /* * We should speed this up, using some variant of a fast * integer inverse square root calculation. Since we only do * this for every window expiration, it's not a huge deal, * though. */ rwb->cur_win_nsec = div_u64(rwb->win_nsec << 4, int_sqrt((rqd->scale_step + 1) << 8)); } else { /* * For step < 0, we don't want to increase/decrease the * window size. */ rwb->cur_win_nsec = rwb->win_nsec; } blk_stat_activate_nsecs(rwb->cb, rwb->cur_win_nsec); } static void wb_timer_fn(struct blk_stat_callback *cb) { struct rq_wb *rwb = cb->data; struct rq_depth *rqd = &rwb->rq_depth; unsigned int inflight = wbt_inflight(rwb); int status; if (!rwb->rqos.disk) return; status = latency_exceeded(rwb, cb->stat); trace_wbt_timer(rwb->rqos.disk->bdi, status, rqd->scale_step, inflight); /* * If we exceeded the latency target, step down. If we did not, * step one level up. If we don't know enough to say either exceeded * or ok, then don't do anything. */ switch (status) { case LAT_EXCEEDED: scale_down(rwb, true); break; case LAT_OK: scale_up(rwb); break; case LAT_UNKNOWN_WRITES: /* * We don't have a valid read/write sample, but we do have * writes going on. Allow step to go negative, to increase * write performance. */ scale_up(rwb); break; case LAT_UNKNOWN: if (++rwb->unknown_cnt < RWB_UNKNOWN_BUMP) break; /* * We get here when previously scaled reduced depth, and we * currently don't have a valid read/write sample. For that * case, slowly return to center state (step == 0). */ if (rqd->scale_step > 0) scale_up(rwb); else if (rqd->scale_step < 0) scale_down(rwb, false); break; default: break; } /* * Re-arm timer, if we have IO in flight */ if (rqd->scale_step || inflight) rwb_arm_timer(rwb); } static void wbt_update_limits(struct rq_wb *rwb) { struct rq_depth *rqd = &rwb->rq_depth; rqd->scale_step = 0; rqd->scaled_max = false; rq_depth_calc_max_depth(rqd); calc_wb_limits(rwb); rwb_wake_all(rwb); } bool wbt_disabled(struct request_queue *q) { struct rq_qos *rqos = wbt_rq_qos(q); return !rqos || !rwb_enabled(RQWB(rqos)); } u64 wbt_get_min_lat(struct request_queue *q) { struct rq_qos *rqos = wbt_rq_qos(q); if (!rqos) return 0; return RQWB(rqos)->min_lat_nsec; } void wbt_set_min_lat(struct request_queue *q, u64 val) { struct rq_qos *rqos = wbt_rq_qos(q); if (!rqos) return; RQWB(rqos)->min_lat_nsec = val; if (val) RQWB(rqos)->enable_state = WBT_STATE_ON_MANUAL; else RQWB(rqos)->enable_state = WBT_STATE_OFF_MANUAL; wbt_update_limits(RQWB(rqos)); } static bool close_io(struct rq_wb *rwb) { const unsigned long now = jiffies; return time_before(now, rwb->last_issue + HZ / 10) || time_before(now, rwb->last_comp + HZ / 10); } #define REQ_HIPRIO (REQ_SYNC | REQ_META | REQ_PRIO | REQ_SWAP) static inline unsigned int get_limit(struct rq_wb *rwb, blk_opf_t opf) { unsigned int limit; if ((opf & REQ_OP_MASK) == REQ_OP_DISCARD) return rwb->wb_background; /* * At this point we know it's a buffered write. If this is * swap trying to free memory, or REQ_SYNC is set, then * it's WB_SYNC_ALL writeback, and we'll use the max limit for * that. If the write is marked as a background write, then use * the idle limit, or go to normal if we haven't had competing * IO for a bit. */ if ((opf & REQ_HIPRIO) || wb_recent_wait(rwb)) limit = rwb->rq_depth.max_depth; else if ((opf & REQ_BACKGROUND) || close_io(rwb)) { /* * If less than 100ms since we completed unrelated IO, * limit us to half the depth for background writeback. */ limit = rwb->wb_background; } else limit = rwb->wb_normal; return limit; } struct wbt_wait_data { struct rq_wb *rwb; enum wbt_flags wb_acct; blk_opf_t opf; }; static bool wbt_inflight_cb(struct rq_wait *rqw, void *private_data) { struct wbt_wait_data *data = private_data; return rq_wait_inc_below(rqw, get_limit(data->rwb, data->opf)); } static void wbt_cleanup_cb(struct rq_wait *rqw, void *private_data) { struct wbt_wait_data *data = private_data; wbt_rqw_done(data->rwb, rqw, data->wb_acct); } /* * Block if we will exceed our limit, or if we are currently waiting for * the timer to kick off queuing again. */ static void __wbt_wait(struct rq_wb *rwb, enum wbt_flags wb_acct, blk_opf_t opf) { struct rq_wait *rqw = get_rq_wait(rwb, wb_acct); struct wbt_wait_data data = { .rwb = rwb, .wb_acct = wb_acct, .opf = opf, }; rq_qos_wait(rqw, &data, wbt_inflight_cb, wbt_cleanup_cb); } static inline bool wbt_should_throttle(struct bio *bio) { switch (bio_op(bio)) { case REQ_OP_WRITE: /* * Don't throttle WRITE_ODIRECT */ if ((bio->bi_opf & (REQ_SYNC | REQ_IDLE)) == (REQ_SYNC | REQ_IDLE)) return false; fallthrough; case REQ_OP_DISCARD: return true; default: return false; } } static enum wbt_flags bio_to_wbt_flags(struct rq_wb *rwb, struct bio *bio) { enum wbt_flags flags = 0; if (!rwb_enabled(rwb)) return 0; if (bio_op(bio) == REQ_OP_READ) { flags = WBT_READ; } else if (wbt_should_throttle(bio)) { if (bio->bi_opf & REQ_SWAP) flags |= WBT_SWAP; if (bio_op(bio) == REQ_OP_DISCARD) flags |= WBT_DISCARD; flags |= WBT_TRACKED; } return flags; } static void wbt_cleanup(struct rq_qos *rqos, struct bio *bio) { struct rq_wb *rwb = RQWB(rqos); enum wbt_flags flags = bio_to_wbt_flags(rwb, bio); __wbt_done(rqos, flags); } /* May sleep, if we have exceeded the writeback limits. */ static void wbt_wait(struct rq_qos *rqos, struct bio *bio) { struct rq_wb *rwb = RQWB(rqos); enum wbt_flags flags; flags = bio_to_wbt_flags(rwb, bio); if (!(flags & WBT_TRACKED)) { if (flags & WBT_READ) wb_timestamp(rwb, &rwb->last_issue); return; } __wbt_wait(rwb, flags, bio->bi_opf); if (!blk_stat_is_active(rwb->cb)) rwb_arm_timer(rwb); } static void wbt_track(struct rq_qos *rqos, struct request *rq, struct bio *bio) { struct rq_wb *rwb = RQWB(rqos); rq->wbt_flags |= bio_to_wbt_flags(rwb, bio); } static void wbt_issue(struct rq_qos *rqos, struct request *rq) { struct rq_wb *rwb = RQWB(rqos); if (!rwb_enabled(rwb)) return; /* * Track sync issue, in case it takes a long time to complete. Allows us * to react quicker, if a sync IO takes a long time to complete. Note * that this is just a hint. The request can go away when it completes, * so it's important we never dereference it. We only use the address to * compare with, which is why we store the sync_issue time locally. */ if (wbt_is_read(rq) && !rwb->sync_issue) { rwb->sync_cookie = rq; rwb->sync_issue = rq->io_start_time_ns; } } static void wbt_requeue(struct rq_qos *rqos, struct request *rq) { struct rq_wb *rwb = RQWB(rqos); if (!rwb_enabled(rwb)) return; if (rq == rwb->sync_cookie) { rwb->sync_issue = 0; rwb->sync_cookie = NULL; } } /* * Enable wbt if defaults are configured that way */ void wbt_enable_default(struct gendisk *disk) { struct request_queue *q = disk->queue; struct rq_qos *rqos; bool enable = IS_ENABLED(CONFIG_BLK_WBT_MQ); if (q->elevator && test_bit(ELEVATOR_FLAG_DISABLE_WBT, &q->elevator->flags)) enable = false; /* Throttling already enabled? */ rqos = wbt_rq_qos(q); if (rqos) { if (enable && RQWB(rqos)->enable_state == WBT_STATE_OFF_DEFAULT) RQWB(rqos)->enable_state = WBT_STATE_ON_DEFAULT; return; } /* Queue not registered? Maybe shutting down... */ if (!blk_queue_registered(q)) return; if (queue_is_mq(q) && enable) wbt_init(disk); } EXPORT_SYMBOL_GPL(wbt_enable_default); u64 wbt_default_latency_nsec(struct request_queue *q) { /* * We default to 2msec for non-rotational storage, and 75msec * for rotational storage. */ if (blk_queue_nonrot(q)) return 2000000ULL; else return 75000000ULL; } static int wbt_data_dir(const struct request *rq) { const enum req_op op = req_op(rq); if (op == REQ_OP_READ) return READ; else if (op_is_write(op)) return WRITE; /* don't account */ return -1; } static void wbt_queue_depth_changed(struct rq_qos *rqos) { RQWB(rqos)->rq_depth.queue_depth = blk_queue_depth(rqos->disk->queue); wbt_update_limits(RQWB(rqos)); } static void wbt_exit(struct rq_qos *rqos) { struct rq_wb *rwb = RQWB(rqos); blk_stat_remove_callback(rqos->disk->queue, rwb->cb); blk_stat_free_callback(rwb->cb); kfree(rwb); } /* * Disable wbt, if enabled by default. */ void wbt_disable_default(struct gendisk *disk) { struct rq_qos *rqos = wbt_rq_qos(disk->queue); struct rq_wb *rwb; if (!rqos) return; rwb = RQWB(rqos); if (rwb->enable_state == WBT_STATE_ON_DEFAULT) { blk_stat_deactivate(rwb->cb); rwb->enable_state = WBT_STATE_OFF_DEFAULT; } } EXPORT_SYMBOL_GPL(wbt_disable_default); #ifdef CONFIG_BLK_DEBUG_FS static int wbt_curr_win_nsec_show(void *data, struct seq_file *m) { struct rq_qos *rqos = data; struct rq_wb *rwb = RQWB(rqos); seq_printf(m, "%llu\n", rwb->cur_win_nsec); return 0; } static int wbt_enabled_show(void *data, struct seq_file *m) { struct rq_qos *rqos = data; struct rq_wb *rwb = RQWB(rqos); seq_printf(m, "%d\n", rwb->enable_state); return 0; } static int wbt_id_show(void *data, struct seq_file *m) { struct rq_qos *rqos = data; seq_printf(m, "%u\n", rqos->id); return 0; } static int wbt_inflight_show(void *data, struct seq_file *m) { struct rq_qos *rqos = data; struct rq_wb *rwb = RQWB(rqos); int i; for (i = 0; i < WBT_NUM_RWQ; i++) seq_printf(m, "%d: inflight %d\n", i, atomic_read(&rwb->rq_wait[i].inflight)); return 0; } static int wbt_min_lat_nsec_show(void *data, struct seq_file *m) { struct rq_qos *rqos = data; struct rq_wb *rwb = RQWB(rqos); seq_printf(m, "%lu\n", rwb->min_lat_nsec); return 0; } static int wbt_unknown_cnt_show(void *data, struct seq_file *m) { struct rq_qos *rqos = data; struct rq_wb *rwb = RQWB(rqos); seq_printf(m, "%u\n", rwb->unknown_cnt); return 0; } static int wbt_normal_show(void *data, struct seq_file *m) { struct rq_qos *rqos = data; struct rq_wb *rwb = RQWB(rqos); seq_printf(m, "%u\n", rwb->wb_normal); return 0; } static int wbt_background_show(void *data, struct seq_file *m) { struct rq_qos *rqos = data; struct rq_wb *rwb = RQWB(rqos); seq_printf(m, "%u\n", rwb->wb_background); return 0; } static const struct blk_mq_debugfs_attr wbt_debugfs_attrs[] = { {"curr_win_nsec", 0400, wbt_curr_win_nsec_show}, {"enabled", 0400, wbt_enabled_show}, {"id", 0400, wbt_id_show}, {"inflight", 0400, wbt_inflight_show}, {"min_lat_nsec", 0400, wbt_min_lat_nsec_show}, {"unknown_cnt", 0400, wbt_unknown_cnt_show}, {"wb_normal", 0400, wbt_normal_show}, {"wb_background", 0400, wbt_background_show}, {}, }; #endif static const struct rq_qos_ops wbt_rqos_ops = { .throttle = wbt_wait, .issue = wbt_issue, .track = wbt_track, .requeue = wbt_requeue, .done = wbt_done, .cleanup = wbt_cleanup, .queue_depth_changed = wbt_queue_depth_changed, .exit = wbt_exit, #ifdef CONFIG_BLK_DEBUG_FS .debugfs_attrs = wbt_debugfs_attrs, #endif }; int wbt_init(struct gendisk *disk) { struct request_queue *q = disk->queue; struct rq_wb *rwb; int i; int ret; rwb = kzalloc(sizeof(*rwb), GFP_KERNEL); if (!rwb) return -ENOMEM; rwb->cb = blk_stat_alloc_callback(wb_timer_fn, wbt_data_dir, 2, rwb); if (!rwb->cb) { kfree(rwb); return -ENOMEM; } for (i = 0; i < WBT_NUM_RWQ; i++) rq_wait_init(&rwb->rq_wait[i]); rwb->last_comp = rwb->last_issue = jiffies; rwb->win_nsec = RWB_WINDOW_NSEC; rwb->enable_state = WBT_STATE_ON_DEFAULT; rwb->rq_depth.default_depth = RWB_DEF_DEPTH; rwb->min_lat_nsec = wbt_default_latency_nsec(q); rwb->rq_depth.queue_depth = blk_queue_depth(q); wbt_update_limits(rwb); /* * Assign rwb and add the stats callback. */ mutex_lock(&q->rq_qos_mutex); ret = rq_qos_add(&rwb->rqos, disk, RQ_QOS_WBT, &wbt_rqos_ops); mutex_unlock(&q->rq_qos_mutex); if (ret) goto err_free; blk_stat_add_callback(q, rwb->cb); return 0; err_free: blk_stat_free_callback(rwb->cb); kfree(rwb); return ret; }
4 4 4 10 10 4 4 4 1 1 1 1 2 2 75 61 8 1 4 69 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 /* * Copyright (c) 2006, 2019 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ #include <linux/kernel.h> #include <linux/in.h> #include <linux/if.h> #include <linux/netdevice.h> #include <linux/inetdevice.h> #include <linux/if_arp.h> #include <linux/delay.h> #include <linux/slab.h> #include <linux/module.h> #include <net/addrconf.h> #include "rds_single_path.h" #include "rds.h" #include "ib.h" #include "ib_mr.h" static unsigned int rds_ib_mr_1m_pool_size = RDS_MR_1M_POOL_SIZE; static unsigned int rds_ib_mr_8k_pool_size = RDS_MR_8K_POOL_SIZE; unsigned int rds_ib_retry_count = RDS_IB_DEFAULT_RETRY_COUNT; static atomic_t rds_ib_unloading; module_param(rds_ib_mr_1m_pool_size, int, 0444); MODULE_PARM_DESC(rds_ib_mr_1m_pool_size, " Max number of 1M mr per HCA"); module_param(rds_ib_mr_8k_pool_size, int, 0444); MODULE_PARM_DESC(rds_ib_mr_8k_pool_size, " Max number of 8K mr per HCA"); module_param(rds_ib_retry_count, int, 0444); MODULE_PARM_DESC(rds_ib_retry_count, " Number of hw retries before reporting an error"); /* * we have a clumsy combination of RCU and a rwsem protecting this list * because it is used both in the get_mr fast path and while blocking in * the FMR flushing path. */ DECLARE_RWSEM(rds_ib_devices_lock); struct list_head rds_ib_devices; /* NOTE: if also grabbing ibdev lock, grab this first */ DEFINE_SPINLOCK(ib_nodev_conns_lock); LIST_HEAD(ib_nodev_conns); static void rds_ib_nodev_connect(void) { struct rds_ib_connection *ic; spin_lock(&ib_nodev_conns_lock); list_for_each_entry(ic, &ib_nodev_conns, ib_node) rds_conn_connect_if_down(ic->conn); spin_unlock(&ib_nodev_conns_lock); } static void rds_ib_dev_shutdown(struct rds_ib_device *rds_ibdev) { struct rds_ib_connection *ic; unsigned long flags; spin_lock_irqsave(&rds_ibdev->spinlock, flags); list_for_each_entry(ic, &rds_ibdev->conn_list, ib_node) rds_conn_path_drop(&ic->conn->c_path[0], true); spin_unlock_irqrestore(&rds_ibdev->spinlock, flags); } /* * rds_ib_destroy_mr_pool() blocks on a few things and mrs drop references * from interrupt context so we push freing off into a work struct in krdsd. */ static void rds_ib_dev_free(struct work_struct *work) { struct rds_ib_ipaddr *i_ipaddr, *i_next; struct rds_ib_device *rds_ibdev = container_of(work, struct rds_ib_device, free_work); if (rds_ibdev->mr_8k_pool) rds_ib_destroy_mr_pool(rds_ibdev->mr_8k_pool); if (rds_ibdev->mr_1m_pool) rds_ib_destroy_mr_pool(rds_ibdev->mr_1m_pool); if (rds_ibdev->pd) ib_dealloc_pd(rds_ibdev->pd); list_for_each_entry_safe(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, list) { list_del(&i_ipaddr->list); kfree(i_ipaddr); } kfree(rds_ibdev->vector_load); kfree(rds_ibdev); } void rds_ib_dev_put(struct rds_ib_device *rds_ibdev) { BUG_ON(refcount_read(&rds_ibdev->refcount) == 0); if (refcount_dec_and_test(&rds_ibdev->refcount)) queue_work(rds_wq, &rds_ibdev->free_work); } static int rds_ib_add_one(struct ib_device *device) { struct rds_ib_device *rds_ibdev; int ret; /* Only handle IB (no iWARP) devices */ if (device->node_type != RDMA_NODE_IB_CA) return -EOPNOTSUPP; /* Device must support FRWR */ if (!(device->attrs.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS)) return -EOPNOTSUPP; rds_ibdev = kzalloc_node(sizeof(struct rds_ib_device), GFP_KERNEL, ibdev_to_node(device)); if (!rds_ibdev) return -ENOMEM; spin_lock_init(&rds_ibdev->spinlock); refcount_set(&rds_ibdev->refcount, 1); INIT_WORK(&rds_ibdev->free_work, rds_ib_dev_free); INIT_LIST_HEAD(&rds_ibdev->ipaddr_list); INIT_LIST_HEAD(&rds_ibdev->conn_list); rds_ibdev->max_wrs = device->attrs.max_qp_wr; rds_ibdev->max_sge = min(device->attrs.max_send_sge, RDS_IB_MAX_SGE); rds_ibdev->odp_capable = !!(device->attrs.kernel_cap_flags & IBK_ON_DEMAND_PAGING) && !!(device->attrs.odp_caps.per_transport_caps.rc_odp_caps & IB_ODP_SUPPORT_WRITE) && !!(device->attrs.odp_caps.per_transport_caps.rc_odp_caps & IB_ODP_SUPPORT_READ); rds_ibdev->max_1m_mrs = device->attrs.max_mr ? min_t(unsigned int, (device->attrs.max_mr / 2), rds_ib_mr_1m_pool_size) : rds_ib_mr_1m_pool_size; rds_ibdev->max_8k_mrs = device->attrs.max_mr ? min_t(unsigned int, ((device->attrs.max_mr / 2) * RDS_MR_8K_SCALE), rds_ib_mr_8k_pool_size) : rds_ib_mr_8k_pool_size; rds_ibdev->max_initiator_depth = device->attrs.max_qp_init_rd_atom; rds_ibdev->max_responder_resources = device->attrs.max_qp_rd_atom; rds_ibdev->vector_load = kcalloc(device->num_comp_vectors, sizeof(int), GFP_KERNEL); if (!rds_ibdev->vector_load) { pr_err("RDS/IB: %s failed to allocate vector memory\n", __func__); ret = -ENOMEM; goto put_dev; } rds_ibdev->dev = device; rds_ibdev->pd = ib_alloc_pd(device, 0); if (IS_ERR(rds_ibdev->pd)) { ret = PTR_ERR(rds_ibdev->pd); rds_ibdev->pd = NULL; goto put_dev; } rds_ibdev->mr_1m_pool = rds_ib_create_mr_pool(rds_ibdev, RDS_IB_MR_1M_POOL); if (IS_ERR(rds_ibdev->mr_1m_pool)) { ret = PTR_ERR(rds_ibdev->mr_1m_pool); rds_ibdev->mr_1m_pool = NULL; goto put_dev; } rds_ibdev->mr_8k_pool = rds_ib_create_mr_pool(rds_ibdev, RDS_IB_MR_8K_POOL); if (IS_ERR(rds_ibdev->mr_8k_pool)) { ret = PTR_ERR(rds_ibdev->mr_8k_pool); rds_ibdev->mr_8k_pool = NULL; goto put_dev; } rdsdebug("RDS/IB: max_mr = %d, max_wrs = %d, max_sge = %d, max_1m_mrs = %d, max_8k_mrs = %d\n", device->attrs.max_mr, rds_ibdev->max_wrs, rds_ibdev->max_sge, rds_ibdev->max_1m_mrs, rds_ibdev->max_8k_mrs); pr_info("RDS/IB: %s: added\n", device->name); down_write(&rds_ib_devices_lock); list_add_tail_rcu(&rds_ibdev->list, &rds_ib_devices); up_write(&rds_ib_devices_lock); refcount_inc(&rds_ibdev->refcount); ib_set_client_data(device, &rds_ib_client, rds_ibdev); rds_ib_nodev_connect(); return 0; put_dev: rds_ib_dev_put(rds_ibdev); return ret; } /* * New connections use this to find the device to associate with the * connection. It's not in the fast path so we're not concerned about the * performance of the IB call. (As of this writing, it uses an interrupt * blocking spinlock to serialize walking a per-device list of all registered * clients.) * * RCU is used to handle incoming connections racing with device teardown. * Rather than use a lock to serialize removal from the client_data and * getting a new reference, we use an RCU grace period. The destruction * path removes the device from client_data and then waits for all RCU * readers to finish. * * A new connection can get NULL from this if its arriving on a * device that is in the process of being removed. */ struct rds_ib_device *rds_ib_get_client_data(struct ib_device *device) { struct rds_ib_device *rds_ibdev; rcu_read_lock(); rds_ibdev = ib_get_client_data(device, &rds_ib_client); if (rds_ibdev) refcount_inc(&rds_ibdev->refcount); rcu_read_unlock(); return rds_ibdev; } /* * The IB stack is letting us know that a device is going away. This can * happen if the underlying HCA driver is removed or if PCI hotplug is removing * the pci function, for example. * * This can be called at any time and can be racing with any other RDS path. */ static void rds_ib_remove_one(struct ib_device *device, void *client_data) { struct rds_ib_device *rds_ibdev = client_data; rds_ib_dev_shutdown(rds_ibdev); /* stop connection attempts from getting a reference to this device. */ ib_set_client_data(device, &rds_ib_client, NULL); down_write(&rds_ib_devices_lock); list_del_rcu(&rds_ibdev->list); up_write(&rds_ib_devices_lock); /* * This synchronize rcu is waiting for readers of both the ib * client data and the devices list to finish before we drop * both of those references. */ synchronize_rcu(); rds_ib_dev_put(rds_ibdev); rds_ib_dev_put(rds_ibdev); } struct ib_client rds_ib_client = { .name = "rds_ib", .add = rds_ib_add_one, .remove = rds_ib_remove_one }; static int rds_ib_conn_info_visitor(struct rds_connection *conn, void *buffer) { struct rds_info_rdma_connection *iinfo = buffer; struct rds_ib_connection *ic = conn->c_transport_data; /* We will only ever look at IB transports */ if (conn->c_trans != &rds_ib_transport) return 0; if (conn->c_isv6) return 0; iinfo->src_addr = conn->c_laddr.s6_addr32[3]; iinfo->dst_addr = conn->c_faddr.s6_addr32[3]; if (ic) { iinfo->tos = conn->c_tos; iinfo->sl = ic->i_sl; } memset(&iinfo->src_gid, 0, sizeof(iinfo->src_gid)); memset(&iinfo->dst_gid, 0, sizeof(iinfo->dst_gid)); if (rds_conn_state(conn) == RDS_CONN_UP) { struct rds_ib_device *rds_ibdev; rdma_read_gids(ic->i_cm_id, (union ib_gid *)&iinfo->src_gid, (union ib_gid *)&iinfo->dst_gid); rds_ibdev = ic->rds_ibdev; iinfo->max_send_wr = ic->i_send_ring.w_nr; iinfo->max_recv_wr = ic->i_recv_ring.w_nr; iinfo->max_send_sge = rds_ibdev->max_sge; rds_ib_get_mr_info(rds_ibdev, iinfo); iinfo->cache_allocs = atomic_read(&ic->i_cache_allocs); } return 1; } #if IS_ENABLED(CONFIG_IPV6) /* IPv6 version of rds_ib_conn_info_visitor(). */ static int rds6_ib_conn_info_visitor(struct rds_connection *conn, void *buffer) { struct rds6_info_rdma_connection *iinfo6 = buffer; struct rds_ib_connection *ic = conn->c_transport_data; /* We will only ever look at IB transports */ if (conn->c_trans != &rds_ib_transport) return 0; iinfo6->src_addr = conn->c_laddr; iinfo6->dst_addr = conn->c_faddr; if (ic) { iinfo6->tos = conn->c_tos; iinfo6->sl = ic->i_sl; } memset(&iinfo6->src_gid, 0, sizeof(iinfo6->src_gid)); memset(&iinfo6->dst_gid, 0, sizeof(iinfo6->dst_gid)); if (rds_conn_state(conn) == RDS_CONN_UP) { struct rds_ib_device *rds_ibdev; rdma_read_gids(ic->i_cm_id, (union ib_gid *)&iinfo6->src_gid, (union ib_gid *)&iinfo6->dst_gid); rds_ibdev = ic->rds_ibdev; iinfo6->max_send_wr = ic->i_send_ring.w_nr; iinfo6->max_recv_wr = ic->i_recv_ring.w_nr; iinfo6->max_send_sge = rds_ibdev->max_sge; rds6_ib_get_mr_info(rds_ibdev, iinfo6); iinfo6->cache_allocs = atomic_read(&ic->i_cache_allocs); } return 1; } #endif static void rds_ib_ic_info(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens) { u64 buffer[(sizeof(struct rds_info_rdma_connection) + 7) / 8]; rds_for_each_conn_info(sock, len, iter, lens, rds_ib_conn_info_visitor, buffer, sizeof(struct rds_info_rdma_connection)); } #if IS_ENABLED(CONFIG_IPV6) /* IPv6 version of rds_ib_ic_info(). */ static void rds6_ib_ic_info(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens) { u64 buffer[(sizeof(struct rds6_info_rdma_connection) + 7) / 8]; rds_for_each_conn_info(sock, len, iter, lens, rds6_ib_conn_info_visitor, buffer, sizeof(struct rds6_info_rdma_connection)); } #endif /* * Early RDS/IB was built to only bind to an address if there is an IPoIB * device with that address set. * * If it were me, I'd advocate for something more flexible. Sending and * receiving should be device-agnostic. Transports would try and maintain * connections between peers who have messages queued. Userspace would be * allowed to influence which paths have priority. We could call userspace * asserting this policy "routing". */ static int rds_ib_laddr_check(struct net *net, const struct in6_addr *addr, __u32 scope_id) { int ret; struct rdma_cm_id *cm_id; #if IS_ENABLED(CONFIG_IPV6) struct sockaddr_in6 sin6; #endif struct sockaddr_in sin; struct sockaddr *sa; bool isv4; isv4 = ipv6_addr_v4mapped(addr); /* Create a CMA ID and try to bind it. This catches both * IB and iWARP capable NICs. */ cm_id = rdma_create_id(&init_net, rds_rdma_cm_event_handler, NULL, RDMA_PS_TCP, IB_QPT_RC); if (IS_ERR(cm_id)) return PTR_ERR(cm_id); if (isv4) { memset(&sin, 0, sizeof(sin)); sin.sin_family = AF_INET; sin.sin_addr.s_addr = addr->s6_addr32[3]; sa = (struct sockaddr *)&sin; } else { #if IS_ENABLED(CONFIG_IPV6) memset(&sin6, 0, sizeof(sin6)); sin6.sin6_family = AF_INET6; sin6.sin6_addr = *addr; sin6.sin6_scope_id = scope_id; sa = (struct sockaddr *)&sin6; /* XXX Do a special IPv6 link local address check here. The * reason is that rdma_bind_addr() always succeeds with IPv6 * link local address regardless it is indeed configured in a * system. */ if (ipv6_addr_type(addr) & IPV6_ADDR_LINKLOCAL) { struct net_device *dev; if (scope_id == 0) { ret = -EADDRNOTAVAIL; goto out; } /* Use init_net for now as RDS is not network * name space aware. */ dev = dev_get_by_index(&init_net, scope_id); if (!dev) { ret = -EADDRNOTAVAIL; goto out; } if (!ipv6_chk_addr(&init_net, addr, dev, 1)) { dev_put(dev); ret = -EADDRNOTAVAIL; goto out; } dev_put(dev); } #else ret = -EADDRNOTAVAIL; goto out; #endif } /* rdma_bind_addr will only succeed for IB & iWARP devices */ ret = rdma_bind_addr(cm_id, sa); /* due to this, we will claim to support iWARP devices unless we check node_type. */ if (ret || !cm_id->device || cm_id->device->node_type != RDMA_NODE_IB_CA) ret = -EADDRNOTAVAIL; rdsdebug("addr %pI6c%%%u ret %d node type %d\n", addr, scope_id, ret, cm_id->device ? cm_id->device->node_type : -1); out: rdma_destroy_id(cm_id); return ret; } static void rds_ib_unregister_client(void) { ib_unregister_client(&rds_ib_client); /* wait for rds_ib_dev_free() to complete */ flush_workqueue(rds_wq); } static void rds_ib_set_unloading(void) { atomic_set(&rds_ib_unloading, 1); } static bool rds_ib_is_unloading(struct rds_connection *conn) { struct rds_conn_path *cp = &conn->c_path[0]; return (test_bit(RDS_DESTROY_PENDING, &cp->cp_flags) || atomic_read(&rds_ib_unloading) != 0); } void rds_ib_exit(void) { rds_ib_set_unloading(); synchronize_rcu(); rds_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info); #if IS_ENABLED(CONFIG_IPV6) rds_info_deregister_func(RDS6_INFO_IB_CONNECTIONS, rds6_ib_ic_info); #endif rds_ib_unregister_client(); rds_ib_destroy_nodev_conns(); rds_ib_sysctl_exit(); rds_ib_recv_exit(); rds_trans_unregister(&rds_ib_transport); rds_ib_mr_exit(); } static u8 rds_ib_get_tos_map(u8 tos) { /* 1:1 user to transport map for RDMA transport. * In future, if custom map is desired, hook can export * user configurable map. */ return tos; } struct rds_transport rds_ib_transport = { .laddr_check = rds_ib_laddr_check, .xmit_path_complete = rds_ib_xmit_path_complete, .xmit = rds_ib_xmit, .xmit_rdma = rds_ib_xmit_rdma, .xmit_atomic = rds_ib_xmit_atomic, .recv_path = rds_ib_recv_path, .conn_alloc = rds_ib_conn_alloc, .conn_free = rds_ib_conn_free, .conn_path_connect = rds_ib_conn_path_connect, .conn_path_shutdown = rds_ib_conn_path_shutdown, .inc_copy_to_user = rds_ib_inc_copy_to_user, .inc_free = rds_ib_inc_free, .cm_initiate_connect = rds_ib_cm_initiate_connect, .cm_handle_connect = rds_ib_cm_handle_connect, .cm_connect_complete = rds_ib_cm_connect_complete, .stats_info_copy = rds_ib_stats_info_copy, .exit = rds_ib_exit, .get_mr = rds_ib_get_mr, .sync_mr = rds_ib_sync_mr, .free_mr = rds_ib_free_mr, .flush_mrs = rds_ib_flush_mrs, .get_tos_map = rds_ib_get_tos_map, .t_owner = THIS_MODULE, .t_name = "infiniband", .t_unloading = rds_ib_is_unloading, .t_type = RDS_TRANS_IB }; int rds_ib_init(void) { int ret; INIT_LIST_HEAD(&rds_ib_devices); ret = rds_ib_mr_init(); if (ret) goto out; ret = ib_register_client(&rds_ib_client); if (ret) goto out_mr_exit; ret = rds_ib_sysctl_init(); if (ret) goto out_ibreg; ret = rds_ib_recv_init(); if (ret) goto out_sysctl; rds_trans_register(&rds_ib_transport); rds_info_register_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info); #if IS_ENABLED(CONFIG_IPV6) rds_info_register_func(RDS6_INFO_IB_CONNECTIONS, rds6_ib_ic_info); #endif goto out; out_sysctl: rds_ib_sysctl_exit(); out_ibreg: rds_ib_unregister_client(); out_mr_exit: rds_ib_mr_exit(); out: return ret; } MODULE_LICENSE("GPL");
21744 68 68 68 68 68 68 68 68 68 67 68 68 3941 3941 3941 38 34 3950 3943 3949 3955 3948 3955 3951 3955 3946 4401 4408 272 4395 4408 139 4396 4408 4408 4408 4398 4400 68 4408 68 68 68 53 4388 3637 3637 3953 4349 3948 2708 2708 3962 3958 3963 4394 4361 4400 4397 4365 4278 4397 4403 68 402 4012 3945 402 2700 4392 4403 4403 68 68 68 68 2708 2708 2708 2668 2676 2670 402 2693 26 68 399 2700 2694 68 2708 2708 2708 2665 639 2675 3923 37 4360 4349 2667 3926 4384 4394 2703 3930 2701 2708 4393 4390 4379 3947 3948 3953 3955 3937 237 15 3948 3955 3945 4386 4385 4382 4336 59 1 15 3948 3945 3951 48 3942 103 3944 3939 5 5 60 59 59 68 4155 4330 17 4334 214 200 11 213 68 215 108 4340 4344 4007 4132 4335 4335 4334 4356 15 4339 68 4344 4339 4058 3670 3783 33 6 15 15 5 4 10 15 25 25 25 25 25 25 25 25 25 25 25 25 25 25 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 1993 Linus Torvalds * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 * SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000 * Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002 * Numa awareness, Christoph Lameter, SGI, June 2005 * Improving global KVA allocator, Uladzislau Rezki, Sony, May 2019 */ #include <linux/vmalloc.h> #include <linux/mm.h> #include <linux/module.h> #include <linux/highmem.h> #include <linux/sched/signal.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/interrupt.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/set_memory.h> #include <linux/debugobjects.h> #include <linux/kallsyms.h> #include <linux/list.h> #include <linux/notifier.h> #include <linux/rbtree.h> #include <linux/xarray.h> #include <linux/io.h> #include <linux/rcupdate.h> #include <linux/pfn.h> #include <linux/kmemleak.h> #include <linux/atomic.h> #include <linux/compiler.h> #include <linux/memcontrol.h> #include <linux/llist.h> #include <linux/uio.h> #include <linux/bitops.h> #include <linux/rbtree_augmented.h> #include <linux/overflow.h> #include <linux/pgtable.h> #include <linux/hugetlb.h> #include <linux/sched/mm.h> #include <asm/tlbflush.h> #include <asm/shmparam.h> #include <linux/page_owner.h> #define CREATE_TRACE_POINTS #include <trace/events/vmalloc.h> #include "internal.h" #include "pgalloc-track.h" #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP static unsigned int __ro_after_init ioremap_max_page_shift = BITS_PER_LONG - 1; static int __init set_nohugeiomap(char *str) { ioremap_max_page_shift = PAGE_SHIFT; return 0; } early_param("nohugeiomap", set_nohugeiomap); #else /* CONFIG_HAVE_ARCH_HUGE_VMAP */ static const unsigned int ioremap_max_page_shift = PAGE_SHIFT; #endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ #ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC static bool __ro_after_init vmap_allow_huge = true; static int __init set_nohugevmalloc(char *str) { vmap_allow_huge = false; return 0; } early_param("nohugevmalloc", set_nohugevmalloc); #else /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */ static const bool vmap_allow_huge = false; #endif /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */ bool is_vmalloc_addr(const void *x) { unsigned long addr = (unsigned long)kasan_reset_tag(x); return addr >= VMALLOC_START && addr < VMALLOC_END; } EXPORT_SYMBOL(is_vmalloc_addr); struct vfree_deferred { struct llist_head list; struct work_struct wq; }; static DEFINE_PER_CPU(struct vfree_deferred, vfree_deferred); /*** Page table manipulation functions ***/ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot, unsigned int max_page_shift, pgtbl_mod_mask *mask) { pte_t *pte; u64 pfn; struct page *page; unsigned long size = PAGE_SIZE; pfn = phys_addr >> PAGE_SHIFT; pte = pte_alloc_kernel_track(pmd, addr, mask); if (!pte) return -ENOMEM; do { if (unlikely(!pte_none(ptep_get(pte)))) { if (pfn_valid(pfn)) { page = pfn_to_page(pfn); dump_page(page, "remapping already mapped page"); } BUG(); } #ifdef CONFIG_HUGETLB_PAGE size = arch_vmap_pte_range_map_size(addr, end, pfn, max_page_shift); if (size != PAGE_SIZE) { pte_t entry = pfn_pte(pfn, prot); entry = arch_make_huge_pte(entry, ilog2(size), 0); set_huge_pte_at(&init_mm, addr, pte, entry, size); pfn += PFN_DOWN(size); continue; } #endif set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot)); pfn++; } while (pte += PFN_DOWN(size), addr += size, addr != end); *mask |= PGTBL_PTE_MODIFIED; return 0; } static int vmap_try_huge_pmd(pmd_t *pmd, unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot, unsigned int max_page_shift) { if (max_page_shift < PMD_SHIFT) return 0; if (!arch_vmap_pmd_supported(prot)) return 0; if ((end - addr) != PMD_SIZE) return 0; if (!IS_ALIGNED(addr, PMD_SIZE)) return 0; if (!IS_ALIGNED(phys_addr, PMD_SIZE)) return 0; if (pmd_present(*pmd) && !pmd_free_pte_page(pmd, addr)) return 0; return pmd_set_huge(pmd, phys_addr, prot); } static int vmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot, unsigned int max_page_shift, pgtbl_mod_mask *mask) { pmd_t *pmd; unsigned long next; pmd = pmd_alloc_track(&init_mm, pud, addr, mask); if (!pmd) return -ENOMEM; do { next = pmd_addr_end(addr, end); if (vmap_try_huge_pmd(pmd, addr, next, phys_addr, prot, max_page_shift)) { *mask |= PGTBL_PMD_MODIFIED; continue; } if (vmap_pte_range(pmd, addr, next, phys_addr, prot, max_page_shift, mask)) return -ENOMEM; } while (pmd++, phys_addr += (next - addr), addr = next, addr != end); return 0; } static int vmap_try_huge_pud(pud_t *pud, unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot, unsigned int max_page_shift) { if (max_page_shift < PUD_SHIFT) return 0; if (!arch_vmap_pud_supported(prot)) return 0; if ((end - addr) != PUD_SIZE) return 0; if (!IS_ALIGNED(addr, PUD_SIZE)) return 0; if (!IS_ALIGNED(phys_addr, PUD_SIZE)) return 0; if (pud_present(*pud) && !pud_free_pmd_page(pud, addr)) return 0; return pud_set_huge(pud, phys_addr, prot); } static int vmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot, unsigned int max_page_shift, pgtbl_mod_mask *mask) { pud_t *pud; unsigned long next; pud = pud_alloc_track(&init_mm, p4d, addr, mask); if (!pud) return -ENOMEM; do { next = pud_addr_end(addr, end); if (vmap_try_huge_pud(pud, addr, next, phys_addr, prot, max_page_shift)) { *mask |= PGTBL_PUD_MODIFIED; continue; } if (vmap_pmd_range(pud, addr, next, phys_addr, prot, max_page_shift, mask)) return -ENOMEM; } while (pud++, phys_addr += (next - addr), addr = next, addr != end); return 0; } static int vmap_try_huge_p4d(p4d_t *p4d, unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot, unsigned int max_page_shift) { if (max_page_shift < P4D_SHIFT) return 0; if (!arch_vmap_p4d_supported(prot)) return 0; if ((end - addr) != P4D_SIZE) return 0; if (!IS_ALIGNED(addr, P4D_SIZE)) return 0; if (!IS_ALIGNED(phys_addr, P4D_SIZE)) return 0; if (p4d_present(*p4d) && !p4d_free_pud_page(p4d, addr)) return 0; return p4d_set_huge(p4d, phys_addr, prot); } static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot, unsigned int max_page_shift, pgtbl_mod_mask *mask) { p4d_t *p4d; unsigned long next; p4d = p4d_alloc_track(&init_mm, pgd, addr, mask); if (!p4d) return -ENOMEM; do { next = p4d_addr_end(addr, end); if (vmap_try_huge_p4d(p4d, addr, next, phys_addr, prot, max_page_shift)) { *mask |= PGTBL_P4D_MODIFIED; continue; } if (vmap_pud_range(p4d, addr, next, phys_addr, prot, max_page_shift, mask)) return -ENOMEM; } while (p4d++, phys_addr += (next - addr), addr = next, addr != end); return 0; } static int vmap_range_noflush(unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot, unsigned int max_page_shift) { pgd_t *pgd; unsigned long start; unsigned long next; int err; pgtbl_mod_mask mask = 0; might_sleep(); BUG_ON(addr >= end); start = addr; pgd = pgd_offset_k(addr); do { next = pgd_addr_end(addr, end); err = vmap_p4d_range(pgd, addr, next, phys_addr, prot, max_page_shift, &mask); if (err) break; } while (pgd++, phys_addr += (next - addr), addr = next, addr != end); if (mask & ARCH_PAGE_TABLE_SYNC_MASK) arch_sync_kernel_mappings(start, end); return err; } int vmap_page_range(unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot) { int err; err = vmap_range_noflush(addr, end, phys_addr, pgprot_nx(prot), ioremap_max_page_shift); flush_cache_vmap(addr, end); if (!err) err = kmsan_ioremap_page_range(addr, end, phys_addr, prot, ioremap_max_page_shift); return err; } int ioremap_page_range(unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot) { struct vm_struct *area; area = find_vm_area((void *)addr); if (!area || !(area->flags & VM_IOREMAP)) { WARN_ONCE(1, "vm_area at addr %lx is not marked as VM_IOREMAP\n", addr); return -EINVAL; } if (addr != (unsigned long)area->addr || (void *)end != area->addr + get_vm_area_size(area)) { WARN_ONCE(1, "ioremap request [%lx,%lx) doesn't match vm_area [%lx, %lx)\n", addr, end, (long)area->addr, (long)area->addr + get_vm_area_size(area)); return -ERANGE; } return vmap_page_range(addr, end, phys_addr, prot); } static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, pgtbl_mod_mask *mask) { pte_t *pte; pte = pte_offset_kernel(pmd, addr); do { pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte); WARN_ON(!pte_none(ptent) && !pte_present(ptent)); } while (pte++, addr += PAGE_SIZE, addr != end); *mask |= PGTBL_PTE_MODIFIED; } static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, pgtbl_mod_mask *mask) { pmd_t *pmd; unsigned long next; int cleared; pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); cleared = pmd_clear_huge(pmd); if (cleared || pmd_bad(*pmd)) *mask |= PGTBL_PMD_MODIFIED; if (cleared) continue; if (pmd_none_or_clear_bad(pmd)) continue; vunmap_pte_range(pmd, addr, next, mask); cond_resched(); } while (pmd++, addr = next, addr != end); } static void vunmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, pgtbl_mod_mask *mask) { pud_t *pud; unsigned long next; int cleared; pud = pud_offset(p4d, addr); do { next = pud_addr_end(addr, end); cleared = pud_clear_huge(pud); if (cleared || pud_bad(*pud)) *mask |= PGTBL_PUD_MODIFIED; if (cleared) continue; if (pud_none_or_clear_bad(pud)) continue; vunmap_pmd_range(pud, addr, next, mask); } while (pud++, addr = next, addr != end); } static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, pgtbl_mod_mask *mask) { p4d_t *p4d; unsigned long next; p4d = p4d_offset(pgd, addr); do { next = p4d_addr_end(addr, end); p4d_clear_huge(p4d); if (p4d_bad(*p4d)) *mask |= PGTBL_P4D_MODIFIED; if (p4d_none_or_clear_bad(p4d)) continue; vunmap_pud_range(p4d, addr, next, mask); } while (p4d++, addr = next, addr != end); } /* * vunmap_range_noflush is similar to vunmap_range, but does not * flush caches or TLBs. * * The caller is responsible for calling flush_cache_vmap() before calling * this function, and flush_tlb_kernel_range after it has returned * successfully (and before the addresses are expected to cause a page fault * or be re-mapped for something else, if TLB flushes are being delayed or * coalesced). * * This is an internal function only. Do not use outside mm/. */ void __vunmap_range_noflush(unsigned long start, unsigned long end) { unsigned long next; pgd_t *pgd; unsigned long addr = start; pgtbl_mod_mask mask = 0; BUG_ON(addr >= end); pgd = pgd_offset_k(addr); do { next = pgd_addr_end(addr, end); if (pgd_bad(*pgd)) mask |= PGTBL_PGD_MODIFIED; if (pgd_none_or_clear_bad(pgd)) continue; vunmap_p4d_range(pgd, addr, next, &mask); } while (pgd++, addr = next, addr != end); if (mask & ARCH_PAGE_TABLE_SYNC_MASK) arch_sync_kernel_mappings(start, end); } void vunmap_range_noflush(unsigned long start, unsigned long end) { kmsan_vunmap_range_noflush(start, end); __vunmap_range_noflush(start, end); } /** * vunmap_range - unmap kernel virtual addresses * @addr: start of the VM area to unmap * @end: end of the VM area to unmap (non-inclusive) * * Clears any present PTEs in the virtual address range, flushes TLBs and * caches. Any subsequent access to the address before it has been re-mapped * is a kernel bug. */ void vunmap_range(unsigned long addr, unsigned long end) { flush_cache_vunmap(addr, end); vunmap_range_noflush(addr, end); flush_tlb_kernel_range(addr, end); } static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, int *nr, pgtbl_mod_mask *mask) { pte_t *pte; /* * nr is a running index into the array which helps higher level * callers keep track of where we're up to. */ pte = pte_alloc_kernel_track(pmd, addr, mask); if (!pte) return -ENOMEM; do { struct page *page = pages[*nr]; if (WARN_ON(!pte_none(ptep_get(pte)))) return -EBUSY; if (WARN_ON(!page)) return -ENOMEM; if (WARN_ON(!pfn_valid(page_to_pfn(page)))) return -EINVAL; set_pte_at(&init_mm, addr, pte, mk_pte(page, prot)); (*nr)++; } while (pte++, addr += PAGE_SIZE, addr != end); *mask |= PGTBL_PTE_MODIFIED; return 0; } static int vmap_pages_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, int *nr, pgtbl_mod_mask *mask) { pmd_t *pmd; unsigned long next; pmd = pmd_alloc_track(&init_mm, pud, addr, mask); if (!pmd) return -ENOMEM; do { next = pmd_addr_end(addr, end); if (vmap_pages_pte_range(pmd, addr, next, prot, pages, nr, mask)) return -ENOMEM; } while (pmd++, addr = next, addr != end); return 0; } static int vmap_pages_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, int *nr, pgtbl_mod_mask *mask) { pud_t *pud; unsigned long next; pud = pud_alloc_track(&init_mm, p4d, addr, mask); if (!pud) return -ENOMEM; do { next = pud_addr_end(addr, end); if (vmap_pages_pmd_range(pud, addr, next, prot, pages, nr, mask)) return -ENOMEM; } while (pud++, addr = next, addr != end); return 0; } static int vmap_pages_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, int *nr, pgtbl_mod_mask *mask) { p4d_t *p4d; unsigned long next; p4d = p4d_alloc_track(&init_mm, pgd, addr, mask); if (!p4d) return -ENOMEM; do { next = p4d_addr_end(addr, end); if (vmap_pages_pud_range(p4d, addr, next, prot, pages, nr, mask)) return -ENOMEM; } while (p4d++, addr = next, addr != end); return 0; } static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages) { unsigned long start = addr; pgd_t *pgd; unsigned long next; int err = 0; int nr = 0; pgtbl_mod_mask mask = 0; BUG_ON(addr >= end); pgd = pgd_offset_k(addr); do { next = pgd_addr_end(addr, end); if (pgd_bad(*pgd)) mask |= PGTBL_PGD_MODIFIED; err = vmap_pages_p4d_range(pgd, addr, next, prot, pages, &nr, &mask); if (err) break; } while (pgd++, addr = next, addr != end); if (mask & ARCH_PAGE_TABLE_SYNC_MASK) arch_sync_kernel_mappings(start, end); return err; } /* * vmap_pages_range_noflush is similar to vmap_pages_range, but does not * flush caches. * * The caller is responsible for calling flush_cache_vmap() after this * function returns successfully and before the addresses are accessed. * * This is an internal function only. Do not use outside mm/. */ int __vmap_pages_range_noflush(unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, unsigned int page_shift) { unsigned int i, nr = (end - addr) >> PAGE_SHIFT; WARN_ON(page_shift < PAGE_SHIFT); if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMALLOC) || page_shift == PAGE_SHIFT) return vmap_small_pages_range_noflush(addr, end, prot, pages); for (i = 0; i < nr; i += 1U << (page_shift - PAGE_SHIFT)) { int err; err = vmap_range_noflush(addr, addr + (1UL << page_shift), page_to_phys(pages[i]), prot, page_shift); if (err) return err; addr += 1UL << page_shift; } return 0; } int vmap_pages_range_noflush(unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, unsigned int page_shift) { int ret = kmsan_vmap_pages_range_noflush(addr, end, prot, pages, page_shift); if (ret) return ret; return __vmap_pages_range_noflush(addr, end, prot, pages, page_shift); } /** * vmap_pages_range - map pages to a kernel virtual address * @addr: start of the VM area to map * @end: end of the VM area to map (non-inclusive) * @prot: page protection flags to use * @pages: pages to map (always PAGE_SIZE pages) * @page_shift: maximum shift that the pages may be mapped with, @pages must * be aligned and contiguous up to at least this shift. * * RETURNS: * 0 on success, -errno on failure. */ int vmap_pages_range(unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, unsigned int page_shift) { int err; err = vmap_pages_range_noflush(addr, end, prot, pages, page_shift); flush_cache_vmap(addr, end); return err; } static int check_sparse_vm_area(struct vm_struct *area, unsigned long start, unsigned long end) { might_sleep(); if (WARN_ON_ONCE(area->flags & VM_FLUSH_RESET_PERMS)) return -EINVAL; if (WARN_ON_ONCE(area->flags & VM_NO_GUARD)) return -EINVAL; if (WARN_ON_ONCE(!(area->flags & VM_SPARSE))) return -EINVAL; if ((end - start) >> PAGE_SHIFT > totalram_pages()) return -E2BIG; if (start < (unsigned long)area->addr || (void *)end > area->addr + get_vm_area_size(area)) return -ERANGE; return 0; } /** * vm_area_map_pages - map pages inside given sparse vm_area * @area: vm_area * @start: start address inside vm_area * @end: end address inside vm_area * @pages: pages to map (always PAGE_SIZE pages) */ int vm_area_map_pages(struct vm_struct *area, unsigned long start, unsigned long end, struct page **pages) { int err; err = check_sparse_vm_area(area, start, end); if (err) return err; return vmap_pages_range(start, end, PAGE_KERNEL, pages, PAGE_SHIFT); } /** * vm_area_unmap_pages - unmap pages inside given sparse vm_area * @area: vm_area * @start: start address inside vm_area * @end: end address inside vm_area */ void vm_area_unmap_pages(struct vm_struct *area, unsigned long start, unsigned long end) { if (check_sparse_vm_area(area, start, end)) return; vunmap_range(start, end); } int is_vmalloc_or_module_addr(const void *x) { /* * ARM, x86-64 and sparc64 put modules in a special place, * and fall back on vmalloc() if that fails. Others * just put it in the vmalloc space. */ #if defined(CONFIG_EXECMEM) && defined(MODULES_VADDR) unsigned long addr = (unsigned long)kasan_reset_tag(x); if (addr >= MODULES_VADDR && addr < MODULES_END) return 1; #endif return is_vmalloc_addr(x); } EXPORT_SYMBOL_GPL(is_vmalloc_or_module_addr); /* * Walk a vmap address to the struct page it maps. Huge vmap mappings will * return the tail page that corresponds to the base page address, which * matches small vmap mappings. */ struct page *vmalloc_to_page(const void *vmalloc_addr) { unsigned long addr = (unsigned long) vmalloc_addr; struct page *page = NULL; pgd_t *pgd = pgd_offset_k(addr); p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *ptep, pte; /* * XXX we might need to change this if we add VIRTUAL_BUG_ON for * architectures that do not vmalloc module space */ VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr)); if (pgd_none(*pgd)) return NULL; if (WARN_ON_ONCE(pgd_leaf(*pgd))) return NULL; /* XXX: no allowance for huge pgd */ if (WARN_ON_ONCE(pgd_bad(*pgd))) return NULL; p4d = p4d_offset(pgd, addr); if (p4d_none(*p4d)) return NULL; if (p4d_leaf(*p4d)) return p4d_page(*p4d) + ((addr & ~P4D_MASK) >> PAGE_SHIFT); if (WARN_ON_ONCE(p4d_bad(*p4d))) return NULL; pud = pud_offset(p4d, addr); if (pud_none(*pud)) return NULL; if (pud_leaf(*pud)) return pud_page(*pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); if (WARN_ON_ONCE(pud_bad(*pud))) return NULL; pmd = pmd_offset(pud, addr); if (pmd_none(*pmd)) return NULL; if (pmd_leaf(*pmd)) return pmd_page(*pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); if (WARN_ON_ONCE(pmd_bad(*pmd))) return NULL; ptep = pte_offset_kernel(pmd, addr); pte = ptep_get(ptep); if (pte_present(pte)) page = pte_page(pte); return page; } EXPORT_SYMBOL(vmalloc_to_page); /* * Map a vmalloc()-space virtual address to the physical page frame number. */ unsigned long vmalloc_to_pfn(const void *vmalloc_addr) { return page_to_pfn(vmalloc_to_page(vmalloc_addr)); } EXPORT_SYMBOL(vmalloc_to_pfn); /*** Global kva allocator ***/ #define DEBUG_AUGMENT_PROPAGATE_CHECK 0 #define DEBUG_AUGMENT_LOWEST_MATCH_CHECK 0 static DEFINE_SPINLOCK(free_vmap_area_lock); static bool vmap_initialized __read_mostly; /* * This kmem_cache is used for vmap_area objects. Instead of * allocating from slab we reuse an object from this cache to * make things faster. Especially in "no edge" splitting of * free block. */ static struct kmem_cache *vmap_area_cachep; /* * This linked list is used in pair with free_vmap_area_root. * It gives O(1) access to prev/next to perform fast coalescing. */ static LIST_HEAD(free_vmap_area_list); /* * This augment red-black tree represents the free vmap space. * All vmap_area objects in this tree are sorted by va->va_start * address. It is used for allocation and merging when a vmap * object is released. * * Each vmap_area node contains a maximum available free block * of its sub-tree, right or left. Therefore it is possible to * find a lowest match of free area. */ static struct rb_root free_vmap_area_root = RB_ROOT; /* * Preload a CPU with one object for "no edge" split case. The * aim is to get rid of allocations from the atomic context, thus * to use more permissive allocation masks. */ static DEFINE_PER_CPU(struct vmap_area *, ne_fit_preload_node); /* * This structure defines a single, solid model where a list and * rb-tree are part of one entity protected by the lock. Nodes are * sorted in ascending order, thus for O(1) access to left/right * neighbors a list is used as well as for sequential traversal. */ struct rb_list { struct rb_root root; struct list_head head; spinlock_t lock; }; /* * A fast size storage contains VAs up to 1M size. A pool consists * of linked between each other ready to go VAs of certain sizes. * An index in the pool-array corresponds to number of pages + 1. */ #define MAX_VA_SIZE_PAGES 256 struct vmap_pool { struct list_head head; unsigned long len; }; /* * An effective vmap-node logic. Users make use of nodes instead * of a global heap. It allows to balance an access and mitigate * contention. */ static struct vmap_node { /* Simple size segregated storage. */ struct vmap_pool pool[MAX_VA_SIZE_PAGES]; spinlock_t pool_lock; bool skip_populate; /* Bookkeeping data of this node. */ struct rb_list busy; struct rb_list lazy; /* * Ready-to-free areas. */ struct list_head purge_list; struct work_struct purge_work; unsigned long nr_purged; } single; /* * Initial setup consists of one single node, i.e. a balancing * is fully disabled. Later on, after vmap is initialized these * parameters are updated based on a system capacity. */ static struct vmap_node *vmap_nodes = &single; static __read_mostly unsigned int nr_vmap_nodes = 1; static __read_mostly unsigned int vmap_zone_size = 1; static inline unsigned int addr_to_node_id(unsigned long addr) { return (addr / vmap_zone_size) % nr_vmap_nodes; } static inline struct vmap_node * addr_to_node(unsigned long addr) { return &vmap_nodes[addr_to_node_id(addr)]; } static inline struct vmap_node * id_to_node(unsigned int id) { return &vmap_nodes[id % nr_vmap_nodes]; } /* * We use the value 0 to represent "no node", that is why * an encoded value will be the node-id incremented by 1. * It is always greater then 0. A valid node_id which can * be encoded is [0:nr_vmap_nodes - 1]. If a passed node_id * is not valid 0 is returned. */ static unsigned int encode_vn_id(unsigned int node_id) { /* Can store U8_MAX [0:254] nodes. */ if (node_id < nr_vmap_nodes) return (node_id + 1) << BITS_PER_BYTE; /* Warn and no node encoded. */ WARN_ONCE(1, "Encode wrong node id (%u)\n", node_id); return 0; } /* * Returns an encoded node-id, the valid range is within * [0:nr_vmap_nodes-1] values. Otherwise nr_vmap_nodes is * returned if extracted data is wrong. */ static unsigned int decode_vn_id(unsigned int val) { unsigned int node_id = (val >> BITS_PER_BYTE) - 1; /* Can store U8_MAX [0:254] nodes. */ if (node_id < nr_vmap_nodes) return node_id; /* If it was _not_ zero, warn. */ WARN_ONCE(node_id != UINT_MAX, "Decode wrong node id (%d)\n", node_id); return nr_vmap_nodes; } static bool is_vn_id_valid(unsigned int node_id) { if (node_id < nr_vmap_nodes) return true; return false; } static __always_inline unsigned long va_size(struct vmap_area *va) { return (va->va_end - va->va_start); } static __always_inline unsigned long get_subtree_max_size(struct rb_node *node) { struct vmap_area *va; va = rb_entry_safe(node, struct vmap_area, rb_node); return va ? va->subtree_max_size : 0; } RB_DECLARE_CALLBACKS_MAX(static, free_vmap_area_rb_augment_cb, struct vmap_area, rb_node, unsigned long, subtree_max_size, va_size) static void reclaim_and_purge_vmap_areas(void); static BLOCKING_NOTIFIER_HEAD(vmap_notify_list); static void drain_vmap_area_work(struct work_struct *work); static DECLARE_WORK(drain_vmap_work, drain_vmap_area_work); static atomic_long_t nr_vmalloc_pages; unsigned long vmalloc_nr_pages(void) { return atomic_long_read(&nr_vmalloc_pages); } static struct vmap_area *__find_vmap_area(unsigned long addr, struct rb_root *root) { struct rb_node *n = root->rb_node; addr = (unsigned long)kasan_reset_tag((void *)addr); while (n) { struct vmap_area *va; va = rb_entry(n, struct vmap_area, rb_node); if (addr < va->va_start) n = n->rb_left; else if (addr >= va->va_end) n = n->rb_right; else return va; } return NULL; } /* Look up the first VA which satisfies addr < va_end, NULL if none. */ static struct vmap_area * __find_vmap_area_exceed_addr(unsigned long addr, struct rb_root *root) { struct vmap_area *va = NULL; struct rb_node *n = root->rb_node; addr = (unsigned long)kasan_reset_tag((void *)addr); while (n) { struct vmap_area *tmp; tmp = rb_entry(n, struct vmap_area, rb_node); if (tmp->va_end > addr) { va = tmp; if (tmp->va_start <= addr) break; n = n->rb_left; } else n = n->rb_right; } return va; } /* * Returns a node where a first VA, that satisfies addr < va_end, resides. * If success, a node is locked. A user is responsible to unlock it when a * VA is no longer needed to be accessed. * * Returns NULL if nothing found. */ static struct vmap_node * find_vmap_area_exceed_addr_lock(unsigned long addr, struct vmap_area **va) { unsigned long va_start_lowest; struct vmap_node *vn; int i; repeat: for (i = 0, va_start_lowest = 0; i < nr_vmap_nodes; i++) { vn = &vmap_nodes[i]; spin_lock(&vn->busy.lock); *va = __find_vmap_area_exceed_addr(addr, &vn->busy.root); if (*va) if (!va_start_lowest || (*va)->va_start < va_start_lowest) va_start_lowest = (*va)->va_start; spin_unlock(&vn->busy.lock); } /* * Check if found VA exists, it might have gone away. In this case we * repeat the search because a VA has been removed concurrently and we * need to proceed to the next one, which is a rare case. */ if (va_start_lowest) { vn = addr_to_node(va_start_lowest); spin_lock(&vn->busy.lock); *va = __find_vmap_area(va_start_lowest, &vn->busy.root); if (*va) return vn; spin_unlock(&vn->busy.lock); goto repeat; } return NULL; } /* * This function returns back addresses of parent node * and its left or right link for further processing. * * Otherwise NULL is returned. In that case all further * steps regarding inserting of conflicting overlap range * have to be declined and actually considered as a bug. */ static __always_inline struct rb_node ** find_va_links(struct vmap_area *va, struct rb_root *root, struct rb_node *from, struct rb_node **parent) { struct vmap_area *tmp_va; struct rb_node **link; if (root) { link = &root->rb_node; if (unlikely(!*link)) { *parent = NULL; return link; } } else { link = &from; } /* * Go to the bottom of the tree. When we hit the last point * we end up with parent rb_node and correct direction, i name * it link, where the new va->rb_node will be attached to. */ do { tmp_va = rb_entry(*link, struct vmap_area, rb_node); /* * During the traversal we also do some sanity check. * Trigger the BUG() if there are sides(left/right) * or full overlaps. */ if (va->va_end <= tmp_va->va_start) link = &(*link)->rb_left; else if (va->va_start >= tmp_va->va_end) link = &(*link)->rb_right; else { WARN(1, "vmalloc bug: 0x%lx-0x%lx overlaps with 0x%lx-0x%lx\n", va->va_start, va->va_end, tmp_va->va_start, tmp_va->va_end); return NULL; } } while (*link); *parent = &tmp_va->rb_node; return link; } static __always_inline struct list_head * get_va_next_sibling(struct rb_node *parent, struct rb_node **link) { struct list_head *list; if (unlikely(!parent)) /* * The red-black tree where we try to find VA neighbors * before merging or inserting is empty, i.e. it means * there is no free vmap space. Normally it does not * happen but we handle this case anyway. */ return NULL; list = &rb_entry(parent, struct vmap_area, rb_node)->list; return (&parent->rb_right == link ? list->next : list); } static __always_inline void __link_va(struct vmap_area *va, struct rb_root *root, struct rb_node *parent, struct rb_node **link, struct list_head *head, bool augment) { /* * VA is still not in the list, but we can * identify its future previous list_head node. */ if (likely(parent)) { head = &rb_entry(parent, struct vmap_area, rb_node)->list; if (&parent->rb_right != link) head = head->prev; } /* Insert to the rb-tree */ rb_link_node(&va->rb_node, parent, link); if (augment) { /* * Some explanation here. Just perform simple insertion * to the tree. We do not set va->subtree_max_size to * its current size before calling rb_insert_augmented(). * It is because we populate the tree from the bottom * to parent levels when the node _is_ in the tree. * * Therefore we set subtree_max_size to zero after insertion, * to let __augment_tree_propagate_from() puts everything to * the correct order later on. */ rb_insert_augmented(&va->rb_node, root, &free_vmap_area_rb_augment_cb); va->subtree_max_size = 0; } else { rb_insert_color(&va->rb_node, root); } /* Address-sort this list */ list_add(&va->list, head); } static __always_inline void link_va(struct vmap_area *va, struct rb_root *root, struct rb_node *parent, struct rb_node **link, struct list_head *head) { __link_va(va, root, parent, link, head, false); } static __always_inline void link_va_augment(struct vmap_area *va, struct rb_root *root, struct rb_node *parent, struct rb_node **link, struct list_head *head) { __link_va(va, root, parent, link, head, true); } static __always_inline void __unlink_va(struct vmap_area *va, struct rb_root *root, bool augment) { if (WARN_ON(RB_EMPTY_NODE(&va->rb_node))) return; if (augment) rb_erase_augmented(&va->rb_node, root, &free_vmap_area_rb_augment_cb); else rb_erase(&va->rb_node, root); list_del_init(&va->list); RB_CLEAR_NODE(&va->rb_node); } static __always_inline void unlink_va(struct vmap_area *va, struct rb_root *root) { __unlink_va(va, root, false); } static __always_inline void unlink_va_augment(struct vmap_area *va, struct rb_root *root) { __unlink_va(va, root, true); } #if DEBUG_AUGMENT_PROPAGATE_CHECK /* * Gets called when remove the node and rotate. */ static __always_inline unsigned long compute_subtree_max_size(struct vmap_area *va) { return max3(va_size(va), get_subtree_max_size(va->rb_node.rb_left), get_subtree_max_size(va->rb_node.rb_right)); } static void augment_tree_propagate_check(void) { struct vmap_area *va; unsigned long computed_size; list_for_each_entry(va, &free_vmap_area_list, list) { computed_size = compute_subtree_max_size(va); if (computed_size != va->subtree_max_size) pr_emerg("tree is corrupted: %lu, %lu\n", va_size(va), va->subtree_max_size); } } #endif /* * This function populates subtree_max_size from bottom to upper * levels starting from VA point. The propagation must be done * when VA size is modified by changing its va_start/va_end. Or * in case of newly inserting of VA to the tree. * * It means that __augment_tree_propagate_from() must be called: * - After VA has been inserted to the tree(free path); * - After VA has been shrunk(allocation path); * - After VA has been increased(merging path). * * Please note that, it does not mean that upper parent nodes * and their subtree_max_size are recalculated all the time up * to the root node. * * 4--8 * /\ * / \ * / \ * 2--2 8--8 * * For example if we modify the node 4, shrinking it to 2, then * no any modification is required. If we shrink the node 2 to 1 * its subtree_max_size is updated only, and set to 1. If we shrink * the node 8 to 6, then its subtree_max_size is set to 6 and parent * node becomes 4--6. */ static __always_inline void augment_tree_propagate_from(struct vmap_area *va) { /* * Populate the tree from bottom towards the root until * the calculated maximum available size of checked node * is equal to its current one. */ free_vmap_area_rb_augment_cb_propagate(&va->rb_node, NULL); #if DEBUG_AUGMENT_PROPAGATE_CHECK augment_tree_propagate_check(); #endif } static void insert_vmap_area(struct vmap_area *va, struct rb_root *root, struct list_head *head) { struct rb_node **link; struct rb_node *parent; link = find_va_links(va, root, NULL, &parent); if (link) link_va(va, root, parent, link, head); } static void insert_vmap_area_augment(struct vmap_area *va, struct rb_node *from, struct rb_root *root, struct list_head *head) { struct rb_node **link; struct rb_node *parent; if (from) link = find_va_links(va, NULL, from, &parent); else link = find_va_links(va, root, NULL, &parent); if (link) { link_va_augment(va, root, parent, link, head); augment_tree_propagate_from(va); } } /* * Merge de-allocated chunk of VA memory with previous * and next free blocks. If coalesce is not done a new * free area is inserted. If VA has been merged, it is * freed. * * Please note, it can return NULL in case of overlap * ranges, followed by WARN() report. Despite it is a * buggy behaviour, a system can be alive and keep * ongoing. */ static __always_inline struct vmap_area * __merge_or_add_vmap_area(struct vmap_area *va, struct rb_root *root, struct list_head *head, bool augment) { struct vmap_area *sibling; struct list_head *next; struct rb_node **link; struct rb_node *parent; bool merged = false; /* * Find a place in the tree where VA potentially will be * inserted, unless it is merged with its sibling/siblings. */ link = find_va_links(va, root, NULL, &parent); if (!link) return NULL; /* * Get next node of VA to check if merging can be done. */ next = get_va_next_sibling(parent, link); if (unlikely(next == NULL)) goto insert; /* * start end * | | * |<------VA------>|<-----Next----->| * | | * start end */ if (next != head) { sibling = list_entry(next, struct vmap_area, list); if (sibling->va_start == va->va_end) { sibling->va_start = va->va_start; /* Free vmap_area object. */ kmem_cache_free(vmap_area_cachep, va); /* Point to the new merged area. */ va = sibling; merged = true; } } /* * start end * | | * |<-----Prev----->|<------VA------>| * | | * start end */ if (next->prev != head) { sibling = list_entry(next->prev, struct vmap_area, list); if (sibling->va_end == va->va_start) { /* * If both neighbors are coalesced, it is important * to unlink the "next" node first, followed by merging * with "previous" one. Otherwise the tree might not be * fully populated if a sibling's augmented value is * "normalized" because of rotation operations. */ if (merged) __unlink_va(va, root, augment); sibling->va_end = va->va_end; /* Free vmap_area object. */ kmem_cache_free(vmap_area_cachep, va); /* Point to the new merged area. */ va = sibling; merged = true; } } insert: if (!merged) __link_va(va, root, parent, link, head, augment); return va; } static __always_inline struct vmap_area * merge_or_add_vmap_area(struct vmap_area *va, struct rb_root *root, struct list_head *head) { return __merge_or_add_vmap_area(va, root, head, false); } static __always_inline struct vmap_area * merge_or_add_vmap_area_augment(struct vmap_area *va, struct rb_root *root, struct list_head *head) { va = __merge_or_add_vmap_area(va, root, head, true); if (va) augment_tree_propagate_from(va); return va; } static __always_inline bool is_within_this_va(struct vmap_area *va, unsigned long size, unsigned long align, unsigned long vstart) { unsigned long nva_start_addr; if (va->va_start > vstart) nva_start_addr = ALIGN(va->va_start, align); else nva_start_addr = ALIGN(vstart, align); /* Can be overflowed due to big size or alignment. */ if (nva_start_addr + size < nva_start_addr || nva_start_addr < vstart) return false; return (nva_start_addr + size <= va->va_end); } /* * Find the first free block(lowest start address) in the tree, * that will accomplish the request corresponding to passing * parameters. Please note, with an alignment bigger than PAGE_SIZE, * a search length is adjusted to account for worst case alignment * overhead. */ static __always_inline struct vmap_area * find_vmap_lowest_match(struct rb_root *root, unsigned long size, unsigned long align, unsigned long vstart, bool adjust_search_size) { struct vmap_area *va; struct rb_node *node; unsigned long length; /* Start from the root. */ node = root->rb_node; /* Adjust the search size for alignment overhead. */ length = adjust_search_size ? size + align - 1 : size; while (node) { va = rb_entry(node, struct vmap_area, rb_node); if (get_subtree_max_size(node->rb_left) >= length && vstart < va->va_start) { node = node->rb_left; } else { if (is_within_this_va(va, size, align, vstart)) return va; /* * Does not make sense to go deeper towards the right * sub-tree if it does not have a free block that is * equal or bigger to the requested search length. */ if (get_subtree_max_size(node->rb_right) >= length) { node = node->rb_right; continue; } /* * OK. We roll back and find the first right sub-tree, * that will satisfy the search criteria. It can happen * due to "vstart" restriction or an alignment overhead * that is bigger then PAGE_SIZE. */ while ((node = rb_parent(node))) { va = rb_entry(node, struct vmap_area, rb_node); if (is_within_this_va(va, size, align, vstart)) return va; if (get_subtree_max_size(node->rb_right) >= length && vstart <= va->va_start) { /* * Shift the vstart forward. Please note, we update it with * parent's start address adding "1" because we do not want * to enter same sub-tree after it has already been checked * and no suitable free block found there. */ vstart = va->va_start + 1; node = node->rb_right; break; } } } } return NULL; } #if DEBUG_AUGMENT_LOWEST_MATCH_CHECK #include <linux/random.h> static struct vmap_area * find_vmap_lowest_linear_match(struct list_head *head, unsigned long size, unsigned long align, unsigned long vstart) { struct vmap_area *va; list_for_each_entry(va, head, list) { if (!is_within_this_va(va, size, align, vstart)) continue; return va; } return NULL; } static void find_vmap_lowest_match_check(struct rb_root *root, struct list_head *head, unsigned long size, unsigned long align) { struct vmap_area *va_1, *va_2; unsigned long vstart; unsigned int rnd; get_random_bytes(&rnd, sizeof(rnd)); vstart = VMALLOC_START + rnd; va_1 = find_vmap_lowest_match(root, size, align, vstart, false); va_2 = find_vmap_lowest_linear_match(head, size, align, vstart); if (va_1 != va_2) pr_emerg("not lowest: t: 0x%p, l: 0x%p, v: 0x%lx\n", va_1, va_2, vstart); } #endif enum fit_type { NOTHING_FIT = 0, FL_FIT_TYPE = 1, /* full fit */ LE_FIT_TYPE = 2, /* left edge fit */ RE_FIT_TYPE = 3, /* right edge fit */ NE_FIT_TYPE = 4 /* no edge fit */ }; static __always_inline enum fit_type classify_va_fit_type(struct vmap_area *va, unsigned long nva_start_addr, unsigned long size) { enum fit_type type; /* Check if it is within VA. */ if (nva_start_addr < va->va_start || nva_start_addr + size > va->va_end) return NOTHING_FIT; /* Now classify. */ if (va->va_start == nva_start_addr) { if (va->va_end == nva_start_addr + size) type = FL_FIT_TYPE; else type = LE_FIT_TYPE; } else if (va->va_end == nva_start_addr + size) { type = RE_FIT_TYPE; } else { type = NE_FIT_TYPE; } return type; } static __always_inline int va_clip(struct rb_root *root, struct list_head *head, struct vmap_area *va, unsigned long nva_start_addr, unsigned long size) { struct vmap_area *lva = NULL; enum fit_type type = classify_va_fit_type(va, nva_start_addr, size); if (type == FL_FIT_TYPE) { /* * No need to split VA, it fully fits. * * | | * V NVA V * |---------------| */ unlink_va_augment(va, root); kmem_cache_free(vmap_area_cachep, va); } else if (type == LE_FIT_TYPE) { /* * Split left edge of fit VA. * * | | * V NVA V R * |-------|-------| */ va->va_start += size; } else if (type == RE_FIT_TYPE) { /* * Split right edge of fit VA. * * | | * L V NVA V * |-------|-------| */ va->va_end = nva_start_addr; } else if (type == NE_FIT_TYPE) { /* * Split no edge of fit VA. * * | | * L V NVA V R * |---|-------|---| */ lva = __this_cpu_xchg(ne_fit_preload_node, NULL); if (unlikely(!lva)) { /* * For percpu allocator we do not do any pre-allocation * and leave it as it is. The reason is it most likely * never ends up with NE_FIT_TYPE splitting. In case of * percpu allocations offsets and sizes are aligned to * fixed align request, i.e. RE_FIT_TYPE and FL_FIT_TYPE * are its main fitting cases. * * There are a few exceptions though, as an example it is * a first allocation (early boot up) when we have "one" * big free space that has to be split. * * Also we can hit this path in case of regular "vmap" * allocations, if "this" current CPU was not preloaded. * See the comment in alloc_vmap_area() why. If so, then * GFP_NOWAIT is used instead to get an extra object for * split purpose. That is rare and most time does not * occur. * * What happens if an allocation gets failed. Basically, * an "overflow" path is triggered to purge lazily freed * areas to free some memory, then, the "retry" path is * triggered to repeat one more time. See more details * in alloc_vmap_area() function. */ lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT); if (!lva) return -1; } /* * Build the remainder. */ lva->va_start = va->va_start; lva->va_end = nva_start_addr; /* * Shrink this VA to remaining size. */ va->va_start = nva_start_addr + size; } else { return -1; } if (type != FL_FIT_TYPE) { augment_tree_propagate_from(va); if (lva) /* type == NE_FIT_TYPE */ insert_vmap_area_augment(lva, &va->rb_node, root, head); } return 0; } static unsigned long va_alloc(struct vmap_area *va, struct rb_root *root, struct list_head *head, unsigned long size, unsigned long align, unsigned long vstart, unsigned long vend) { unsigned long nva_start_addr; int ret; if (va->va_start > vstart) nva_start_addr = ALIGN(va->va_start, align); else nva_start_addr = ALIGN(vstart, align); /* Check the "vend" restriction. */ if (nva_start_addr + size > vend) return vend; /* Update the free vmap_area. */ ret = va_clip(root, head, va, nva_start_addr, size); if (WARN_ON_ONCE(ret)) return vend; return nva_start_addr; } /* * Returns a start address of the newly allocated area, if success. * Otherwise a vend is returned that indicates failure. */ static __always_inline unsigned long __alloc_vmap_area(struct rb_root *root, struct list_head *head, unsigned long size, unsigned long align, unsigned long vstart, unsigned long vend) { bool adjust_search_size = true; unsigned long nva_start_addr; struct vmap_area *va; /* * Do not adjust when: * a) align <= PAGE_SIZE, because it does not make any sense. * All blocks(their start addresses) are at least PAGE_SIZE * aligned anyway; * b) a short range where a requested size corresponds to exactly * specified [vstart:vend] interval and an alignment > PAGE_SIZE. * With adjusted search length an allocation would not succeed. */ if (align <= PAGE_SIZE || (align > PAGE_SIZE && (vend - vstart) == size)) adjust_search_size = false; va = find_vmap_lowest_match(root, size, align, vstart, adjust_search_size); if (unlikely(!va)) return vend; nva_start_addr = va_alloc(va, root, head, size, align, vstart, vend); if (nva_start_addr == vend) return vend; #if DEBUG_AUGMENT_LOWEST_MATCH_CHECK find_vmap_lowest_match_check(root, head, size, align); #endif return nva_start_addr; } /* * Free a region of KVA allocated by alloc_vmap_area */ static void free_vmap_area(struct vmap_area *va) { struct vmap_node *vn = addr_to_node(va->va_start); /* * Remove from the busy tree/list. */ spin_lock(&vn->busy.lock); unlink_va(va, &vn->busy.root); spin_unlock(&vn->busy.lock); /* * Insert/Merge it back to the free tree/list. */ spin_lock(&free_vmap_area_lock); merge_or_add_vmap_area_augment(va, &free_vmap_area_root, &free_vmap_area_list); spin_unlock(&free_vmap_area_lock); } static inline void preload_this_cpu_lock(spinlock_t *lock, gfp_t gfp_mask, int node) { struct vmap_area *va = NULL, *tmp; /* * Preload this CPU with one extra vmap_area object. It is used * when fit type of free area is NE_FIT_TYPE. It guarantees that * a CPU that does an allocation is preloaded. * * We do it in non-atomic context, thus it allows us to use more * permissive allocation masks to be more stable under low memory * condition and high memory pressure. */ if (!this_cpu_read(ne_fit_preload_node)) va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node); spin_lock(lock); tmp = NULL; if (va && !__this_cpu_try_cmpxchg(ne_fit_preload_node, &tmp, va)) kmem_cache_free(vmap_area_cachep, va); } static struct vmap_pool * size_to_va_pool(struct vmap_node *vn, unsigned long size) { unsigned int idx = (size - 1) / PAGE_SIZE; if (idx < MAX_VA_SIZE_PAGES) return &vn->pool[idx]; return NULL; } static bool node_pool_add_va(struct vmap_node *n, struct vmap_area *va) { struct vmap_pool *vp; vp = size_to_va_pool(n, va_size(va)); if (!vp) return false; spin_lock(&n->pool_lock); list_add(&va->list, &vp->head); WRITE_ONCE(vp->len, vp->len + 1); spin_unlock(&n->pool_lock); return true; } static struct vmap_area * node_pool_del_va(struct vmap_node *vn, unsigned long size, unsigned long align, unsigned long vstart, unsigned long vend) { struct vmap_area *va = NULL; struct vmap_pool *vp; int err = 0; vp = size_to_va_pool(vn, size); if (!vp || list_empty(&vp->head)) return NULL; spin_lock(&vn->pool_lock); if (!list_empty(&vp->head)) { va = list_first_entry(&vp->head, struct vmap_area, list); if (IS_ALIGNED(va->va_start, align)) { /* * Do some sanity check and emit a warning * if one of below checks detects an error. */ err |= (va_size(va) != size); err |= (va->va_start < vstart); err |= (va->va_end > vend); if (!WARN_ON_ONCE(err)) { list_del_init(&va->list); WRITE_ONCE(vp->len, vp->len - 1); } else { va = NULL; } } else { list_move_tail(&va->list, &vp->head); va = NULL; } } spin_unlock(&vn->pool_lock); return va; } static struct vmap_area * node_alloc(unsigned long size, unsigned long align, unsigned long vstart, unsigned long vend, unsigned long *addr, unsigned int *vn_id) { struct vmap_area *va; *vn_id = 0; *addr = vend; /* * Fallback to a global heap if not vmalloc or there * is only one node. */ if (vstart != VMALLOC_START || vend != VMALLOC_END || nr_vmap_nodes == 1) return NULL; *vn_id = raw_smp_processor_id() % nr_vmap_nodes; va = node_pool_del_va(id_to_node(*vn_id), size, align, vstart, vend); *vn_id = encode_vn_id(*vn_id); if (va) *addr = va->va_start; return va; } static inline void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, unsigned long flags, const void *caller) { vm->flags = flags; vm->addr = (void *)va->va_start; vm->size = vm->requested_size = va_size(va); vm->caller = caller; va->vm = vm; } /* * Allocate a region of KVA of the specified size and alignment, within the * vstart and vend. If vm is passed in, the two will also be bound. */ static struct vmap_area *alloc_vmap_area(unsigned long size, unsigned long align, unsigned long vstart, unsigned long vend, int node, gfp_t gfp_mask, unsigned long va_flags, struct vm_struct *vm) { struct vmap_node *vn; struct vmap_area *va; unsigned long freed; unsigned long addr; unsigned int vn_id; int purged = 0; int ret; if (unlikely(!size || offset_in_page(size) || !is_power_of_2(align))) return ERR_PTR(-EINVAL); if (unlikely(!vmap_initialized)) return ERR_PTR(-EBUSY); might_sleep(); /* * If a VA is obtained from a global heap(if it fails here) * it is anyway marked with this "vn_id" so it is returned * to this pool's node later. Such way gives a possibility * to populate pools based on users demand. * * On success a ready to go VA is returned. */ va = node_alloc(size, align, vstart, vend, &addr, &vn_id); if (!va) { gfp_mask = gfp_mask & GFP_RECLAIM_MASK; va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node); if (unlikely(!va)) return ERR_PTR(-ENOMEM); /* * Only scan the relevant parts containing pointers to other objects * to avoid false negatives. */ kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask); } retry: if (addr == vend) { preload_this_cpu_lock(&free_vmap_area_lock, gfp_mask, node); addr = __alloc_vmap_area(&free_vmap_area_root, &free_vmap_area_list, size, align, vstart, vend); spin_unlock(&free_vmap_area_lock); } trace_alloc_vmap_area(addr, size, align, vstart, vend, addr == vend); /* * If an allocation fails, the "vend" address is * returned. Therefore trigger the overflow path. */ if (unlikely(addr == vend)) goto overflow; va->va_start = addr; va->va_end = addr + size; va->vm = NULL; va->flags = (va_flags | vn_id); if (vm) { vm->addr = (void *)va->va_start; vm->size = va_size(va); va->vm = vm; } vn = addr_to_node(va->va_start); spin_lock(&vn->busy.lock); insert_vmap_area(va, &vn->busy.root, &vn->busy.head); spin_unlock(&vn->busy.lock); BUG_ON(!IS_ALIGNED(va->va_start, align)); BUG_ON(va->va_start < vstart); BUG_ON(va->va_end > vend); ret = kasan_populate_vmalloc(addr, size); if (ret) { free_vmap_area(va); return ERR_PTR(ret); } return va; overflow: if (!purged) { reclaim_and_purge_vmap_areas(); purged = 1; goto retry; } freed = 0; blocking_notifier_call_chain(&vmap_notify_list, 0, &freed); if (freed > 0) { purged = 0; goto retry; } if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) pr_warn("vmalloc_node_range for size %lu failed: Address range restricted to %#lx - %#lx\n", size, vstart, vend); kmem_cache_free(vmap_area_cachep, va); return ERR_PTR(-EBUSY); } int register_vmap_purge_notifier(struct notifier_block *nb) { return blocking_notifier_chain_register(&vmap_notify_list, nb); } EXPORT_SYMBOL_GPL(register_vmap_purge_notifier); int unregister_vmap_purge_notifier(struct notifier_block *nb) { return blocking_notifier_chain_unregister(&vmap_notify_list, nb); } EXPORT_SYMBOL_GPL(unregister_vmap_purge_notifier); /* * lazy_max_pages is the maximum amount of virtual address space we gather up * before attempting to purge with a TLB flush. * * There is a tradeoff here: a larger number will cover more kernel page tables * and take slightly longer to purge, but it will linearly reduce the number of * global TLB flushes that must be performed. It would seem natural to scale * this number up linearly with the number of CPUs (because vmapping activity * could also scale linearly with the number of CPUs), however it is likely * that in practice, workloads might be constrained in other ways that mean * vmap activity will not scale linearly with CPUs. Also, I want to be * conservative and not introduce a big latency on huge systems, so go with * a less aggressive log scale. It will still be an improvement over the old * code, and it will be simple to change the scale factor if we find that it * becomes a problem on bigger systems. */ static unsigned long lazy_max_pages(void) { unsigned int log; log = fls(num_online_cpus()); return log * (32UL * 1024 * 1024 / PAGE_SIZE); } static atomic_long_t vmap_lazy_nr = ATOMIC_LONG_INIT(0); /* * Serialize vmap purging. There is no actual critical section protected * by this lock, but we want to avoid concurrent calls for performance * reasons and to make the pcpu_get_vm_areas more deterministic. */ static DEFINE_MUTEX(vmap_purge_lock); /* for per-CPU blocks */ static void purge_fragmented_blocks_allcpus(void); static cpumask_t purge_nodes; static void reclaim_list_global(struct list_head *head) { struct vmap_area *va, *n; if (list_empty(head)) return; spin_lock(&free_vmap_area_lock); list_for_each_entry_safe(va, n, head, list) merge_or_add_vmap_area_augment(va, &free_vmap_area_root, &free_vmap_area_list); spin_unlock(&free_vmap_area_lock); } static void decay_va_pool_node(struct vmap_node *vn, bool full_decay) { LIST_HEAD(decay_list); struct rb_root decay_root = RB_ROOT; struct vmap_area *va, *nva; unsigned long n_decay; int i; for (i = 0; i < MAX_VA_SIZE_PAGES; i++) { LIST_HEAD(tmp_list); if (list_empty(&vn->pool[i].head)) continue; /* Detach the pool, so no-one can access it. */ spin_lock(&vn->pool_lock); list_replace_init(&vn->pool[i].head, &tmp_list); spin_unlock(&vn->pool_lock); if (full_decay) WRITE_ONCE(vn->pool[i].len, 0); /* Decay a pool by ~25% out of left objects. */ n_decay = vn->pool[i].len >> 2; list_for_each_entry_safe(va, nva, &tmp_list, list) { list_del_init(&va->list); merge_or_add_vmap_area(va, &decay_root, &decay_list); if (!full_decay) { WRITE_ONCE(vn->pool[i].len, vn->pool[i].len - 1); if (!--n_decay) break; } } /* * Attach the pool back if it has been partly decayed. * Please note, it is supposed that nobody(other contexts) * can populate the pool therefore a simple list replace * operation takes place here. */ if (!full_decay && !list_empty(&tmp_list)) { spin_lock(&vn->pool_lock); list_replace_init(&tmp_list, &vn->pool[i].head); spin_unlock(&vn->pool_lock); } } reclaim_list_global(&decay_list); } static void kasan_release_vmalloc_node(struct vmap_node *vn) { struct vmap_area *va; unsigned long start, end; start = list_first_entry(&vn->purge_list, struct vmap_area, list)->va_start; end = list_last_entry(&vn->purge_list, struct vmap_area, list)->va_end; list_for_each_entry(va, &vn->purge_list, list) { if (is_vmalloc_or_module_addr((void *) va->va_start)) kasan_release_vmalloc(va->va_start, va->va_end, va->va_start, va->va_end, KASAN_VMALLOC_PAGE_RANGE); } kasan_release_vmalloc(start, end, start, end, KASAN_VMALLOC_TLB_FLUSH); } static void purge_vmap_node(struct work_struct *work) { struct vmap_node *vn = container_of(work, struct vmap_node, purge_work); unsigned long nr_purged_pages = 0; struct vmap_area *va, *n_va; LIST_HEAD(local_list); if (IS_ENABLED(CONFIG_KASAN_VMALLOC)) kasan_release_vmalloc_node(vn); vn->nr_purged = 0; list_for_each_entry_safe(va, n_va, &vn->purge_list, list) { unsigned long nr = va_size(va) >> PAGE_SHIFT; unsigned int vn_id = decode_vn_id(va->flags); list_del_init(&va->list); nr_purged_pages += nr; vn->nr_purged++; if (is_vn_id_valid(vn_id) && !vn->skip_populate) if (node_pool_add_va(vn, va)) continue; /* Go back to global. */ list_add(&va->list, &local_list); } atomic_long_sub(nr_purged_pages, &vmap_lazy_nr); reclaim_list_global(&local_list); } /* * Purges all lazily-freed vmap areas. */ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end, bool full_pool_decay) { unsigned long nr_purged_areas = 0; unsigned int nr_purge_helpers; unsigned int nr_purge_nodes; struct vmap_node *vn; int i; lockdep_assert_held(&vmap_purge_lock); /* * Use cpumask to mark which node has to be processed. */ purge_nodes = CPU_MASK_NONE; for (i = 0; i < nr_vmap_nodes; i++) { vn = &vmap_nodes[i]; INIT_LIST_HEAD(&vn->purge_list); vn->skip_populate = full_pool_decay; decay_va_pool_node(vn, full_pool_decay); if (RB_EMPTY_ROOT(&vn->lazy.root)) continue; spin_lock(&vn->lazy.lock); WRITE_ONCE(vn->lazy.root.rb_node, NULL); list_replace_init(&vn->lazy.head, &vn->purge_list); spin_unlock(&vn->lazy.lock); start = min(start, list_first_entry(&vn->purge_list, struct vmap_area, list)->va_start); end = max(end, list_last_entry(&vn->purge_list, struct vmap_area, list)->va_end); cpumask_set_cpu(i, &purge_nodes); } nr_purge_nodes = cpumask_weight(&purge_nodes); if (nr_purge_nodes > 0) { flush_tlb_kernel_range(start, end); /* One extra worker is per a lazy_max_pages() full set minus one. */ nr_purge_helpers = atomic_long_read(&vmap_lazy_nr) / lazy_max_pages(); nr_purge_helpers = clamp(nr_purge_helpers, 1U, nr_purge_nodes) - 1; for_each_cpu(i, &purge_nodes) { vn = &vmap_nodes[i]; if (nr_purge_helpers > 0) { INIT_WORK(&vn->purge_work, purge_vmap_node); if (cpumask_test_cpu(i, cpu_online_mask)) schedule_work_on(i, &vn->purge_work); else schedule_work(&vn->purge_work); nr_purge_helpers--; } else { vn->purge_work.func = NULL; purge_vmap_node(&vn->purge_work); nr_purged_areas += vn->nr_purged; } } for_each_cpu(i, &purge_nodes) { vn = &vmap_nodes[i]; if (vn->purge_work.func) { flush_work(&vn->purge_work); nr_purged_areas += vn->nr_purged; } } } trace_purge_vmap_area_lazy(start, end, nr_purged_areas); return nr_purged_areas > 0; } /* * Reclaim vmap areas by purging fragmented blocks and purge_vmap_area_list. */ static void reclaim_and_purge_vmap_areas(void) { mutex_lock(&vmap_purge_lock); purge_fragmented_blocks_allcpus(); __purge_vmap_area_lazy(ULONG_MAX, 0, true); mutex_unlock(&vmap_purge_lock); } static void drain_vmap_area_work(struct work_struct *work) { mutex_lock(&vmap_purge_lock); __purge_vmap_area_lazy(ULONG_MAX, 0, false); mutex_unlock(&vmap_purge_lock); } /* * Free a vmap area, caller ensuring that the area has been unmapped, * unlinked and flush_cache_vunmap had been called for the correct * range previously. */ static void free_vmap_area_noflush(struct vmap_area *va) { unsigned long nr_lazy_max = lazy_max_pages(); unsigned long va_start = va->va_start; unsigned int vn_id = decode_vn_id(va->flags); struct vmap_node *vn; unsigned long nr_lazy; if (WARN_ON_ONCE(!list_empty(&va->list))) return; nr_lazy = atomic_long_add_return(va_size(va) >> PAGE_SHIFT, &vmap_lazy_nr); /* * If it was request by a certain node we would like to * return it to that node, i.e. its pool for later reuse. */ vn = is_vn_id_valid(vn_id) ? id_to_node(vn_id):addr_to_node(va->va_start); spin_lock(&vn->lazy.lock); insert_vmap_area(va, &vn->lazy.root, &vn->lazy.head); spin_unlock(&vn->lazy.lock); trace_free_vmap_area_noflush(va_start, nr_lazy, nr_lazy_max); /* After this point, we may free va at any time */ if (unlikely(nr_lazy > nr_lazy_max)) schedule_work(&drain_vmap_work); } /* * Free and unmap a vmap area */ static void free_unmap_vmap_area(struct vmap_area *va) { flush_cache_vunmap(va->va_start, va->va_end); vunmap_range_noflush(va->va_start, va->va_end); if (debug_pagealloc_enabled_static()) flush_tlb_kernel_range(va->va_start, va->va_end); free_vmap_area_noflush(va); } struct vmap_area *find_vmap_area(unsigned long addr) { struct vmap_node *vn; struct vmap_area *va; int i, j; if (unlikely(!vmap_initialized)) return NULL; /* * An addr_to_node_id(addr) converts an address to a node index * where a VA is located. If VA spans several zones and passed * addr is not the same as va->va_start, what is not common, we * may need to scan extra nodes. See an example: * * <----va----> * -|-----|-----|-----|-----|- * 1 2 0 1 * * VA resides in node 1 whereas it spans 1, 2 an 0. If passed * addr is within 2 or 0 nodes we should do extra work. */ i = j = addr_to_node_id(addr); do { vn = &vmap_nodes[i]; spin_lock(&vn->busy.lock); va = __find_vmap_area(addr, &vn->busy.root); spin_unlock(&vn->busy.lock); if (va) return va; } while ((i = (i + 1) % nr_vmap_nodes) != j); return NULL; } static struct vmap_area *find_unlink_vmap_area(unsigned long addr) { struct vmap_node *vn; struct vmap_area *va; int i, j; /* * Check the comment in the find_vmap_area() about the loop. */ i = j = addr_to_node_id(addr); do { vn = &vmap_nodes[i]; spin_lock(&vn->busy.lock); va = __find_vmap_area(addr, &vn->busy.root); if (va) unlink_va(va, &vn->busy.root); spin_unlock(&vn->busy.lock); if (va) return va; } while ((i = (i + 1) % nr_vmap_nodes) != j); return NULL; } /*** Per cpu kva allocator ***/ /* * vmap space is limited especially on 32 bit architectures. Ensure there is * room for at least 16 percpu vmap blocks per CPU. */ /* * If we had a constant VMALLOC_START and VMALLOC_END, we'd like to be able * to #define VMALLOC_SPACE (VMALLOC_END-VMALLOC_START). Guess * instead (we just need a rough idea) */ #if BITS_PER_LONG == 32 #define VMALLOC_SPACE (128UL*1024*1024) #else #define VMALLOC_SPACE (128UL*1024*1024*1024) #endif #define VMALLOC_PAGES (VMALLOC_SPACE / PAGE_SIZE) #define VMAP_MAX_ALLOC BITS_PER_LONG /* 256K with 4K pages */ #define VMAP_BBMAP_BITS_MAX 1024 /* 4MB with 4K pages */ #define VMAP_BBMAP_BITS_MIN (VMAP_MAX_ALLOC*2) #define VMAP_MIN(x, y) ((x) < (y) ? (x) : (y)) /* can't use min() */ #define VMAP_MAX(x, y) ((x) > (y) ? (x) : (y)) /* can't use max() */ #define VMAP_BBMAP_BITS \ VMAP_MIN(VMAP_BBMAP_BITS_MAX, \ VMAP_MAX(VMAP_BBMAP_BITS_MIN, \ VMALLOC_PAGES / roundup_pow_of_two(NR_CPUS) / 16)) #define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE) /* * Purge threshold to prevent overeager purging of fragmented blocks for * regular operations: Purge if vb->free is less than 1/4 of the capacity. */ #define VMAP_PURGE_THRESHOLD (VMAP_BBMAP_BITS / 4) #define VMAP_RAM 0x1 /* indicates vm_map_ram area*/ #define VMAP_BLOCK 0x2 /* mark out the vmap_block sub-type*/ #define VMAP_FLAGS_MASK 0x3 struct vmap_block_queue { spinlock_t lock; struct list_head free; /* * An xarray requires an extra memory dynamically to * be allocated. If it is an issue, we can use rb-tree * instead. */ struct xarray vmap_blocks; }; struct vmap_block { spinlock_t lock; struct vmap_area *va; unsigned long free, dirty; DECLARE_BITMAP(used_map, VMAP_BBMAP_BITS); unsigned long dirty_min, dirty_max; /*< dirty range */ struct list_head free_list; struct rcu_head rcu_head; struct list_head purge; unsigned int cpu; }; /* Queue of free and dirty vmap blocks, for allocation and flushing purposes */ static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue); /* * In order to fast access to any "vmap_block" associated with a * specific address, we use a hash. * * A per-cpu vmap_block_queue is used in both ways, to serialize * an access to free block chains among CPUs(alloc path) and it * also acts as a vmap_block hash(alloc/free paths). It means we * overload it, since we already have the per-cpu array which is * used as a hash table. When used as a hash a 'cpu' passed to * per_cpu() is not actually a CPU but rather a hash index. * * A hash function is addr_to_vb_xa() which hashes any address * to a specific index(in a hash) it belongs to. This then uses a * per_cpu() macro to access an array with generated index. * * An example: * * CPU_1 CPU_2 CPU_0 * | | | * V V V * 0 10 20 30 40 50 60 * |------|------|------|------|------|------|...<vmap address space> * CPU0 CPU1 CPU2 CPU0 CPU1 CPU2 * * - CPU_1 invokes vm_unmap_ram(6), 6 belongs to CPU0 zone, thus * it access: CPU0/INDEX0 -> vmap_blocks -> xa_lock; * * - CPU_2 invokes vm_unmap_ram(11), 11 belongs to CPU1 zone, thus * it access: CPU1/INDEX1 -> vmap_blocks -> xa_lock; * * - CPU_0 invokes vm_unmap_ram(20), 20 belongs to CPU2 zone, thus * it access: CPU2/INDEX2 -> vmap_blocks -> xa_lock. * * This technique almost always avoids lock contention on insert/remove, * however xarray spinlocks protect against any contention that remains. */ static struct xarray * addr_to_vb_xa(unsigned long addr) { int index = (addr / VMAP_BLOCK_SIZE) % nr_cpu_ids; /* * Please note, nr_cpu_ids points on a highest set * possible bit, i.e. we never invoke cpumask_next() * if an index points on it which is nr_cpu_ids - 1. */ if (!cpu_possible(index)) index = cpumask_next(index, cpu_possible_mask); return &per_cpu(vmap_block_queue, index).vmap_blocks; } /* * We should probably have a fallback mechanism to allocate virtual memory * out of partially filled vmap blocks. However vmap block sizing should be * fairly reasonable according to the vmalloc size, so it shouldn't be a * big problem. */ static unsigned long addr_to_vb_idx(unsigned long addr) { addr -= VMALLOC_START & ~(VMAP_BLOCK_SIZE-1); addr /= VMAP_BLOCK_SIZE; return addr; } static void *vmap_block_vaddr(unsigned long va_start, unsigned long pages_off) { unsigned long addr; addr = va_start + (pages_off << PAGE_SHIFT); BUG_ON(addr_to_vb_idx(addr) != addr_to_vb_idx(va_start)); return (void *)addr; } /** * new_vmap_block - allocates new vmap_block and occupies 2^order pages in this * block. Of course pages number can't exceed VMAP_BBMAP_BITS * @order: how many 2^order pages should be occupied in newly allocated block * @gfp_mask: flags for the page level allocator * * Return: virtual address in a newly allocated block or ERR_PTR(-errno) */ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask) { struct vmap_block_queue *vbq; struct vmap_block *vb; struct vmap_area *va; struct xarray *xa; unsigned long vb_idx; int node, err; void *vaddr; node = numa_node_id(); vb = kmalloc_node(sizeof(struct vmap_block), gfp_mask & GFP_RECLAIM_MASK, node); if (unlikely(!vb)) return ERR_PTR(-ENOMEM); va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE, VMALLOC_START, VMALLOC_END, node, gfp_mask, VMAP_RAM|VMAP_BLOCK, NULL); if (IS_ERR(va)) { kfree(vb); return ERR_CAST(va); } vaddr = vmap_block_vaddr(va->va_start, 0); spin_lock_init(&vb->lock); vb->va = va; /* At least something should be left free */ BUG_ON(VMAP_BBMAP_BITS <= (1UL << order)); bitmap_zero(vb->used_map, VMAP_BBMAP_BITS); vb->free = VMAP_BBMAP_BITS - (1UL << order); vb->dirty = 0; vb->dirty_min = VMAP_BBMAP_BITS; vb->dirty_max = 0; bitmap_set(vb->used_map, 0, (1UL << order)); INIT_LIST_HEAD(&vb->free_list); vb->cpu = raw_smp_processor_id(); xa = addr_to_vb_xa(va->va_start); vb_idx = addr_to_vb_idx(va->va_start); err = xa_insert(xa, vb_idx, vb, gfp_mask); if (err) { kfree(vb); free_vmap_area(va); return ERR_PTR(err); } /* * list_add_tail_rcu could happened in another core * rather than vb->cpu due to task migration, which * is safe as list_add_tail_rcu will ensure the list's * integrity together with list_for_each_rcu from read * side. */ vbq = per_cpu_ptr(&vmap_block_queue, vb->cpu); spin_lock(&vbq->lock); list_add_tail_rcu(&vb->free_list, &vbq->free); spin_unlock(&vbq->lock); return vaddr; } static void free_vmap_block(struct vmap_block *vb) { struct vmap_node *vn; struct vmap_block *tmp; struct xarray *xa; xa = addr_to_vb_xa(vb->va->va_start); tmp = xa_erase(xa, addr_to_vb_idx(vb->va->va_start)); BUG_ON(tmp != vb); vn = addr_to_node(vb->va->va_start); spin_lock(&vn->busy.lock); unlink_va(vb->va, &vn->busy.root); spin_unlock(&vn->busy.lock); free_vmap_area_noflush(vb->va); kfree_rcu(vb, rcu_head); } static bool purge_fragmented_block(struct vmap_block *vb, struct list_head *purge_list, bool force_purge) { struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, vb->cpu); if (vb->free + vb->dirty != VMAP_BBMAP_BITS || vb->dirty == VMAP_BBMAP_BITS) return false; /* Don't overeagerly purge usable blocks unless requested */ if (!(force_purge || vb->free < VMAP_PURGE_THRESHOLD)) return false; /* prevent further allocs after releasing lock */ WRITE_ONCE(vb->free, 0); /* prevent purging it again */ WRITE_ONCE(vb->dirty, VMAP_BBMAP_BITS); vb->dirty_min = 0; vb->dirty_max = VMAP_BBMAP_BITS; spin_lock(&vbq->lock); list_del_rcu(&vb->free_list); spin_unlock(&vbq->lock); list_add_tail(&vb->purge, purge_list); return true; } static void free_purged_blocks(struct list_head *purge_list) { struct vmap_block *vb, *n_vb; list_for_each_entry_safe(vb, n_vb, purge_list, purge) { list_del(&vb->purge); free_vmap_block(vb); } } static void purge_fragmented_blocks(int cpu) { LIST_HEAD(purge); struct vmap_block *vb; struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu); rcu_read_lock(); list_for_each_entry_rcu(vb, &vbq->free, free_list) { unsigned long free = READ_ONCE(vb->free); unsigned long dirty = READ_ONCE(vb->dirty); if (free + dirty != VMAP_BBMAP_BITS || dirty == VMAP_BBMAP_BITS) continue; spin_lock(&vb->lock); purge_fragmented_block(vb, &purge, true); spin_unlock(&vb->lock); } rcu_read_unlock(); free_purged_blocks(&purge); } static void purge_fragmented_blocks_allcpus(void) { int cpu; for_each_possible_cpu(cpu) purge_fragmented_blocks(cpu); } static void *vb_alloc(unsigned long size, gfp_t gfp_mask) { struct vmap_block_queue *vbq; struct vmap_block *vb; void *vaddr = NULL; unsigned int order; BUG_ON(offset_in_page(size)); BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); if (WARN_ON(size == 0)) { /* * Allocating 0 bytes isn't what caller wants since * get_order(0) returns funny result. Just warn and terminate * early. */ return ERR_PTR(-EINVAL); } order = get_order(size); rcu_read_lock(); vbq = raw_cpu_ptr(&vmap_block_queue); list_for_each_entry_rcu(vb, &vbq->free, free_list) { unsigned long pages_off; if (READ_ONCE(vb->free) < (1UL << order)) continue; spin_lock(&vb->lock); if (vb->free < (1UL << order)) { spin_unlock(&vb->lock); continue; } pages_off = VMAP_BBMAP_BITS - vb->free; vaddr = vmap_block_vaddr(vb->va->va_start, pages_off); WRITE_ONCE(vb->free, vb->free - (1UL << order)); bitmap_set(vb->used_map, pages_off, (1UL << order)); if (vb->free == 0) { spin_lock(&vbq->lock); list_del_rcu(&vb->free_list); spin_unlock(&vbq->lock); } spin_unlock(&vb->lock); break; } rcu_read_unlock(); /* Allocate new block if nothing was found */ if (!vaddr) vaddr = new_vmap_block(order, gfp_mask); return vaddr; } static void vb_free(unsigned long addr, unsigned long size) { unsigned long offset; unsigned int order; struct vmap_block *vb; struct xarray *xa; BUG_ON(offset_in_page(size)); BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); flush_cache_vunmap(addr, addr + size); order = get_order(size); offset = (addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT; xa = addr_to_vb_xa(addr); vb = xa_load(xa, addr_to_vb_idx(addr)); spin_lock(&vb->lock); bitmap_clear(vb->used_map, offset, (1UL << order)); spin_unlock(&vb->lock); vunmap_range_noflush(addr, addr + size); if (debug_pagealloc_enabled_static()) flush_tlb_kernel_range(addr, addr + size); spin_lock(&vb->lock); /* Expand the not yet TLB flushed dirty range */ vb->dirty_min = min(vb->dirty_min, offset); vb->dirty_max = max(vb->dirty_max, offset + (1UL << order)); WRITE_ONCE(vb->dirty, vb->dirty + (1UL << order)); if (vb->dirty == VMAP_BBMAP_BITS) { BUG_ON(vb->free); spin_unlock(&vb->lock); free_vmap_block(vb); } else spin_unlock(&vb->lock); } static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush) { LIST_HEAD(purge_list); int cpu; if (unlikely(!vmap_initialized)) return; mutex_lock(&vmap_purge_lock); for_each_possible_cpu(cpu) { struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu); struct vmap_block *vb; unsigned long idx; rcu_read_lock(); xa_for_each(&vbq->vmap_blocks, idx, vb) { spin_lock(&vb->lock); /* * Try to purge a fragmented block first. If it's * not purgeable, check whether there is dirty * space to be flushed. */ if (!purge_fragmented_block(vb, &purge_list, false) && vb->dirty_max && vb->dirty != VMAP_BBMAP_BITS) { unsigned long va_start = vb->va->va_start; unsigned long s, e; s = va_start + (vb->dirty_min << PAGE_SHIFT); e = va_start + (vb->dirty_max << PAGE_SHIFT); start = min(s, start); end = max(e, end); /* Prevent that this is flushed again */ vb->dirty_min = VMAP_BBMAP_BITS; vb->dirty_max = 0; flush = 1; } spin_unlock(&vb->lock); } rcu_read_unlock(); } free_purged_blocks(&purge_list); if (!__purge_vmap_area_lazy(start, end, false) && flush) flush_tlb_kernel_range(start, end); mutex_unlock(&vmap_purge_lock); } /** * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer * * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily * to amortize TLB flushing overheads. What this means is that any page you * have now, may, in a former life, have been mapped into kernel virtual * address by the vmap layer and so there might be some CPUs with TLB entries * still referencing that page (additional to the regular 1:1 kernel mapping). * * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can * be sure that none of the pages we have control over will have any aliases * from the vmap layer. */ void vm_unmap_aliases(void) { unsigned long start = ULONG_MAX, end = 0; int flush = 0; _vm_unmap_aliases(start, end, flush); } EXPORT_SYMBOL_GPL(vm_unmap_aliases); /** * vm_unmap_ram - unmap linear kernel address space set up by vm_map_ram * @mem: the pointer returned by vm_map_ram * @count: the count passed to that vm_map_ram call (cannot unmap partial) */ void vm_unmap_ram(const void *mem, unsigned int count) { unsigned long size = (unsigned long)count << PAGE_SHIFT; unsigned long addr = (unsigned long)kasan_reset_tag(mem); struct vmap_area *va; might_sleep(); BUG_ON(!addr); BUG_ON(addr < VMALLOC_START); BUG_ON(addr > VMALLOC_END); BUG_ON(!PAGE_ALIGNED(addr)); kasan_poison_vmalloc(mem, size); if (likely(count <= VMAP_MAX_ALLOC)) { debug_check_no_locks_freed(mem, size); vb_free(addr, size); return; } va = find_unlink_vmap_area(addr); if (WARN_ON_ONCE(!va)) return; debug_check_no_locks_freed((void *)va->va_start, va_size(va)); free_unmap_vmap_area(va); } EXPORT_SYMBOL(vm_unmap_ram); /** * vm_map_ram - map pages linearly into kernel virtual address (vmalloc space) * @pages: an array of pointers to the pages to be mapped * @count: number of pages * @node: prefer to allocate data structures on this node * * If you use this function for less than VMAP_MAX_ALLOC pages, it could be * faster than vmap so it's good. But if you mix long-life and short-life * objects with vm_map_ram(), it could consume lots of address space through * fragmentation (especially on a 32bit machine). You could see failures in * the end. Please use this function for short-lived objects. * * Returns: a pointer to the address that has been mapped, or %NULL on failure */ void *vm_map_ram(struct page **pages, unsigned int count, int node) { unsigned long size = (unsigned long)count << PAGE_SHIFT; unsigned long addr; void *mem; if (likely(count <= VMAP_MAX_ALLOC)) { mem = vb_alloc(size, GFP_KERNEL); if (IS_ERR(mem)) return NULL; addr = (unsigned long)mem; } else { struct vmap_area *va; va = alloc_vmap_area(size, PAGE_SIZE, VMALLOC_START, VMALLOC_END, node, GFP_KERNEL, VMAP_RAM, NULL); if (IS_ERR(va)) return NULL; addr = va->va_start; mem = (void *)addr; } if (vmap_pages_range(addr, addr + size, PAGE_KERNEL, pages, PAGE_SHIFT) < 0) { vm_unmap_ram(mem, count); return NULL; } /* * Mark the pages as accessible, now that they are mapped. * With hardware tag-based KASAN, marking is skipped for * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc(). */ mem = kasan_unpoison_vmalloc(mem, size, KASAN_VMALLOC_PROT_NORMAL); return mem; } EXPORT_SYMBOL(vm_map_ram); static struct vm_struct *vmlist __initdata; static inline unsigned int vm_area_page_order(struct vm_struct *vm) { #ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC return vm->page_order; #else return 0; #endif } unsigned int get_vm_area_page_order(struct vm_struct *vm) { return vm_area_page_order(vm); } static inline void set_vm_area_page_order(struct vm_struct *vm, unsigned int order) { #ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC vm->page_order = order; #else BUG_ON(order != 0); #endif } /** * vm_area_add_early - add vmap area early during boot * @vm: vm_struct to add * * This function is used to add fixed kernel vm area to vmlist before * vmalloc_init() is called. @vm->addr, @vm->size, and @vm->flags * should contain proper values and the other fields should be zero. * * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING. */ void __init vm_area_add_early(struct vm_struct *vm) { struct vm_struct *tmp, **p; BUG_ON(vmap_initialized); for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) { if (tmp->addr >= vm->addr) { BUG_ON(tmp->addr < vm->addr + vm->size); break; } else BUG_ON(tmp->addr + tmp->size > vm->addr); } vm->next = *p; *p = vm; } /** * vm_area_register_early - register vmap area early during boot * @vm: vm_struct to register * @align: requested alignment * * This function is used to register kernel vm area before * vmalloc_init() is called. @vm->size and @vm->flags should contain * proper values on entry and other fields should be zero. On return, * vm->addr contains the allocated address. * * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING. */ void __init vm_area_register_early(struct vm_struct *vm, size_t align) { unsigned long addr = ALIGN(VMALLOC_START, align); struct vm_struct *cur, **p; BUG_ON(vmap_initialized); for (p = &vmlist; (cur = *p) != NULL; p = &cur->next) { if ((unsigned long)cur->addr - addr >= vm->size) break; addr = ALIGN((unsigned long)cur->addr + cur->size, align); } BUG_ON(addr > VMALLOC_END - vm->size); vm->addr = (void *)addr; vm->next = *p; *p = vm; kasan_populate_early_vm_area_shadow(vm->addr, vm->size); } static void clear_vm_uninitialized_flag(struct vm_struct *vm) { /* * Before removing VM_UNINITIALIZED, * we should make sure that vm has proper values. * Pair with smp_rmb() in show_numa_info(). */ smp_wmb(); vm->flags &= ~VM_UNINITIALIZED; } struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long align, unsigned long shift, unsigned long flags, unsigned long start, unsigned long end, int node, gfp_t gfp_mask, const void *caller) { struct vmap_area *va; struct vm_struct *area; unsigned long requested_size = size; BUG_ON(in_interrupt()); size = ALIGN(size, 1ul << shift); if (unlikely(!size)) return NULL; if (flags & VM_IOREMAP) align = 1ul << clamp_t(int, get_count_order_long(size), PAGE_SHIFT, IOREMAP_MAX_ORDER); area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node); if (unlikely(!area)) return NULL; if (!(flags & VM_NO_GUARD)) size += PAGE_SIZE; area->flags = flags; area->caller = caller; area->requested_size = requested_size; va = alloc_vmap_area(size, align, start, end, node, gfp_mask, 0, area); if (IS_ERR(va)) { kfree(area); return NULL; } /* * Mark pages for non-VM_ALLOC mappings as accessible. Do it now as a * best-effort approach, as they can be mapped outside of vmalloc code. * For VM_ALLOC mappings, the pages are marked as accessible after * getting mapped in __vmalloc_node_range(). * With hardware tag-based KASAN, marking is skipped for * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc(). */ if (!(flags & VM_ALLOC)) area->addr = kasan_unpoison_vmalloc(area->addr, requested_size, KASAN_VMALLOC_PROT_NORMAL); return area; } struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, unsigned long start, unsigned long end, const void *caller) { return __get_vm_area_node(size, 1, PAGE_SHIFT, flags, start, end, NUMA_NO_NODE, GFP_KERNEL, caller); } /** * get_vm_area - reserve a contiguous kernel virtual area * @size: size of the area * @flags: %VM_IOREMAP for I/O mappings or VM_ALLOC * * Search an area of @size in the kernel virtual mapping area, * and reserved it for out purposes. Returns the area descriptor * on success or %NULL on failure. * * Return: the area descriptor on success or %NULL on failure. */ struct vm_struct *get_vm_area(unsigned long size, unsigned long flags) { return __get_vm_area_node(size, 1, PAGE_SHIFT, flags, VMALLOC_START, VMALLOC_END, NUMA_NO_NODE, GFP_KERNEL, __builtin_return_address(0)); } struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, const void *caller) { return __get_vm_area_node(size, 1, PAGE_SHIFT, flags, VMALLOC_START, VMALLOC_END, NUMA_NO_NODE, GFP_KERNEL, caller); } /** * find_vm_area - find a continuous kernel virtual area * @addr: base address * * Search for the kernel VM area starting at @addr, and return it. * It is up to the caller to do all required locking to keep the returned * pointer valid. * * Return: the area descriptor on success or %NULL on failure. */ struct vm_struct *find_vm_area(const void *addr) { struct vmap_area *va; va = find_vmap_area((unsigned long)addr); if (!va) return NULL; return va->vm; } /** * remove_vm_area - find and remove a continuous kernel virtual area * @addr: base address * * Search for the kernel VM area starting at @addr, and remove it. * This function returns the found VM area, but using it is NOT safe * on SMP machines, except for its size or flags. * * Return: the area descriptor on success or %NULL on failure. */ struct vm_struct *remove_vm_area(const void *addr) { struct vmap_area *va; struct vm_struct *vm; might_sleep(); if (WARN(!PAGE_ALIGNED(addr), "Trying to vfree() bad address (%p)\n", addr)) return NULL; va = find_unlink_vmap_area((unsigned long)addr); if (!va || !va->vm) return NULL; vm = va->vm; debug_check_no_locks_freed(vm->addr, get_vm_area_size(vm)); debug_check_no_obj_freed(vm->addr, get_vm_area_size(vm)); kasan_free_module_shadow(vm); kasan_poison_vmalloc(vm->addr, get_vm_area_size(vm)); free_unmap_vmap_area(va); return vm; } static inline void set_area_direct_map(const struct vm_struct *area, int (*set_direct_map)(struct page *page)) { int i; /* HUGE_VMALLOC passes small pages to set_direct_map */ for (i = 0; i < area->nr_pages; i++) if (page_address(area->pages[i])) set_direct_map(area->pages[i]); } /* * Flush the vm mapping and reset the direct map. */ static void vm_reset_perms(struct vm_struct *area) { unsigned long start = ULONG_MAX, end = 0; unsigned int page_order = vm_area_page_order(area); int flush_dmap = 0; int i; /* * Find the start and end range of the direct mappings to make sure that * the vm_unmap_aliases() flush includes the direct map. */ for (i = 0; i < area->nr_pages; i += 1U << page_order) { unsigned long addr = (unsigned long)page_address(area->pages[i]); if (addr) { unsigned long page_size; page_size = PAGE_SIZE << page_order; start = min(addr, start); end = max(addr + page_size, end); flush_dmap = 1; } } /* * Set direct map to something invalid so that it won't be cached if * there are any accesses after the TLB flush, then flush the TLB and * reset the direct map permissions to the default. */ set_area_direct_map(area, set_direct_map_invalid_noflush); _vm_unmap_aliases(start, end, flush_dmap); set_area_direct_map(area, set_direct_map_default_noflush); } static void delayed_vfree_work(struct work_struct *w) { struct vfree_deferred *p = container_of(w, struct vfree_deferred, wq); struct llist_node *t, *llnode; llist_for_each_safe(llnode, t, llist_del_all(&p->list)) vfree(llnode); } /** * vfree_atomic - release memory allocated by vmalloc() * @addr: memory base address * * This one is just like vfree() but can be called in any atomic context * except NMIs. */ void vfree_atomic(const void *addr) { struct vfree_deferred *p = raw_cpu_ptr(&vfree_deferred); BUG_ON(in_nmi()); kmemleak_free(addr); /* * Use raw_cpu_ptr() because this can be called from preemptible * context. Preemption is absolutely fine here, because the llist_add() * implementation is lockless, so it works even if we are adding to * another cpu's list. schedule_work() should be fine with this too. */ if (addr && llist_add((struct llist_node *)addr, &p->list)) schedule_work(&p->wq); } /** * vfree - Release memory allocated by vmalloc() * @addr: Memory base address * * Free the virtually continuous memory area starting at @addr, as obtained * from one of the vmalloc() family of APIs. This will usually also free the * physical memory underlying the virtual allocation, but that memory is * reference counted, so it will not be freed until the last user goes away. * * If @addr is NULL, no operation is performed. * * Context: * May sleep if called *not* from interrupt context. * Must not be called in NMI context (strictly speaking, it could be * if we have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling * conventions for vfree() arch-dependent would be a really bad idea). */ void vfree(const void *addr) { struct vm_struct *vm; int i; if (unlikely(in_interrupt())) { vfree_atomic(addr); return; } BUG_ON(in_nmi()); kmemleak_free(addr); might_sleep(); if (!addr) return; vm = remove_vm_area(addr); if (unlikely(!vm)) { WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n", addr); return; } if (unlikely(vm->flags & VM_FLUSH_RESET_PERMS)) vm_reset_perms(vm); for (i = 0; i < vm->nr_pages; i++) { struct page *page = vm->pages[i]; BUG_ON(!page); if (!(vm->flags & VM_MAP_PUT_PAGES)) mod_memcg_page_state(page, MEMCG_VMALLOC, -1); /* * High-order allocs for huge vmallocs are split, so * can be freed as an array of order-0 allocations */ __free_page(page); cond_resched(); } if (!(vm->flags & VM_MAP_PUT_PAGES)) atomic_long_sub(vm->nr_pages, &nr_vmalloc_pages); kvfree(vm->pages); kfree(vm); } EXPORT_SYMBOL(vfree); /** * vunmap - release virtual mapping obtained by vmap() * @addr: memory base address * * Free the virtually contiguous memory area starting at @addr, * which was created from the page array passed to vmap(). * * Must not be called in interrupt context. */ void vunmap(const void *addr) { struct vm_struct *vm; BUG_ON(in_interrupt()); might_sleep(); if (!addr) return; vm = remove_vm_area(addr); if (unlikely(!vm)) { WARN(1, KERN_ERR "Trying to vunmap() nonexistent vm area (%p)\n", addr); return; } kfree(vm); } EXPORT_SYMBOL(vunmap); /** * vmap - map an array of pages into virtually contiguous space * @pages: array of page pointers * @count: number of pages to map * @flags: vm_area->flags * @prot: page protection for the mapping * * Maps @count pages from @pages into contiguous kernel virtual space. * If @flags contains %VM_MAP_PUT_PAGES the ownership of the pages array itself * (which must be kmalloc or vmalloc memory) and one reference per pages in it * are transferred from the caller to vmap(), and will be freed / dropped when * vfree() is called on the return value. * * Return: the address of the area or %NULL on failure */ void *vmap(struct page **pages, unsigned int count, unsigned long flags, pgprot_t prot) { struct vm_struct *area; unsigned long addr; unsigned long size; /* In bytes */ might_sleep(); if (WARN_ON_ONCE(flags & VM_FLUSH_RESET_PERMS)) return NULL; /* * Your top guard is someone else's bottom guard. Not having a top * guard compromises someone else's mappings too. */ if (WARN_ON_ONCE(flags & VM_NO_GUARD)) flags &= ~VM_NO_GUARD; if (count > totalram_pages()) return NULL; size = (unsigned long)count << PAGE_SHIFT; area = get_vm_area_caller(size, flags, __builtin_return_address(0)); if (!area) return NULL; addr = (unsigned long)area->addr; if (vmap_pages_range(addr, addr + size, pgprot_nx(prot), pages, PAGE_SHIFT) < 0) { vunmap(area->addr); return NULL; } if (flags & VM_MAP_PUT_PAGES) { area->pages = pages; area->nr_pages = count; } return area->addr; } EXPORT_SYMBOL(vmap); #ifdef CONFIG_VMAP_PFN struct vmap_pfn_data { unsigned long *pfns; pgprot_t prot; unsigned int idx; }; static int vmap_pfn_apply(pte_t *pte, unsigned long addr, void *private) { struct vmap_pfn_data *data = private; unsigned long pfn = data->pfns[data->idx]; pte_t ptent; if (WARN_ON_ONCE(pfn_valid(pfn))) return -EINVAL; ptent = pte_mkspecial(pfn_pte(pfn, data->prot)); set_pte_at(&init_mm, addr, pte, ptent); data->idx++; return 0; } /** * vmap_pfn - map an array of PFNs into virtually contiguous space * @pfns: array of PFNs * @count: number of pages to map * @prot: page protection for the mapping * * Maps @count PFNs from @pfns into contiguous kernel virtual space and returns * the start address of the mapping. */ void *vmap_pfn(unsigned long *pfns, unsigned int count, pgprot_t prot) { struct vmap_pfn_data data = { .pfns = pfns, .prot = pgprot_nx(prot) }; struct vm_struct *area; area = get_vm_area_caller(count * PAGE_SIZE, VM_IOREMAP, __builtin_return_address(0)); if (!area) return NULL; if (apply_to_page_range(&init_mm, (unsigned long)area->addr, count * PAGE_SIZE, vmap_pfn_apply, &data)) { free_vm_area(area); return NULL; } flush_cache_vmap((unsigned long)area->addr, (unsigned long)area->addr + count * PAGE_SIZE); return area->addr; } EXPORT_SYMBOL_GPL(vmap_pfn); #endif /* CONFIG_VMAP_PFN */ static inline unsigned int vm_area_alloc_pages(gfp_t gfp, int nid, unsigned int order, unsigned int nr_pages, struct page **pages) { unsigned int nr_allocated = 0; struct page *page; int i; /* * For order-0 pages we make use of bulk allocator, if * the page array is partly or not at all populated due * to fails, fallback to a single page allocator that is * more permissive. */ if (!order) { while (nr_allocated < nr_pages) { unsigned int nr, nr_pages_request; /* * A maximum allowed request is hard-coded and is 100 * pages per call. That is done in order to prevent a * long preemption off scenario in the bulk-allocator * so the range is [1:100]. */ nr_pages_request = min(100U, nr_pages - nr_allocated); /* memory allocation should consider mempolicy, we can't * wrongly use nearest node when nid == NUMA_NO_NODE, * otherwise memory may be allocated in only one node, * but mempolicy wants to alloc memory by interleaving. */ if (IS_ENABLED(CONFIG_NUMA) && nid == NUMA_NO_NODE) nr = alloc_pages_bulk_mempolicy_noprof(gfp, nr_pages_request, pages + nr_allocated); else nr = alloc_pages_bulk_node_noprof(gfp, nid, nr_pages_request, pages + nr_allocated); nr_allocated += nr; cond_resched(); /* * If zero or pages were obtained partly, * fallback to a single page allocator. */ if (nr != nr_pages_request) break; } } /* High-order pages or fallback path if "bulk" fails. */ while (nr_allocated < nr_pages) { if (!(gfp & __GFP_NOFAIL) && fatal_signal_pending(current)) break; if (nid == NUMA_NO_NODE) page = alloc_pages_noprof(gfp, order); else page = alloc_pages_node_noprof(nid, gfp, order); if (unlikely(!page)) break; /* * High-order allocations must be able to be treated as * independent small pages by callers (as they can with * small-page vmallocs). Some drivers do their own refcounting * on vmalloc_to_page() pages, some use page->mapping, * page->lru, etc. */ if (order) split_page(page, order); /* * Careful, we allocate and map page-order pages, but * tracking is done per PAGE_SIZE page so as to keep the * vm_struct APIs independent of the physical/mapped size. */ for (i = 0; i < (1U << order); i++) pages[nr_allocated + i] = page + i; cond_resched(); nr_allocated += 1U << order; } return nr_allocated; } static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot, unsigned int page_shift, int node) { const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; bool nofail = gfp_mask & __GFP_NOFAIL; unsigned long addr = (unsigned long)area->addr; unsigned long size = get_vm_area_size(area); unsigned long array_size; unsigned int nr_small_pages = size >> PAGE_SHIFT; unsigned int page_order; unsigned int flags; int ret; array_size = (unsigned long)nr_small_pages * sizeof(struct page *); if (!(gfp_mask & (GFP_DMA | GFP_DMA32))) gfp_mask |= __GFP_HIGHMEM; /* Please note that the recursion is strictly bounded. */ if (array_size > PAGE_SIZE) { area->pages = __vmalloc_node_noprof(array_size, 1, nested_gfp, node, area->caller); } else { area->pages = kmalloc_node_noprof(array_size, nested_gfp, node); } if (!area->pages) { warn_alloc(gfp_mask, NULL, "vmalloc error: size %lu, failed to allocated page array size %lu", nr_small_pages * PAGE_SIZE, array_size); free_vm_area(area); return NULL; } set_vm_area_page_order(area, page_shift - PAGE_SHIFT); page_order = vm_area_page_order(area); /* * High-order nofail allocations are really expensive and * potentially dangerous (pre-mature OOM, disruptive reclaim * and compaction etc. * * Please note, the __vmalloc_node_range_noprof() falls-back * to order-0 pages if high-order attempt is unsuccessful. */ area->nr_pages = vm_area_alloc_pages((page_order ? gfp_mask & ~__GFP_NOFAIL : gfp_mask) | __GFP_NOWARN, node, page_order, nr_small_pages, area->pages); atomic_long_add(area->nr_pages, &nr_vmalloc_pages); if (gfp_mask & __GFP_ACCOUNT) { int i; for (i = 0; i < area->nr_pages; i++) mod_memcg_page_state(area->pages[i], MEMCG_VMALLOC, 1); } /* * If not enough pages were obtained to accomplish an * allocation request, free them via vfree() if any. */ if (area->nr_pages != nr_small_pages) { /* * vm_area_alloc_pages() can fail due to insufficient memory but * also:- * * - a pending fatal signal * - insufficient huge page-order pages * * Since we always retry allocations at order-0 in the huge page * case a warning for either is spurious. */ if (!fatal_signal_pending(current) && page_order == 0) warn_alloc(gfp_mask, NULL, "vmalloc error: size %lu, failed to allocate pages", area->nr_pages * PAGE_SIZE); goto fail; } /* * page tables allocations ignore external gfp mask, enforce it * by the scope API */ if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO) flags = memalloc_nofs_save(); else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0) flags = memalloc_noio_save(); do { ret = vmap_pages_range(addr, addr + size, prot, area->pages, page_shift); if (nofail && (ret < 0)) schedule_timeout_uninterruptible(1); } while (nofail && (ret < 0)); if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO) memalloc_nofs_restore(flags); else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0) memalloc_noio_restore(flags); if (ret < 0) { warn_alloc(gfp_mask, NULL, "vmalloc error: size %lu, failed to map pages", area->nr_pages * PAGE_SIZE); goto fail; } return area->addr; fail: vfree(area->addr); return NULL; } /** * __vmalloc_node_range - allocate virtually contiguous memory * @size: allocation size * @align: desired alignment * @start: vm area range start * @end: vm area range end * @gfp_mask: flags for the page level allocator * @prot: protection mask for the allocated pages * @vm_flags: additional vm area flags (e.g. %VM_NO_GUARD) * @node: node to use for allocation or NUMA_NO_NODE * @caller: caller's return address * * Allocate enough pages to cover @size from the page level * allocator with @gfp_mask flags. Please note that the full set of gfp * flags are not supported. GFP_KERNEL, GFP_NOFS and GFP_NOIO are all * supported. * Zone modifiers are not supported. From the reclaim modifiers * __GFP_DIRECT_RECLAIM is required (aka GFP_NOWAIT is not supported) * and only __GFP_NOFAIL is supported (i.e. __GFP_NORETRY and * __GFP_RETRY_MAYFAIL are not supported). * * __GFP_NOWARN can be used to suppress failures messages. * * Map them into contiguous kernel virtual space, using a pagetable * protection of @prot. * * Return: the address of the area or %NULL on failure */ void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align, unsigned long start, unsigned long end, gfp_t gfp_mask, pgprot_t prot, unsigned long vm_flags, int node, const void *caller) { struct vm_struct *area; void *ret; kasan_vmalloc_flags_t kasan_flags = KASAN_VMALLOC_NONE; unsigned long original_align = align; unsigned int shift = PAGE_SHIFT; if (WARN_ON_ONCE(!size)) return NULL; if ((size >> PAGE_SHIFT) > totalram_pages()) { warn_alloc(gfp_mask, NULL, "vmalloc error: size %lu, exceeds total pages", size); return NULL; } if (vmap_allow_huge && (vm_flags & VM_ALLOW_HUGE_VMAP)) { /* * Try huge pages. Only try for PAGE_KERNEL allocations, * others like modules don't yet expect huge pages in * their allocations due to apply_to_page_range not * supporting them. */ if (arch_vmap_pmd_supported(prot) && size >= PMD_SIZE) shift = PMD_SHIFT; else shift = arch_vmap_pte_supported_shift(size); align = max(original_align, 1UL << shift); } again: area = __get_vm_area_node(size, align, shift, VM_ALLOC | VM_UNINITIALIZED | vm_flags, start, end, node, gfp_mask, caller); if (!area) { bool nofail = gfp_mask & __GFP_NOFAIL; warn_alloc(gfp_mask, NULL, "vmalloc error: size %lu, vm_struct allocation failed%s", size, (nofail) ? ". Retrying." : ""); if (nofail) { schedule_timeout_uninterruptible(1); goto again; } goto fail; } /* * Prepare arguments for __vmalloc_area_node() and * kasan_unpoison_vmalloc(). */ if (pgprot_val(prot) == pgprot_val(PAGE_KERNEL)) { if (kasan_hw_tags_enabled()) { /* * Modify protection bits to allow tagging. * This must be done before mapping. */ prot = arch_vmap_pgprot_tagged(prot); /* * Skip page_alloc poisoning and zeroing for physical * pages backing VM_ALLOC mapping. Memory is instead * poisoned and zeroed by kasan_unpoison_vmalloc(). */ gfp_mask |= __GFP_SKIP_KASAN | __GFP_SKIP_ZERO; } /* Take note that the mapping is PAGE_KERNEL. */ kasan_flags |= KASAN_VMALLOC_PROT_NORMAL; } /* Allocate physical pages and map them into vmalloc space. */ ret = __vmalloc_area_node(area, gfp_mask, prot, shift, node); if (!ret) goto fail; /* * Mark the pages as accessible, now that they are mapped. * The condition for setting KASAN_VMALLOC_INIT should complement the * one in post_alloc_hook() with regards to the __GFP_SKIP_ZERO check * to make sure that memory is initialized under the same conditions. * Tag-based KASAN modes only assign tags to normal non-executable * allocations, see __kasan_unpoison_vmalloc(). */ kasan_flags |= KASAN_VMALLOC_VM_ALLOC; if (!want_init_on_free() && want_init_on_alloc(gfp_mask) && (gfp_mask & __GFP_SKIP_ZERO)) kasan_flags |= KASAN_VMALLOC_INIT; /* KASAN_VMALLOC_PROT_NORMAL already set if required. */ area->addr = kasan_unpoison_vmalloc(area->addr, size, kasan_flags); /* * In this function, newly allocated vm_struct has VM_UNINITIALIZED * flag. It means that vm_struct is not fully initialized. * Now, it is fully initialized, so remove this flag here. */ clear_vm_uninitialized_flag(area); if (!(vm_flags & VM_DEFER_KMEMLEAK)) kmemleak_vmalloc(area, PAGE_ALIGN(size), gfp_mask); return area->addr; fail: if (shift > PAGE_SHIFT) { shift = PAGE_SHIFT; align = original_align; goto again; } return NULL; } /** * __vmalloc_node - allocate virtually contiguous memory * @size: allocation size * @align: desired alignment * @gfp_mask: flags for the page level allocator * @node: node to use for allocation or NUMA_NO_NODE * @caller: caller's return address * * Allocate enough pages to cover @size from the page level allocator with * @gfp_mask flags. Map them into contiguous kernel virtual space. * * Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_RETRY_MAYFAIL * and __GFP_NOFAIL are not supported * * Any use of gfp flags outside of GFP_KERNEL should be consulted * with mm people. * * Return: pointer to the allocated memory or %NULL on error */ void *__vmalloc_node_noprof(unsigned long size, unsigned long align, gfp_t gfp_mask, int node, const void *caller) { return __vmalloc_node_range_noprof(size, align, VMALLOC_START, VMALLOC_END, gfp_mask, PAGE_KERNEL, 0, node, caller); } /* * This is only for performance analysis of vmalloc and stress purpose. * It is required by vmalloc test module, therefore do not use it other * than that. */ #ifdef CONFIG_TEST_VMALLOC_MODULE EXPORT_SYMBOL_GPL(__vmalloc_node_noprof); #endif void *__vmalloc_noprof(unsigned long size, gfp_t gfp_mask) { return __vmalloc_node_noprof(size, 1, gfp_mask, NUMA_NO_NODE, __builtin_return_address(0)); } EXPORT_SYMBOL(__vmalloc_noprof); /** * vmalloc - allocate virtually contiguous memory * @size: allocation size * * Allocate enough pages to cover @size from the page level * allocator and map them into contiguous kernel virtual space. * * For tight control over page level allocator and protection flags * use __vmalloc() instead. * * Return: pointer to the allocated memory or %NULL on error */ void *vmalloc_noprof(unsigned long size) { return __vmalloc_node_noprof(size, 1, GFP_KERNEL, NUMA_NO_NODE, __builtin_return_address(0)); } EXPORT_SYMBOL(vmalloc_noprof); /** * vmalloc_huge - allocate virtually contiguous memory, allow huge pages * @size: allocation size * @gfp_mask: flags for the page level allocator * * Allocate enough pages to cover @size from the page level * allocator and map them into contiguous kernel virtual space. * If @size is greater than or equal to PMD_SIZE, allow using * huge pages for the memory * * Return: pointer to the allocated memory or %NULL on error */ void *vmalloc_huge_noprof(unsigned long size, gfp_t gfp_mask) { return __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END, gfp_mask, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP, NUMA_NO_NODE, __builtin_return_address(0)); } EXPORT_SYMBOL_GPL(vmalloc_huge_noprof); /** * vzalloc - allocate virtually contiguous memory with zero fill * @size: allocation size * * Allocate enough pages to cover @size from the page level * allocator and map them into contiguous kernel virtual space. * The memory allocated is set to zero. * * For tight control over page level allocator and protection flags * use __vmalloc() instead. * * Return: pointer to the allocated memory or %NULL on error */ void *vzalloc_noprof(unsigned long size) { return __vmalloc_node_noprof(size, 1, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE, __builtin_return_address(0)); } EXPORT_SYMBOL(vzalloc_noprof); /** * vmalloc_user - allocate zeroed virtually contiguous memory for userspace * @size: allocation size * * The resulting memory area is zeroed so it can be mapped to userspace * without leaking data. * * Return: pointer to the allocated memory or %NULL on error */ void *vmalloc_user_noprof(unsigned long size) { return __vmalloc_node_range_noprof(size, SHMLBA, VMALLOC_START, VMALLOC_END, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL, VM_USERMAP, NUMA_NO_NODE, __builtin_return_address(0)); } EXPORT_SYMBOL(vmalloc_user_noprof); /** * vmalloc_node - allocate memory on a specific node * @size: allocation size * @node: numa node * * Allocate enough pages to cover @size from the page level * allocator and map them into contiguous kernel virtual space. * * For tight control over page level allocator and protection flags * use __vmalloc() instead. * * Return: pointer to the allocated memory or %NULL on error */ void *vmalloc_node_noprof(unsigned long size, int node) { return __vmalloc_node_noprof(size, 1, GFP_KERNEL, node, __builtin_return_address(0)); } EXPORT_SYMBOL(vmalloc_node_noprof); /** * vzalloc_node - allocate memory on a specific node with zero fill * @size: allocation size * @node: numa node * * Allocate enough pages to cover @size from the page level * allocator and map them into contiguous kernel virtual space. * The memory allocated is set to zero. * * Return: pointer to the allocated memory or %NULL on error */ void *vzalloc_node_noprof(unsigned long size, int node) { return __vmalloc_node_noprof(size, 1, GFP_KERNEL | __GFP_ZERO, node, __builtin_return_address(0)); } EXPORT_SYMBOL(vzalloc_node_noprof); /** * vrealloc - reallocate virtually contiguous memory; contents remain unchanged * @p: object to reallocate memory for * @size: the size to reallocate * @flags: the flags for the page level allocator * * If @p is %NULL, vrealloc() behaves exactly like vmalloc(). If @size is 0 and * @p is not a %NULL pointer, the object pointed to is freed. * * If __GFP_ZERO logic is requested, callers must ensure that, starting with the * initial memory allocation, every subsequent call to this API for the same * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that * __GFP_ZERO is not fully honored by this API. * * In any case, the contents of the object pointed to are preserved up to the * lesser of the new and old sizes. * * This function must not be called concurrently with itself or vfree() for the * same memory allocation. * * Return: pointer to the allocated memory; %NULL if @size is zero or in case of * failure */ void *vrealloc_noprof(const void *p, size_t size, gfp_t flags) { struct vm_struct *vm = NULL; size_t alloced_size = 0; size_t old_size = 0; void *n; if (!size) { vfree(p); return NULL; } if (p) { vm = find_vm_area(p); if (unlikely(!vm)) { WARN(1, "Trying to vrealloc() nonexistent vm area (%p)\n", p); return NULL; } alloced_size = get_vm_area_size(vm); old_size = vm->requested_size; if (WARN(alloced_size < old_size, "vrealloc() has mismatched area vs requested sizes (%p)\n", p)) return NULL; } /* * TODO: Shrink the vm_area, i.e. unmap and free unused pages. What * would be a good heuristic for when to shrink the vm_area? */ if (size <= old_size) { /* Zero out "freed" memory. */ if (want_init_on_free()) memset((void *)p + size, 0, old_size - size); vm->requested_size = size; kasan_poison_vmalloc(p + size, old_size - size); return (void *)p; } /* * We already have the bytes available in the allocation; use them. */ if (size <= alloced_size) { kasan_unpoison_vmalloc(p + old_size, size - old_size, KASAN_VMALLOC_PROT_NORMAL); /* Zero out "alloced" memory. */ if (want_init_on_alloc(flags)) memset((void *)p + old_size, 0, size - old_size); vm->requested_size = size; } /* TODO: Grow the vm_area, i.e. allocate and map additional pages. */ n = __vmalloc_noprof(size, flags); if (!n) return NULL; if (p) { memcpy(n, p, old_size); vfree(p); } return n; } #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32) #define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL) #elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA) #define GFP_VMALLOC32 (GFP_DMA | GFP_KERNEL) #else /* * 64b systems should always have either DMA or DMA32 zones. For others * GFP_DMA32 should do the right thing and use the normal zone. */ #define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL) #endif /** * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) * @size: allocation size * * Allocate enough 32bit PA addressable pages to cover @size from the * page level allocator and map them into contiguous kernel virtual space. * * Return: pointer to the allocated memory or %NULL on error */ void *vmalloc_32_noprof(unsigned long size) { return __vmalloc_node_noprof(size, 1, GFP_VMALLOC32, NUMA_NO_NODE, __builtin_return_address(0)); } EXPORT_SYMBOL(vmalloc_32_noprof); /** * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory * @size: allocation size * * The resulting memory area is 32bit addressable and zeroed so it can be * mapped to userspace without leaking data. * * Return: pointer to the allocated memory or %NULL on error */ void *vmalloc_32_user_noprof(unsigned long size) { return __vmalloc_node_range_noprof(size, SHMLBA, VMALLOC_START, VMALLOC_END, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL, VM_USERMAP, NUMA_NO_NODE, __builtin_return_address(0)); } EXPORT_SYMBOL(vmalloc_32_user_noprof); /* * Atomically zero bytes in the iterator. * * Returns the number of zeroed bytes. */ static size_t zero_iter(struct iov_iter *iter, size_t count) { size_t remains = count; while (remains > 0) { size_t num, copied; num = min_t(size_t, remains, PAGE_SIZE); copied = copy_page_to_iter_nofault(ZERO_PAGE(0), 0, num, iter); remains -= copied; if (copied < num) break; } return count - remains; } /* * small helper routine, copy contents to iter from addr. * If the page is not present, fill zero. * * Returns the number of copied bytes. */ static size_t aligned_vread_iter(struct iov_iter *iter, const char *addr, size_t count) { size_t remains = count; struct page *page; while (remains > 0) { unsigned long offset, length; size_t copied = 0; offset = offset_in_page(addr); length = PAGE_SIZE - offset; if (length > remains) length = remains; page = vmalloc_to_page(addr); /* * To do safe access to this _mapped_ area, we need lock. But * adding lock here means that we need to add overhead of * vmalloc()/vfree() calls for this _debug_ interface, rarely * used. Instead of that, we'll use an local mapping via * copy_page_to_iter_nofault() and accept a small overhead in * this access function. */ if (page) copied = copy_page_to_iter_nofault(page, offset, length, iter); else copied = zero_iter(iter, length); addr += copied; remains -= copied; if (copied != length) break; } return count - remains; } /* * Read from a vm_map_ram region of memory. * * Returns the number of copied bytes. */ static size_t vmap_ram_vread_iter(struct iov_iter *iter, const char *addr, size_t count, unsigned long flags) { char *start; struct vmap_block *vb; struct xarray *xa; unsigned long offset; unsigned int rs, re; size_t remains, n; /* * If it's area created by vm_map_ram() interface directly, but * not further subdividing and delegating management to vmap_block, * handle it here. */ if (!(flags & VMAP_BLOCK)) return aligned_vread_iter(iter, addr, count); remains = count; /* * Area is split into regions and tracked with vmap_block, read out * each region and zero fill the hole between regions. */ xa = addr_to_vb_xa((unsigned long) addr); vb = xa_load(xa, addr_to_vb_idx((unsigned long)addr)); if (!vb) goto finished_zero; spin_lock(&vb->lock); if (bitmap_empty(vb->used_map, VMAP_BBMAP_BITS)) { spin_unlock(&vb->lock); goto finished_zero; } for_each_set_bitrange(rs, re, vb->used_map, VMAP_BBMAP_BITS) { size_t copied; if (remains == 0) goto finished; start = vmap_block_vaddr(vb->va->va_start, rs); if (addr < start) { size_t to_zero = min_t(size_t, start - addr, remains); size_t zeroed = zero_iter(iter, to_zero); addr += zeroed; remains -= zeroed; if (remains == 0 || zeroed != to_zero) goto finished; } /*it could start reading from the middle of used region*/ offset = offset_in_page(addr); n = ((re - rs + 1) << PAGE_SHIFT) - offset; if (n > remains) n = remains; copied = aligned_vread_iter(iter, start + offset, n); addr += copied; remains -= copied; if (copied != n) goto finished; } spin_unlock(&vb->lock); finished_zero: /* zero-fill the left dirty or free regions */ return count - remains + zero_iter(iter, remains); finished: /* We couldn't copy/zero everything */ spin_unlock(&vb->lock); return count - remains; } /** * vread_iter() - read vmalloc area in a safe way to an iterator. * @iter: the iterator to which data should be written. * @addr: vm address. * @count: number of bytes to be read. * * This function checks that addr is a valid vmalloc'ed area, and * copy data from that area to a given buffer. If the given memory range * of [addr...addr+count) includes some valid address, data is copied to * proper area of @buf. If there are memory holes, they'll be zero-filled. * IOREMAP area is treated as memory hole and no copy is done. * * If [addr...addr+count) doesn't includes any intersects with alive * vm_struct area, returns 0. @buf should be kernel's buffer. * * Note: In usual ops, vread() is never necessary because the caller * should know vmalloc() area is valid and can use memcpy(). * This is for routines which have to access vmalloc area without * any information, as /proc/kcore. * * Return: number of bytes for which addr and buf should be increased * (same number as @count) or %0 if [addr...addr+count) doesn't * include any intersection with valid vmalloc area */ long vread_iter(struct iov_iter *iter, const char *addr, size_t count) { struct vmap_node *vn; struct vmap_area *va; struct vm_struct *vm; char *vaddr; size_t n, size, flags, remains; unsigned long next; addr = kasan_reset_tag(addr); /* Don't allow overflow */ if ((unsigned long) addr + count < count) count = -(unsigned long) addr; remains = count; vn = find_vmap_area_exceed_addr_lock((unsigned long) addr, &va); if (!vn) goto finished_zero; /* no intersects with alive vmap_area */ if ((unsigned long)addr + remains <= va->va_start) goto finished_zero; do { size_t copied; if (remains == 0) goto finished; vm = va->vm; flags = va->flags & VMAP_FLAGS_MASK; /* * VMAP_BLOCK indicates a sub-type of vm_map_ram area, need * be set together with VMAP_RAM. */ WARN_ON(flags == VMAP_BLOCK); if (!vm && !flags) goto next_va; if (vm && (vm->flags & VM_UNINITIALIZED)) goto next_va; /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */ smp_rmb(); vaddr = (char *) va->va_start; size = vm ? get_vm_area_size(vm) : va_size(va); if (addr >= vaddr + size) goto next_va; if (addr < vaddr) { size_t to_zero = min_t(size_t, vaddr - addr, remains); size_t zeroed = zero_iter(iter, to_zero); addr += zeroed; remains -= zeroed; if (remains == 0 || zeroed != to_zero) goto finished; } n = vaddr + size - addr; if (n > remains) n = remains; if (flags & VMAP_RAM) copied = vmap_ram_vread_iter(iter, addr, n, flags); else if (!(vm && (vm->flags & (VM_IOREMAP | VM_SPARSE)))) copied = aligned_vread_iter(iter, addr, n); else /* IOREMAP | SPARSE area is treated as memory hole */ copied = zero_iter(iter, n); addr += copied; remains -= copied; if (copied != n) goto finished; next_va: next = va->va_end; spin_unlock(&vn->busy.lock); } while ((vn = find_vmap_area_exceed_addr_lock(next, &va))); finished_zero: if (vn) spin_unlock(&vn->busy.lock); /* zero-fill memory holes */ return count - remains + zero_iter(iter, remains); finished: /* Nothing remains, or We couldn't copy/zero everything. */ if (vn) spin_unlock(&vn->busy.lock); return count - remains; } /** * remap_vmalloc_range_partial - map vmalloc pages to userspace * @vma: vma to cover * @uaddr: target user address to start at * @kaddr: virtual address of vmalloc kernel memory * @pgoff: offset from @kaddr to start at * @size: size of map area * * Returns: 0 for success, -Exxx on failure * * This function checks that @kaddr is a valid vmalloc'ed area, * and that it is big enough to cover the range starting at * @uaddr in @vma. Will return failure if that criteria isn't * met. * * Similar to remap_pfn_range() (see mm/memory.c) */ int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr, void *kaddr, unsigned long pgoff, unsigned long size) { struct vm_struct *area; unsigned long off; unsigned long end_index; if (check_shl_overflow(pgoff, PAGE_SHIFT, &off)) return -EINVAL; size = PAGE_ALIGN(size); if (!PAGE_ALIGNED(uaddr) || !PAGE_ALIGNED(kaddr)) return -EINVAL; area = find_vm_area(kaddr); if (!area) return -EINVAL; if (!(area->flags & (VM_USERMAP | VM_DMA_COHERENT))) return -EINVAL; if (check_add_overflow(size, off, &end_index) || end_index > get_vm_area_size(area)) return -EINVAL; kaddr += off; do { struct page *page = vmalloc_to_page(kaddr); int ret; ret = vm_insert_page(vma, uaddr, page); if (ret) return ret; uaddr += PAGE_SIZE; kaddr += PAGE_SIZE; size -= PAGE_SIZE; } while (size > 0); vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP); return 0; } /** * remap_vmalloc_range - map vmalloc pages to userspace * @vma: vma to cover (map full range of vma) * @addr: vmalloc memory * @pgoff: number of pages into addr before first page to map * * Returns: 0 for success, -Exxx on failure * * This function checks that addr is a valid vmalloc'ed area, and * that it is big enough to cover the vma. Will return failure if * that criteria isn't met. * * Similar to remap_pfn_range() (see mm/memory.c) */ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, unsigned long pgoff) { return remap_vmalloc_range_partial(vma, vma->vm_start, addr, pgoff, vma->vm_end - vma->vm_start); } EXPORT_SYMBOL(remap_vmalloc_range); void free_vm_area(struct vm_struct *area) { struct vm_struct *ret; ret = remove_vm_area(area->addr); BUG_ON(ret != area); kfree(area); } EXPORT_SYMBOL_GPL(free_vm_area); #ifdef CONFIG_SMP static struct vmap_area *node_to_va(struct rb_node *n) { return rb_entry_safe(n, struct vmap_area, rb_node); } /** * pvm_find_va_enclose_addr - find the vmap_area @addr belongs to * @addr: target address * * Returns: vmap_area if it is found. If there is no such area * the first highest(reverse order) vmap_area is returned * i.e. va->va_start < addr && va->va_end < addr or NULL * if there are no any areas before @addr. */ static struct vmap_area * pvm_find_va_enclose_addr(unsigned long addr) { struct vmap_area *va, *tmp; struct rb_node *n; n = free_vmap_area_root.rb_node; va = NULL; while (n) { tmp = rb_entry(n, struct vmap_area, rb_node); if (tmp->va_start <= addr) { va = tmp; if (tmp->va_end >= addr) break; n = n->rb_right; } else { n = n->rb_left; } } return va; } /** * pvm_determine_end_from_reverse - find the highest aligned address * of free block below VMALLOC_END * @va: * in - the VA we start the search(reverse order); * out - the VA with the highest aligned end address. * @align: alignment for required highest address * * Returns: determined end address within vmap_area */ static unsigned long pvm_determine_end_from_reverse(struct vmap_area **va, unsigned long align) { unsigned long vmalloc_end = VMALLOC_END & ~(align - 1); unsigned long addr; if (likely(*va)) { list_for_each_entry_from_reverse((*va), &free_vmap_area_list, list) { addr = min((*va)->va_end & ~(align - 1), vmalloc_end); if ((*va)->va_start < addr) return addr; } } return 0; } /** * pcpu_get_vm_areas - allocate vmalloc areas for percpu allocator * @offsets: array containing offset of each area * @sizes: array containing size of each area * @nr_vms: the number of areas to allocate * @align: alignment, all entries in @offsets and @sizes must be aligned to this * * Returns: kmalloc'd vm_struct pointer array pointing to allocated * vm_structs on success, %NULL on failure * * Percpu allocator wants to use congruent vm areas so that it can * maintain the offsets among percpu areas. This function allocates * congruent vmalloc areas for it with GFP_KERNEL. These areas tend to * be scattered pretty far, distance between two areas easily going up * to gigabytes. To avoid interacting with regular vmallocs, these * areas are allocated from top. * * Despite its complicated look, this allocator is rather simple. It * does everything top-down and scans free blocks from the end looking * for matching base. While scanning, if any of the areas do not fit the * base address is pulled down to fit the area. Scanning is repeated till * all the areas fit and then all necessary data structures are inserted * and the result is returned. */ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, const size_t *sizes, int nr_vms, size_t align) { const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align); const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1); struct vmap_area **vas, *va; struct vm_struct **vms; int area, area2, last_area, term_area; unsigned long base, start, size, end, last_end, orig_start, orig_end; bool purged = false; /* verify parameters and allocate data structures */ BUG_ON(offset_in_page(align) || !is_power_of_2(align)); for (last_area = 0, area = 0; area < nr_vms; area++) { start = offsets[area]; end = start + sizes[area]; /* is everything aligned properly? */ BUG_ON(!IS_ALIGNED(offsets[area], align)); BUG_ON(!IS_ALIGNED(sizes[area], align)); /* detect the area with the highest address */ if (start > offsets[last_area]) last_area = area; for (area2 = area + 1; area2 < nr_vms; area2++) { unsigned long start2 = offsets[area2]; unsigned long end2 = start2 + sizes[area2]; BUG_ON(start2 < end && start < end2); } } last_end = offsets[last_area] + sizes[last_area]; if (vmalloc_end - vmalloc_start < last_end) { WARN_ON(true); return NULL; } vms = kcalloc(nr_vms, sizeof(vms[0]), GFP_KERNEL); vas = kcalloc(nr_vms, sizeof(vas[0]), GFP_KERNEL); if (!vas || !vms) goto err_free2; for (area = 0; area < nr_vms; area++) { vas[area] = kmem_cache_zalloc(vmap_area_cachep, GFP_KERNEL); vms[area] = kzalloc(sizeof(struct vm_struct), GFP_KERNEL); if (!vas[area] || !vms[area]) goto err_free; } retry: spin_lock(&free_vmap_area_lock); /* start scanning - we scan from the top, begin with the last area */ area = term_area = last_area; start = offsets[area]; end = start + sizes[area]; va = pvm_find_va_enclose_addr(vmalloc_end); base = pvm_determine_end_from_reverse(&va, align) - end; while (true) { /* * base might have underflowed, add last_end before * comparing. */ if (base + last_end < vmalloc_start + last_end) goto overflow; /* * Fitting base has not been found. */ if (va == NULL) goto overflow; /* * If required width exceeds current VA block, move * base downwards and then recheck. */ if (base + end > va->va_end) { base = pvm_determine_end_from_reverse(&va, align) - end; term_area = area; continue; } /* * If this VA does not fit, move base downwards and recheck. */ if (base + start < va->va_start) { va = node_to_va(rb_prev(&va->rb_node)); base = pvm_determine_end_from_reverse(&va, align) - end; term_area = area; continue; } /* * This area fits, move on to the previous one. If * the previous one is the terminal one, we're done. */ area = (area + nr_vms - 1) % nr_vms; if (area == term_area) break; start = offsets[area]; end = start + sizes[area]; va = pvm_find_va_enclose_addr(base + end); } /* we've found a fitting base, insert all va's */ for (area = 0; area < nr_vms; area++) { int ret; start = base + offsets[area]; size = sizes[area]; va = pvm_find_va_enclose_addr(start); if (WARN_ON_ONCE(va == NULL)) /* It is a BUG(), but trigger recovery instead. */ goto recovery; ret = va_clip(&free_vmap_area_root, &free_vmap_area_list, va, start, size); if (WARN_ON_ONCE(unlikely(ret))) /* It is a BUG(), but trigger recovery instead. */ goto recovery; /* Allocated area. */ va = vas[area]; va->va_start = start; va->va_end = start + size; } spin_unlock(&free_vmap_area_lock); /* populate the kasan shadow space */ for (area = 0; area < nr_vms; area++) { if (kasan_populate_vmalloc(vas[area]->va_start, sizes[area])) goto err_free_shadow; } /* insert all vm's */ for (area = 0; area < nr_vms; area++) { struct vmap_node *vn = addr_to_node(vas[area]->va_start); spin_lock(&vn->busy.lock); insert_vmap_area(vas[area], &vn->busy.root, &vn->busy.head); setup_vmalloc_vm(vms[area], vas[area], VM_ALLOC, pcpu_get_vm_areas); spin_unlock(&vn->busy.lock); } /* * Mark allocated areas as accessible. Do it now as a best-effort * approach, as they can be mapped outside of vmalloc code. * With hardware tag-based KASAN, marking is skipped for * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc(). */ for (area = 0; area < nr_vms; area++) vms[area]->addr = kasan_unpoison_vmalloc(vms[area]->addr, vms[area]->size, KASAN_VMALLOC_PROT_NORMAL); kfree(vas); return vms; recovery: /* * Remove previously allocated areas. There is no * need in removing these areas from the busy tree, * because they are inserted only on the final step * and when pcpu_get_vm_areas() is success. */ while (area--) { orig_start = vas[area]->va_start; orig_end = vas[area]->va_end; va = merge_or_add_vmap_area_augment(vas[area], &free_vmap_area_root, &free_vmap_area_list); if (va) kasan_release_vmalloc(orig_start, orig_end, va->va_start, va->va_end, KASAN_VMALLOC_PAGE_RANGE | KASAN_VMALLOC_TLB_FLUSH); vas[area] = NULL; } overflow: spin_unlock(&free_vmap_area_lock); if (!purged) { reclaim_and_purge_vmap_areas(); purged = true; /* Before "retry", check if we recover. */ for (area = 0; area < nr_vms; area++) { if (vas[area]) continue; vas[area] = kmem_cache_zalloc( vmap_area_cachep, GFP_KERNEL); if (!vas[area]) goto err_free; } goto retry; } err_free: for (area = 0; area < nr_vms; area++) { if (vas[area]) kmem_cache_free(vmap_area_cachep, vas[area]); kfree(vms[area]); } err_free2: kfree(vas); kfree(vms); return NULL; err_free_shadow: spin_lock(&free_vmap_area_lock); /* * We release all the vmalloc shadows, even the ones for regions that * hadn't been successfully added. This relies on kasan_release_vmalloc * being able to tolerate this case. */ for (area = 0; area < nr_vms; area++) { orig_start = vas[area]->va_start; orig_end = vas[area]->va_end; va = merge_or_add_vmap_area_augment(vas[area], &free_vmap_area_root, &free_vmap_area_list); if (va) kasan_release_vmalloc(orig_start, orig_end, va->va_start, va->va_end, KASAN_VMALLOC_PAGE_RANGE | KASAN_VMALLOC_TLB_FLUSH); vas[area] = NULL; kfree(vms[area]); } spin_unlock(&free_vmap_area_lock); kfree(vas); kfree(vms); return NULL; } /** * pcpu_free_vm_areas - free vmalloc areas for percpu allocator * @vms: vm_struct pointer array returned by pcpu_get_vm_areas() * @nr_vms: the number of allocated areas * * Free vm_structs and the array allocated by pcpu_get_vm_areas(). */ void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms) { int i; for (i = 0; i < nr_vms; i++) free_vm_area(vms[i]); kfree(vms); } #endif /* CONFIG_SMP */ #ifdef CONFIG_PRINTK bool vmalloc_dump_obj(void *object) { const void *caller; struct vm_struct *vm; struct vmap_area *va; struct vmap_node *vn; unsigned long addr; unsigned int nr_pages; addr = PAGE_ALIGN((unsigned long) object); vn = addr_to_node(addr); if (!spin_trylock(&vn->busy.lock)) return false; va = __find_vmap_area(addr, &vn->busy.root); if (!va || !va->vm) { spin_unlock(&vn->busy.lock); return false; } vm = va->vm; addr = (unsigned long) vm->addr; caller = vm->caller; nr_pages = vm->nr_pages; spin_unlock(&vn->busy.lock); pr_cont(" %u-page vmalloc region starting at %#lx allocated at %pS\n", nr_pages, addr, caller); return true; } #endif #ifdef CONFIG_PROC_FS static void show_numa_info(struct seq_file *m, struct vm_struct *v) { if (IS_ENABLED(CONFIG_NUMA)) { unsigned int nr, *counters = m->private; unsigned int step = 1U << vm_area_page_order(v); if (!counters) return; if (v->flags & VM_UNINITIALIZED) return; /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */ smp_rmb(); memset(counters, 0, nr_node_ids * sizeof(unsigned int)); for (nr = 0; nr < v->nr_pages; nr += step) counters[page_to_nid(v->pages[nr])] += step; for_each_node_state(nr, N_HIGH_MEMORY) if (counters[nr]) seq_printf(m, " N%u=%u", nr, counters[nr]); } } static void show_purge_info(struct seq_file *m) { struct vmap_node *vn; struct vmap_area *va; int i; for (i = 0; i < nr_vmap_nodes; i++) { vn = &vmap_nodes[i]; spin_lock(&vn->lazy.lock); list_for_each_entry(va, &vn->lazy.head, list) { seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n", (void *)va->va_start, (void *)va->va_end, va_size(va)); } spin_unlock(&vn->lazy.lock); } } static int vmalloc_info_show(struct seq_file *m, void *p) { struct vmap_node *vn; struct vmap_area *va; struct vm_struct *v; int i; for (i = 0; i < nr_vmap_nodes; i++) { vn = &vmap_nodes[i]; spin_lock(&vn->busy.lock); list_for_each_entry(va, &vn->busy.head, list) { if (!va->vm) { if (va->flags & VMAP_RAM) seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n", (void *)va->va_start, (void *)va->va_end, va_size(va)); continue; } v = va->vm; seq_printf(m, "0x%pK-0x%pK %7ld", v->addr, v->addr + v->size, v->size); if (v->caller) seq_printf(m, " %pS", v->caller); if (v->nr_pages) seq_printf(m, " pages=%d", v->nr_pages); if (v->phys_addr) seq_printf(m, " phys=%pa", &v->phys_addr); if (v->flags & VM_IOREMAP) seq_puts(m, " ioremap"); if (v->flags & VM_SPARSE) seq_puts(m, " sparse"); if (v->flags & VM_ALLOC) seq_puts(m, " vmalloc"); if (v->flags & VM_MAP) seq_puts(m, " vmap"); if (v->flags & VM_USERMAP) seq_puts(m, " user"); if (v->flags & VM_DMA_COHERENT) seq_puts(m, " dma-coherent"); if (is_vmalloc_addr(v->pages)) seq_puts(m, " vpages"); show_numa_info(m, v); seq_putc(m, '\n'); } spin_unlock(&vn->busy.lock); } /* * As a final step, dump "unpurged" areas. */ show_purge_info(m); return 0; } static int __init proc_vmalloc_init(void) { void *priv_data = NULL; if (IS_ENABLED(CONFIG_NUMA)) priv_data = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL); proc_create_single_data("vmallocinfo", 0400, NULL, vmalloc_info_show, priv_data); return 0; } module_init(proc_vmalloc_init); #endif static void __init vmap_init_free_space(void) { unsigned long vmap_start = 1; const unsigned long vmap_end = ULONG_MAX; struct vmap_area *free; struct vm_struct *busy; /* * B F B B B F * -|-----|.....|-----|-----|-----|.....|- * | The KVA space | * |<--------------------------------->| */ for (busy = vmlist; busy; busy = busy->next) { if ((unsigned long) busy->addr - vmap_start > 0) { free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); if (!WARN_ON_ONCE(!free)) { free->va_start = vmap_start; free->va_end = (unsigned long) busy->addr; insert_vmap_area_augment(free, NULL, &free_vmap_area_root, &free_vmap_area_list); } } vmap_start = (unsigned long) busy->addr + busy->size; } if (vmap_end - vmap_start > 0) { free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); if (!WARN_ON_ONCE(!free)) { free->va_start = vmap_start; free->va_end = vmap_end; insert_vmap_area_augment(free, NULL, &free_vmap_area_root, &free_vmap_area_list); } } } static void vmap_init_nodes(void) { struct vmap_node *vn; int i, n; #if BITS_PER_LONG == 64 /* * A high threshold of max nodes is fixed and bound to 128, * thus a scale factor is 1 for systems where number of cores * are less or equal to specified threshold. * * As for NUMA-aware notes. For bigger systems, for example * NUMA with multi-sockets, where we can end-up with thousands * of cores in total, a "sub-numa-clustering" should be added. * * In this case a NUMA domain is considered as a single entity * with dedicated sub-nodes in it which describe one group or * set of cores. Therefore a per-domain purging is supposed to * be added as well as a per-domain balancing. */ n = clamp_t(unsigned int, num_possible_cpus(), 1, 128); if (n > 1) { vn = kmalloc_array(n, sizeof(*vn), GFP_NOWAIT | __GFP_NOWARN); if (vn) { /* Node partition is 16 pages. */ vmap_zone_size = (1 << 4) * PAGE_SIZE; nr_vmap_nodes = n; vmap_nodes = vn; } else { pr_err("Failed to allocate an array. Disable a node layer\n"); } } #endif for (n = 0; n < nr_vmap_nodes; n++) { vn = &vmap_nodes[n]; vn->busy.root = RB_ROOT; INIT_LIST_HEAD(&vn->busy.head); spin_lock_init(&vn->busy.lock); vn->lazy.root = RB_ROOT; INIT_LIST_HEAD(&vn->lazy.head); spin_lock_init(&vn->lazy.lock); for (i = 0; i < MAX_VA_SIZE_PAGES; i++) { INIT_LIST_HEAD(&vn->pool[i].head); WRITE_ONCE(vn->pool[i].len, 0); } spin_lock_init(&vn->pool_lock); } } static unsigned long vmap_node_shrink_count(struct shrinker *shrink, struct shrink_control *sc) { unsigned long count; struct vmap_node *vn; int i, j; for (count = 0, i = 0; i < nr_vmap_nodes; i++) { vn = &vmap_nodes[i]; for (j = 0; j < MAX_VA_SIZE_PAGES; j++) count += READ_ONCE(vn->pool[j].len); } return count ? count : SHRINK_EMPTY; } static unsigned long vmap_node_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) { int i; for (i = 0; i < nr_vmap_nodes; i++) decay_va_pool_node(&vmap_nodes[i], true); return SHRINK_STOP; } void __init vmalloc_init(void) { struct shrinker *vmap_node_shrinker; struct vmap_area *va; struct vmap_node *vn; struct vm_struct *tmp; int i; /* * Create the cache for vmap_area objects. */ vmap_area_cachep = KMEM_CACHE(vmap_area, SLAB_PANIC); for_each_possible_cpu(i) { struct vmap_block_queue *vbq; struct vfree_deferred *p; vbq = &per_cpu(vmap_block_queue, i); spin_lock_init(&vbq->lock); INIT_LIST_HEAD(&vbq->free); p = &per_cpu(vfree_deferred, i); init_llist_head(&p->list); INIT_WORK(&p->wq, delayed_vfree_work); xa_init(&vbq->vmap_blocks); } /* * Setup nodes before importing vmlist. */ vmap_init_nodes(); /* Import existing vmlist entries. */ for (tmp = vmlist; tmp; tmp = tmp->next) { va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); if (WARN_ON_ONCE(!va)) continue; va->va_start = (unsigned long)tmp->addr; va->va_end = va->va_start + tmp->size; va->vm = tmp; vn = addr_to_node(va->va_start); insert_vmap_area(va, &vn->busy.root, &vn->busy.head); } /* * Now we can initialize a free vmap space. */ vmap_init_free_space(); vmap_initialized = true; vmap_node_shrinker = shrinker_alloc(0, "vmap-node"); if (!vmap_node_shrinker) { pr_err("Failed to allocate vmap-node shrinker!\n"); return; } vmap_node_shrinker->count_objects = vmap_node_shrink_count; vmap_node_shrinker->scan_objects = vmap_node_shrink_scan; shrinker_register(vmap_node_shrinker); }
6 5 1 6 5 3 2 1 1 1 1 3 2 1 6 5 5 5 5 5 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 // SPDX-License-Identifier: GPL-2.0-only #include "netlink.h" #include "common.h" struct pause_req_info { struct ethnl_req_info base; enum ethtool_mac_stats_src src; }; #define PAUSE_REQINFO(__req_base) \ container_of(__req_base, struct pause_req_info, base) struct pause_reply_data { struct ethnl_reply_data base; struct ethtool_pauseparam pauseparam; struct ethtool_pause_stats pausestat; }; #define PAUSE_REPDATA(__reply_base) \ container_of(__reply_base, struct pause_reply_data, base) const struct nla_policy ethnl_pause_get_policy[] = { [ETHTOOL_A_PAUSE_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy_stats), [ETHTOOL_A_PAUSE_STATS_SRC] = NLA_POLICY_MAX(NLA_U32, ETHTOOL_MAC_STATS_SRC_PMAC), }; static int pause_parse_request(struct ethnl_req_info *req_base, struct nlattr **tb, struct netlink_ext_ack *extack) { enum ethtool_mac_stats_src src = ETHTOOL_MAC_STATS_SRC_AGGREGATE; struct pause_req_info *req_info = PAUSE_REQINFO(req_base); if (tb[ETHTOOL_A_PAUSE_STATS_SRC]) { if (!(req_base->flags & ETHTOOL_FLAG_STATS)) { NL_SET_ERR_MSG_MOD(extack, "ETHTOOL_FLAG_STATS must be set when requesting a source of stats"); return -EINVAL; } src = nla_get_u32(tb[ETHTOOL_A_PAUSE_STATS_SRC]); } req_info->src = src; return 0; } static int pause_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, const struct genl_info *info) { const struct pause_req_info *req_info = PAUSE_REQINFO(req_base); struct pause_reply_data *data = PAUSE_REPDATA(reply_base); enum ethtool_mac_stats_src src = req_info->src; struct net_device *dev = reply_base->dev; int ret; if (!dev->ethtool_ops->get_pauseparam) return -EOPNOTSUPP; ethtool_stats_init((u64 *)&data->pausestat, sizeof(data->pausestat) / 8); data->pausestat.src = src; ret = ethnl_ops_begin(dev); if (ret < 0) return ret; if ((src == ETHTOOL_MAC_STATS_SRC_EMAC || src == ETHTOOL_MAC_STATS_SRC_PMAC) && !__ethtool_dev_mm_supported(dev)) { NL_SET_ERR_MSG_MOD(info->extack, "Device does not support MAC merge layer"); ethnl_ops_complete(dev); return -EOPNOTSUPP; } dev->ethtool_ops->get_pauseparam(dev, &data->pauseparam); if (req_base->flags & ETHTOOL_FLAG_STATS && dev->ethtool_ops->get_pause_stats) dev->ethtool_ops->get_pause_stats(dev, &data->pausestat); ethnl_ops_complete(dev); return 0; } static int pause_reply_size(const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { int n = nla_total_size(sizeof(u8)) + /* _PAUSE_AUTONEG */ nla_total_size(sizeof(u8)) + /* _PAUSE_RX */ nla_total_size(sizeof(u8)); /* _PAUSE_TX */ if (req_base->flags & ETHTOOL_FLAG_STATS) n += nla_total_size(0) + /* _PAUSE_STATS */ nla_total_size(sizeof(u32)) + /* _PAUSE_STATS_SRC */ nla_total_size_64bit(sizeof(u64)) * ETHTOOL_PAUSE_STAT_CNT; return n; } static int ethtool_put_stat(struct sk_buff *skb, u64 val, u16 attrtype, u16 padtype) { if (val == ETHTOOL_STAT_NOT_SET) return 0; if (nla_put_u64_64bit(skb, attrtype, val, padtype)) return -EMSGSIZE; return 0; } static int pause_put_stats(struct sk_buff *skb, const struct ethtool_pause_stats *pause_stats) { const u16 pad = ETHTOOL_A_PAUSE_STAT_PAD; struct nlattr *nest; if (nla_put_u32(skb, ETHTOOL_A_PAUSE_STATS_SRC, pause_stats->src)) return -EMSGSIZE; nest = nla_nest_start(skb, ETHTOOL_A_PAUSE_STATS); if (!nest) return -EMSGSIZE; if (ethtool_put_stat(skb, pause_stats->tx_pause_frames, ETHTOOL_A_PAUSE_STAT_TX_FRAMES, pad) || ethtool_put_stat(skb, pause_stats->rx_pause_frames, ETHTOOL_A_PAUSE_STAT_RX_FRAMES, pad)) goto err_cancel; nla_nest_end(skb, nest); return 0; err_cancel: nla_nest_cancel(skb, nest); return -EMSGSIZE; } static int pause_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct pause_reply_data *data = PAUSE_REPDATA(reply_base); const struct ethtool_pauseparam *pauseparam = &data->pauseparam; if (nla_put_u8(skb, ETHTOOL_A_PAUSE_AUTONEG, !!pauseparam->autoneg) || nla_put_u8(skb, ETHTOOL_A_PAUSE_RX, !!pauseparam->rx_pause) || nla_put_u8(skb, ETHTOOL_A_PAUSE_TX, !!pauseparam->tx_pause)) return -EMSGSIZE; if (req_base->flags & ETHTOOL_FLAG_STATS && pause_put_stats(skb, &data->pausestat)) return -EMSGSIZE; return 0; } /* PAUSE_SET */ const struct nla_policy ethnl_pause_set_policy[] = { [ETHTOOL_A_PAUSE_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), [ETHTOOL_A_PAUSE_AUTONEG] = { .type = NLA_U8 }, [ETHTOOL_A_PAUSE_RX] = { .type = NLA_U8 }, [ETHTOOL_A_PAUSE_TX] = { .type = NLA_U8 }, }; static int ethnl_set_pause_validate(struct ethnl_req_info *req_info, struct genl_info *info) { const struct ethtool_ops *ops = req_info->dev->ethtool_ops; return ops->get_pauseparam && ops->set_pauseparam ? 1 : -EOPNOTSUPP; } static int ethnl_set_pause(struct ethnl_req_info *req_info, struct genl_info *info) { struct net_device *dev = req_info->dev; struct ethtool_pauseparam params = {}; struct nlattr **tb = info->attrs; bool mod = false; int ret; dev->ethtool_ops->get_pauseparam(dev, &params); ethnl_update_bool32(&params.autoneg, tb[ETHTOOL_A_PAUSE_AUTONEG], &mod); ethnl_update_bool32(&params.rx_pause, tb[ETHTOOL_A_PAUSE_RX], &mod); ethnl_update_bool32(&params.tx_pause, tb[ETHTOOL_A_PAUSE_TX], &mod); if (!mod) return 0; ret = dev->ethtool_ops->set_pauseparam(dev, &params); return ret < 0 ? ret : 1; } const struct ethnl_request_ops ethnl_pause_request_ops = { .request_cmd = ETHTOOL_MSG_PAUSE_GET, .reply_cmd = ETHTOOL_MSG_PAUSE_GET_REPLY, .hdr_attr = ETHTOOL_A_PAUSE_HEADER, .req_info_size = sizeof(struct pause_req_info), .reply_data_size = sizeof(struct pause_reply_data), .parse_request = pause_parse_request, .prepare_data = pause_prepare_data, .reply_size = pause_reply_size, .fill_reply = pause_fill_reply, .set_validate = ethnl_set_pause_validate, .set = ethnl_set_pause, .set_ntf_cmd = ETHTOOL_MSG_PAUSE_NTF, };
1 1 1 9 3 6 2 5 2 2 2 2 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 // SPDX-License-Identifier: GPL-2.0-or-later #include <net/genetlink.h> #include "br_private.h" #include "br_private_cfm.h" static const struct nla_policy br_cfm_mep_create_policy[IFLA_BRIDGE_CFM_MEP_CREATE_MAX + 1] = { [IFLA_BRIDGE_CFM_MEP_CREATE_UNSPEC] = { .type = NLA_REJECT }, [IFLA_BRIDGE_CFM_MEP_CREATE_INSTANCE] = { .type = NLA_U32 }, [IFLA_BRIDGE_CFM_MEP_CREATE_DOMAIN] = { .type = NLA_U32 }, [IFLA_BRIDGE_CFM_MEP_CREATE_DIRECTION] = { .type = NLA_U32 }, [IFLA_BRIDGE_CFM_MEP_CREATE_IFINDEX] = { .type = NLA_U32 }, }; static const struct nla_policy br_cfm_mep_delete_policy[IFLA_BRIDGE_CFM_MEP_DELETE_MAX + 1] = { [IFLA_BRIDGE_CFM_MEP_DELETE_UNSPEC] = { .type = NLA_REJECT }, [IFLA_BRIDGE_CFM_MEP_DELETE_INSTANCE] = { .type = NLA_U32 }, }; static const struct nla_policy br_cfm_mep_config_policy[IFLA_BRIDGE_CFM_MEP_CONFIG_MAX + 1] = { [IFLA_BRIDGE_CFM_MEP_CONFIG_UNSPEC] = { .type = NLA_REJECT }, [IFLA_BRIDGE_CFM_MEP_CONFIG_INSTANCE] = { .type = NLA_U32 }, [IFLA_BRIDGE_CFM_MEP_CONFIG_UNICAST_MAC] = NLA_POLICY_ETH_ADDR, [IFLA_BRIDGE_CFM_MEP_CONFIG_MDLEVEL] = NLA_POLICY_MAX(NLA_U32, 7), [IFLA_BRIDGE_CFM_MEP_CONFIG_MEPID] = NLA_POLICY_MAX(NLA_U32, 0x1FFF), }; static const struct nla_policy br_cfm_cc_config_policy[IFLA_BRIDGE_CFM_CC_CONFIG_MAX + 1] = { [IFLA_BRIDGE_CFM_CC_CONFIG_UNSPEC] = { .type = NLA_REJECT }, [IFLA_BRIDGE_CFM_CC_CONFIG_INSTANCE] = { .type = NLA_U32 }, [IFLA_BRIDGE_CFM_CC_CONFIG_ENABLE] = { .type = NLA_U32 }, [IFLA_BRIDGE_CFM_CC_CONFIG_EXP_INTERVAL] = { .type = NLA_U32 }, [IFLA_BRIDGE_CFM_CC_CONFIG_EXP_MAID] = { .type = NLA_BINARY, .len = CFM_MAID_LENGTH }, }; static const struct nla_policy br_cfm_cc_peer_mep_policy[IFLA_BRIDGE_CFM_CC_PEER_MEP_MAX + 1] = { [IFLA_BRIDGE_CFM_CC_PEER_MEP_UNSPEC] = { .type = NLA_REJECT }, [IFLA_BRIDGE_CFM_CC_PEER_MEP_INSTANCE] = { .type = NLA_U32 }, [IFLA_BRIDGE_CFM_CC_PEER_MEPID] = NLA_POLICY_MAX(NLA_U32, 0x1FFF), }; static const struct nla_policy br_cfm_cc_rdi_policy[IFLA_BRIDGE_CFM_CC_RDI_MAX + 1] = { [IFLA_BRIDGE_CFM_CC_RDI_UNSPEC] = { .type = NLA_REJECT }, [IFLA_BRIDGE_CFM_CC_RDI_INSTANCE] = { .type = NLA_U32 }, [IFLA_BRIDGE_CFM_CC_RDI_RDI] = { .type = NLA_U32 }, }; static const struct nla_policy br_cfm_cc_ccm_tx_policy[IFLA_BRIDGE_CFM_CC_CCM_TX_MAX + 1] = { [IFLA_BRIDGE_CFM_CC_CCM_TX_UNSPEC] = { .type = NLA_REJECT }, [IFLA_BRIDGE_CFM_CC_CCM_TX_INSTANCE] = { .type = NLA_U32 }, [IFLA_BRIDGE_CFM_CC_CCM_TX_DMAC] = NLA_POLICY_ETH_ADDR, [IFLA_BRIDGE_CFM_CC_CCM_TX_SEQ_NO_UPDATE] = { .type = NLA_U32 }, [IFLA_BRIDGE_CFM_CC_CCM_TX_PERIOD] = { .type = NLA_U32 }, [IFLA_BRIDGE_CFM_CC_CCM_TX_IF_TLV] = { .type = NLA_U32 }, [IFLA_BRIDGE_CFM_CC_CCM_TX_IF_TLV_VALUE] = { .type = NLA_U8 }, [IFLA_BRIDGE_CFM_CC_CCM_TX_PORT_TLV] = { .type = NLA_U32 }, [IFLA_BRIDGE_CFM_CC_CCM_TX_PORT_TLV_VALUE] = { .type = NLA_U8 }, }; static const struct nla_policy br_cfm_policy[IFLA_BRIDGE_CFM_MAX + 1] = { [IFLA_BRIDGE_CFM_UNSPEC] = { .type = NLA_REJECT }, [IFLA_BRIDGE_CFM_MEP_CREATE] = NLA_POLICY_NESTED(br_cfm_mep_create_policy), [IFLA_BRIDGE_CFM_MEP_DELETE] = NLA_POLICY_NESTED(br_cfm_mep_delete_policy), [IFLA_BRIDGE_CFM_MEP_CONFIG] = NLA_POLICY_NESTED(br_cfm_mep_config_policy), [IFLA_BRIDGE_CFM_CC_CONFIG] = NLA_POLICY_NESTED(br_cfm_cc_config_policy), [IFLA_BRIDGE_CFM_CC_PEER_MEP_ADD] = NLA_POLICY_NESTED(br_cfm_cc_peer_mep_policy), [IFLA_BRIDGE_CFM_CC_PEER_MEP_REMOVE] = NLA_POLICY_NESTED(br_cfm_cc_peer_mep_policy), [IFLA_BRIDGE_CFM_CC_RDI] = NLA_POLICY_NESTED(br_cfm_cc_rdi_policy), [IFLA_BRIDGE_CFM_CC_CCM_TX] = NLA_POLICY_NESTED(br_cfm_cc_ccm_tx_policy), }; static int br_mep_create_parse(struct net_bridge *br, struct nlattr *attr, struct netlink_ext_ack *extack) { struct nlattr *tb[IFLA_BRIDGE_CFM_MEP_CREATE_MAX + 1]; struct br_cfm_mep_create create; u32 instance; int err; err = nla_parse_nested(tb, IFLA_BRIDGE_CFM_MEP_CREATE_MAX, attr, br_cfm_mep_create_policy, extack); if (err) return err; if (!tb[IFLA_BRIDGE_CFM_MEP_CREATE_INSTANCE]) { NL_SET_ERR_MSG_MOD(extack, "Missing INSTANCE attribute"); return -EINVAL; } if (!tb[IFLA_BRIDGE_CFM_MEP_CREATE_DOMAIN]) { NL_SET_ERR_MSG_MOD(extack, "Missing DOMAIN attribute"); return -EINVAL; } if (!tb[IFLA_BRIDGE_CFM_MEP_CREATE_DIRECTION]) { NL_SET_ERR_MSG_MOD(extack, "Missing DIRECTION attribute"); return -EINVAL; } if (!tb[IFLA_BRIDGE_CFM_MEP_CREATE_IFINDEX]) { NL_SET_ERR_MSG_MOD(extack, "Missing IFINDEX attribute"); return -EINVAL; } memset(&create, 0, sizeof(create)); instance = nla_get_u32(tb[IFLA_BRIDGE_CFM_MEP_CREATE_INSTANCE]); create.domain = nla_get_u32(tb[IFLA_BRIDGE_CFM_MEP_CREATE_DOMAIN]); create.direction = nla_get_u32(tb[IFLA_BRIDGE_CFM_MEP_CREATE_DIRECTION]); create.ifindex = nla_get_u32(tb[IFLA_BRIDGE_CFM_MEP_CREATE_IFINDEX]); return br_cfm_mep_create(br, instance, &create, extack); } static int br_mep_delete_parse(struct net_bridge *br, struct nlattr *attr, struct netlink_ext_ack *extack) { struct nlattr *tb[IFLA_BRIDGE_CFM_MEP_DELETE_MAX + 1]; u32 instance; int err; err = nla_parse_nested(tb, IFLA_BRIDGE_CFM_MEP_DELETE_MAX, attr, br_cfm_mep_delete_policy, extack); if (err) return err; if (!tb[IFLA_BRIDGE_CFM_MEP_DELETE_INSTANCE]) { NL_SET_ERR_MSG_MOD(extack, "Missing INSTANCE attribute"); return -EINVAL; } instance = nla_get_u32(tb[IFLA_BRIDGE_CFM_MEP_DELETE_INSTANCE]); return br_cfm_mep_delete(br, instance, extack); } static int br_mep_config_parse(struct net_bridge *br, struct nlattr *attr, struct netlink_ext_ack *extack) { struct nlattr *tb[IFLA_BRIDGE_CFM_MEP_CONFIG_MAX + 1]; struct br_cfm_mep_config config; u32 instance; int err; err = nla_parse_nested(tb, IFLA_BRIDGE_CFM_MEP_CONFIG_MAX, attr, br_cfm_mep_config_policy, extack); if (err) return err; if (!tb[IFLA_BRIDGE_CFM_MEP_CONFIG_INSTANCE]) { NL_SET_ERR_MSG_MOD(extack, "Missing INSTANCE attribute"); return -EINVAL; } if (!tb[IFLA_BRIDGE_CFM_MEP_CONFIG_UNICAST_MAC]) { NL_SET_ERR_MSG_MOD(extack, "Missing UNICAST_MAC attribute"); return -EINVAL; } if (!tb[IFLA_BRIDGE_CFM_MEP_CONFIG_MDLEVEL]) { NL_SET_ERR_MSG_MOD(extack, "Missing MDLEVEL attribute"); return -EINVAL; } if (!tb[IFLA_BRIDGE_CFM_MEP_CONFIG_MEPID]) { NL_SET_ERR_MSG_MOD(extack, "Missing MEPID attribute"); return -EINVAL; } memset(&config, 0, sizeof(config)); instance = nla_get_u32(tb[IFLA_BRIDGE_CFM_MEP_CONFIG_INSTANCE]); nla_memcpy(&config.unicast_mac.addr, tb[IFLA_BRIDGE_CFM_MEP_CONFIG_UNICAST_MAC], sizeof(config.unicast_mac.addr)); config.mdlevel = nla_get_u32(tb[IFLA_BRIDGE_CFM_MEP_CONFIG_MDLEVEL]); config.mepid = nla_get_u32(tb[IFLA_BRIDGE_CFM_MEP_CONFIG_MEPID]); return br_cfm_mep_config_set(br, instance, &config, extack); } static int br_cc_config_parse(struct net_bridge *br, struct nlattr *attr, struct netlink_ext_ack *extack) { struct nlattr *tb[IFLA_BRIDGE_CFM_CC_CONFIG_MAX + 1]; struct br_cfm_cc_config config; u32 instance; int err; err = nla_parse_nested(tb, IFLA_BRIDGE_CFM_CC_CONFIG_MAX, attr, br_cfm_cc_config_policy, extack); if (err) return err; if (!tb[IFLA_BRIDGE_CFM_CC_CONFIG_INSTANCE]) { NL_SET_ERR_MSG_MOD(extack, "Missing INSTANCE attribute"); return -EINVAL; } if (!tb[IFLA_BRIDGE_CFM_CC_CONFIG_ENABLE]) { NL_SET_ERR_MSG_MOD(extack, "Missing ENABLE attribute"); return -EINVAL; } if (!tb[IFLA_BRIDGE_CFM_CC_CONFIG_EXP_INTERVAL]) { NL_SET_ERR_MSG_MOD(extack, "Missing INTERVAL attribute"); return -EINVAL; } if (!tb[IFLA_BRIDGE_CFM_CC_CONFIG_EXP_MAID]) { NL_SET_ERR_MSG_MOD(extack, "Missing MAID attribute"); return -EINVAL; } memset(&config, 0, sizeof(config)); instance = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_CONFIG_INSTANCE]); config.enable = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_CONFIG_ENABLE]); config.exp_interval = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_CONFIG_EXP_INTERVAL]); nla_memcpy(&config.exp_maid.data, tb[IFLA_BRIDGE_CFM_CC_CONFIG_EXP_MAID], sizeof(config.exp_maid.data)); return br_cfm_cc_config_set(br, instance, &config, extack); } static int br_cc_peer_mep_add_parse(struct net_bridge *br, struct nlattr *attr, struct netlink_ext_ack *extack) { struct nlattr *tb[IFLA_BRIDGE_CFM_CC_PEER_MEP_MAX + 1]; u32 instance, peer_mep_id; int err; err = nla_parse_nested(tb, IFLA_BRIDGE_CFM_CC_PEER_MEP_MAX, attr, br_cfm_cc_peer_mep_policy, extack); if (err) return err; if (!tb[IFLA_BRIDGE_CFM_CC_PEER_MEP_INSTANCE]) { NL_SET_ERR_MSG_MOD(extack, "Missing INSTANCE attribute"); return -EINVAL; } if (!tb[IFLA_BRIDGE_CFM_CC_PEER_MEPID]) { NL_SET_ERR_MSG_MOD(extack, "Missing PEER_MEP_ID attribute"); return -EINVAL; } instance = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_PEER_MEP_INSTANCE]); peer_mep_id = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_PEER_MEPID]); return br_cfm_cc_peer_mep_add(br, instance, peer_mep_id, extack); } static int br_cc_peer_mep_remove_parse(struct net_bridge *br, struct nlattr *attr, struct netlink_ext_ack *extack) { struct nlattr *tb[IFLA_BRIDGE_CFM_CC_PEER_MEP_MAX + 1]; u32 instance, peer_mep_id; int err; err = nla_parse_nested(tb, IFLA_BRIDGE_CFM_CC_PEER_MEP_MAX, attr, br_cfm_cc_peer_mep_policy, extack); if (err) return err; if (!tb[IFLA_BRIDGE_CFM_CC_PEER_MEP_INSTANCE]) { NL_SET_ERR_MSG_MOD(extack, "Missing INSTANCE attribute"); return -EINVAL; } if (!tb[IFLA_BRIDGE_CFM_CC_PEER_MEPID]) { NL_SET_ERR_MSG_MOD(extack, "Missing PEER_MEP_ID attribute"); return -EINVAL; } instance = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_PEER_MEP_INSTANCE]); peer_mep_id = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_PEER_MEPID]); return br_cfm_cc_peer_mep_remove(br, instance, peer_mep_id, extack); } static int br_cc_rdi_parse(struct net_bridge *br, struct nlattr *attr, struct netlink_ext_ack *extack) { struct nlattr *tb[IFLA_BRIDGE_CFM_CC_RDI_MAX + 1]; u32 instance, rdi; int err; err = nla_parse_nested(tb, IFLA_BRIDGE_CFM_CC_RDI_MAX, attr, br_cfm_cc_rdi_policy, extack); if (err) return err; if (!tb[IFLA_BRIDGE_CFM_CC_RDI_INSTANCE]) { NL_SET_ERR_MSG_MOD(extack, "Missing INSTANCE attribute"); return -EINVAL; } if (!tb[IFLA_BRIDGE_CFM_CC_RDI_RDI]) { NL_SET_ERR_MSG_MOD(extack, "Missing RDI attribute"); return -EINVAL; } instance = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_RDI_INSTANCE]); rdi = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_RDI_RDI]); return br_cfm_cc_rdi_set(br, instance, rdi, extack); } static int br_cc_ccm_tx_parse(struct net_bridge *br, struct nlattr *attr, struct netlink_ext_ack *extack) { struct nlattr *tb[IFLA_BRIDGE_CFM_CC_CCM_TX_MAX + 1]; struct br_cfm_cc_ccm_tx_info tx_info; u32 instance; int err; err = nla_parse_nested(tb, IFLA_BRIDGE_CFM_CC_CCM_TX_MAX, attr, br_cfm_cc_ccm_tx_policy, extack); if (err) return err; if (!tb[IFLA_BRIDGE_CFM_CC_CCM_TX_INSTANCE]) { NL_SET_ERR_MSG_MOD(extack, "Missing INSTANCE attribute"); return -EINVAL; } if (!tb[IFLA_BRIDGE_CFM_CC_CCM_TX_DMAC]) { NL_SET_ERR_MSG_MOD(extack, "Missing DMAC attribute"); return -EINVAL; } if (!tb[IFLA_BRIDGE_CFM_CC_CCM_TX_SEQ_NO_UPDATE]) { NL_SET_ERR_MSG_MOD(extack, "Missing SEQ_NO_UPDATE attribute"); return -EINVAL; } if (!tb[IFLA_BRIDGE_CFM_CC_CCM_TX_PERIOD]) { NL_SET_ERR_MSG_MOD(extack, "Missing PERIOD attribute"); return -EINVAL; } if (!tb[IFLA_BRIDGE_CFM_CC_CCM_TX_IF_TLV]) { NL_SET_ERR_MSG_MOD(extack, "Missing IF_TLV attribute"); return -EINVAL; } if (!tb[IFLA_BRIDGE_CFM_CC_CCM_TX_IF_TLV_VALUE]) { NL_SET_ERR_MSG_MOD(extack, "Missing IF_TLV_VALUE attribute"); return -EINVAL; } if (!tb[IFLA_BRIDGE_CFM_CC_CCM_TX_PORT_TLV]) { NL_SET_ERR_MSG_MOD(extack, "Missing PORT_TLV attribute"); return -EINVAL; } if (!tb[IFLA_BRIDGE_CFM_CC_CCM_TX_PORT_TLV_VALUE]) { NL_SET_ERR_MSG_MOD(extack, "Missing PORT_TLV_VALUE attribute"); return -EINVAL; } memset(&tx_info, 0, sizeof(tx_info)); instance = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_CCM_TX_INSTANCE]); nla_memcpy(&tx_info.dmac.addr, tb[IFLA_BRIDGE_CFM_CC_CCM_TX_DMAC], sizeof(tx_info.dmac.addr)); tx_info.seq_no_update = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_CCM_TX_SEQ_NO_UPDATE]); tx_info.period = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_CCM_TX_PERIOD]); tx_info.if_tlv = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_CCM_TX_IF_TLV]); tx_info.if_tlv_value = nla_get_u8(tb[IFLA_BRIDGE_CFM_CC_CCM_TX_IF_TLV_VALUE]); tx_info.port_tlv = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_CCM_TX_PORT_TLV]); tx_info.port_tlv_value = nla_get_u8(tb[IFLA_BRIDGE_CFM_CC_CCM_TX_PORT_TLV_VALUE]); return br_cfm_cc_ccm_tx(br, instance, &tx_info, extack); } int br_cfm_parse(struct net_bridge *br, struct net_bridge_port *p, struct nlattr *attr, int cmd, struct netlink_ext_ack *extack) { struct nlattr *tb[IFLA_BRIDGE_CFM_MAX + 1]; int err; /* When this function is called for a port then the br pointer is * invalid, therefor set the br to point correctly */ if (p) br = p->br; err = nla_parse_nested(tb, IFLA_BRIDGE_CFM_MAX, attr, br_cfm_policy, extack); if (err) return err; if (tb[IFLA_BRIDGE_CFM_MEP_CREATE]) { err = br_mep_create_parse(br, tb[IFLA_BRIDGE_CFM_MEP_CREATE], extack); if (err) return err; } if (tb[IFLA_BRIDGE_CFM_MEP_DELETE]) { err = br_mep_delete_parse(br, tb[IFLA_BRIDGE_CFM_MEP_DELETE], extack); if (err) return err; } if (tb[IFLA_BRIDGE_CFM_MEP_CONFIG]) { err = br_mep_config_parse(br, tb[IFLA_BRIDGE_CFM_MEP_CONFIG], extack); if (err) return err; } if (tb[IFLA_BRIDGE_CFM_CC_CONFIG]) { err = br_cc_config_parse(br, tb[IFLA_BRIDGE_CFM_CC_CONFIG], extack); if (err) return err; } if (tb[IFLA_BRIDGE_CFM_CC_PEER_MEP_ADD]) { err = br_cc_peer_mep_add_parse(br, tb[IFLA_BRIDGE_CFM_CC_PEER_MEP_ADD], extack); if (err) return err; } if (tb[IFLA_BRIDGE_CFM_CC_PEER_MEP_REMOVE]) { err = br_cc_peer_mep_remove_parse(br, tb[IFLA_BRIDGE_CFM_CC_PEER_MEP_REMOVE], extack); if (err) return err; } if (tb[IFLA_BRIDGE_CFM_CC_RDI]) { err = br_cc_rdi_parse(br, tb[IFLA_BRIDGE_CFM_CC_RDI], extack); if (err) return err; } if (tb[IFLA_BRIDGE_CFM_CC_CCM_TX]) { err = br_cc_ccm_tx_parse(br, tb[IFLA_BRIDGE_CFM_CC_CCM_TX], extack); if (err) return err; } return 0; } int br_cfm_config_fill_info(struct sk_buff *skb, struct net_bridge *br) { struct br_cfm_peer_mep *peer_mep; struct br_cfm_mep *mep; struct nlattr *tb; hlist_for_each_entry_rcu(mep, &br->mep_list, head) { tb = nla_nest_start(skb, IFLA_BRIDGE_CFM_MEP_CREATE_INFO); if (!tb) goto nla_info_failure; if (nla_put_u32(skb, IFLA_BRIDGE_CFM_MEP_CREATE_INSTANCE, mep->instance)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BRIDGE_CFM_MEP_CREATE_DOMAIN, mep->create.domain)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BRIDGE_CFM_MEP_CREATE_DIRECTION, mep->create.direction)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BRIDGE_CFM_MEP_CREATE_IFINDEX, mep->create.ifindex)) goto nla_put_failure; nla_nest_end(skb, tb); tb = nla_nest_start(skb, IFLA_BRIDGE_CFM_MEP_CONFIG_INFO); if (!tb) goto nla_info_failure; if (nla_put_u32(skb, IFLA_BRIDGE_CFM_MEP_CONFIG_INSTANCE, mep->instance)) goto nla_put_failure; if (nla_put(skb, IFLA_BRIDGE_CFM_MEP_CONFIG_UNICAST_MAC, sizeof(mep->config.unicast_mac.addr), mep->config.unicast_mac.addr)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BRIDGE_CFM_MEP_CONFIG_MDLEVEL, mep->config.mdlevel)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BRIDGE_CFM_MEP_CONFIG_MEPID, mep->config.mepid)) goto nla_put_failure; nla_nest_end(skb, tb); tb = nla_nest_start(skb, IFLA_BRIDGE_CFM_CC_CONFIG_INFO); if (!tb) goto nla_info_failure; if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_CONFIG_INSTANCE, mep->instance)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_CONFIG_ENABLE, mep->cc_config.enable)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_CONFIG_EXP_INTERVAL, mep->cc_config.exp_interval)) goto nla_put_failure; if (nla_put(skb, IFLA_BRIDGE_CFM_CC_CONFIG_EXP_MAID, sizeof(mep->cc_config.exp_maid.data), mep->cc_config.exp_maid.data)) goto nla_put_failure; nla_nest_end(skb, tb); tb = nla_nest_start(skb, IFLA_BRIDGE_CFM_CC_RDI_INFO); if (!tb) goto nla_info_failure; if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_RDI_INSTANCE, mep->instance)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_RDI_RDI, mep->rdi)) goto nla_put_failure; nla_nest_end(skb, tb); tb = nla_nest_start(skb, IFLA_BRIDGE_CFM_CC_CCM_TX_INFO); if (!tb) goto nla_info_failure; if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_CCM_TX_INSTANCE, mep->instance)) goto nla_put_failure; if (nla_put(skb, IFLA_BRIDGE_CFM_CC_CCM_TX_DMAC, sizeof(mep->cc_ccm_tx_info.dmac), mep->cc_ccm_tx_info.dmac.addr)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_CCM_TX_SEQ_NO_UPDATE, mep->cc_ccm_tx_info.seq_no_update)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_CCM_TX_PERIOD, mep->cc_ccm_tx_info.period)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_CCM_TX_IF_TLV, mep->cc_ccm_tx_info.if_tlv)) goto nla_put_failure; if (nla_put_u8(skb, IFLA_BRIDGE_CFM_CC_CCM_TX_IF_TLV_VALUE, mep->cc_ccm_tx_info.if_tlv_value)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_CCM_TX_PORT_TLV, mep->cc_ccm_tx_info.port_tlv)) goto nla_put_failure; if (nla_put_u8(skb, IFLA_BRIDGE_CFM_CC_CCM_TX_PORT_TLV_VALUE, mep->cc_ccm_tx_info.port_tlv_value)) goto nla_put_failure; nla_nest_end(skb, tb); hlist_for_each_entry_rcu(peer_mep, &mep->peer_mep_list, head) { tb = nla_nest_start(skb, IFLA_BRIDGE_CFM_CC_PEER_MEP_INFO); if (!tb) goto nla_info_failure; if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_PEER_MEP_INSTANCE, mep->instance)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_PEER_MEPID, peer_mep->mepid)) goto nla_put_failure; nla_nest_end(skb, tb); } } return 0; nla_put_failure: nla_nest_cancel(skb, tb); nla_info_failure: return -EMSGSIZE; } int br_cfm_status_fill_info(struct sk_buff *skb, struct net_bridge *br, bool getlink) { struct br_cfm_peer_mep *peer_mep; struct br_cfm_mep *mep; struct nlattr *tb; hlist_for_each_entry_rcu(mep, &br->mep_list, head) { tb = nla_nest_start(skb, IFLA_BRIDGE_CFM_MEP_STATUS_INFO); if (!tb) goto nla_info_failure; if (nla_put_u32(skb, IFLA_BRIDGE_CFM_MEP_STATUS_INSTANCE, mep->instance)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BRIDGE_CFM_MEP_STATUS_OPCODE_UNEXP_SEEN, mep->status.opcode_unexp_seen)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BRIDGE_CFM_MEP_STATUS_VERSION_UNEXP_SEEN, mep->status.version_unexp_seen)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BRIDGE_CFM_MEP_STATUS_RX_LEVEL_LOW_SEEN, mep->status.rx_level_low_seen)) goto nla_put_failure; /* Only clear if this is a GETLINK */ if (getlink) { /* Clear all 'seen' indications */ mep->status.opcode_unexp_seen = false; mep->status.version_unexp_seen = false; mep->status.rx_level_low_seen = false; } nla_nest_end(skb, tb); hlist_for_each_entry_rcu(peer_mep, &mep->peer_mep_list, head) { tb = nla_nest_start(skb, IFLA_BRIDGE_CFM_CC_PEER_STATUS_INFO); if (!tb) goto nla_info_failure; if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_PEER_STATUS_INSTANCE, mep->instance)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_PEER_STATUS_PEER_MEPID, peer_mep->mepid)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_PEER_STATUS_CCM_DEFECT, peer_mep->cc_status.ccm_defect)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_PEER_STATUS_RDI, peer_mep->cc_status.rdi)) goto nla_put_failure; if (nla_put_u8(skb, IFLA_BRIDGE_CFM_CC_PEER_STATUS_PORT_TLV_VALUE, peer_mep->cc_status.port_tlv_value)) goto nla_put_failure; if (nla_put_u8(skb, IFLA_BRIDGE_CFM_CC_PEER_STATUS_IF_TLV_VALUE, peer_mep->cc_status.if_tlv_value)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_PEER_STATUS_SEEN, peer_mep->cc_status.seen)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_PEER_STATUS_TLV_SEEN, peer_mep->cc_status.tlv_seen)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_PEER_STATUS_SEQ_UNEXP_SEEN, peer_mep->cc_status.seq_unexp_seen)) goto nla_put_failure; if (getlink) { /* Only clear if this is a GETLINK */ /* Clear all 'seen' indications */ peer_mep->cc_status.seen = false; peer_mep->cc_status.tlv_seen = false; peer_mep->cc_status.seq_unexp_seen = false; } nla_nest_end(skb, tb); } } return 0; nla_put_failure: nla_nest_cancel(skb, tb); nla_info_failure: return -EMSGSIZE; }
2 1 1 2 4 1 2 1 103 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 // SPDX-License-Identifier: GPL-2.0-only /* * Generic netlink handshake service * * Author: Chuck Lever <chuck.lever@oracle.com> * * Copyright (c) 2023, Oracle and/or its affiliates. */ #include <linux/types.h> #include <linux/socket.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/mm.h> #include <net/sock.h> #include <net/genetlink.h> #include <net/netns/generic.h> #include <kunit/visibility.h> #include <uapi/linux/handshake.h> #include "handshake.h" #include "genl.h" #include <trace/events/handshake.h> /** * handshake_genl_notify - Notify handlers that a request is waiting * @net: target network namespace * @proto: handshake protocol * @flags: memory allocation control flags * * Returns zero on success or a negative errno if notification failed. */ int handshake_genl_notify(struct net *net, const struct handshake_proto *proto, gfp_t flags) { struct sk_buff *msg; void *hdr; /* Disable notifications during unit testing */ if (!test_bit(HANDSHAKE_F_PROTO_NOTIFY, &proto->hp_flags)) return 0; if (!genl_has_listeners(&handshake_nl_family, net, proto->hp_handler_class)) return -ESRCH; msg = genlmsg_new(GENLMSG_DEFAULT_SIZE, flags); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, 0, 0, &handshake_nl_family, 0, HANDSHAKE_CMD_READY); if (!hdr) goto out_free; if (nla_put_u32(msg, HANDSHAKE_A_ACCEPT_HANDLER_CLASS, proto->hp_handler_class) < 0) { genlmsg_cancel(msg, hdr); goto out_free; } genlmsg_end(msg, hdr); return genlmsg_multicast_netns(&handshake_nl_family, net, msg, 0, proto->hp_handler_class, flags); out_free: nlmsg_free(msg); return -EMSGSIZE; } /** * handshake_genl_put - Create a generic netlink message header * @msg: buffer in which to create the header * @info: generic netlink message context * * Returns a ready-to-use header, or NULL. */ struct nlmsghdr *handshake_genl_put(struct sk_buff *msg, struct genl_info *info) { return genlmsg_put(msg, info->snd_portid, info->snd_seq, &handshake_nl_family, 0, info->genlhdr->cmd); } EXPORT_SYMBOL(handshake_genl_put); int handshake_nl_accept_doit(struct sk_buff *skb, struct genl_info *info) { struct net *net = sock_net(skb->sk); struct handshake_net *hn = handshake_pernet(net); struct handshake_req *req = NULL; struct socket *sock; int class, fd, err; err = -EOPNOTSUPP; if (!hn) goto out_status; err = -EINVAL; if (GENL_REQ_ATTR_CHECK(info, HANDSHAKE_A_ACCEPT_HANDLER_CLASS)) goto out_status; class = nla_get_u32(info->attrs[HANDSHAKE_A_ACCEPT_HANDLER_CLASS]); err = -EAGAIN; req = handshake_req_next(hn, class); if (!req) goto out_status; sock = req->hr_sk->sk_socket; fd = get_unused_fd_flags(O_CLOEXEC); if (fd < 0) { err = fd; goto out_complete; } err = req->hr_proto->hp_accept(req, info, fd); if (err) { put_unused_fd(fd); goto out_complete; } fd_install(fd, get_file(sock->file)); trace_handshake_cmd_accept(net, req, req->hr_sk, fd); return 0; out_complete: handshake_complete(req, -EIO, NULL); out_status: trace_handshake_cmd_accept_err(net, req, NULL, err); return err; } int handshake_nl_done_doit(struct sk_buff *skb, struct genl_info *info) { struct net *net = sock_net(skb->sk); struct handshake_req *req; struct socket *sock; int fd, status, err; if (GENL_REQ_ATTR_CHECK(info, HANDSHAKE_A_DONE_SOCKFD)) return -EINVAL; fd = nla_get_s32(info->attrs[HANDSHAKE_A_DONE_SOCKFD]); sock = sockfd_lookup(fd, &err); if (!sock) return err; req = handshake_req_hash_lookup(sock->sk); if (!req) { err = -EBUSY; trace_handshake_cmd_done_err(net, req, sock->sk, err); sockfd_put(sock); return err; } trace_handshake_cmd_done(net, req, sock->sk, fd); status = -EIO; if (info->attrs[HANDSHAKE_A_DONE_STATUS]) status = nla_get_u32(info->attrs[HANDSHAKE_A_DONE_STATUS]); handshake_complete(req, status, info); sockfd_put(sock); return 0; } static unsigned int handshake_net_id; static int __net_init handshake_net_init(struct net *net) { struct handshake_net *hn = net_generic(net, handshake_net_id); unsigned long tmp; struct sysinfo si; /* * Arbitrary limit to prevent handshakes that do not make * progress from clogging up the system. The cap scales up * with the amount of physical memory on the system. */ si_meminfo(&si); tmp = si.totalram / (25 * si.mem_unit); hn->hn_pending_max = clamp(tmp, 3UL, 50UL); spin_lock_init(&hn->hn_lock); hn->hn_pending = 0; hn->hn_flags = 0; INIT_LIST_HEAD(&hn->hn_requests); return 0; } static void __net_exit handshake_net_exit(struct net *net) { struct handshake_net *hn = net_generic(net, handshake_net_id); struct handshake_req *req; LIST_HEAD(requests); /* * Drain the net's pending list. Requests that have been * accepted and are in progress will be destroyed when * the socket is closed. */ spin_lock(&hn->hn_lock); set_bit(HANDSHAKE_F_NET_DRAINING, &hn->hn_flags); list_splice_init(&requests, &hn->hn_requests); spin_unlock(&hn->hn_lock); while (!list_empty(&requests)) { req = list_first_entry(&requests, struct handshake_req, hr_list); list_del(&req->hr_list); /* * Requests on this list have not yet been * accepted, so they do not have an fd to put. */ handshake_complete(req, -ETIMEDOUT, NULL); } } static struct pernet_operations handshake_genl_net_ops = { .init = handshake_net_init, .exit = handshake_net_exit, .id = &handshake_net_id, .size = sizeof(struct handshake_net), }; /** * handshake_pernet - Get the handshake private per-net structure * @net: network namespace * * Returns a pointer to the net's private per-net structure for the * handshake module, or NULL if handshake_init() failed. */ struct handshake_net *handshake_pernet(struct net *net) { return handshake_net_id ? net_generic(net, handshake_net_id) : NULL; } EXPORT_SYMBOL_IF_KUNIT(handshake_pernet); static int __init handshake_init(void) { int ret; ret = handshake_req_hash_init(); if (ret) { pr_warn("handshake: hash initialization failed (%d)\n", ret); return ret; } ret = genl_register_family(&handshake_nl_family); if (ret) { pr_warn("handshake: netlink registration failed (%d)\n", ret); handshake_req_hash_destroy(); return ret; } /* * ORDER: register_pernet_subsys must be done last. * * If initialization does not make it past pernet_subsys * registration, then handshake_net_id will remain 0. That * shunts the handshake consumer API to return ENOTSUPP * to prevent it from dereferencing something that hasn't * been allocated. */ ret = register_pernet_subsys(&handshake_genl_net_ops); if (ret) { pr_warn("handshake: pernet registration failed (%d)\n", ret); genl_unregister_family(&handshake_nl_family); handshake_req_hash_destroy(); } return ret; } static void __exit handshake_exit(void) { unregister_pernet_subsys(&handshake_genl_net_ops); handshake_net_id = 0; handshake_req_hash_destroy(); genl_unregister_family(&handshake_nl_family); } module_init(handshake_init); module_exit(handshake_exit);
4 13 16 4 13 5 1 4 70 70 70 70 69 15 70 15 55 55 15 15 15 69 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 // SPDX-License-Identifier: GPL-2.0 /* * Performance events callchain code, extracted from core.c: * * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> */ #include <linux/perf_event.h> #include <linux/slab.h> #include <linux/sched/task_stack.h> #include <linux/uprobes.h> #include "internal.h" struct callchain_cpus_entries { struct rcu_head rcu_head; struct perf_callchain_entry *cpu_entries[]; }; int sysctl_perf_event_max_stack __read_mostly = PERF_MAX_STACK_DEPTH; int sysctl_perf_event_max_contexts_per_stack __read_mostly = PERF_MAX_CONTEXTS_PER_STACK; static const int six_hundred_forty_kb = 640 * 1024; static inline size_t perf_callchain_entry__sizeof(void) { return (sizeof(struct perf_callchain_entry) + sizeof(__u64) * (sysctl_perf_event_max_stack + sysctl_perf_event_max_contexts_per_stack)); } static DEFINE_PER_CPU(u8, callchain_recursion[PERF_NR_CONTEXTS]); static atomic_t nr_callchain_events; static DEFINE_MUTEX(callchain_mutex); static struct callchain_cpus_entries *callchain_cpus_entries; __weak void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { } __weak void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { } static void release_callchain_buffers_rcu(struct rcu_head *head) { struct callchain_cpus_entries *entries; int cpu; entries = container_of(head, struct callchain_cpus_entries, rcu_head); for_each_possible_cpu(cpu) kfree(entries->cpu_entries[cpu]); kfree(entries); } static void release_callchain_buffers(void) { struct callchain_cpus_entries *entries; entries = callchain_cpus_entries; RCU_INIT_POINTER(callchain_cpus_entries, NULL); call_rcu(&entries->rcu_head, release_callchain_buffers_rcu); } static int alloc_callchain_buffers(void) { int cpu; int size; struct callchain_cpus_entries *entries; /* * We can't use the percpu allocation API for data that can be * accessed from NMI. Use a temporary manual per cpu allocation * until that gets sorted out. */ size = offsetof(struct callchain_cpus_entries, cpu_entries[nr_cpu_ids]); entries = kzalloc(size, GFP_KERNEL); if (!entries) return -ENOMEM; size = perf_callchain_entry__sizeof() * PERF_NR_CONTEXTS; for_each_possible_cpu(cpu) { entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL, cpu_to_node(cpu)); if (!entries->cpu_entries[cpu]) goto fail; } rcu_assign_pointer(callchain_cpus_entries, entries); return 0; fail: for_each_possible_cpu(cpu) kfree(entries->cpu_entries[cpu]); kfree(entries); return -ENOMEM; } int get_callchain_buffers(int event_max_stack) { int err = 0; int count; mutex_lock(&callchain_mutex); count = atomic_inc_return(&nr_callchain_events); if (WARN_ON_ONCE(count < 1)) { err = -EINVAL; goto exit; } /* * If requesting per event more than the global cap, * return a different error to help userspace figure * this out. * * And also do it here so that we have &callchain_mutex held. */ if (event_max_stack > sysctl_perf_event_max_stack) { err = -EOVERFLOW; goto exit; } if (count == 1) err = alloc_callchain_buffers(); exit: if (err) atomic_dec(&nr_callchain_events); mutex_unlock(&callchain_mutex); return err; } void put_callchain_buffers(void) { if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) { release_callchain_buffers(); mutex_unlock(&callchain_mutex); } } struct perf_callchain_entry *get_callchain_entry(int *rctx) { int cpu; struct callchain_cpus_entries *entries; *rctx = get_recursion_context(this_cpu_ptr(callchain_recursion)); if (*rctx == -1) return NULL; entries = rcu_dereference(callchain_cpus_entries); if (!entries) { put_recursion_context(this_cpu_ptr(callchain_recursion), *rctx); return NULL; } cpu = smp_processor_id(); return (((void *)entries->cpu_entries[cpu]) + (*rctx * perf_callchain_entry__sizeof())); } void put_callchain_entry(int rctx) { put_recursion_context(this_cpu_ptr(callchain_recursion), rctx); } static void fixup_uretprobe_trampoline_entries(struct perf_callchain_entry *entry, int start_entry_idx) { #ifdef CONFIG_UPROBES struct uprobe_task *utask = current->utask; struct return_instance *ri; __u64 *cur_ip, *last_ip, tramp_addr; if (likely(!utask || !utask->return_instances)) return; cur_ip = &entry->ip[start_entry_idx]; last_ip = &entry->ip[entry->nr - 1]; ri = utask->return_instances; tramp_addr = uprobe_get_trampoline_vaddr(); /* * If there are pending uretprobes for the current thread, they are * recorded in a list inside utask->return_instances; each such * pending uretprobe replaces traced user function's return address on * the stack, so when stack trace is captured, instead of seeing * actual function's return address, we'll have one or many uretprobe * trampoline addresses in the stack trace, which are not helpful and * misleading to users. * So here we go over the pending list of uretprobes, and each * encountered trampoline address is replaced with actual return * address. */ while (ri && cur_ip <= last_ip) { if (*cur_ip == tramp_addr) { *cur_ip = ri->orig_ret_vaddr; ri = ri->next; } cur_ip++; } #endif } struct perf_callchain_entry * get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, u32 max_stack, bool crosstask, bool add_mark) { struct perf_callchain_entry *entry; struct perf_callchain_entry_ctx ctx; int rctx, start_entry_idx; entry = get_callchain_entry(&rctx); if (!entry) return NULL; ctx.entry = entry; ctx.max_stack = max_stack; ctx.nr = entry->nr = init_nr; ctx.contexts = 0; ctx.contexts_maxed = false; if (kernel && !user_mode(regs)) { if (add_mark) perf_callchain_store_context(&ctx, PERF_CONTEXT_KERNEL); perf_callchain_kernel(&ctx, regs); } if (user) { if (!user_mode(regs)) { if (current->mm) regs = task_pt_regs(current); else regs = NULL; } if (regs) { if (crosstask) goto exit_put; if (add_mark) perf_callchain_store_context(&ctx, PERF_CONTEXT_USER); start_entry_idx = entry->nr; perf_callchain_user(&ctx, regs); fixup_uretprobe_trampoline_entries(entry, start_entry_idx); } } exit_put: put_callchain_entry(rctx); return entry; } static int perf_event_max_stack_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int *value = table->data; int new_value = *value, ret; struct ctl_table new_table = *table; new_table.data = &new_value; ret = proc_dointvec_minmax(&new_table, write, buffer, lenp, ppos); if (ret || !write) return ret; mutex_lock(&callchain_mutex); if (atomic_read(&nr_callchain_events)) ret = -EBUSY; else *value = new_value; mutex_unlock(&callchain_mutex); return ret; } static const struct ctl_table callchain_sysctl_table[] = { { .procname = "perf_event_max_stack", .data = &sysctl_perf_event_max_stack, .maxlen = sizeof(sysctl_perf_event_max_stack), .mode = 0644, .proc_handler = perf_event_max_stack_handler, .extra1 = SYSCTL_ZERO, .extra2 = (void *)&six_hundred_forty_kb, }, { .procname = "perf_event_max_contexts_per_stack", .data = &sysctl_perf_event_max_contexts_per_stack, .maxlen = sizeof(sysctl_perf_event_max_contexts_per_stack), .mode = 0644, .proc_handler = perf_event_max_stack_handler, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE_THOUSAND, }, }; static int __init init_callchain_sysctls(void) { register_sysctl_init("kernel", callchain_sysctl_table); return 0; } core_initcall(init_callchain_sysctls);
14 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 // SPDX-License-Identifier: GPL-2.0-only /* * linux/net/sunrpc/timer.c * * Estimate RPC request round trip time. * * Based on packet round-trip and variance estimator algorithms described * in appendix A of "Congestion Avoidance and Control" by Van Jacobson * and Michael J. Karels (ACM Computer Communication Review; Proceedings * of the Sigcomm '88 Symposium in Stanford, CA, August, 1988). * * This RTT estimator is used only for RPC over datagram protocols. * * Copyright (C) 2002 Trond Myklebust <trond.myklebust@fys.uio.no> */ #include <asm/param.h> #include <linux/types.h> #include <linux/unistd.h> #include <linux/module.h> #include <linux/sunrpc/clnt.h> #define RPC_RTO_MAX (60*HZ) #define RPC_RTO_INIT (HZ/5) #define RPC_RTO_MIN (HZ/10) /** * rpc_init_rtt - Initialize an RPC RTT estimator context * @rt: context to initialize * @timeo: initial timeout value, in jiffies * */ void rpc_init_rtt(struct rpc_rtt *rt, unsigned long timeo) { unsigned long init = 0; unsigned int i; rt->timeo = timeo; if (timeo > RPC_RTO_INIT) init = (timeo - RPC_RTO_INIT) << 3; for (i = 0; i < 5; i++) { rt->srtt[i] = init; rt->sdrtt[i] = RPC_RTO_INIT; rt->ntimeouts[i] = 0; } } EXPORT_SYMBOL_GPL(rpc_init_rtt); /** * rpc_update_rtt - Update an RPC RTT estimator context * @rt: context to update * @timer: timer array index (request type) * @m: recent actual RTT, in jiffies * * NB: When computing the smoothed RTT and standard deviation, * be careful not to produce negative intermediate results. */ void rpc_update_rtt(struct rpc_rtt *rt, unsigned int timer, long m) { long *srtt, *sdrtt; if (timer-- == 0) return; /* jiffies wrapped; ignore this one */ if (m < 0) return; if (m == 0) m = 1L; srtt = (long *)&rt->srtt[timer]; m -= *srtt >> 3; *srtt += m; if (m < 0) m = -m; sdrtt = (long *)&rt->sdrtt[timer]; m -= *sdrtt >> 2; *sdrtt += m; /* Set lower bound on the variance */ if (*sdrtt < RPC_RTO_MIN) *sdrtt = RPC_RTO_MIN; } EXPORT_SYMBOL_GPL(rpc_update_rtt); /** * rpc_calc_rto - Provide an estimated timeout value * @rt: context to use for calculation * @timer: timer array index (request type) * * Estimate RTO for an NFS RPC sent via an unreliable datagram. Use * the mean and mean deviation of RTT for the appropriate type of RPC * for frequently issued RPCs, and a fixed default for the others. * * The justification for doing "other" this way is that these RPCs * happen so infrequently that timer estimation would probably be * stale. Also, since many of these RPCs are non-idempotent, a * conservative timeout is desired. * * getattr, lookup, * read, write, commit - A+4D * other - timeo */ unsigned long rpc_calc_rto(struct rpc_rtt *rt, unsigned int timer) { unsigned long res; if (timer-- == 0) return rt->timeo; res = ((rt->srtt[timer] + 7) >> 3) + rt->sdrtt[timer]; if (res > RPC_RTO_MAX) res = RPC_RTO_MAX; return res; } EXPORT_SYMBOL_GPL(rpc_calc_rto);
384 380 384 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 /* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ #ifndef _LINUX_RSEQ_H #define _LINUX_RSEQ_H #ifdef CONFIG_RSEQ #include <linux/preempt.h> #include <linux/sched.h> /* * Map the event mask on the user-space ABI enum rseq_cs_flags * for direct mask checks. */ enum rseq_event_mask_bits { RSEQ_EVENT_PREEMPT_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT, RSEQ_EVENT_SIGNAL_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT, RSEQ_EVENT_MIGRATE_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT, }; enum rseq_event_mask { RSEQ_EVENT_PREEMPT = (1U << RSEQ_EVENT_PREEMPT_BIT), RSEQ_EVENT_SIGNAL = (1U << RSEQ_EVENT_SIGNAL_BIT), RSEQ_EVENT_MIGRATE = (1U << RSEQ_EVENT_MIGRATE_BIT), }; static inline void rseq_set_notify_resume(struct task_struct *t) { if (t->rseq) set_tsk_thread_flag(t, TIF_NOTIFY_RESUME); } void __rseq_handle_notify_resume(struct ksignal *sig, struct pt_regs *regs); static inline void rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) { if (current->rseq) __rseq_handle_notify_resume(ksig, regs); } static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { preempt_disable(); __set_bit(RSEQ_EVENT_SIGNAL_BIT, &current->rseq_event_mask); preempt_enable(); rseq_handle_notify_resume(ksig, regs); } /* rseq_preempt() requires preemption to be disabled. */ static inline void rseq_preempt(struct task_struct *t) { __set_bit(RSEQ_EVENT_PREEMPT_BIT, &t->rseq_event_mask); rseq_set_notify_resume(t); } /* rseq_migrate() requires preemption to be disabled. */ static inline void rseq_migrate(struct task_struct *t) { __set_bit(RSEQ_EVENT_MIGRATE_BIT, &t->rseq_event_mask); rseq_set_notify_resume(t); } /* * If parent process has a registered restartable sequences area, the * child inherits. Unregister rseq for a clone with CLONE_VM set. */ static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags) { if (clone_flags & CLONE_VM) { t->rseq = NULL; t->rseq_len = 0; t->rseq_sig = 0; t->rseq_event_mask = 0; } else { t->rseq = current->rseq; t->rseq_len = current->rseq_len; t->rseq_sig = current->rseq_sig; t->rseq_event_mask = current->rseq_event_mask; } } static inline void rseq_execve(struct task_struct *t) { t->rseq = NULL; t->rseq_len = 0; t->rseq_sig = 0; t->rseq_event_mask = 0; } #else static inline void rseq_set_notify_resume(struct task_struct *t) { } static inline void rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) { } static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { } static inline void rseq_preempt(struct task_struct *t) { } static inline void rseq_migrate(struct task_struct *t) { } static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags) { } static inline void rseq_execve(struct task_struct *t) { } #endif #ifdef CONFIG_DEBUG_RSEQ void rseq_syscall(struct pt_regs *regs); #else static inline void rseq_syscall(struct pt_regs *regs) { } #endif #endif /* _LINUX_RSEQ_H */
4710 4709 4716 4710 4715 3274 3271 5 2 5 15 6 9 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 #include <linux/bpf.h> #include <linux/vmalloc.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/kernel.h> #include <linux/idr.h> #include <linux/namei.h> #include <linux/user_namespace.h> #include <linux/security.h> static bool bpf_ns_capable(struct user_namespace *ns, int cap) { return ns_capable(ns, cap) || (cap != CAP_SYS_ADMIN && ns_capable(ns, CAP_SYS_ADMIN)); } bool bpf_token_capable(const struct bpf_token *token, int cap) { struct user_namespace *userns; /* BPF token allows ns_capable() level of capabilities */ userns = token ? token->userns : &init_user_ns; if (!bpf_ns_capable(userns, cap)) return false; if (token && security_bpf_token_capable(token, cap) < 0) return false; return true; } void bpf_token_inc(struct bpf_token *token) { atomic64_inc(&token->refcnt); } static void bpf_token_free(struct bpf_token *token) { security_bpf_token_free(token); put_user_ns(token->userns); kfree(token); } static void bpf_token_put_deferred(struct work_struct *work) { struct bpf_token *token = container_of(work, struct bpf_token, work); bpf_token_free(token); } void bpf_token_put(struct bpf_token *token) { if (!token) return; if (!atomic64_dec_and_test(&token->refcnt)) return; INIT_WORK(&token->work, bpf_token_put_deferred); schedule_work(&token->work); } static int bpf_token_release(struct inode *inode, struct file *filp) { struct bpf_token *token = filp->private_data; bpf_token_put(token); return 0; } static void bpf_token_show_fdinfo(struct seq_file *m, struct file *filp) { struct bpf_token *token = filp->private_data; u64 mask; BUILD_BUG_ON(__MAX_BPF_CMD >= 64); mask = BIT_ULL(__MAX_BPF_CMD) - 1; if ((token->allowed_cmds & mask) == mask) seq_printf(m, "allowed_cmds:\tany\n"); else seq_printf(m, "allowed_cmds:\t0x%llx\n", token->allowed_cmds); BUILD_BUG_ON(__MAX_BPF_MAP_TYPE >= 64); mask = BIT_ULL(__MAX_BPF_MAP_TYPE) - 1; if ((token->allowed_maps & mask) == mask) seq_printf(m, "allowed_maps:\tany\n"); else seq_printf(m, "allowed_maps:\t0x%llx\n", token->allowed_maps); BUILD_BUG_ON(__MAX_BPF_PROG_TYPE >= 64); mask = BIT_ULL(__MAX_BPF_PROG_TYPE) - 1; if ((token->allowed_progs & mask) == mask) seq_printf(m, "allowed_progs:\tany\n"); else seq_printf(m, "allowed_progs:\t0x%llx\n", token->allowed_progs); BUILD_BUG_ON(__MAX_BPF_ATTACH_TYPE >= 64); mask = BIT_ULL(__MAX_BPF_ATTACH_TYPE) - 1; if ((token->allowed_attachs & mask) == mask) seq_printf(m, "allowed_attachs:\tany\n"); else seq_printf(m, "allowed_attachs:\t0x%llx\n", token->allowed_attachs); } #define BPF_TOKEN_INODE_NAME "bpf-token" static const struct inode_operations bpf_token_iops = { }; static const struct file_operations bpf_token_fops = { .release = bpf_token_release, .show_fdinfo = bpf_token_show_fdinfo, }; int bpf_token_create(union bpf_attr *attr) { struct bpf_mount_opts *mnt_opts; struct bpf_token *token = NULL; struct user_namespace *userns; struct inode *inode; struct file *file; CLASS(fd, f)(attr->token_create.bpffs_fd); struct path path; struct super_block *sb; umode_t mode; int err, fd; if (fd_empty(f)) return -EBADF; path = fd_file(f)->f_path; sb = path.dentry->d_sb; if (path.dentry != sb->s_root) return -EINVAL; if (sb->s_op != &bpf_super_ops) return -EINVAL; err = path_permission(&path, MAY_ACCESS); if (err) return err; userns = sb->s_user_ns; /* * Enforce that creators of BPF tokens are in the same user * namespace as the BPF FS instance. This makes reasoning about * permissions a lot easier and we can always relax this later. */ if (current_user_ns() != userns) return -EPERM; if (!ns_capable(userns, CAP_BPF)) return -EPERM; /* Creating BPF token in init_user_ns doesn't make much sense. */ if (current_user_ns() == &init_user_ns) return -EOPNOTSUPP; mnt_opts = sb->s_fs_info; if (mnt_opts->delegate_cmds == 0 && mnt_opts->delegate_maps == 0 && mnt_opts->delegate_progs == 0 && mnt_opts->delegate_attachs == 0) return -ENOENT; /* no BPF token delegation is set up */ mode = S_IFREG | ((S_IRUSR | S_IWUSR) & ~current_umask()); inode = bpf_get_inode(sb, NULL, mode); if (IS_ERR(inode)) return PTR_ERR(inode); inode->i_op = &bpf_token_iops; inode->i_fop = &bpf_token_fops; clear_nlink(inode); /* make sure it is unlinked */ file = alloc_file_pseudo(inode, path.mnt, BPF_TOKEN_INODE_NAME, O_RDWR, &bpf_token_fops); if (IS_ERR(file)) { iput(inode); return PTR_ERR(file); } token = kzalloc(sizeof(*token), GFP_USER); if (!token) { err = -ENOMEM; goto out_file; } atomic64_set(&token->refcnt, 1); /* remember bpffs owning userns for future ns_capable() checks */ token->userns = get_user_ns(userns); token->allowed_cmds = mnt_opts->delegate_cmds; token->allowed_maps = mnt_opts->delegate_maps; token->allowed_progs = mnt_opts->delegate_progs; token->allowed_attachs = mnt_opts->delegate_attachs; err = security_bpf_token_create(token, attr, &path); if (err) goto out_token; fd = get_unused_fd_flags(O_CLOEXEC); if (fd < 0) { err = fd; goto out_token; } file->private_data = token; fd_install(fd, file); return fd; out_token: bpf_token_free(token); out_file: fput(file); return err; } struct bpf_token *bpf_token_get_from_fd(u32 ufd) { CLASS(fd, f)(ufd); struct bpf_token *token; if (fd_empty(f)) return ERR_PTR(-EBADF); if (fd_file(f)->f_op != &bpf_token_fops) return ERR_PTR(-EINVAL); token = fd_file(f)->private_data; bpf_token_inc(token); return token; } bool bpf_token_allow_cmd(const struct bpf_token *token, enum bpf_cmd cmd) { if (!token) return false; if (!(token->allowed_cmds & BIT_ULL(cmd))) return false; return security_bpf_token_cmd(token, cmd) == 0; } bool bpf_token_allow_map_type(const struct bpf_token *token, enum bpf_map_type type) { if (!token || type >= __MAX_BPF_MAP_TYPE) return false; return token->allowed_maps & BIT_ULL(type); } bool bpf_token_allow_prog_type(const struct bpf_token *token, enum bpf_prog_type prog_type, enum bpf_attach_type attach_type) { if (!token || prog_type >= __MAX_BPF_PROG_TYPE || attach_type >= __MAX_BPF_ATTACH_TYPE) return false; return (token->allowed_progs & BIT_ULL(prog_type)) && (token->allowed_attachs & BIT_ULL(attach_type)); }
1 8 7 1 7 1 1 4 3 5 5 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 // SPDX-License-Identifier: GPL-2.0-only /* * Module for modifying the secmark field of the skb, for use by * security subsystems. * * Based on the nfmark match by: * (C) 1999-2001 Marc Boucher <marc@mbsi.ca> * * (C) 2006,2008 Red Hat, Inc., James Morris <jmorris@redhat.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/security.h> #include <linux/skbuff.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_SECMARK.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("James Morris <jmorris@redhat.com>"); MODULE_DESCRIPTION("Xtables: packet security mark modification"); MODULE_ALIAS("ipt_SECMARK"); MODULE_ALIAS("ip6t_SECMARK"); static u8 mode; static unsigned int secmark_tg(struct sk_buff *skb, const struct xt_secmark_target_info_v1 *info) { u32 secmark = 0; switch (mode) { case SECMARK_MODE_SEL: secmark = info->secid; break; default: BUG(); } skb->secmark = secmark; return XT_CONTINUE; } static int checkentry_lsm(struct xt_secmark_target_info_v1 *info) { int err; info->secctx[SECMARK_SECCTX_MAX - 1] = '\0'; info->secid = 0; err = security_secctx_to_secid(info->secctx, strlen(info->secctx), &info->secid); if (err) { if (err == -EINVAL) pr_info_ratelimited("invalid security context \'%s\'\n", info->secctx); return err; } if (!info->secid) { pr_info_ratelimited("unable to map security context \'%s\'\n", info->secctx); return -ENOENT; } err = security_secmark_relabel_packet(info->secid); if (err) { pr_info_ratelimited("unable to obtain relabeling permission\n"); return err; } security_secmark_refcount_inc(); return 0; } static int secmark_tg_check(const char *table, struct xt_secmark_target_info_v1 *info) { int err; if (strcmp(table, "mangle") != 0 && strcmp(table, "security") != 0) { pr_info_ratelimited("only valid in \'mangle\' or \'security\' table, not \'%s\'\n", table); return -EINVAL; } if (mode && mode != info->mode) { pr_info_ratelimited("mode already set to %hu cannot mix with rules for mode %hu\n", mode, info->mode); return -EINVAL; } switch (info->mode) { case SECMARK_MODE_SEL: break; default: pr_info_ratelimited("invalid mode: %hu\n", info->mode); return -EINVAL; } err = checkentry_lsm(info); if (err) return err; if (!mode) mode = info->mode; return 0; } static void secmark_tg_destroy(const struct xt_tgdtor_param *par) { switch (mode) { case SECMARK_MODE_SEL: security_secmark_refcount_dec(); } } static int secmark_tg_check_v0(const struct xt_tgchk_param *par) { struct xt_secmark_target_info *info = par->targinfo; struct xt_secmark_target_info_v1 newinfo = { .mode = info->mode, }; int ret; memcpy(newinfo.secctx, info->secctx, SECMARK_SECCTX_MAX); ret = secmark_tg_check(par->table, &newinfo); info->secid = newinfo.secid; return ret; } static unsigned int secmark_tg_v0(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_secmark_target_info *info = par->targinfo; struct xt_secmark_target_info_v1 newinfo = { .secid = info->secid, }; return secmark_tg(skb, &newinfo); } static int secmark_tg_check_v1(const struct xt_tgchk_param *par) { return secmark_tg_check(par->table, par->targinfo); } static unsigned int secmark_tg_v1(struct sk_buff *skb, const struct xt_action_param *par) { return secmark_tg(skb, par->targinfo); } static struct xt_target secmark_tg_reg[] __read_mostly = { { .name = "SECMARK", .revision = 0, .family = NFPROTO_IPV4, .checkentry = secmark_tg_check_v0, .destroy = secmark_tg_destroy, .target = secmark_tg_v0, .targetsize = sizeof(struct xt_secmark_target_info), .me = THIS_MODULE, }, { .name = "SECMARK", .revision = 1, .family = NFPROTO_IPV4, .checkentry = secmark_tg_check_v1, .destroy = secmark_tg_destroy, .target = secmark_tg_v1, .targetsize = sizeof(struct xt_secmark_target_info_v1), .usersize = offsetof(struct xt_secmark_target_info_v1, secid), .me = THIS_MODULE, }, #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) { .name = "SECMARK", .revision = 0, .family = NFPROTO_IPV6, .checkentry = secmark_tg_check_v0, .destroy = secmark_tg_destroy, .target = secmark_tg_v0, .targetsize = sizeof(struct xt_secmark_target_info), .me = THIS_MODULE, }, { .name = "SECMARK", .revision = 1, .family = NFPROTO_IPV6, .checkentry = secmark_tg_check_v1, .destroy = secmark_tg_destroy, .target = secmark_tg_v1, .targetsize = sizeof(struct xt_secmark_target_info_v1), .usersize = offsetof(struct xt_secmark_target_info_v1, secid), .me = THIS_MODULE, }, #endif }; static int __init secmark_tg_init(void) { return xt_register_targets(secmark_tg_reg, ARRAY_SIZE(secmark_tg_reg)); } static void __exit secmark_tg_exit(void) { xt_unregister_targets(secmark_tg_reg, ARRAY_SIZE(secmark_tg_reg)); } module_init(secmark_tg_init); module_exit(secmark_tg_exit);
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 /* SPDX-License-Identifier: GPL-2.0 */ /* * linux/include/linux/sunrpc/clnt.h * * Declarations for the high-level RPC client interface * * Copyright (C) 1995, 1996, Olaf Kirch <okir@monad.swb.de> */ #ifndef _LINUX_SUNRPC_CLNT_H #define _LINUX_SUNRPC_CLNT_H #include <linux/types.h> #include <linux/socket.h> #include <linux/in.h> #include <linux/in6.h> #include <linux/refcount.h> #include <linux/sunrpc/msg_prot.h> #include <linux/sunrpc/sched.h> #include <linux/sunrpc/xprt.h> #include <linux/sunrpc/auth.h> #include <linux/sunrpc/stats.h> #include <linux/sunrpc/xdr.h> #include <linux/sunrpc/timer.h> #include <linux/sunrpc/rpc_pipe_fs.h> #include <asm/signal.h> #include <linux/path.h> #include <net/ipv6.h> #include <linux/sunrpc/xprtmultipath.h> struct rpc_inode; struct rpc_sysfs_client { struct kobject kobject; struct net *net; struct rpc_clnt *clnt; struct rpc_xprt_switch *xprt_switch; }; /* * The high-level client handle */ struct rpc_clnt { refcount_t cl_count; /* Number of references */ unsigned int cl_clid; /* client id */ struct list_head cl_clients; /* Global list of clients */ struct list_head cl_tasks; /* List of tasks */ atomic_t cl_pid; /* task PID counter */ spinlock_t cl_lock; /* spinlock */ struct rpc_xprt __rcu * cl_xprt; /* transport */ const struct rpc_procinfo *cl_procinfo; /* procedure info */ u32 cl_prog, /* RPC program number */ cl_vers, /* RPC version number */ cl_maxproc; /* max procedure number */ struct rpc_auth * cl_auth; /* authenticator */ struct rpc_stat * cl_stats; /* per-program statistics */ struct rpc_iostats * cl_metrics; /* per-client statistics */ unsigned int cl_softrtry : 1,/* soft timeouts */ cl_softerr : 1,/* Timeouts return errors */ cl_discrtry : 1,/* disconnect before retry */ cl_noretranstimeo: 1,/* No retransmit timeouts */ cl_autobind : 1,/* use getport() */ cl_chatty : 1,/* be verbose */ cl_shutdown : 1,/* rpc immediate -EIO */ cl_netunreach_fatal : 1; /* Treat ENETUNREACH errors as fatal */ struct xprtsec_parms cl_xprtsec; /* transport security policy */ struct rpc_rtt * cl_rtt; /* RTO estimator data */ const struct rpc_timeout *cl_timeout; /* Timeout strategy */ atomic_t cl_swapper; /* swapfile count */ int cl_nodelen; /* nodename length */ char cl_nodename[UNX_MAXNODENAME+1]; struct rpc_pipe_dir_head cl_pipedir_objects; struct rpc_clnt * cl_parent; /* Points to parent of clones */ struct rpc_rtt cl_rtt_default; struct rpc_timeout cl_timeout_default; const struct rpc_program *cl_program; const char * cl_principal; /* use for machine cred */ #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) struct dentry *cl_debugfs; /* debugfs directory */ #endif struct rpc_sysfs_client *cl_sysfs; /* sysfs directory */ /* cl_work is only needed after cl_xpi is no longer used, * and that are of similar size */ union { struct rpc_xprt_iter cl_xpi; struct work_struct cl_work; }; const struct cred *cl_cred; unsigned int cl_max_connect; /* max number of transports not to the same IP */ struct super_block *pipefs_sb; atomic_t cl_task_count; }; /* * General RPC program info */ #define RPC_MAXVERSION 4 struct rpc_program { const char * name; /* protocol name */ u32 number; /* program number */ unsigned int nrvers; /* number of versions */ const struct rpc_version ** version; /* version array */ struct rpc_stat * stats; /* statistics */ const char * pipe_dir_name; /* path to rpc_pipefs dir */ }; struct rpc_version { u32 number; /* version number */ unsigned int nrprocs; /* number of procs */ const struct rpc_procinfo *procs; /* procedure array */ unsigned int *counts; /* call counts */ }; /* * Procedure information */ struct rpc_procinfo { u32 p_proc; /* RPC procedure number */ kxdreproc_t p_encode; /* XDR encode function */ kxdrdproc_t p_decode; /* XDR decode function */ unsigned int p_arglen; /* argument hdr length (u32) */ unsigned int p_replen; /* reply hdr length (u32) */ unsigned int p_timer; /* Which RTT timer to use */ u32 p_statidx; /* Which procedure to account */ const char * p_name; /* name of procedure */ }; struct rpc_create_args { struct net *net; int protocol; struct sockaddr *address; size_t addrsize; struct sockaddr *saddress; const struct rpc_timeout *timeout; const char *servername; const char *nodename; const struct rpc_program *program; struct rpc_stat *stats; u32 prognumber; /* overrides program->number */ u32 version; rpc_authflavor_t authflavor; u32 nconnect; unsigned long flags; char *client_name; struct svc_xprt *bc_xprt; /* NFSv4.1 backchannel */ const struct cred *cred; unsigned int max_connect; struct xprtsec_parms xprtsec; unsigned long connect_timeout; unsigned long reconnect_timeout; }; struct rpc_add_xprt_test { void (*add_xprt_test)(struct rpc_clnt *clnt, struct rpc_xprt *xprt, void *calldata); void *data; }; /* Values for "flags" field */ #define RPC_CLNT_CREATE_HARDRTRY (1UL << 0) #define RPC_CLNT_CREATE_AUTOBIND (1UL << 2) #define RPC_CLNT_CREATE_NONPRIVPORT (1UL << 3) #define RPC_CLNT_CREATE_NOPING (1UL << 4) #define RPC_CLNT_CREATE_DISCRTRY (1UL << 5) #define RPC_CLNT_CREATE_QUIET (1UL << 6) #define RPC_CLNT_CREATE_INFINITE_SLOTS (1UL << 7) #define RPC_CLNT_CREATE_NO_IDLE_TIMEOUT (1UL << 8) #define RPC_CLNT_CREATE_NO_RETRANS_TIMEOUT (1UL << 9) #define RPC_CLNT_CREATE_SOFTERR (1UL << 10) #define RPC_CLNT_CREATE_REUSEPORT (1UL << 11) #define RPC_CLNT_CREATE_CONNECTED (1UL << 12) #define RPC_CLNT_CREATE_NETUNREACH_FATAL (1UL << 13) struct rpc_clnt *rpc_create(struct rpc_create_args *args); struct rpc_clnt *rpc_bind_new_program(struct rpc_clnt *, const struct rpc_program *, u32); struct rpc_clnt *rpc_clone_client(struct rpc_clnt *); struct rpc_clnt *rpc_clone_client_set_auth(struct rpc_clnt *, rpc_authflavor_t); int rpc_switch_client_transport(struct rpc_clnt *, struct xprt_create *, const struct rpc_timeout *); void rpc_shutdown_client(struct rpc_clnt *); void rpc_release_client(struct rpc_clnt *); void rpc_task_release_transport(struct rpc_task *); void rpc_task_release_client(struct rpc_task *); struct rpc_xprt *rpc_task_get_xprt(struct rpc_clnt *clnt, struct rpc_xprt *xprt); int rpcb_create_local(struct net *); void rpcb_put_local(struct net *); int rpcb_register(struct net *, u32, u32, int, unsigned short); int rpcb_v4_register(struct net *net, const u32 program, const u32 version, const struct sockaddr *address, const char *netid); void rpcb_getport_async(struct rpc_task *); void rpc_prepare_reply_pages(struct rpc_rqst *req, struct page **pages, unsigned int base, unsigned int len, unsigned int hdrsize); void rpc_call_start(struct rpc_task *); int rpc_call_async(struct rpc_clnt *clnt, const struct rpc_message *msg, int flags, const struct rpc_call_ops *tk_ops, void *calldata); int rpc_call_sync(struct rpc_clnt *clnt, const struct rpc_message *msg, int flags); struct rpc_task *rpc_call_null(struct rpc_clnt *clnt, struct rpc_cred *cred, int flags); int rpc_restart_call_prepare(struct rpc_task *); int rpc_restart_call(struct rpc_task *); void rpc_setbufsize(struct rpc_clnt *, unsigned int, unsigned int); struct net * rpc_net_ns(struct rpc_clnt *); size_t rpc_max_payload(struct rpc_clnt *); size_t rpc_max_bc_payload(struct rpc_clnt *); unsigned int rpc_num_bc_slots(struct rpc_clnt *); void rpc_force_rebind(struct rpc_clnt *); size_t rpc_peeraddr(struct rpc_clnt *, struct sockaddr *, size_t); const char *rpc_peeraddr2str(struct rpc_clnt *, enum rpc_display_format_t); int rpc_localaddr(struct rpc_clnt *, struct sockaddr *, size_t); int rpc_clnt_iterate_for_each_xprt(struct rpc_clnt *clnt, int (*fn)(struct rpc_clnt *, struct rpc_xprt *, void *), void *data); int rpc_clnt_test_and_add_xprt(struct rpc_clnt *clnt, struct rpc_xprt_switch *xps, struct rpc_xprt *xprt, void *dummy); int rpc_clnt_add_xprt(struct rpc_clnt *, struct xprt_create *, int (*setup)(struct rpc_clnt *, struct rpc_xprt_switch *, struct rpc_xprt *, void *), void *data); void rpc_set_connect_timeout(struct rpc_clnt *clnt, unsigned long connect_timeout, unsigned long reconnect_timeout); int rpc_clnt_setup_test_and_add_xprt(struct rpc_clnt *, struct rpc_xprt_switch *, struct rpc_xprt *, void *); void rpc_clnt_manage_trunked_xprts(struct rpc_clnt *); void rpc_clnt_probe_trunked_xprts(struct rpc_clnt *, struct rpc_add_xprt_test *); const char *rpc_proc_name(const struct rpc_task *task); void rpc_clnt_xprt_switch_add_xprt(struct rpc_clnt *, struct rpc_xprt *); void rpc_clnt_xprt_switch_remove_xprt(struct rpc_clnt *, struct rpc_xprt *); bool rpc_clnt_xprt_switch_has_addr(struct rpc_clnt *clnt, const struct sockaddr *sap); void rpc_clnt_xprt_set_online(struct rpc_clnt *clnt, struct rpc_xprt *xprt); void rpc_clnt_disconnect(struct rpc_clnt *clnt); void rpc_cleanup_clids(void); static inline int rpc_reply_expected(struct rpc_task *task) { return (task->tk_msg.rpc_proc != NULL) && (task->tk_msg.rpc_proc->p_decode != NULL); } static inline void rpc_task_close_connection(struct rpc_task *task) { if (task->tk_xprt) xprt_force_disconnect(task->tk_xprt); } #endif /* _LINUX_SUNRPC_CLNT_H */
103 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 // SPDX-License-Identifier: BSD-3-Clause /* * linux/net/sunrpc/auth_gss/auth_gss.c * * RPCSEC_GSS client authentication. * * Copyright (c) 2000 The Regents of the University of Michigan. * All rights reserved. * * Dug Song <dugsong@monkey.org> * Andy Adamson <andros@umich.edu> */ #include <linux/module.h> #include <linux/init.h> #include <linux/types.h> #include <linux/slab.h> #include <linux/sched.h> #include <linux/pagemap.h> #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/auth.h> #include <linux/sunrpc/auth_gss.h> #include <linux/sunrpc/gss_krb5.h> #include <linux/sunrpc/svcauth_gss.h> #include <linux/sunrpc/gss_err.h> #include <linux/workqueue.h> #include <linux/sunrpc/rpc_pipe_fs.h> #include <linux/sunrpc/gss_api.h> #include <linux/uaccess.h> #include <linux/hashtable.h> #include "auth_gss_internal.h" #include "../netns.h" #include <trace/events/rpcgss.h> static const struct rpc_authops authgss_ops; static const struct rpc_credops gss_credops; static const struct rpc_credops gss_nullops; #define GSS_RETRY_EXPIRED 5 static unsigned int gss_expired_cred_retry_delay = GSS_RETRY_EXPIRED; #define GSS_KEY_EXPIRE_TIMEO 240 static unsigned int gss_key_expire_timeo = GSS_KEY_EXPIRE_TIMEO; #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) # define RPCDBG_FACILITY RPCDBG_AUTH #endif /* * This compile-time check verifies that we will not exceed the * slack space allotted by the client and server auth_gss code * before they call gss_wrap(). */ #define GSS_KRB5_MAX_SLACK_NEEDED \ (GSS_KRB5_TOK_HDR_LEN /* gss token header */ \ + GSS_KRB5_MAX_CKSUM_LEN /* gss token checksum */ \ + GSS_KRB5_MAX_BLOCKSIZE /* confounder */ \ + GSS_KRB5_MAX_BLOCKSIZE /* possible padding */ \ + GSS_KRB5_TOK_HDR_LEN /* encrypted hdr in v2 token */ \ + GSS_KRB5_MAX_CKSUM_LEN /* encryption hmac */ \ + XDR_UNIT * 2 /* RPC verifier */ \ + GSS_KRB5_TOK_HDR_LEN \ + GSS_KRB5_MAX_CKSUM_LEN) #define GSS_CRED_SLACK (RPC_MAX_AUTH_SIZE * 2) /* length of a krb5 verifier (48), plus data added before arguments when * using integrity (two 4-byte integers): */ #define GSS_VERF_SLACK 100 static DEFINE_HASHTABLE(gss_auth_hash_table, 4); static DEFINE_SPINLOCK(gss_auth_hash_lock); struct gss_pipe { struct rpc_pipe_dir_object pdo; struct rpc_pipe *pipe; struct rpc_clnt *clnt; const char *name; struct kref kref; }; struct gss_auth { struct kref kref; struct hlist_node hash; struct rpc_auth rpc_auth; struct gss_api_mech *mech; enum rpc_gss_svc service; struct rpc_clnt *client; struct net *net; netns_tracker ns_tracker; /* * There are two upcall pipes; dentry[1], named "gssd", is used * for the new text-based upcall; dentry[0] is named after the * mechanism (for example, "krb5") and exists for * backwards-compatibility with older gssd's. */ struct gss_pipe *gss_pipe[2]; const char *target_name; }; /* pipe_version >= 0 if and only if someone has a pipe open. */ static DEFINE_SPINLOCK(pipe_version_lock); static struct rpc_wait_queue pipe_version_rpc_waitqueue; static DECLARE_WAIT_QUEUE_HEAD(pipe_version_waitqueue); static void gss_put_auth(struct gss_auth *gss_auth); static void gss_free_ctx(struct gss_cl_ctx *); static const struct rpc_pipe_ops gss_upcall_ops_v0; static const struct rpc_pipe_ops gss_upcall_ops_v1; static inline struct gss_cl_ctx * gss_get_ctx(struct gss_cl_ctx *ctx) { refcount_inc(&ctx->count); return ctx; } static inline void gss_put_ctx(struct gss_cl_ctx *ctx) { if (refcount_dec_and_test(&ctx->count)) gss_free_ctx(ctx); } /* gss_cred_set_ctx: * called by gss_upcall_callback and gss_create_upcall in order * to set the gss context. The actual exchange of an old context * and a new one is protected by the pipe->lock. */ static void gss_cred_set_ctx(struct rpc_cred *cred, struct gss_cl_ctx *ctx) { struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base); if (!test_bit(RPCAUTH_CRED_NEW, &cred->cr_flags)) return; gss_get_ctx(ctx); rcu_assign_pointer(gss_cred->gc_ctx, ctx); set_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags); smp_mb__before_atomic(); clear_bit(RPCAUTH_CRED_NEW, &cred->cr_flags); } static struct gss_cl_ctx * gss_cred_get_ctx(struct rpc_cred *cred) { struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base); struct gss_cl_ctx *ctx = NULL; rcu_read_lock(); ctx = rcu_dereference(gss_cred->gc_ctx); if (ctx) gss_get_ctx(ctx); rcu_read_unlock(); return ctx; } static struct gss_cl_ctx * gss_alloc_context(void) { struct gss_cl_ctx *ctx; ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (ctx != NULL) { ctx->gc_proc = RPC_GSS_PROC_DATA; ctx->gc_seq = 1; /* NetApp 6.4R1 doesn't accept seq. no. 0 */ spin_lock_init(&ctx->gc_seq_lock); refcount_set(&ctx->count,1); } return ctx; } #define GSSD_MIN_TIMEOUT (60 * 60) static const void * gss_fill_context(const void *p, const void *end, struct gss_cl_ctx *ctx, struct gss_api_mech *gm) { const void *q; unsigned int seclen; unsigned int timeout; unsigned long now = jiffies; u32 window_size; int ret; /* First unsigned int gives the remaining lifetime in seconds of the * credential - e.g. the remaining TGT lifetime for Kerberos or * the -t value passed to GSSD. */ p = simple_get_bytes(p, end, &timeout, sizeof(timeout)); if (IS_ERR(p)) goto err; if (timeout == 0) timeout = GSSD_MIN_TIMEOUT; ctx->gc_expiry = now + ((unsigned long)timeout * HZ); /* Sequence number window. Determines the maximum number of * simultaneous requests */ p = simple_get_bytes(p, end, &window_size, sizeof(window_size)); if (IS_ERR(p)) goto err; ctx->gc_win = window_size; /* gssd signals an error by passing ctx->gc_win = 0: */ if (ctx->gc_win == 0) { /* * in which case, p points to an error code. Anything other * than -EKEYEXPIRED gets converted to -EACCES. */ p = simple_get_bytes(p, end, &ret, sizeof(ret)); if (!IS_ERR(p)) p = (ret == -EKEYEXPIRED) ? ERR_PTR(-EKEYEXPIRED) : ERR_PTR(-EACCES); goto err; } /* copy the opaque wire context */ p = simple_get_netobj(p, end, &ctx->gc_wire_ctx); if (IS_ERR(p)) goto err; /* import the opaque security context */ p = simple_get_bytes(p, end, &seclen, sizeof(seclen)); if (IS_ERR(p)) goto err; q = (const void *)((const char *)p + seclen); if (unlikely(q > end || q < p)) { p = ERR_PTR(-EFAULT); goto err; } ret = gss_import_sec_context(p, seclen, gm, &ctx->gc_gss_ctx, NULL, GFP_KERNEL); if (ret < 0) { trace_rpcgss_import_ctx(ret); p = ERR_PTR(ret); goto err; } /* is there any trailing data? */ if (q == end) { p = q; goto done; } /* pull in acceptor name (if there is one) */ p = simple_get_netobj(q, end, &ctx->gc_acceptor); if (IS_ERR(p)) goto err; done: trace_rpcgss_context(window_size, ctx->gc_expiry, now, timeout, ctx->gc_acceptor.len, ctx->gc_acceptor.data); err: return p; } /* XXX: Need some documentation about why UPCALL_BUF_LEN is so small. * Is user space expecting no more than UPCALL_BUF_LEN bytes? * Note that there are now _two_ NI_MAXHOST sized data items * being passed in this string. */ #define UPCALL_BUF_LEN 256 struct gss_upcall_msg { refcount_t count; kuid_t uid; const char *service_name; struct rpc_pipe_msg msg; struct list_head list; struct gss_auth *auth; struct rpc_pipe *pipe; struct rpc_wait_queue rpc_waitqueue; wait_queue_head_t waitqueue; struct gss_cl_ctx *ctx; char databuf[UPCALL_BUF_LEN]; }; static int get_pipe_version(struct net *net) { struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); int ret; spin_lock(&pipe_version_lock); if (sn->pipe_version >= 0) { atomic_inc(&sn->pipe_users); ret = sn->pipe_version; } else ret = -EAGAIN; spin_unlock(&pipe_version_lock); return ret; } static void put_pipe_version(struct net *net) { struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); if (atomic_dec_and_lock(&sn->pipe_users, &pipe_version_lock)) { sn->pipe_version = -1; spin_unlock(&pipe_version_lock); } } static void gss_release_msg(struct gss_upcall_msg *gss_msg) { struct net *net = gss_msg->auth->net; if (!refcount_dec_and_test(&gss_msg->count)) return; put_pipe_version(net); BUG_ON(!list_empty(&gss_msg->list)); if (gss_msg->ctx != NULL) gss_put_ctx(gss_msg->ctx); rpc_destroy_wait_queue(&gss_msg->rpc_waitqueue); gss_put_auth(gss_msg->auth); kfree_const(gss_msg->service_name); kfree(gss_msg); } static struct gss_upcall_msg * __gss_find_upcall(struct rpc_pipe *pipe, kuid_t uid, const struct gss_auth *auth) { struct gss_upcall_msg *pos; list_for_each_entry(pos, &pipe->in_downcall, list) { if (!uid_eq(pos->uid, uid)) continue; if (pos->auth->service != auth->service) continue; refcount_inc(&pos->count); return pos; } return NULL; } /* Try to add an upcall to the pipefs queue. * If an upcall owned by our uid already exists, then we return a reference * to that upcall instead of adding the new upcall. */ static inline struct gss_upcall_msg * gss_add_msg(struct gss_upcall_msg *gss_msg) { struct rpc_pipe *pipe = gss_msg->pipe; struct gss_upcall_msg *old; spin_lock(&pipe->lock); old = __gss_find_upcall(pipe, gss_msg->uid, gss_msg->auth); if (old == NULL) { refcount_inc(&gss_msg->count); list_add(&gss_msg->list, &pipe->in_downcall); } else gss_msg = old; spin_unlock(&pipe->lock); return gss_msg; } static void __gss_unhash_msg(struct gss_upcall_msg *gss_msg) { list_del_init(&gss_msg->list); rpc_wake_up_status(&gss_msg->rpc_waitqueue, gss_msg->msg.errno); wake_up_all(&gss_msg->waitqueue); refcount_dec(&gss_msg->count); } static void gss_unhash_msg(struct gss_upcall_msg *gss_msg) { struct rpc_pipe *pipe = gss_msg->pipe; if (list_empty(&gss_msg->list)) return; spin_lock(&pipe->lock); if (!list_empty(&gss_msg->list)) __gss_unhash_msg(gss_msg); spin_unlock(&pipe->lock); } static void gss_handle_downcall_result(struct gss_cred *gss_cred, struct gss_upcall_msg *gss_msg) { switch (gss_msg->msg.errno) { case 0: if (gss_msg->ctx == NULL) break; clear_bit(RPCAUTH_CRED_NEGATIVE, &gss_cred->gc_base.cr_flags); gss_cred_set_ctx(&gss_cred->gc_base, gss_msg->ctx); break; case -EKEYEXPIRED: set_bit(RPCAUTH_CRED_NEGATIVE, &gss_cred->gc_base.cr_flags); } gss_cred->gc_upcall_timestamp = jiffies; gss_cred->gc_upcall = NULL; rpc_wake_up_status(&gss_msg->rpc_waitqueue, gss_msg->msg.errno); } static void gss_upcall_callback(struct rpc_task *task) { struct gss_cred *gss_cred = container_of(task->tk_rqstp->rq_cred, struct gss_cred, gc_base); struct gss_upcall_msg *gss_msg = gss_cred->gc_upcall; struct rpc_pipe *pipe = gss_msg->pipe; spin_lock(&pipe->lock); gss_handle_downcall_result(gss_cred, gss_msg); spin_unlock(&pipe->lock); task->tk_status = gss_msg->msg.errno; gss_release_msg(gss_msg); } static void gss_encode_v0_msg(struct gss_upcall_msg *gss_msg, const struct cred *cred) { struct user_namespace *userns = cred->user_ns; uid_t uid = from_kuid_munged(userns, gss_msg->uid); memcpy(gss_msg->databuf, &uid, sizeof(uid)); gss_msg->msg.data = gss_msg->databuf; gss_msg->msg.len = sizeof(uid); BUILD_BUG_ON(sizeof(uid) > sizeof(gss_msg->databuf)); } static ssize_t gss_v0_upcall(struct file *file, struct rpc_pipe_msg *msg, char __user *buf, size_t buflen) { struct gss_upcall_msg *gss_msg = container_of(msg, struct gss_upcall_msg, msg); if (msg->copied == 0) gss_encode_v0_msg(gss_msg, file->f_cred); return rpc_pipe_generic_upcall(file, msg, buf, buflen); } static int gss_encode_v1_msg(struct gss_upcall_msg *gss_msg, const char *service_name, const char *target_name, const struct cred *cred) { struct user_namespace *userns = cred->user_ns; struct gss_api_mech *mech = gss_msg->auth->mech; char *p = gss_msg->databuf; size_t buflen = sizeof(gss_msg->databuf); int len; len = scnprintf(p, buflen, "mech=%s uid=%d", mech->gm_name, from_kuid_munged(userns, gss_msg->uid)); buflen -= len; p += len; gss_msg->msg.len = len; /* * target= is a full service principal that names the remote * identity that we are authenticating to. */ if (target_name) { len = scnprintf(p, buflen, " target=%s", target_name); buflen -= len; p += len; gss_msg->msg.len += len; } /* * gssd uses service= and srchost= to select a matching key from * the system's keytab to use as the source principal. * * service= is the service name part of the source principal, * or "*" (meaning choose any). * * srchost= is the hostname part of the source principal. When * not provided, gssd uses the local hostname. */ if (service_name) { char *c = strchr(service_name, '@'); if (!c) len = scnprintf(p, buflen, " service=%s", service_name); else len = scnprintf(p, buflen, " service=%.*s srchost=%s", (int)(c - service_name), service_name, c + 1); buflen -= len; p += len; gss_msg->msg.len += len; } if (mech->gm_upcall_enctypes) { len = scnprintf(p, buflen, " enctypes=%s", mech->gm_upcall_enctypes); buflen -= len; p += len; gss_msg->msg.len += len; } trace_rpcgss_upcall_msg(gss_msg->databuf); len = scnprintf(p, buflen, "\n"); if (len == 0) goto out_overflow; gss_msg->msg.len += len; gss_msg->msg.data = gss_msg->databuf; return 0; out_overflow: WARN_ON_ONCE(1); return -ENOMEM; } static ssize_t gss_v1_upcall(struct file *file, struct rpc_pipe_msg *msg, char __user *buf, size_t buflen) { struct gss_upcall_msg *gss_msg = container_of(msg, struct gss_upcall_msg, msg); int err; if (msg->copied == 0) { err = gss_encode_v1_msg(gss_msg, gss_msg->service_name, gss_msg->auth->target_name, file->f_cred); if (err) return err; } return rpc_pipe_generic_upcall(file, msg, buf, buflen); } static struct gss_upcall_msg * gss_alloc_msg(struct gss_auth *gss_auth, kuid_t uid, const char *service_name) { struct gss_upcall_msg *gss_msg; int vers; int err = -ENOMEM; gss_msg = kzalloc(sizeof(*gss_msg), GFP_KERNEL); if (gss_msg == NULL) goto err; vers = get_pipe_version(gss_auth->net); err = vers; if (err < 0) goto err_free_msg; gss_msg->pipe = gss_auth->gss_pipe[vers]->pipe; INIT_LIST_HEAD(&gss_msg->list); rpc_init_wait_queue(&gss_msg->rpc_waitqueue, "RPCSEC_GSS upcall waitq"); init_waitqueue_head(&gss_msg->waitqueue); refcount_set(&gss_msg->count, 1); gss_msg->uid = uid; gss_msg->auth = gss_auth; kref_get(&gss_auth->kref); if (service_name) { gss_msg->service_name = kstrdup_const(service_name, GFP_KERNEL); if (!gss_msg->service_name) { err = -ENOMEM; goto err_put_pipe_version; } } return gss_msg; err_put_pipe_version: put_pipe_version(gss_auth->net); err_free_msg: kfree(gss_msg); err: return ERR_PTR(err); } static struct gss_upcall_msg * gss_setup_upcall(struct gss_auth *gss_auth, struct rpc_cred *cred) { struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base); struct gss_upcall_msg *gss_new, *gss_msg; kuid_t uid = cred->cr_cred->fsuid; gss_new = gss_alloc_msg(gss_auth, uid, gss_cred->gc_principal); if (IS_ERR(gss_new)) return gss_new; gss_msg = gss_add_msg(gss_new); if (gss_msg == gss_new) { int res; refcount_inc(&gss_msg->count); res = rpc_queue_upcall(gss_new->pipe, &gss_new->msg); if (res) { gss_unhash_msg(gss_new); refcount_dec(&gss_msg->count); gss_release_msg(gss_new); gss_msg = ERR_PTR(res); } } else gss_release_msg(gss_new); return gss_msg; } static void warn_gssd(void) { dprintk("AUTH_GSS upcall failed. Please check user daemon is running.\n"); } static inline int gss_refresh_upcall(struct rpc_task *task) { struct rpc_cred *cred = task->tk_rqstp->rq_cred; struct gss_auth *gss_auth = container_of(cred->cr_auth, struct gss_auth, rpc_auth); struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base); struct gss_upcall_msg *gss_msg; struct rpc_pipe *pipe; int err = 0; gss_msg = gss_setup_upcall(gss_auth, cred); if (PTR_ERR(gss_msg) == -EAGAIN) { /* XXX: warning on the first, under the assumption we * shouldn't normally hit this case on a refresh. */ warn_gssd(); rpc_sleep_on_timeout(&pipe_version_rpc_waitqueue, task, NULL, jiffies + (15 * HZ)); err = -EAGAIN; goto out; } if (IS_ERR(gss_msg)) { err = PTR_ERR(gss_msg); goto out; } pipe = gss_msg->pipe; spin_lock(&pipe->lock); if (gss_cred->gc_upcall != NULL) rpc_sleep_on(&gss_cred->gc_upcall->rpc_waitqueue, task, NULL); else if (gss_msg->ctx == NULL && gss_msg->msg.errno >= 0) { gss_cred->gc_upcall = gss_msg; /* gss_upcall_callback will release the reference to gss_upcall_msg */ refcount_inc(&gss_msg->count); rpc_sleep_on(&gss_msg->rpc_waitqueue, task, gss_upcall_callback); } else { gss_handle_downcall_result(gss_cred, gss_msg); err = gss_msg->msg.errno; } spin_unlock(&pipe->lock); gss_release_msg(gss_msg); out: trace_rpcgss_upcall_result(from_kuid(&init_user_ns, cred->cr_cred->fsuid), err); return err; } static inline int gss_create_upcall(struct gss_auth *gss_auth, struct gss_cred *gss_cred) { struct net *net = gss_auth->net; struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); struct rpc_pipe *pipe; struct rpc_cred *cred = &gss_cred->gc_base; struct gss_upcall_msg *gss_msg; DEFINE_WAIT(wait); int err; retry: err = 0; /* if gssd is down, just skip upcalling altogether */ if (!gssd_running(net)) { warn_gssd(); err = -EACCES; goto out; } gss_msg = gss_setup_upcall(gss_auth, cred); if (PTR_ERR(gss_msg) == -EAGAIN) { err = wait_event_interruptible_timeout(pipe_version_waitqueue, sn->pipe_version >= 0, 15 * HZ); if (sn->pipe_version < 0) { warn_gssd(); err = -EACCES; } if (err < 0) goto out; goto retry; } if (IS_ERR(gss_msg)) { err = PTR_ERR(gss_msg); goto out; } pipe = gss_msg->pipe; for (;;) { prepare_to_wait(&gss_msg->waitqueue, &wait, TASK_KILLABLE); spin_lock(&pipe->lock); if (gss_msg->ctx != NULL || gss_msg->msg.errno < 0) { break; } spin_unlock(&pipe->lock); if (fatal_signal_pending(current)) { err = -ERESTARTSYS; goto out_intr; } schedule(); } if (gss_msg->ctx) { trace_rpcgss_ctx_init(gss_cred); gss_cred_set_ctx(cred, gss_msg->ctx); } else { err = gss_msg->msg.errno; } spin_unlock(&pipe->lock); out_intr: finish_wait(&gss_msg->waitqueue, &wait); gss_release_msg(gss_msg); out: trace_rpcgss_upcall_result(from_kuid(&init_user_ns, cred->cr_cred->fsuid), err); return err; } static struct gss_upcall_msg * gss_find_downcall(struct rpc_pipe *pipe, kuid_t uid) { struct gss_upcall_msg *pos; list_for_each_entry(pos, &pipe->in_downcall, list) { if (!uid_eq(pos->uid, uid)) continue; if (!rpc_msg_is_inflight(&pos->msg)) continue; refcount_inc(&pos->count); return pos; } return NULL; } #define MSG_BUF_MAXSIZE 1024 static ssize_t gss_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) { const void *p, *end; void *buf; struct gss_upcall_msg *gss_msg; struct rpc_pipe *pipe = RPC_I(file_inode(filp))->pipe; struct gss_cl_ctx *ctx; uid_t id; kuid_t uid; ssize_t err = -EFBIG; if (mlen > MSG_BUF_MAXSIZE) goto out; err = -ENOMEM; buf = kmalloc(mlen, GFP_KERNEL); if (!buf) goto out; err = -EFAULT; if (copy_from_user(buf, src, mlen)) goto err; end = (const void *)((char *)buf + mlen); p = simple_get_bytes(buf, end, &id, sizeof(id)); if (IS_ERR(p)) { err = PTR_ERR(p); goto err; } uid = make_kuid(current_user_ns(), id); if (!uid_valid(uid)) { err = -EINVAL; goto err; } err = -ENOMEM; ctx = gss_alloc_context(); if (ctx == NULL) goto err; err = -ENOENT; /* Find a matching upcall */ spin_lock(&pipe->lock); gss_msg = gss_find_downcall(pipe, uid); if (gss_msg == NULL) { spin_unlock(&pipe->lock); goto err_put_ctx; } list_del_init(&gss_msg->list); spin_unlock(&pipe->lock); p = gss_fill_context(p, end, ctx, gss_msg->auth->mech); if (IS_ERR(p)) { err = PTR_ERR(p); switch (err) { case -EACCES: case -EKEYEXPIRED: gss_msg->msg.errno = err; err = mlen; break; case -EFAULT: case -ENOMEM: case -EINVAL: case -ENOSYS: gss_msg->msg.errno = -EAGAIN; break; default: printk(KERN_CRIT "%s: bad return from " "gss_fill_context: %zd\n", __func__, err); gss_msg->msg.errno = -EIO; } goto err_release_msg; } gss_msg->ctx = gss_get_ctx(ctx); err = mlen; err_release_msg: spin_lock(&pipe->lock); __gss_unhash_msg(gss_msg); spin_unlock(&pipe->lock); gss_release_msg(gss_msg); err_put_ctx: gss_put_ctx(ctx); err: kfree(buf); out: return err; } static int gss_pipe_open(struct inode *inode, int new_version) { struct net *net = inode->i_sb->s_fs_info; struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); int ret = 0; spin_lock(&pipe_version_lock); if (sn->pipe_version < 0) { /* First open of any gss pipe determines the version: */ sn->pipe_version = new_version; rpc_wake_up(&pipe_version_rpc_waitqueue); wake_up(&pipe_version_waitqueue); } else if (sn->pipe_version != new_version) { /* Trying to open a pipe of a different version */ ret = -EBUSY; goto out; } atomic_inc(&sn->pipe_users); out: spin_unlock(&pipe_version_lock); return ret; } static int gss_pipe_open_v0(struct inode *inode) { return gss_pipe_open(inode, 0); } static int gss_pipe_open_v1(struct inode *inode) { return gss_pipe_open(inode, 1); } static void gss_pipe_release(struct inode *inode) { struct net *net = inode->i_sb->s_fs_info; struct rpc_pipe *pipe = RPC_I(inode)->pipe; struct gss_upcall_msg *gss_msg; restart: spin_lock(&pipe->lock); list_for_each_entry(gss_msg, &pipe->in_downcall, list) { if (!list_empty(&gss_msg->msg.list)) continue; gss_msg->msg.errno = -EPIPE; refcount_inc(&gss_msg->count); __gss_unhash_msg(gss_msg); spin_unlock(&pipe->lock); gss_release_msg(gss_msg); goto restart; } spin_unlock(&pipe->lock); put_pipe_version(net); } static void gss_pipe_destroy_msg(struct rpc_pipe_msg *msg) { struct gss_upcall_msg *gss_msg = container_of(msg, struct gss_upcall_msg, msg); if (msg->errno < 0) { refcount_inc(&gss_msg->count); gss_unhash_msg(gss_msg); if (msg->errno == -ETIMEDOUT) warn_gssd(); gss_release_msg(gss_msg); } gss_release_msg(gss_msg); } static void gss_pipe_dentry_destroy(struct dentry *dir, struct rpc_pipe_dir_object *pdo) { struct gss_pipe *gss_pipe = pdo->pdo_data; struct rpc_pipe *pipe = gss_pipe->pipe; if (pipe->dentry != NULL) { rpc_unlink(pipe->dentry); pipe->dentry = NULL; } } static int gss_pipe_dentry_create(struct dentry *dir, struct rpc_pipe_dir_object *pdo) { struct gss_pipe *p = pdo->pdo_data; struct dentry *dentry; dentry = rpc_mkpipe_dentry(dir, p->name, p->clnt, p->pipe); if (IS_ERR(dentry)) return PTR_ERR(dentry); p->pipe->dentry = dentry; return 0; } static const struct rpc_pipe_dir_object_ops gss_pipe_dir_object_ops = { .create = gss_pipe_dentry_create, .destroy = gss_pipe_dentry_destroy, }; static struct gss_pipe *gss_pipe_alloc(struct rpc_clnt *clnt, const char *name, const struct rpc_pipe_ops *upcall_ops) { struct gss_pipe *p; int err = -ENOMEM; p = kmalloc(sizeof(*p), GFP_KERNEL); if (p == NULL) goto err; p->pipe = rpc_mkpipe_data(upcall_ops, RPC_PIPE_WAIT_FOR_OPEN); if (IS_ERR(p->pipe)) { err = PTR_ERR(p->pipe); goto err_free_gss_pipe; } p->name = name; p->clnt = clnt; kref_init(&p->kref); rpc_init_pipe_dir_object(&p->pdo, &gss_pipe_dir_object_ops, p); return p; err_free_gss_pipe: kfree(p); err: return ERR_PTR(err); } struct gss_alloc_pdo { struct rpc_clnt *clnt; const char *name; const struct rpc_pipe_ops *upcall_ops; }; static int gss_pipe_match_pdo(struct rpc_pipe_dir_object *pdo, void *data) { struct gss_pipe *gss_pipe; struct gss_alloc_pdo *args = data; if (pdo->pdo_ops != &gss_pipe_dir_object_ops) return 0; gss_pipe = container_of(pdo, struct gss_pipe, pdo); if (strcmp(gss_pipe->name, args->name) != 0) return 0; if (!kref_get_unless_zero(&gss_pipe->kref)) return 0; return 1; } static struct rpc_pipe_dir_object *gss_pipe_alloc_pdo(void *data) { struct gss_pipe *gss_pipe; struct gss_alloc_pdo *args = data; gss_pipe = gss_pipe_alloc(args->clnt, args->name, args->upcall_ops); if (!IS_ERR(gss_pipe)) return &gss_pipe->pdo; return NULL; } static struct gss_pipe *gss_pipe_get(struct rpc_clnt *clnt, const char *name, const struct rpc_pipe_ops *upcall_ops) { struct net *net = rpc_net_ns(clnt); struct rpc_pipe_dir_object *pdo; struct gss_alloc_pdo args = { .clnt = clnt, .name = name, .upcall_ops = upcall_ops, }; pdo = rpc_find_or_alloc_pipe_dir_object(net, &clnt->cl_pipedir_objects, gss_pipe_match_pdo, gss_pipe_alloc_pdo, &args); if (pdo != NULL) return container_of(pdo, struct gss_pipe, pdo); return ERR_PTR(-ENOMEM); } static void __gss_pipe_free(struct gss_pipe *p) { struct rpc_clnt *clnt = p->clnt; struct net *net = rpc_net_ns(clnt); rpc_remove_pipe_dir_object(net, &clnt->cl_pipedir_objects, &p->pdo); rpc_destroy_pipe_data(p->pipe); kfree(p); } static void __gss_pipe_release(struct kref *kref) { struct gss_pipe *p = container_of(kref, struct gss_pipe, kref); __gss_pipe_free(p); } static void gss_pipe_free(struct gss_pipe *p) { if (p != NULL) kref_put(&p->kref, __gss_pipe_release); } /* * NOTE: we have the opportunity to use different * parameters based on the input flavor (which must be a pseudoflavor) */ static struct gss_auth * gss_create_new(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt) { rpc_authflavor_t flavor = args->pseudoflavor; struct gss_auth *gss_auth; struct gss_pipe *gss_pipe; struct rpc_auth * auth; int err = -ENOMEM; /* XXX? */ if (!try_module_get(THIS_MODULE)) return ERR_PTR(err); if (!(gss_auth = kmalloc(sizeof(*gss_auth), GFP_KERNEL))) goto out_dec; INIT_HLIST_NODE(&gss_auth->hash); gss_auth->target_name = NULL; if (args->target_name) { gss_auth->target_name = kstrdup(args->target_name, GFP_KERNEL); if (gss_auth->target_name == NULL) goto err_free; } gss_auth->client = clnt; gss_auth->net = get_net_track(rpc_net_ns(clnt), &gss_auth->ns_tracker, GFP_KERNEL); err = -EINVAL; gss_auth->mech = gss_mech_get_by_pseudoflavor(flavor); if (!gss_auth->mech) goto err_put_net; gss_auth->service = gss_pseudoflavor_to_service(gss_auth->mech, flavor); if (gss_auth->service == 0) goto err_put_mech; if (!gssd_running(gss_auth->net)) goto err_put_mech; auth = &gss_auth->rpc_auth; auth->au_cslack = GSS_CRED_SLACK >> 2; BUILD_BUG_ON(GSS_KRB5_MAX_SLACK_NEEDED > RPC_MAX_AUTH_SIZE); auth->au_rslack = GSS_KRB5_MAX_SLACK_NEEDED >> 2; auth->au_verfsize = GSS_VERF_SLACK >> 2; auth->au_ralign = GSS_VERF_SLACK >> 2; __set_bit(RPCAUTH_AUTH_UPDATE_SLACK, &auth->au_flags); auth->au_ops = &authgss_ops; auth->au_flavor = flavor; if (gss_pseudoflavor_to_datatouch(gss_auth->mech, flavor)) __set_bit(RPCAUTH_AUTH_DATATOUCH, &auth->au_flags); refcount_set(&auth->au_count, 1); kref_init(&gss_auth->kref); err = rpcauth_init_credcache(auth); if (err) goto err_put_mech; /* * Note: if we created the old pipe first, then someone who * examined the directory at the right moment might conclude * that we supported only the old pipe. So we instead create * the new pipe first. */ gss_pipe = gss_pipe_get(clnt, "gssd", &gss_upcall_ops_v1); if (IS_ERR(gss_pipe)) { err = PTR_ERR(gss_pipe); goto err_destroy_credcache; } gss_auth->gss_pipe[1] = gss_pipe; gss_pipe = gss_pipe_get(clnt, gss_auth->mech->gm_name, &gss_upcall_ops_v0); if (IS_ERR(gss_pipe)) { err = PTR_ERR(gss_pipe); goto err_destroy_pipe_1; } gss_auth->gss_pipe[0] = gss_pipe; return gss_auth; err_destroy_pipe_1: gss_pipe_free(gss_auth->gss_pipe[1]); err_destroy_credcache: rpcauth_destroy_credcache(auth); err_put_mech: gss_mech_put(gss_auth->mech); err_put_net: put_net_track(gss_auth->net, &gss_auth->ns_tracker); err_free: kfree(gss_auth->target_name); kfree(gss_auth); out_dec: module_put(THIS_MODULE); trace_rpcgss_createauth(flavor, err); return ERR_PTR(err); } static void gss_free(struct gss_auth *gss_auth) { gss_pipe_free(gss_auth->gss_pipe[0]); gss_pipe_free(gss_auth->gss_pipe[1]); gss_mech_put(gss_auth->mech); put_net_track(gss_auth->net, &gss_auth->ns_tracker); kfree(gss_auth->target_name); kfree(gss_auth); module_put(THIS_MODULE); } static void gss_free_callback(struct kref *kref) { struct gss_auth *gss_auth = container_of(kref, struct gss_auth, kref); gss_free(gss_auth); } static void gss_put_auth(struct gss_auth *gss_auth) { kref_put(&gss_auth->kref, gss_free_callback); } static void gss_destroy(struct rpc_auth *auth) { struct gss_auth *gss_auth = container_of(auth, struct gss_auth, rpc_auth); if (hash_hashed(&gss_auth->hash)) { spin_lock(&gss_auth_hash_lock); hash_del(&gss_auth->hash); spin_unlock(&gss_auth_hash_lock); } gss_pipe_free(gss_auth->gss_pipe[0]); gss_auth->gss_pipe[0] = NULL; gss_pipe_free(gss_auth->gss_pipe[1]); gss_auth->gss_pipe[1] = NULL; rpcauth_destroy_credcache(auth); gss_put_auth(gss_auth); } /* * Auths may be shared between rpc clients that were cloned from a * common client with the same xprt, if they also share the flavor and * target_name. * * The auth is looked up from the oldest parent sharing the same * cl_xprt, and the auth itself references only that common parent * (which is guaranteed to last as long as any of its descendants). */ static struct gss_auth * gss_auth_find_or_add_hashed(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt, struct gss_auth *new) { struct gss_auth *gss_auth; unsigned long hashval = (unsigned long)clnt; spin_lock(&gss_auth_hash_lock); hash_for_each_possible(gss_auth_hash_table, gss_auth, hash, hashval) { if (gss_auth->client != clnt) continue; if (gss_auth->rpc_auth.au_flavor != args->pseudoflavor) continue; if (gss_auth->target_name != args->target_name) { if (gss_auth->target_name == NULL) continue; if (args->target_name == NULL) continue; if (strcmp(gss_auth->target_name, args->target_name)) continue; } if (!refcount_inc_not_zero(&gss_auth->rpc_auth.au_count)) continue; goto out; } if (new) hash_add(gss_auth_hash_table, &new->hash, hashval); gss_auth = new; out: spin_unlock(&gss_auth_hash_lock); return gss_auth; } static struct gss_auth * gss_create_hashed(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt) { struct gss_auth *gss_auth; struct gss_auth *new; gss_auth = gss_auth_find_or_add_hashed(args, clnt, NULL); if (gss_auth != NULL) goto out; new = gss_create_new(args, clnt); if (IS_ERR(new)) return new; gss_auth = gss_auth_find_or_add_hashed(args, clnt, new); if (gss_auth != new) gss_destroy(&new->rpc_auth); out: return gss_auth; } static struct rpc_auth * gss_create(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt) { struct gss_auth *gss_auth; struct rpc_xprt_switch *xps = rcu_access_pointer(clnt->cl_xpi.xpi_xpswitch); while (clnt != clnt->cl_parent) { struct rpc_clnt *parent = clnt->cl_parent; /* Find the original parent for this transport */ if (rcu_access_pointer(parent->cl_xpi.xpi_xpswitch) != xps) break; clnt = parent; } gss_auth = gss_create_hashed(args, clnt); if (IS_ERR(gss_auth)) return ERR_CAST(gss_auth); return &gss_auth->rpc_auth; } static struct gss_cred * gss_dup_cred(struct gss_auth *gss_auth, struct gss_cred *gss_cred) { struct gss_cred *new; /* Make a copy of the cred so that we can reference count it */ new = kzalloc(sizeof(*gss_cred), GFP_KERNEL); if (new) { struct auth_cred acred = { .cred = gss_cred->gc_base.cr_cred, }; struct gss_cl_ctx *ctx = rcu_dereference_protected(gss_cred->gc_ctx, 1); rpcauth_init_cred(&new->gc_base, &acred, &gss_auth->rpc_auth, &gss_nullops); new->gc_base.cr_flags = 1UL << RPCAUTH_CRED_UPTODATE; new->gc_service = gss_cred->gc_service; new->gc_principal = gss_cred->gc_principal; kref_get(&gss_auth->kref); rcu_assign_pointer(new->gc_ctx, ctx); gss_get_ctx(ctx); } return new; } /* * gss_send_destroy_context will cause the RPCSEC_GSS to send a NULL RPC call * to the server with the GSS control procedure field set to * RPC_GSS_PROC_DESTROY. This should normally cause the server to release * all RPCSEC_GSS state associated with that context. */ static void gss_send_destroy_context(struct rpc_cred *cred) { struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base); struct gss_auth *gss_auth = container_of(cred->cr_auth, struct gss_auth, rpc_auth); struct gss_cl_ctx *ctx = rcu_dereference_protected(gss_cred->gc_ctx, 1); struct gss_cred *new; struct rpc_task *task; new = gss_dup_cred(gss_auth, gss_cred); if (new) { ctx->gc_proc = RPC_GSS_PROC_DESTROY; trace_rpcgss_ctx_destroy(gss_cred); task = rpc_call_null(gss_auth->client, &new->gc_base, RPC_TASK_ASYNC); if (!IS_ERR(task)) rpc_put_task(task); put_rpccred(&new->gc_base); } } /* gss_destroy_cred (and gss_free_ctx) are used to clean up after failure * to create a new cred or context, so they check that things have been * allocated before freeing them. */ static void gss_do_free_ctx(struct gss_cl_ctx *ctx) { gss_delete_sec_context(&ctx->gc_gss_ctx); kfree(ctx->gc_wire_ctx.data); kfree(ctx->gc_acceptor.data); kfree(ctx); } static void gss_free_ctx_callback(struct rcu_head *head) { struct gss_cl_ctx *ctx = container_of(head, struct gss_cl_ctx, gc_rcu); gss_do_free_ctx(ctx); } static void gss_free_ctx(struct gss_cl_ctx *ctx) { call_rcu(&ctx->gc_rcu, gss_free_ctx_callback); } static void gss_free_cred(struct gss_cred *gss_cred) { kfree(gss_cred); } static void gss_free_cred_callback(struct rcu_head *head) { struct gss_cred *gss_cred = container_of(head, struct gss_cred, gc_base.cr_rcu); gss_free_cred(gss_cred); } static void gss_destroy_nullcred(struct rpc_cred *cred) { struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base); struct gss_auth *gss_auth = container_of(cred->cr_auth, struct gss_auth, rpc_auth); struct gss_cl_ctx *ctx = rcu_dereference_protected(gss_cred->gc_ctx, 1); RCU_INIT_POINTER(gss_cred->gc_ctx, NULL); put_cred(cred->cr_cred); call_rcu(&cred->cr_rcu, gss_free_cred_callback); if (ctx) gss_put_ctx(ctx); gss_put_auth(gss_auth); } static void gss_destroy_cred(struct rpc_cred *cred) { if (test_and_clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags) != 0) gss_send_destroy_context(cred); gss_destroy_nullcred(cred); } static int gss_hash_cred(struct auth_cred *acred, unsigned int hashbits) { return hash_64(from_kuid(&init_user_ns, acred->cred->fsuid), hashbits); } /* * Lookup RPCSEC_GSS cred for the current process */ static struct rpc_cred *gss_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags) { return rpcauth_lookup_credcache(auth, acred, flags, rpc_task_gfp_mask()); } static struct rpc_cred * gss_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags, gfp_t gfp) { struct gss_auth *gss_auth = container_of(auth, struct gss_auth, rpc_auth); struct gss_cred *cred = NULL; int err = -ENOMEM; if (!(cred = kzalloc(sizeof(*cred), gfp))) goto out_err; rpcauth_init_cred(&cred->gc_base, acred, auth, &gss_credops); /* * Note: in order to force a call to call_refresh(), we deliberately * fail to flag the credential as RPCAUTH_CRED_UPTODATE. */ cred->gc_base.cr_flags = 1UL << RPCAUTH_CRED_NEW; cred->gc_service = gss_auth->service; cred->gc_principal = acred->principal; kref_get(&gss_auth->kref); return &cred->gc_base; out_err: return ERR_PTR(err); } static int gss_cred_init(struct rpc_auth *auth, struct rpc_cred *cred) { struct gss_auth *gss_auth = container_of(auth, struct gss_auth, rpc_auth); struct gss_cred *gss_cred = container_of(cred,struct gss_cred, gc_base); int err; do { err = gss_create_upcall(gss_auth, gss_cred); } while (err == -EAGAIN); return err; } static char * gss_stringify_acceptor(struct rpc_cred *cred) { char *string = NULL; struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base); struct gss_cl_ctx *ctx; unsigned int len; struct xdr_netobj *acceptor; rcu_read_lock(); ctx = rcu_dereference(gss_cred->gc_ctx); if (!ctx) goto out; len = ctx->gc_acceptor.len; rcu_read_unlock(); /* no point if there's no string */ if (!len) return NULL; realloc: string = kmalloc(len + 1, GFP_KERNEL); if (!string) return NULL; rcu_read_lock(); ctx = rcu_dereference(gss_cred->gc_ctx); /* did the ctx disappear or was it replaced by one with no acceptor? */ if (!ctx || !ctx->gc_acceptor.len) { kfree(string); string = NULL; goto out; } acceptor = &ctx->gc_acceptor; /* * Did we find a new acceptor that's longer than the original? Allocate * a longer buffer and try again. */ if (len < acceptor->len) { len = acceptor->len; rcu_read_unlock(); kfree(string); goto realloc; } memcpy(string, acceptor->data, acceptor->len); string[acceptor->len] = '\0'; out: rcu_read_unlock(); return string; } /* * Returns -EACCES if GSS context is NULL or will expire within the * timeout (miliseconds) */ static int gss_key_timeout(struct rpc_cred *rc) { struct gss_cred *gss_cred = container_of(rc, struct gss_cred, gc_base); struct gss_cl_ctx *ctx; unsigned long timeout = jiffies + (gss_key_expire_timeo * HZ); int ret = 0; rcu_read_lock(); ctx = rcu_dereference(gss_cred->gc_ctx); if (!ctx || time_after(timeout, ctx->gc_expiry)) ret = -EACCES; rcu_read_unlock(); return ret; } static int gss_match(struct auth_cred *acred, struct rpc_cred *rc, int flags) { struct gss_cred *gss_cred = container_of(rc, struct gss_cred, gc_base); struct gss_cl_ctx *ctx; int ret; if (test_bit(RPCAUTH_CRED_NEW, &rc->cr_flags)) goto out; /* Don't match with creds that have expired. */ rcu_read_lock(); ctx = rcu_dereference(gss_cred->gc_ctx); if (!ctx || time_after(jiffies, ctx->gc_expiry)) { rcu_read_unlock(); return 0; } rcu_read_unlock(); if (!test_bit(RPCAUTH_CRED_UPTODATE, &rc->cr_flags)) return 0; out: if (acred->principal != NULL) { if (gss_cred->gc_principal == NULL) return 0; ret = strcmp(acred->principal, gss_cred->gc_principal) == 0; } else { if (gss_cred->gc_principal != NULL) return 0; ret = uid_eq(rc->cr_cred->fsuid, acred->cred->fsuid); } return ret; } /* * Marshal credentials. * * The expensive part is computing the verifier. We can't cache a * pre-computed version of the verifier because the seqno, which * is different every time, is included in the MIC. */ static int gss_marshal(struct rpc_task *task, struct xdr_stream *xdr) { struct rpc_rqst *req = task->tk_rqstp; struct rpc_cred *cred = req->rq_cred; struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base); struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred); __be32 *p, *cred_len; u32 maj_stat = 0; struct xdr_netobj mic; struct kvec iov; struct xdr_buf verf_buf; int status; /* Credential */ p = xdr_reserve_space(xdr, 7 * sizeof(*p) + ctx->gc_wire_ctx.len); if (!p) goto marshal_failed; *p++ = rpc_auth_gss; cred_len = p++; spin_lock(&ctx->gc_seq_lock); req->rq_seqno = (ctx->gc_seq < MAXSEQ) ? ctx->gc_seq++ : MAXSEQ; spin_unlock(&ctx->gc_seq_lock); if (req->rq_seqno == MAXSEQ) goto expired; trace_rpcgss_seqno(task); *p++ = cpu_to_be32(RPC_GSS_VERSION); *p++ = cpu_to_be32(ctx->gc_proc); *p++ = cpu_to_be32(req->rq_seqno); *p++ = cpu_to_be32(gss_cred->gc_service); p = xdr_encode_netobj(p, &ctx->gc_wire_ctx); *cred_len = cpu_to_be32((p - (cred_len + 1)) << 2); /* Verifier */ /* We compute the checksum for the verifier over the xdr-encoded bytes * starting with the xid and ending at the end of the credential: */ iov.iov_base = req->rq_snd_buf.head[0].iov_base; iov.iov_len = (u8 *)p - (u8 *)iov.iov_base; xdr_buf_from_iov(&iov, &verf_buf); p = xdr_reserve_space(xdr, sizeof(*p)); if (!p) goto marshal_failed; *p++ = rpc_auth_gss; mic.data = (u8 *)(p + 1); maj_stat = gss_get_mic(ctx->gc_gss_ctx, &verf_buf, &mic); if (maj_stat == GSS_S_CONTEXT_EXPIRED) goto expired; else if (maj_stat != 0) goto bad_mic; if (xdr_stream_encode_opaque_inline(xdr, (void **)&p, mic.len) < 0) goto marshal_failed; status = 0; out: gss_put_ctx(ctx); return status; expired: clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags); status = -EKEYEXPIRED; goto out; marshal_failed: status = -EMSGSIZE; goto out; bad_mic: trace_rpcgss_get_mic(task, maj_stat); status = -EIO; goto out; } static int gss_renew_cred(struct rpc_task *task) { struct rpc_cred *oldcred = task->tk_rqstp->rq_cred; struct gss_cred *gss_cred = container_of(oldcred, struct gss_cred, gc_base); struct rpc_auth *auth = oldcred->cr_auth; struct auth_cred acred = { .cred = oldcred->cr_cred, .principal = gss_cred->gc_principal, }; struct rpc_cred *new; new = gss_lookup_cred(auth, &acred, RPCAUTH_LOOKUP_NEW); if (IS_ERR(new)) return PTR_ERR(new); task->tk_rqstp->rq_cred = new; put_rpccred(oldcred); return 0; } static int gss_cred_is_negative_entry(struct rpc_cred *cred) { if (test_bit(RPCAUTH_CRED_NEGATIVE, &cred->cr_flags)) { unsigned long now = jiffies; unsigned long begin, expire; struct gss_cred *gss_cred; gss_cred = container_of(cred, struct gss_cred, gc_base); begin = gss_cred->gc_upcall_timestamp; expire = begin + gss_expired_cred_retry_delay * HZ; if (time_in_range_open(now, begin, expire)) return 1; } return 0; } /* * Refresh credentials. XXX - finish */ static int gss_refresh(struct rpc_task *task) { struct rpc_cred *cred = task->tk_rqstp->rq_cred; int ret = 0; if (gss_cred_is_negative_entry(cred)) return -EKEYEXPIRED; if (!test_bit(RPCAUTH_CRED_NEW, &cred->cr_flags) && !test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags)) { ret = gss_renew_cred(task); if (ret < 0) goto out; cred = task->tk_rqstp->rq_cred; } if (test_bit(RPCAUTH_CRED_NEW, &cred->cr_flags)) ret = gss_refresh_upcall(task); out: return ret; } /* Dummy refresh routine: used only when destroying the context */ static int gss_refresh_null(struct rpc_task *task) { return 0; } static int gss_validate(struct rpc_task *task, struct xdr_stream *xdr) { struct rpc_cred *cred = task->tk_rqstp->rq_cred; struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred); __be32 *p, *seq = NULL; struct kvec iov; struct xdr_buf verf_buf; struct xdr_netobj mic; u32 len, maj_stat; int status; p = xdr_inline_decode(xdr, 2 * sizeof(*p)); if (!p) goto validate_failed; if (*p++ != rpc_auth_gss) goto validate_failed; len = be32_to_cpup(p); if (len > RPC_MAX_AUTH_SIZE) goto validate_failed; p = xdr_inline_decode(xdr, len); if (!p) goto validate_failed; seq = kmalloc(4, GFP_KERNEL); if (!seq) goto validate_failed; *seq = cpu_to_be32(task->tk_rqstp->rq_seqno); iov.iov_base = seq; iov.iov_len = 4; xdr_buf_from_iov(&iov, &verf_buf); mic.data = (u8 *)p; mic.len = len; maj_stat = gss_verify_mic(ctx->gc_gss_ctx, &verf_buf, &mic); if (maj_stat == GSS_S_CONTEXT_EXPIRED) clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags); if (maj_stat) goto bad_mic; /* We leave it to unwrap to calculate au_rslack. For now we just * calculate the length of the verifier: */ if (test_bit(RPCAUTH_AUTH_UPDATE_SLACK, &cred->cr_auth->au_flags)) cred->cr_auth->au_verfsize = XDR_QUADLEN(len) + 2; status = 0; out: gss_put_ctx(ctx); kfree(seq); return status; validate_failed: status = -EIO; goto out; bad_mic: trace_rpcgss_verify_mic(task, maj_stat); status = -EACCES; goto out; } static noinline_for_stack int gss_wrap_req_integ(struct rpc_cred *cred, struct gss_cl_ctx *ctx, struct rpc_task *task, struct xdr_stream *xdr) { struct rpc_rqst *rqstp = task->tk_rqstp; struct xdr_buf integ_buf, *snd_buf = &rqstp->rq_snd_buf; struct xdr_netobj mic; __be32 *p, *integ_len; u32 offset, maj_stat; p = xdr_reserve_space(xdr, 2 * sizeof(*p)); if (!p) goto wrap_failed; integ_len = p++; *p = cpu_to_be32(rqstp->rq_seqno); if (rpcauth_wrap_req_encode(task, xdr)) goto wrap_failed; offset = (u8 *)p - (u8 *)snd_buf->head[0].iov_base; if (xdr_buf_subsegment(snd_buf, &integ_buf, offset, snd_buf->len - offset)) goto wrap_failed; *integ_len = cpu_to_be32(integ_buf.len); p = xdr_reserve_space(xdr, 0); if (!p) goto wrap_failed; mic.data = (u8 *)(p + 1); maj_stat = gss_get_mic(ctx->gc_gss_ctx, &integ_buf, &mic); if (maj_stat == GSS_S_CONTEXT_EXPIRED) clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags); else if (maj_stat) goto bad_mic; /* Check that the trailing MIC fit in the buffer, after the fact */ if (xdr_stream_encode_opaque_inline(xdr, (void **)&p, mic.len) < 0) goto wrap_failed; return 0; wrap_failed: return -EMSGSIZE; bad_mic: trace_rpcgss_get_mic(task, maj_stat); return -EIO; } static void priv_release_snd_buf(struct rpc_rqst *rqstp) { int i; for (i=0; i < rqstp->rq_enc_pages_num; i++) __free_page(rqstp->rq_enc_pages[i]); kfree(rqstp->rq_enc_pages); rqstp->rq_release_snd_buf = NULL; } static int alloc_enc_pages(struct rpc_rqst *rqstp) { struct xdr_buf *snd_buf = &rqstp->rq_snd_buf; int first, last, i; if (rqstp->rq_release_snd_buf) rqstp->rq_release_snd_buf(rqstp); if (snd_buf->page_len == 0) { rqstp->rq_enc_pages_num = 0; return 0; } first = snd_buf->page_base >> PAGE_SHIFT; last = (snd_buf->page_base + snd_buf->page_len - 1) >> PAGE_SHIFT; rqstp->rq_enc_pages_num = last - first + 1 + 1; rqstp->rq_enc_pages = kmalloc_array(rqstp->rq_enc_pages_num, sizeof(struct page *), GFP_KERNEL); if (!rqstp->rq_enc_pages) goto out; for (i=0; i < rqstp->rq_enc_pages_num; i++) { rqstp->rq_enc_pages[i] = alloc_page(GFP_KERNEL); if (rqstp->rq_enc_pages[i] == NULL) goto out_free; } rqstp->rq_release_snd_buf = priv_release_snd_buf; return 0; out_free: rqstp->rq_enc_pages_num = i; priv_release_snd_buf(rqstp); out: return -EAGAIN; } static noinline_for_stack int gss_wrap_req_priv(struct rpc_cred *cred, struct gss_cl_ctx *ctx, struct rpc_task *task, struct xdr_stream *xdr) { struct rpc_rqst *rqstp = task->tk_rqstp; struct xdr_buf *snd_buf = &rqstp->rq_snd_buf; u32 pad, offset, maj_stat; int status; __be32 *p, *opaque_len; struct page **inpages; int first; struct kvec *iov; status = -EIO; p = xdr_reserve_space(xdr, 2 * sizeof(*p)); if (!p) goto wrap_failed; opaque_len = p++; *p = cpu_to_be32(rqstp->rq_seqno); if (rpcauth_wrap_req_encode(task, xdr)) goto wrap_failed; status = alloc_enc_pages(rqstp); if (unlikely(status)) goto wrap_failed; first = snd_buf->page_base >> PAGE_SHIFT; inpages = snd_buf->pages + first; snd_buf->pages = rqstp->rq_enc_pages; snd_buf->page_base -= first << PAGE_SHIFT; /* * Move the tail into its own page, in case gss_wrap needs * more space in the head when wrapping. * * Still... Why can't gss_wrap just slide the tail down? */ if (snd_buf->page_len || snd_buf->tail[0].iov_len) { char *tmp; tmp = page_address(rqstp->rq_enc_pages[rqstp->rq_enc_pages_num - 1]); memcpy(tmp, snd_buf->tail[0].iov_base, snd_buf->tail[0].iov_len); snd_buf->tail[0].iov_base = tmp; } offset = (u8 *)p - (u8 *)snd_buf->head[0].iov_base; maj_stat = gss_wrap(ctx->gc_gss_ctx, offset, snd_buf, inpages); /* slack space should prevent this ever happening: */ if (unlikely(snd_buf->len > snd_buf->buflen)) { status = -EIO; goto wrap_failed; } /* We're assuming that when GSS_S_CONTEXT_EXPIRED, the encryption was * done anyway, so it's safe to put the request on the wire: */ if (maj_stat == GSS_S_CONTEXT_EXPIRED) clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags); else if (maj_stat) goto bad_wrap; *opaque_len = cpu_to_be32(snd_buf->len - offset); /* guess whether the pad goes into the head or the tail: */ if (snd_buf->page_len || snd_buf->tail[0].iov_len) iov = snd_buf->tail; else iov = snd_buf->head; p = iov->iov_base + iov->iov_len; pad = xdr_pad_size(snd_buf->len - offset); memset(p, 0, pad); iov->iov_len += pad; snd_buf->len += pad; return 0; wrap_failed: return status; bad_wrap: trace_rpcgss_wrap(task, maj_stat); return -EIO; } static int gss_wrap_req(struct rpc_task *task, struct xdr_stream *xdr) { struct rpc_cred *cred = task->tk_rqstp->rq_cred; struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base); struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred); int status; status = -EIO; if (ctx->gc_proc != RPC_GSS_PROC_DATA) { /* The spec seems a little ambiguous here, but I think that not * wrapping context destruction requests makes the most sense. */ status = rpcauth_wrap_req_encode(task, xdr); goto out; } switch (gss_cred->gc_service) { case RPC_GSS_SVC_NONE: status = rpcauth_wrap_req_encode(task, xdr); break; case RPC_GSS_SVC_INTEGRITY: status = gss_wrap_req_integ(cred, ctx, task, xdr); break; case RPC_GSS_SVC_PRIVACY: status = gss_wrap_req_priv(cred, ctx, task, xdr); break; default: status = -EIO; } out: gss_put_ctx(ctx); return status; } /** * gss_update_rslack - Possibly update RPC receive buffer size estimates * @task: rpc_task for incoming RPC Reply being unwrapped * @cred: controlling rpc_cred for @task * @before: XDR words needed before each RPC Reply message * @after: XDR words needed following each RPC Reply message * */ static void gss_update_rslack(struct rpc_task *task, struct rpc_cred *cred, unsigned int before, unsigned int after) { struct rpc_auth *auth = cred->cr_auth; if (test_and_clear_bit(RPCAUTH_AUTH_UPDATE_SLACK, &auth->au_flags)) { auth->au_ralign = auth->au_verfsize + before; auth->au_rslack = auth->au_verfsize + after; trace_rpcgss_update_slack(task, auth); } } static int gss_unwrap_resp_auth(struct rpc_task *task, struct rpc_cred *cred) { gss_update_rslack(task, cred, 0, 0); return 0; } /* * RFC 2203, Section 5.3.2.2 * * struct rpc_gss_integ_data { * opaque databody_integ<>; * opaque checksum<>; * }; * * struct rpc_gss_data_t { * unsigned int seq_num; * proc_req_arg_t arg; * }; */ static noinline_for_stack int gss_unwrap_resp_integ(struct rpc_task *task, struct rpc_cred *cred, struct gss_cl_ctx *ctx, struct rpc_rqst *rqstp, struct xdr_stream *xdr) { struct xdr_buf gss_data, *rcv_buf = &rqstp->rq_rcv_buf; u32 len, offset, seqno, maj_stat; struct xdr_netobj mic; int ret; ret = -EIO; mic.data = NULL; /* opaque databody_integ<>; */ if (xdr_stream_decode_u32(xdr, &len)) goto unwrap_failed; if (len & 3) goto unwrap_failed; offset = rcv_buf->len - xdr_stream_remaining(xdr); if (xdr_stream_decode_u32(xdr, &seqno)) goto unwrap_failed; if (seqno != rqstp->rq_seqno) goto bad_seqno; if (xdr_buf_subsegment(rcv_buf, &gss_data, offset, len)) goto unwrap_failed; /* * The xdr_stream now points to the beginning of the * upper layer payload, to be passed below to * rpcauth_unwrap_resp_decode(). The checksum, which * follows the upper layer payload in @rcv_buf, is * located and parsed without updating the xdr_stream. */ /* opaque checksum<>; */ offset += len; if (xdr_decode_word(rcv_buf, offset, &len)) goto unwrap_failed; offset += sizeof(__be32); if (offset + len > rcv_buf->len) goto unwrap_failed; mic.len = len; mic.data = kmalloc(len, GFP_KERNEL); if (ZERO_OR_NULL_PTR(mic.data)) goto unwrap_failed; if (read_bytes_from_xdr_buf(rcv_buf, offset, mic.data, mic.len)) goto unwrap_failed; maj_stat = gss_verify_mic(ctx->gc_gss_ctx, &gss_data, &mic); if (maj_stat == GSS_S_CONTEXT_EXPIRED) clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags); if (maj_stat != GSS_S_COMPLETE) goto bad_mic; gss_update_rslack(task, cred, 2, 2 + 1 + XDR_QUADLEN(mic.len)); ret = 0; out: kfree(mic.data); return ret; unwrap_failed: trace_rpcgss_unwrap_failed(task); goto out; bad_seqno: trace_rpcgss_bad_seqno(task, rqstp->rq_seqno, seqno); goto out; bad_mic: trace_rpcgss_verify_mic(task, maj_stat); goto out; } static noinline_for_stack int gss_unwrap_resp_priv(struct rpc_task *task, struct rpc_cred *cred, struct gss_cl_ctx *ctx, struct rpc_rqst *rqstp, struct xdr_stream *xdr) { struct xdr_buf *rcv_buf = &rqstp->rq_rcv_buf; struct kvec *head = rqstp->rq_rcv_buf.head; u32 offset, opaque_len, maj_stat; __be32 *p; p = xdr_inline_decode(xdr, 2 * sizeof(*p)); if (unlikely(!p)) goto unwrap_failed; opaque_len = be32_to_cpup(p++); offset = (u8 *)(p) - (u8 *)head->iov_base; if (offset + opaque_len > rcv_buf->len) goto unwrap_failed; maj_stat = gss_unwrap(ctx->gc_gss_ctx, offset, offset + opaque_len, rcv_buf); if (maj_stat == GSS_S_CONTEXT_EXPIRED) clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags); if (maj_stat != GSS_S_COMPLETE) goto bad_unwrap; /* gss_unwrap decrypted the sequence number */ if (be32_to_cpup(p++) != rqstp->rq_seqno) goto bad_seqno; /* gss_unwrap redacts the opaque blob from the head iovec. * rcv_buf has changed, thus the stream needs to be reset. */ xdr_init_decode(xdr, rcv_buf, p, rqstp); gss_update_rslack(task, cred, 2 + ctx->gc_gss_ctx->align, 2 + ctx->gc_gss_ctx->slack); return 0; unwrap_failed: trace_rpcgss_unwrap_failed(task); return -EIO; bad_seqno: trace_rpcgss_bad_seqno(task, rqstp->rq_seqno, be32_to_cpup(--p)); return -EIO; bad_unwrap: trace_rpcgss_unwrap(task, maj_stat); return -EIO; } static bool gss_seq_is_newer(u32 new, u32 old) { return (s32)(new - old) > 0; } static bool gss_xmit_need_reencode(struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; struct rpc_cred *cred = req->rq_cred; struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred); u32 win, seq_xmit = 0; bool ret = true; if (!ctx) goto out; if (gss_seq_is_newer(req->rq_seqno, READ_ONCE(ctx->gc_seq))) goto out_ctx; seq_xmit = READ_ONCE(ctx->gc_seq_xmit); while (gss_seq_is_newer(req->rq_seqno, seq_xmit)) { u32 tmp = seq_xmit; seq_xmit = cmpxchg(&ctx->gc_seq_xmit, tmp, req->rq_seqno); if (seq_xmit == tmp) { ret = false; goto out_ctx; } } win = ctx->gc_win; if (win > 0) ret = !gss_seq_is_newer(req->rq_seqno, seq_xmit - win); out_ctx: gss_put_ctx(ctx); out: trace_rpcgss_need_reencode(task, seq_xmit, ret); return ret; } static int gss_unwrap_resp(struct rpc_task *task, struct xdr_stream *xdr) { struct rpc_rqst *rqstp = task->tk_rqstp; struct rpc_cred *cred = rqstp->rq_cred; struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base); struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred); int status = -EIO; if (ctx->gc_proc != RPC_GSS_PROC_DATA) goto out_decode; switch (gss_cred->gc_service) { case RPC_GSS_SVC_NONE: status = gss_unwrap_resp_auth(task, cred); break; case RPC_GSS_SVC_INTEGRITY: status = gss_unwrap_resp_integ(task, cred, ctx, rqstp, xdr); break; case RPC_GSS_SVC_PRIVACY: status = gss_unwrap_resp_priv(task, cred, ctx, rqstp, xdr); break; } if (status) goto out; out_decode: status = rpcauth_unwrap_resp_decode(task, xdr); out: gss_put_ctx(ctx); return status; } static const struct rpc_authops authgss_ops = { .owner = THIS_MODULE, .au_flavor = RPC_AUTH_GSS, .au_name = "RPCSEC_GSS", .create = gss_create, .destroy = gss_destroy, .hash_cred = gss_hash_cred, .lookup_cred = gss_lookup_cred, .crcreate = gss_create_cred, .info2flavor = gss_mech_info2flavor, .flavor2info = gss_mech_flavor2info, }; static const struct rpc_credops gss_credops = { .cr_name = "AUTH_GSS", .crdestroy = gss_destroy_cred, .cr_init = gss_cred_init, .crmatch = gss_match, .crmarshal = gss_marshal, .crrefresh = gss_refresh, .crvalidate = gss_validate, .crwrap_req = gss_wrap_req, .crunwrap_resp = gss_unwrap_resp, .crkey_timeout = gss_key_timeout, .crstringify_acceptor = gss_stringify_acceptor, .crneed_reencode = gss_xmit_need_reencode, }; static const struct rpc_credops gss_nullops = { .cr_name = "AUTH_GSS", .crdestroy = gss_destroy_nullcred, .crmatch = gss_match, .crmarshal = gss_marshal, .crrefresh = gss_refresh_null, .crvalidate = gss_validate, .crwrap_req = gss_wrap_req, .crunwrap_resp = gss_unwrap_resp, .crstringify_acceptor = gss_stringify_acceptor, }; static const struct rpc_pipe_ops gss_upcall_ops_v0 = { .upcall = gss_v0_upcall, .downcall = gss_pipe_downcall, .destroy_msg = gss_pipe_destroy_msg, .open_pipe = gss_pipe_open_v0, .release_pipe = gss_pipe_release, }; static const struct rpc_pipe_ops gss_upcall_ops_v1 = { .upcall = gss_v1_upcall, .downcall = gss_pipe_downcall, .destroy_msg = gss_pipe_destroy_msg, .open_pipe = gss_pipe_open_v1, .release_pipe = gss_pipe_release, }; static __net_init int rpcsec_gss_init_net(struct net *net) { return gss_svc_init_net(net); } static __net_exit void rpcsec_gss_exit_net(struct net *net) { gss_svc_shutdown_net(net); } static struct pernet_operations rpcsec_gss_net_ops = { .init = rpcsec_gss_init_net, .exit = rpcsec_gss_exit_net, }; /* * Initialize RPCSEC_GSS module */ static int __init init_rpcsec_gss(void) { int err = 0; err = rpcauth_register(&authgss_ops); if (err) goto out; err = gss_svc_init(); if (err) goto out_unregister; err = register_pernet_subsys(&rpcsec_gss_net_ops); if (err) goto out_svc_exit; rpc_init_wait_queue(&pipe_version_rpc_waitqueue, "gss pipe version"); return 0; out_svc_exit: gss_svc_shutdown(); out_unregister: rpcauth_unregister(&authgss_ops); out: return err; } static void __exit exit_rpcsec_gss(void) { unregister_pernet_subsys(&rpcsec_gss_net_ops); gss_svc_shutdown(); rpcauth_unregister(&authgss_ops); rcu_barrier(); /* Wait for completion of call_rcu()'s */ } MODULE_ALIAS("rpc-auth-6"); MODULE_DESCRIPTION("Sun RPC Kerberos RPCSEC_GSS client authentication"); MODULE_LICENSE("GPL"); module_param_named(expired_cred_retry_delay, gss_expired_cred_retry_delay, uint, 0644); MODULE_PARM_DESC(expired_cred_retry_delay, "Timeout (in seconds) until " "the RPC engine retries an expired credential"); module_param_named(key_expire_timeo, gss_key_expire_timeo, uint, 0644); MODULE_PARM_DESC(key_expire_timeo, "Time (in seconds) at the end of a " "credential keys lifetime where the NFS layer cleans up " "prior to key expiration"); module_init(init_rpcsec_gss) module_exit(exit_rpcsec_gss)
132 389 948 6 351 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_TIME64_H #define _LINUX_TIME64_H #include <linux/math64.h> #include <vdso/time64.h> typedef __s64 time64_t; typedef __u64 timeu64_t; #include <uapi/linux/time.h> struct timespec64 { time64_t tv_sec; /* seconds */ long tv_nsec; /* nanoseconds */ }; struct itimerspec64 { struct timespec64 it_interval; struct timespec64 it_value; }; /* Parameters used to convert the timespec values: */ #define PSEC_PER_NSEC 1000L /* Located here for timespec[64]_valid_strict */ #define TIME64_MAX ((s64)~((u64)1 << 63)) #define TIME64_MIN (-TIME64_MAX - 1) #define KTIME_MAX ((s64)~((u64)1 << 63)) #define KTIME_MIN (-KTIME_MAX - 1) #define KTIME_SEC_MAX (KTIME_MAX / NSEC_PER_SEC) #define KTIME_SEC_MIN (KTIME_MIN / NSEC_PER_SEC) /* * Limits for settimeofday(): * * To prevent setting the time close to the wraparound point time setting * is limited so a reasonable uptime can be accomodated. Uptime of 30 years * should be really sufficient, which means the cutoff is 2232. At that * point the cutoff is just a small part of the larger problem. */ #define TIME_UPTIME_SEC_MAX (30LL * 365 * 24 *3600) #define TIME_SETTOD_SEC_MAX (KTIME_SEC_MAX - TIME_UPTIME_SEC_MAX) static inline int timespec64_equal(const struct timespec64 *a, const struct timespec64 *b) { return (a->tv_sec == b->tv_sec) && (a->tv_nsec == b->tv_nsec); } static inline bool timespec64_is_epoch(const struct timespec64 *ts) { return ts->tv_sec == 0 && ts->tv_nsec == 0; } /* * lhs < rhs: return <0 * lhs == rhs: return 0 * lhs > rhs: return >0 */ static inline int timespec64_compare(const struct timespec64 *lhs, const struct timespec64 *rhs) { if (lhs->tv_sec < rhs->tv_sec) return -1; if (lhs->tv_sec > rhs->tv_sec) return 1; return lhs->tv_nsec - rhs->tv_nsec; } extern void set_normalized_timespec64(struct timespec64 *ts, time64_t sec, s64 nsec); static inline struct timespec64 timespec64_add(struct timespec64 lhs, struct timespec64 rhs) { struct timespec64 ts_delta; set_normalized_timespec64(&ts_delta, lhs.tv_sec + rhs.tv_sec, lhs.tv_nsec + rhs.tv_nsec); return ts_delta; } /* * sub = lhs - rhs, in normalized form */ static inline struct timespec64 timespec64_sub(struct timespec64 lhs, struct timespec64 rhs) { struct timespec64 ts_delta; set_normalized_timespec64(&ts_delta, lhs.tv_sec - rhs.tv_sec, lhs.tv_nsec - rhs.tv_nsec); return ts_delta; } /* * Returns true if the timespec64 is norm, false if denorm: */ static inline bool timespec64_valid(const struct timespec64 *ts) { /* Dates before 1970 are bogus */ if (ts->tv_sec < 0) return false; /* Can't have more nanoseconds then a second */ if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) return false; return true; } static inline bool timespec64_valid_strict(const struct timespec64 *ts) { if (!timespec64_valid(ts)) return false; /* Disallow values that could overflow ktime_t */ if ((unsigned long long)ts->tv_sec >= KTIME_SEC_MAX) return false; return true; } static inline bool timespec64_valid_settod(const struct timespec64 *ts) { if (!timespec64_valid(ts)) return false; /* Disallow values which cause overflow issues vs. CLOCK_REALTIME */ if ((unsigned long long)ts->tv_sec >= TIME_SETTOD_SEC_MAX) return false; return true; } /** * timespec64_to_ns - Convert timespec64 to nanoseconds * @ts: pointer to the timespec64 variable to be converted * * Returns the scalar nanosecond representation of the timespec64 * parameter. */ static inline s64 timespec64_to_ns(const struct timespec64 *ts) { /* Prevent multiplication overflow / underflow */ if (ts->tv_sec >= KTIME_SEC_MAX) return KTIME_MAX; if (ts->tv_sec <= KTIME_SEC_MIN) return KTIME_MIN; return ((s64) ts->tv_sec * NSEC_PER_SEC) + ts->tv_nsec; } /** * ns_to_timespec64 - Convert nanoseconds to timespec64 * @nsec: the nanoseconds value to be converted * * Returns the timespec64 representation of the nsec parameter. */ extern struct timespec64 ns_to_timespec64(s64 nsec); /** * timespec64_add_ns - Adds nanoseconds to a timespec64 * @a: pointer to timespec64 to be incremented * @ns: unsigned nanoseconds value to be added * * This must always be inlined because its used from the x86-64 vdso, * which cannot call other kernel functions. */ static __always_inline void timespec64_add_ns(struct timespec64 *a, u64 ns) { a->tv_sec += __iter_div_u64_rem(a->tv_nsec + ns, NSEC_PER_SEC, &ns); a->tv_nsec = ns; } /* * timespec64_add_safe assumes both values are positive and checks for * overflow. It will return TIME64_MAX in case of overflow. */ extern struct timespec64 timespec64_add_safe(const struct timespec64 lhs, const struct timespec64 rhs); #endif /* _LINUX_TIME64_H */
170 135 44 91 135 103 55 33 1 28 5 65 50 38 78 38 65 26 15 4 76 14 7 91 69 75 105 8 188 110 105 1 1 3 1 35 59 188 164 165 164 42 153 1 1 3 24 80 77 26 26 26 1 1 134 123 26 1 123 27 20 9 9 9 20 268 48 19 12 16 72 77 24 31 47 28 102 84 47 18 47 19 12 22 13 25 61 61 60 18 13 13 96 38 103 105 105 15 76 33 43 18 23 104 70 67 61 96 100 1 3 56 90 1 92 94 55 4 156 5 138 5 28 2 3 26 6 41 41 6 7 34 35 34 18 35 3 22 3 26 36 25 17 30 23 15 7 38 35 5 21 6 18 20 36 37 8 7 6 10 12 19 6 36 31 23 8 37 37 37 37 24 24 24 23 23 5 23 7 22 5 22 21 8 22 22 26 2 2 39 6 3 1 30 16 2 21 24 24 16 2 22 4 16 6 7 1 21 6 3 39 39 25 39 7 39 24 39 39 2 37 25 37 37 37 20 37 37 37 26 3 12 42 42 36 27 42 4 25 39 41 4 3 38 15 38 41 19 25 25 24 37 39 38 39 37 24 37 14 37 27 27 26 17 37 36 37 41 41 41 39 41 41 41 42 42 42 42 1 1 1 1 1 348 25 3 24 8 24 31 32 143 32 31 3 13 25 5 5 5 3 2 25 24 23 23 20 4 23 42 42 1 2 2 1 1 113 16 14 16 1 15 16 14 9 1 10 10 88 4 88 31 31 11 31 31 30 31 31 25 25 25 25 25 1 2 1 1 1 1 1 1 37 21 4 4 4 21 12 43 9 43 18 42 42 37 43 43 8 43 26 38 23 23 35 35 22 36 36 34 4 2 1 1 2 2 36 36 36 36 25 36 165 155 181 5 106 31 163 165 165 76 203 125 163 73 164 42 141 9 24 180 1 157 157 117 24 16 31 27 144 31 1 156 157 155 31 96 90 23 157 156 23 157 157 48 153 125 119 111 37 21 121 181 36 158 1 156 1 111 36 33 151 181 179 181 117 124 124 123 29 5 5 39 26 16 145 76 156 158 100 32 188 147 188 188 151 188 139 84 63 28 32 68 188 23 6 7 30 21 18 8 8 8 31 31 83 156 5 1 1 2 1 1 1 2 182 6 176 38 1 178 189 1 136 159 188 44 172 8 173 108 188 182 40 188 188 36 188 181 180 179 5 181 39 23 112 157 110 165 178 181 181 37 34 3 37 36 1 3 4 38 4 1 1 262 261 260 24 136 2 7 6 58 93 1 58 91 3 102 110 58 98 4 7 44 37 6 8 6 57 1 1 133 18 115 115 7 107 162 158 3 4 3 16 3 18 1 7 18 8 1 76 71 71 48 23 60 1 10 69 37 10 27 1 36 1 24 11 27 27 14 15 28 71 63 42 21 63 23 11 12 12 24 24 13 20 19 3 7 8 21 30 8 28 23 24 19 18 4 26 17 22 14 14 8 96 27 94 73 55 54 5 21 94 24 23 30 31 31 30 16 24 7 141 100 86 29 27 41 9 22 20 20 27 17 19 5 17 13 27 19 30 20 40 40 40 146 120 77 147 53 130 130 20 1 12 8 2 17 5 133 131 164 164 81 146 146 114 2 115 4 25 91 89 30 96 32 23 24 111 25 115 95 43 2 112 54 56 10 10 41 20 16 18 339 333 20 7 20 9 17 6 4 18 16 18 18 18 18 18 18 18 18 15 20 16 16 16 15 16 12 4 8 7 7 8 29 9 20 20 18 8 12 1 45 46 43 47 207 203 47 42 26 78 178 144 142 63 142 101 74 1 29 36 40 21 105 134 48 49 1 49 47 3 179 161 48 160 44 37 1 181 182 7 159 23 23 12 1 1 1 144 1 56 6 170 138 140 98 45 122 10 37 36 5 5 35 8 28 5 29 26 14 19 120 1 121 1 117 116 117 135 135 135 135 134 135 79 44 35 77 2 34 45 12 12 12 12 12 35 91 112 40 86 58 35 35 35 35 35 23 12 35 3 3 31 60 5 1 4 55 59 251 250 9 1 1 79 140 32 90 90 9 86 56 1 55 56 56 1 54 56 56 31 31 1 26 13 74 1 27 2 2 73 73 63 44 19 2 75 73 75 75 74 9 9 1 8 59 74 75 9 71 8 75 75 27 59 4 5 59 63 59 5 1 4 63 59 5 57 59 26 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 // SPDX-License-Identifier: GPL-2.0 /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Implementation of the Transmission Control Protocol(TCP). * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Mark Evans, <evansmp@uhura.aston.ac.uk> * Corey Minyard <wf-rch!minyard@relay.EU.net> * Florian La Roche, <flla@stud.uni-sb.de> * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> * Linus Torvalds, <torvalds@cs.helsinki.fi> * Alan Cox, <gw4pts@gw4pts.ampr.org> * Matthew Dillon, <dillon@apollo.west.oic.com> * Arnt Gulbrandsen, <agulbra@nvg.unit.no> * Jorge Cwik, <jorge@laser.satlink.net> */ /* * Changes: * Pedro Roque : Fast Retransmit/Recovery. * Two receive queues. * Retransmit queue handled by TCP. * Better retransmit timer handling. * New congestion avoidance. * Header prediction. * Variable renaming. * * Eric : Fast Retransmit. * Randy Scott : MSS option defines. * Eric Schenk : Fixes to slow start algorithm. * Eric Schenk : Yet another double ACK bug. * Eric Schenk : Delayed ACK bug fixes. * Eric Schenk : Floyd style fast retrans war avoidance. * David S. Miller : Don't allow zero congestion window. * Eric Schenk : Fix retransmitter so that it sends * next packet on ack of previous packet. * Andi Kleen : Moved open_request checking here * and process RSTs for open_requests. * Andi Kleen : Better prune_queue, and other fixes. * Andrey Savochkin: Fix RTT measurements in the presence of * timestamps. * Andrey Savochkin: Check sequence numbers correctly when * removing SACKs due to in sequence incoming * data segments. * Andi Kleen: Make sure we never ack data there is not * enough room for. Also make this condition * a fatal error if it might still happen. * Andi Kleen: Add tcp_measure_rcv_mss to make * connections with MSS<min(MTU,ann. MSS) * work without delayed acks. * Andi Kleen: Process packets with PSH set in the * fast path. * J Hadi Salim: ECN support * Andrei Gurtov, * Pasi Sarolahti, * Panu Kuhlberg: Experimental audit of TCP (re)transmission * engine. Lots of bugs are found. * Pasi Sarolahti: F-RTO for dealing with spurious RTOs */ #define pr_fmt(fmt) "TCP: " fmt #include <linux/mm.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/sysctl.h> #include <linux/kernel.h> #include <linux/prefetch.h> #include <net/dst.h> #include <net/tcp.h> #include <net/proto_memory.h> #include <net/inet_common.h> #include <linux/ipsec.h> #include <linux/unaligned.h> #include <linux/errqueue.h> #include <trace/events/tcp.h> #include <linux/jump_label_ratelimit.h> #include <net/busy_poll.h> #include <net/mptcp.h> int sysctl_tcp_max_orphans __read_mostly = NR_FILE; #define FLAG_DATA 0x01 /* Incoming frame contained data. */ #define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ #define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */ #define FLAG_RETRANS_DATA_ACKED 0x08 /* "" "" some of which was retransmitted. */ #define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged SYN. */ #define FLAG_DATA_SACKED 0x20 /* New SACK. */ #define FLAG_ECE 0x40 /* ECE in this ACK */ #define FLAG_LOST_RETRANS 0x80 /* This ACK marks some retransmission lost */ #define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/ #define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */ #define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */ #define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */ #define FLAG_SET_XMIT_TIMER 0x1000 /* Set TLP or RTO timer */ #define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */ #define FLAG_UPDATE_TS_RECENT 0x4000 /* tcp_replace_ts_recent() */ #define FLAG_NO_CHALLENGE_ACK 0x8000 /* do not call tcp_send_challenge_ack() */ #define FLAG_ACK_MAYBE_DELAYED 0x10000 /* Likely a delayed ACK */ #define FLAG_DSACK_TLP 0x20000 /* DSACK for tail loss probe */ #define FLAG_TS_PROGRESS 0x40000 /* Positive timestamp delta */ #define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED) #define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED) #define FLAG_CA_ALERT (FLAG_DATA_SACKED|FLAG_ECE|FLAG_DSACKING_ACK) #define FLAG_FORWARD_PROGRESS (FLAG_ACKED|FLAG_DATA_SACKED) #define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH) #define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH)) #define REXMIT_NONE 0 /* no loss recovery to do */ #define REXMIT_LOST 1 /* retransmit packets marked lost */ #define REXMIT_NEW 2 /* FRTO-style transmit of unsent/new packets */ #if IS_ENABLED(CONFIG_TLS_DEVICE) static DEFINE_STATIC_KEY_DEFERRED_FALSE(clean_acked_data_enabled, HZ); void clean_acked_data_enable(struct tcp_sock *tp, void (*cad)(struct sock *sk, u32 ack_seq)) { tp->tcp_clean_acked = cad; static_branch_deferred_inc(&clean_acked_data_enabled); } EXPORT_SYMBOL_GPL(clean_acked_data_enable); void clean_acked_data_disable(struct tcp_sock *tp) { static_branch_slow_dec_deferred(&clean_acked_data_enabled); tp->tcp_clean_acked = NULL; } EXPORT_SYMBOL_GPL(clean_acked_data_disable); void clean_acked_data_flush(void) { static_key_deferred_flush(&clean_acked_data_enabled); } EXPORT_SYMBOL_GPL(clean_acked_data_flush); #endif #ifdef CONFIG_CGROUP_BPF static void bpf_skops_parse_hdr(struct sock *sk, struct sk_buff *skb) { bool unknown_opt = tcp_sk(sk)->rx_opt.saw_unknown && BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG); bool parse_all_opt = BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG); struct bpf_sock_ops_kern sock_ops; if (likely(!unknown_opt && !parse_all_opt)) return; /* The skb will be handled in the * bpf_skops_established() or * bpf_skops_write_hdr_opt(). */ switch (sk->sk_state) { case TCP_SYN_RECV: case TCP_SYN_SENT: case TCP_LISTEN: return; } sock_owned_by_me(sk); memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp)); sock_ops.op = BPF_SOCK_OPS_PARSE_HDR_OPT_CB; sock_ops.is_fullsock = 1; sock_ops.is_locked_tcp_sock = 1; sock_ops.sk = sk; bpf_skops_init_skb(&sock_ops, skb, tcp_hdrlen(skb)); BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops); } static void bpf_skops_established(struct sock *sk, int bpf_op, struct sk_buff *skb) { struct bpf_sock_ops_kern sock_ops; sock_owned_by_me(sk); memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp)); sock_ops.op = bpf_op; sock_ops.is_fullsock = 1; sock_ops.is_locked_tcp_sock = 1; sock_ops.sk = sk; /* sk with TCP_REPAIR_ON does not have skb in tcp_finish_connect */ if (skb) bpf_skops_init_skb(&sock_ops, skb, tcp_hdrlen(skb)); BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops); } #else static void bpf_skops_parse_hdr(struct sock *sk, struct sk_buff *skb) { } static void bpf_skops_established(struct sock *sk, int bpf_op, struct sk_buff *skb) { } #endif static __cold void tcp_gro_dev_warn(const struct sock *sk, const struct sk_buff *skb, unsigned int len) { struct net_device *dev; rcu_read_lock(); dev = dev_get_by_index_rcu(sock_net(sk), skb->skb_iif); if (!dev || len >= READ_ONCE(dev->mtu)) pr_warn("%s: Driver has suspect GRO implementation, TCP performance may be compromised.\n", dev ? dev->name : "Unknown driver"); rcu_read_unlock(); } /* Adapt the MSS value used to make delayed ack decision to the * real world. */ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb) { struct inet_connection_sock *icsk = inet_csk(sk); const unsigned int lss = icsk->icsk_ack.last_seg_size; unsigned int len; icsk->icsk_ack.last_seg_size = 0; /* skb->len may jitter because of SACKs, even if peer * sends good full-sized frames. */ len = skb_shinfo(skb)->gso_size ? : skb->len; if (len >= icsk->icsk_ack.rcv_mss) { /* Note: divides are still a bit expensive. * For the moment, only adjust scaling_ratio * when we update icsk_ack.rcv_mss. */ if (unlikely(len != icsk->icsk_ack.rcv_mss)) { u64 val = (u64)skb->len << TCP_RMEM_TO_WIN_SCALE; u8 old_ratio = tcp_sk(sk)->scaling_ratio; do_div(val, skb->truesize); tcp_sk(sk)->scaling_ratio = val ? val : 1; if (old_ratio != tcp_sk(sk)->scaling_ratio) { struct tcp_sock *tp = tcp_sk(sk); val = tcp_win_from_space(sk, sk->sk_rcvbuf); tcp_set_window_clamp(sk, val); if (tp->window_clamp < tp->rcvq_space.space) tp->rcvq_space.space = tp->window_clamp; } } icsk->icsk_ack.rcv_mss = min_t(unsigned int, len, tcp_sk(sk)->advmss); /* Account for possibly-removed options */ DO_ONCE_LITE_IF(len > icsk->icsk_ack.rcv_mss + MAX_TCP_OPTION_SPACE, tcp_gro_dev_warn, sk, skb, len); /* If the skb has a len of exactly 1*MSS and has the PSH bit * set then it is likely the end of an application write. So * more data may not be arriving soon, and yet the data sender * may be waiting for an ACK if cwnd-bound or using TX zero * copy. So we set ICSK_ACK_PUSHED here so that * tcp_cleanup_rbuf() will send an ACK immediately if the app * reads all of the data and is not ping-pong. If len > MSS * then this logic does not matter (and does not hurt) because * tcp_cleanup_rbuf() will always ACK immediately if the app * reads data and there is more than an MSS of unACKed data. */ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_PSH) icsk->icsk_ack.pending |= ICSK_ACK_PUSHED; } else { /* Otherwise, we make more careful check taking into account, * that SACKs block is variable. * * "len" is invariant segment length, including TCP header. */ len += skb->data - skb_transport_header(skb); if (len >= TCP_MSS_DEFAULT + sizeof(struct tcphdr) || /* If PSH is not set, packet should be * full sized, provided peer TCP is not badly broken. * This observation (if it is correct 8)) allows * to handle super-low mtu links fairly. */ (len >= TCP_MIN_MSS + sizeof(struct tcphdr) && !(tcp_flag_word(tcp_hdr(skb)) & TCP_REMNANT))) { /* Subtract also invariant (if peer is RFC compliant), * tcp header plus fixed timestamp option length. * Resulting "len" is MSS free of SACK jitter. */ len -= tcp_sk(sk)->tcp_header_len; icsk->icsk_ack.last_seg_size = len; if (len == lss) { icsk->icsk_ack.rcv_mss = len; return; } } if (icsk->icsk_ack.pending & ICSK_ACK_PUSHED) icsk->icsk_ack.pending |= ICSK_ACK_PUSHED2; icsk->icsk_ack.pending |= ICSK_ACK_PUSHED; } } static void tcp_incr_quickack(struct sock *sk, unsigned int max_quickacks) { struct inet_connection_sock *icsk = inet_csk(sk); unsigned int quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss); if (quickacks == 0) quickacks = 2; quickacks = min(quickacks, max_quickacks); if (quickacks > icsk->icsk_ack.quick) icsk->icsk_ack.quick = quickacks; } static void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks) { struct inet_connection_sock *icsk = inet_csk(sk); tcp_incr_quickack(sk, max_quickacks); inet_csk_exit_pingpong_mode(sk); icsk->icsk_ack.ato = TCP_ATO_MIN; } /* Send ACKs quickly, if "quick" count is not exhausted * and the session is not interactive. */ static bool tcp_in_quickack_mode(struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); return icsk->icsk_ack.dst_quick_ack || (icsk->icsk_ack.quick && !inet_csk_in_pingpong_mode(sk)); } static void tcp_ecn_queue_cwr(struct tcp_sock *tp) { if (tcp_ecn_mode_rfc3168(tp)) tp->ecn_flags |= TCP_ECN_QUEUE_CWR; } static void tcp_ecn_accept_cwr(struct sock *sk, const struct sk_buff *skb) { if (tcp_hdr(skb)->cwr) { tcp_sk(sk)->ecn_flags &= ~TCP_ECN_DEMAND_CWR; /* If the sender is telling us it has entered CWR, then its * cwnd may be very low (even just 1 packet), so we should ACK * immediately. */ if (TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq) inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW; } } static void tcp_ecn_withdraw_cwr(struct tcp_sock *tp) { tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR; } static void tcp_data_ecn_check(struct sock *sk, const struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); if (tcp_ecn_disabled(tp)) return; switch (TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK) { case INET_ECN_NOT_ECT: /* Funny extension: if ECT is not set on a segment, * and we already seen ECT on a previous segment, * it is probably a retransmit. */ if (tp->ecn_flags & TCP_ECN_SEEN) tcp_enter_quickack_mode(sk, 2); break; case INET_ECN_CE: if (tcp_ca_needs_ecn(sk)) tcp_ca_event(sk, CA_EVENT_ECN_IS_CE); if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) { /* Better not delay acks, sender can have a very low cwnd */ tcp_enter_quickack_mode(sk, 2); tp->ecn_flags |= TCP_ECN_DEMAND_CWR; } tp->ecn_flags |= TCP_ECN_SEEN; break; default: if (tcp_ca_needs_ecn(sk)) tcp_ca_event(sk, CA_EVENT_ECN_NO_CE); tp->ecn_flags |= TCP_ECN_SEEN; break; } } static void tcp_ecn_rcv_synack(struct tcp_sock *tp, const struct tcphdr *th) { if (tcp_ecn_mode_rfc3168(tp) && (!th->ece || th->cwr)) tcp_ecn_mode_set(tp, TCP_ECN_DISABLED); } static void tcp_ecn_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th) { if (tcp_ecn_mode_rfc3168(tp) && (!th->ece || !th->cwr)) tcp_ecn_mode_set(tp, TCP_ECN_DISABLED); } static bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr *th) { if (th->ece && !th->syn && tcp_ecn_mode_rfc3168(tp)) return true; return false; } static void tcp_count_delivered_ce(struct tcp_sock *tp, u32 ecn_count) { tp->delivered_ce += ecn_count; } /* Updates the delivered and delivered_ce counts */ static void tcp_count_delivered(struct tcp_sock *tp, u32 delivered, bool ece_ack) { tp->delivered += delivered; if (ece_ack) tcp_count_delivered_ce(tp, delivered); } /* Buffer size and advertised window tuning. * * 1. Tuning sk->sk_sndbuf, when connection enters established state. */ static void tcp_sndbuf_expand(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; int sndmem, per_mss; u32 nr_segs; /* Worst case is non GSO/TSO : each frame consumes one skb * and skb->head is kmalloced using power of two area of memory */ per_mss = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) + MAX_TCP_HEADER + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); per_mss = roundup_pow_of_two(per_mss) + SKB_DATA_ALIGN(sizeof(struct sk_buff)); nr_segs = max_t(u32, TCP_INIT_CWND, tcp_snd_cwnd(tp)); nr_segs = max_t(u32, nr_segs, tp->reordering + 1); /* Fast Recovery (RFC 5681 3.2) : * Cubic needs 1.7 factor, rounded to 2 to include * extra cushion (application might react slowly to EPOLLOUT) */ sndmem = ca_ops->sndbuf_expand ? ca_ops->sndbuf_expand(sk) : 2; sndmem *= nr_segs * per_mss; if (sk->sk_sndbuf < sndmem) WRITE_ONCE(sk->sk_sndbuf, min(sndmem, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[2]))); } /* 2. Tuning advertised window (window_clamp, rcv_ssthresh) * * All tcp_full_space() is split to two parts: "network" buffer, allocated * forward and advertised in receiver window (tp->rcv_wnd) and * "application buffer", required to isolate scheduling/application * latencies from network. * window_clamp is maximal advertised window. It can be less than * tcp_full_space(), in this case tcp_full_space() - window_clamp * is reserved for "application" buffer. The less window_clamp is * the smoother our behaviour from viewpoint of network, but the lower * throughput and the higher sensitivity of the connection to losses. 8) * * rcv_ssthresh is more strict window_clamp used at "slow start" * phase to predict further behaviour of this connection. * It is used for two goals: * - to enforce header prediction at sender, even when application * requires some significant "application buffer". It is check #1. * - to prevent pruning of receive queue because of misprediction * of receiver window. Check #2. * * The scheme does not work when sender sends good segments opening * window and then starts to feed us spaghetti. But it should work * in common situations. Otherwise, we have to rely on queue collapsing. */ /* Slow part of check#2. */ static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb, unsigned int skbtruesize) { const struct tcp_sock *tp = tcp_sk(sk); /* Optimize this! */ int truesize = tcp_win_from_space(sk, skbtruesize) >> 1; int window = tcp_win_from_space(sk, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])) >> 1; while (tp->rcv_ssthresh <= window) { if (truesize <= skb->len) return 2 * inet_csk(sk)->icsk_ack.rcv_mss; truesize >>= 1; window >>= 1; } return 0; } /* Even if skb appears to have a bad len/truesize ratio, TCP coalescing * can play nice with us, as sk_buff and skb->head might be either * freed or shared with up to MAX_SKB_FRAGS segments. * Only give a boost to drivers using page frag(s) to hold the frame(s), * and if no payload was pulled in skb->head before reaching us. */ static u32 truesize_adjust(bool adjust, const struct sk_buff *skb) { u32 truesize = skb->truesize; if (adjust && !skb_headlen(skb)) { truesize -= SKB_TRUESIZE(skb_end_offset(skb)); /* paranoid check, some drivers might be buggy */ if (unlikely((int)truesize < (int)skb->len)) truesize = skb->truesize; } return truesize; } static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb, bool adjust) { struct tcp_sock *tp = tcp_sk(sk); int room; room = min_t(int, tp->window_clamp, tcp_space(sk)) - tp->rcv_ssthresh; if (room <= 0) return; /* Check #1 */ if (!tcp_under_memory_pressure(sk)) { unsigned int truesize = truesize_adjust(adjust, skb); int incr; /* Check #2. Increase window, if skb with such overhead * will fit to rcvbuf in future. */ if (tcp_win_from_space(sk, truesize) <= skb->len) incr = 2 * tp->advmss; else incr = __tcp_grow_window(sk, skb, truesize); if (incr) { incr = max_t(int, incr, 2 * skb->len); tp->rcv_ssthresh += min(room, incr); inet_csk(sk)->icsk_ack.quick |= 1; } } else { /* Under pressure: * Adjust rcv_ssthresh according to reserved mem */ tcp_adjust_rcv_ssthresh(sk); } } /* 3. Try to fixup all. It is made immediately after connection enters * established state. */ static void tcp_init_buffer_space(struct sock *sk) { int tcp_app_win = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_app_win); struct tcp_sock *tp = tcp_sk(sk); int maxwin; if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK)) tcp_sndbuf_expand(sk); tcp_mstamp_refresh(tp); tp->rcvq_space.time = tp->tcp_mstamp; tp->rcvq_space.seq = tp->copied_seq; maxwin = tcp_full_space(sk); if (tp->window_clamp >= maxwin) { WRITE_ONCE(tp->window_clamp, maxwin); if (tcp_app_win && maxwin > 4 * tp->advmss) WRITE_ONCE(tp->window_clamp, max(maxwin - (maxwin >> tcp_app_win), 4 * tp->advmss)); } /* Force reservation of one segment. */ if (tcp_app_win && tp->window_clamp > 2 * tp->advmss && tp->window_clamp + tp->advmss > maxwin) WRITE_ONCE(tp->window_clamp, max(2 * tp->advmss, maxwin - tp->advmss)); tp->rcv_ssthresh = min(tp->rcv_ssthresh, tp->window_clamp); tp->snd_cwnd_stamp = tcp_jiffies32; tp->rcvq_space.space = min3(tp->rcv_ssthresh, tp->rcv_wnd, (u32)TCP_INIT_CWND * tp->advmss); } /* 4. Recalculate window clamp after socket hit its memory bounds. */ static void tcp_clamp_window(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); struct net *net = sock_net(sk); int rmem2; icsk->icsk_ack.quick = 0; rmem2 = READ_ONCE(net->ipv4.sysctl_tcp_rmem[2]); if (sk->sk_rcvbuf < rmem2 && !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && !tcp_under_memory_pressure(sk) && sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) { WRITE_ONCE(sk->sk_rcvbuf, min(atomic_read(&sk->sk_rmem_alloc), rmem2)); } if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss); } /* Initialize RCV_MSS value. * RCV_MSS is an our guess about MSS used by the peer. * We haven't any direct information about the MSS. * It's better to underestimate the RCV_MSS rather than overestimate. * Overestimations make us ACKing less frequently than needed. * Underestimations are more easy to detect and fix by tcp_measure_rcv_mss(). */ void tcp_initialize_rcv_mss(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache); hint = min(hint, tp->rcv_wnd / 2); hint = min(hint, TCP_MSS_DEFAULT); hint = max(hint, TCP_MIN_MSS); inet_csk(sk)->icsk_ack.rcv_mss = hint; } EXPORT_IPV6_MOD(tcp_initialize_rcv_mss); /* Receiver "autotuning" code. * * The algorithm for RTT estimation w/o timestamps is based on * Dynamic Right-Sizing (DRS) by Wu Feng and Mike Fisk of LANL. * <https://public.lanl.gov/radiant/pubs.html#DRS> * * More detail on this code can be found at * <http://staff.psc.edu/jheffner/>, * though this reference is out of date. A new paper * is pending. */ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep) { u32 new_sample, old_sample = tp->rcv_rtt_est.rtt_us; long m = sample << 3; if (old_sample == 0 || m < old_sample) { new_sample = m; } else { /* If we sample in larger samples in the non-timestamp * case, we could grossly overestimate the RTT especially * with chatty applications or bulk transfer apps which * are stalled on filesystem I/O. * * Also, since we are only going for a minimum in the * non-timestamp case, we do not smooth things out * else with timestamps disabled convergence takes too * long. */ if (win_dep) return; /* Do not use this sample if receive queue is not empty. */ if (tp->rcv_nxt != tp->copied_seq) return; new_sample = old_sample - (old_sample >> 3) + sample; } tp->rcv_rtt_est.rtt_us = new_sample; } static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp) { u32 delta_us; if (tp->rcv_rtt_est.time == 0) goto new_measure; if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq)) return; delta_us = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcv_rtt_est.time); if (!delta_us) delta_us = 1; tcp_rcv_rtt_update(tp, delta_us, 1); new_measure: tp->rcv_rtt_est.seq = tp->rcv_nxt + tp->rcv_wnd; tp->rcv_rtt_est.time = tp->tcp_mstamp; } static s32 tcp_rtt_tsopt_us(const struct tcp_sock *tp, u32 min_delta) { u32 delta, delta_us; delta = tcp_time_stamp_ts(tp) - tp->rx_opt.rcv_tsecr; if (tp->tcp_usec_ts) return delta; if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) { if (!delta) delta = min_delta; delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ); return delta_us; } return -1; } static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, const struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); if (tp->rx_opt.rcv_tsecr == tp->rcv_rtt_last_tsecr) return; tp->rcv_rtt_last_tsecr = tp->rx_opt.rcv_tsecr; if (TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss) { s32 delta = tcp_rtt_tsopt_us(tp, 0); if (delta > 0) tcp_rcv_rtt_update(tp, delta, 0); } } static void tcp_rcvbuf_grow(struct sock *sk) { const struct net *net = sock_net(sk); struct tcp_sock *tp = tcp_sk(sk); int rcvwin, rcvbuf, cap; if (!READ_ONCE(net->ipv4.sysctl_tcp_moderate_rcvbuf) || (sk->sk_userlocks & SOCK_RCVBUF_LOCK)) return; /* slow start: allow the sender to double its rate. */ rcvwin = tp->rcvq_space.space << 1; if (!RB_EMPTY_ROOT(&tp->out_of_order_queue)) rcvwin += TCP_SKB_CB(tp->ooo_last_skb)->end_seq - tp->rcv_nxt; cap = READ_ONCE(net->ipv4.sysctl_tcp_rmem[2]); rcvbuf = min_t(u32, tcp_space_from_win(sk, rcvwin), cap); if (rcvbuf > sk->sk_rcvbuf) { WRITE_ONCE(sk->sk_rcvbuf, rcvbuf); /* Make the window clamp follow along. */ WRITE_ONCE(tp->window_clamp, tcp_win_from_space(sk, rcvbuf)); } } /* * This function should be called every time data is copied to user space. * It calculates the appropriate TCP receive buffer space. */ void tcp_rcv_space_adjust(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); int time, inq, copied; trace_tcp_rcv_space_adjust(sk); tcp_mstamp_refresh(tp); time = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcvq_space.time); if (time < (tp->rcv_rtt_est.rtt_us >> 3) || tp->rcv_rtt_est.rtt_us == 0) return; /* Number of bytes copied to user in last RTT */ copied = tp->copied_seq - tp->rcvq_space.seq; /* Number of bytes in receive queue. */ inq = tp->rcv_nxt - tp->copied_seq; copied -= inq; if (copied <= tp->rcvq_space.space) goto new_measure; trace_tcp_rcvbuf_grow(sk, time); tp->rcvq_space.space = copied; tcp_rcvbuf_grow(sk); new_measure: tp->rcvq_space.seq = tp->copied_seq; tp->rcvq_space.time = tp->tcp_mstamp; } static void tcp_save_lrcv_flowlabel(struct sock *sk, const struct sk_buff *skb) { #if IS_ENABLED(CONFIG_IPV6) struct inet_connection_sock *icsk = inet_csk(sk); if (skb->protocol == htons(ETH_P_IPV6)) icsk->icsk_ack.lrcv_flowlabel = ntohl(ip6_flowlabel(ipv6_hdr(skb))); #endif } /* There is something which you must keep in mind when you analyze the * behavior of the tp->ato delayed ack timeout interval. When a * connection starts up, we want to ack as quickly as possible. The * problem is that "good" TCP's do slow start at the beginning of data * transmission. The means that until we send the first few ACK's the * sender will sit on his end and only queue most of his data, because * he can only send snd_cwnd unacked packets at any given time. For * each ACK we send, he increments snd_cwnd and transmits more of his * queue. -DaveM */ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); u32 now; inet_csk_schedule_ack(sk); tcp_measure_rcv_mss(sk, skb); tcp_rcv_rtt_measure(tp); now = tcp_jiffies32; if (!icsk->icsk_ack.ato) { /* The _first_ data packet received, initialize * delayed ACK engine. */ tcp_incr_quickack(sk, TCP_MAX_QUICKACKS); icsk->icsk_ack.ato = TCP_ATO_MIN; } else { int m = now - icsk->icsk_ack.lrcvtime; if (m <= TCP_ATO_MIN / 2) { /* The fastest case is the first. */ icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + TCP_ATO_MIN / 2; } else if (m < icsk->icsk_ack.ato) { icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + m; if (icsk->icsk_ack.ato > icsk->icsk_rto) icsk->icsk_ack.ato = icsk->icsk_rto; } else if (m > icsk->icsk_rto) { /* Too long gap. Apparently sender failed to * restart window, so that we send ACKs quickly. */ tcp_incr_quickack(sk, TCP_MAX_QUICKACKS); } } icsk->icsk_ack.lrcvtime = now; tcp_save_lrcv_flowlabel(sk, skb); tcp_data_ecn_check(sk, skb); if (skb->len >= 128) tcp_grow_window(sk, skb, true); } /* Called to compute a smoothed rtt estimate. The data fed to this * routine either comes from timestamps, or from segments that were * known _not_ to have been retransmitted [see Karn/Partridge * Proceedings SIGCOMM 87]. The algorithm is from the SIGCOMM 88 * piece by Van Jacobson. * NOTE: the next three routines used to be one big routine. * To save cycles in the RFC 1323 implementation it was better to break * it up into three procedures. -- erics */ static void tcp_rtt_estimator(struct sock *sk, long mrtt_us) { struct tcp_sock *tp = tcp_sk(sk); long m = mrtt_us; /* RTT */ u32 srtt = tp->srtt_us; /* The following amusing code comes from Jacobson's * article in SIGCOMM '88. Note that rtt and mdev * are scaled versions of rtt and mean deviation. * This is designed to be as fast as possible * m stands for "measurement". * * On a 1990 paper the rto value is changed to: * RTO = rtt + 4 * mdev * * Funny. This algorithm seems to be very broken. * These formulae increase RTO, when it should be decreased, increase * too slowly, when it should be increased quickly, decrease too quickly * etc. I guess in BSD RTO takes ONE value, so that it is absolutely * does not matter how to _calculate_ it. Seems, it was trap * that VJ failed to avoid. 8) */ if (srtt != 0) { m -= (srtt >> 3); /* m is now error in rtt est */ srtt += m; /* rtt = 7/8 rtt + 1/8 new */ if (m < 0) { m = -m; /* m is now abs(error) */ m -= (tp->mdev_us >> 2); /* similar update on mdev */ /* This is similar to one of Eifel findings. * Eifel blocks mdev updates when rtt decreases. * This solution is a bit different: we use finer gain * for mdev in this case (alpha*beta). * Like Eifel it also prevents growth of rto, * but also it limits too fast rto decreases, * happening in pure Eifel. */ if (m > 0) m >>= 3; } else { m -= (tp->mdev_us >> 2); /* similar update on mdev */ } tp->mdev_us += m; /* mdev = 3/4 mdev + 1/4 new */ if (tp->mdev_us > tp->mdev_max_us) { tp->mdev_max_us = tp->mdev_us; if (tp->mdev_max_us > tp->rttvar_us) tp->rttvar_us = tp->mdev_max_us; } if (after(tp->snd_una, tp->rtt_seq)) { if (tp->mdev_max_us < tp->rttvar_us) tp->rttvar_us -= (tp->rttvar_us - tp->mdev_max_us) >> 2; tp->rtt_seq = tp->snd_nxt; tp->mdev_max_us = tcp_rto_min_us(sk); tcp_bpf_rtt(sk, mrtt_us, srtt); } } else { /* no previous measure. */ srtt = m << 3; /* take the measured time to be rtt */ tp->mdev_us = m << 1; /* make sure rto = 3*rtt */ tp->rttvar_us = max(tp->mdev_us, tcp_rto_min_us(sk)); tp->mdev_max_us = tp->rttvar_us; tp->rtt_seq = tp->snd_nxt; tcp_bpf_rtt(sk, mrtt_us, srtt); } tp->srtt_us = max(1U, srtt); } static void tcp_update_pacing_rate(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); u64 rate; /* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */ rate = (u64)tp->mss_cache * ((USEC_PER_SEC / 100) << 3); /* current rate is (cwnd * mss) / srtt * In Slow Start [1], set sk_pacing_rate to 200 % the current rate. * In Congestion Avoidance phase, set it to 120 % the current rate. * * [1] : Normal Slow Start condition is (tp->snd_cwnd < tp->snd_ssthresh) * If snd_cwnd >= (tp->snd_ssthresh / 2), we are approaching * end of slow start and should slow down. */ if (tcp_snd_cwnd(tp) < tp->snd_ssthresh / 2) rate *= READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_pacing_ss_ratio); else rate *= READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_pacing_ca_ratio); rate *= max(tcp_snd_cwnd(tp), tp->packets_out); if (likely(tp->srtt_us)) do_div(rate, tp->srtt_us); /* WRITE_ONCE() is needed because sch_fq fetches sk_pacing_rate * without any lock. We want to make sure compiler wont store * intermediate values in this location. */ WRITE_ONCE(sk->sk_pacing_rate, min_t(u64, rate, READ_ONCE(sk->sk_max_pacing_rate))); } /* Calculate rto without backoff. This is the second half of Van Jacobson's * routine referred to above. */ static void tcp_set_rto(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); /* Old crap is replaced with new one. 8) * * More seriously: * 1. If rtt variance happened to be less 50msec, it is hallucination. * It cannot be less due to utterly erratic ACK generation made * at least by solaris and freebsd. "Erratic ACKs" has _nothing_ * to do with delayed acks, because at cwnd>2 true delack timeout * is invisible. Actually, Linux-2.4 also generates erratic * ACKs in some circumstances. */ inet_csk(sk)->icsk_rto = __tcp_set_rto(tp); /* 2. Fixups made earlier cannot be right. * If we do not estimate RTO correctly without them, * all the algo is pure shit and should be replaced * with correct one. It is exactly, which we pretend to do. */ /* NOTE: clamping at TCP_RTO_MIN is not required, current algo * guarantees that rto is higher. */ tcp_bound_rto(sk); } __u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst) { __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0); if (!cwnd) cwnd = TCP_INIT_CWND; return min_t(__u32, cwnd, tp->snd_cwnd_clamp); } struct tcp_sacktag_state { /* Timestamps for earliest and latest never-retransmitted segment * that was SACKed. RTO needs the earliest RTT to stay conservative, * but congestion control should still get an accurate delay signal. */ u64 first_sackt; u64 last_sackt; u32 reord; u32 sack_delivered; int flag; unsigned int mss_now; struct rate_sample *rate; }; /* Take a notice that peer is sending D-SACKs. Skip update of data delivery * and spurious retransmission information if this DSACK is unlikely caused by * sender's action: * - DSACKed sequence range is larger than maximum receiver's window. * - Total no. of DSACKed segments exceed the total no. of retransmitted segs. */ static u32 tcp_dsack_seen(struct tcp_sock *tp, u32 start_seq, u32 end_seq, struct tcp_sacktag_state *state) { u32 seq_len, dup_segs = 1; if (!before(start_seq, end_seq)) return 0; seq_len = end_seq - start_seq; /* Dubious DSACK: DSACKed range greater than maximum advertised rwnd */ if (seq_len > tp->max_window) return 0; if (seq_len > tp->mss_cache) dup_segs = DIV_ROUND_UP(seq_len, tp->mss_cache); else if (tp->tlp_high_seq && tp->tlp_high_seq == end_seq) state->flag |= FLAG_DSACK_TLP; tp->dsack_dups += dup_segs; /* Skip the DSACK if dup segs weren't retransmitted by sender */ if (tp->dsack_dups > tp->total_retrans) return 0; tp->rx_opt.sack_ok |= TCP_DSACK_SEEN; /* We increase the RACK ordering window in rounds where we receive * DSACKs that may have been due to reordering causing RACK to trigger * a spurious fast recovery. Thus RACK ignores DSACKs that happen * without having seen reordering, or that match TLP probes (TLP * is timer-driven, not triggered by RACK). */ if (tp->reord_seen && !(state->flag & FLAG_DSACK_TLP)) tp->rack.dsack_seen = 1; state->flag |= FLAG_DSACKING_ACK; /* A spurious retransmission is delivered */ state->sack_delivered += dup_segs; return dup_segs; } /* It's reordering when higher sequence was delivered (i.e. sacked) before * some lower never-retransmitted sequence ("low_seq"). The maximum reordering * distance is approximated in full-mss packet distance ("reordering"). */ static void tcp_check_sack_reordering(struct sock *sk, const u32 low_seq, const int ts) { struct tcp_sock *tp = tcp_sk(sk); const u32 mss = tp->mss_cache; u32 fack, metric; fack = tcp_highest_sack_seq(tp); if (!before(low_seq, fack)) return; metric = fack - low_seq; if ((metric > tp->reordering * mss) && mss) { #if FASTRETRANS_DEBUG > 1 pr_debug("Disorder%d %d %u f%u s%u rr%d\n", tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state, tp->reordering, 0, tp->sacked_out, tp->undo_marker ? tp->undo_retrans : 0); #endif tp->reordering = min_t(u32, (metric + mss - 1) / mss, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_max_reordering)); } /* This exciting event is worth to be remembered. 8) */ tp->reord_seen++; NET_INC_STATS(sock_net(sk), ts ? LINUX_MIB_TCPTSREORDER : LINUX_MIB_TCPSACKREORDER); } /* This must be called before lost_out or retrans_out are updated * on a new loss, because we want to know if all skbs previously * known to be lost have already been retransmitted, indicating * that this newly lost skb is our next skb to retransmit. */ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb) { if ((!tp->retransmit_skb_hint && tp->retrans_out >= tp->lost_out) || (tp->retransmit_skb_hint && before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(tp->retransmit_skb_hint)->seq))) tp->retransmit_skb_hint = skb; } /* Sum the number of packets on the wire we have marked as lost, and * notify the congestion control module that the given skb was marked lost. */ static void tcp_notify_skb_loss_event(struct tcp_sock *tp, const struct sk_buff *skb) { tp->lost += tcp_skb_pcount(skb); } void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb) { __u8 sacked = TCP_SKB_CB(skb)->sacked; struct tcp_sock *tp = tcp_sk(sk); if (sacked & TCPCB_SACKED_ACKED) return; tcp_verify_retransmit_hint(tp, skb); if (sacked & TCPCB_LOST) { if (sacked & TCPCB_SACKED_RETRANS) { /* Account for retransmits that are lost again */ TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; tp->retrans_out -= tcp_skb_pcount(skb); NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT, tcp_skb_pcount(skb)); tcp_notify_skb_loss_event(tp, skb); } } else { tp->lost_out += tcp_skb_pcount(skb); TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; tcp_notify_skb_loss_event(tp, skb); } } /* This procedure tags the retransmission queue when SACKs arrive. * * We have three tag bits: SACKED(S), RETRANS(R) and LOST(L). * Packets in queue with these bits set are counted in variables * sacked_out, retrans_out and lost_out, correspondingly. * * Valid combinations are: * Tag InFlight Description * 0 1 - orig segment is in flight. * S 0 - nothing flies, orig reached receiver. * L 0 - nothing flies, orig lost by net. * R 2 - both orig and retransmit are in flight. * L|R 1 - orig is lost, retransmit is in flight. * S|R 1 - orig reached receiver, retrans is still in flight. * (L|S|R is logically valid, it could occur when L|R is sacked, * but it is equivalent to plain S and code short-circuits it to S. * L|S is logically invalid, it would mean -1 packet in flight 8)) * * These 6 states form finite state machine, controlled by the following events: * 1. New ACK (+SACK) arrives. (tcp_sacktag_write_queue()) * 2. Retransmission. (tcp_retransmit_skb(), tcp_xmit_retransmit_queue()) * 3. Loss detection event of two flavors: * A. Scoreboard estimator decided the packet is lost. * A'. Reno "three dupacks" marks head of queue lost. * B. SACK arrives sacking SND.NXT at the moment, when the * segment was retransmitted. * 4. D-SACK added new rule: D-SACK changes any tag to S. * * It is pleasant to note, that state diagram turns out to be commutative, * so that we are allowed not to be bothered by order of our actions, * when multiple events arrive simultaneously. (see the function below). * * Reordering detection. * -------------------- * Reordering metric is maximal distance, which a packet can be displaced * in packet stream. With SACKs we can estimate it: * * 1. SACK fills old hole and the corresponding segment was not * ever retransmitted -> reordering. Alas, we cannot use it * when segment was retransmitted. * 2. The last flaw is solved with D-SACK. D-SACK arrives * for retransmitted and already SACKed segment -> reordering.. * Both of these heuristics are not used in Loss state, when we cannot * account for retransmits accurately. * * SACK block validation. * ---------------------- * * SACK block range validation checks that the received SACK block fits to * the expected sequence limits, i.e., it is between SND.UNA and SND.NXT. * Note that SND.UNA is not included to the range though being valid because * it means that the receiver is rather inconsistent with itself reporting * SACK reneging when it should advance SND.UNA. Such SACK block this is * perfectly valid, however, in light of RFC2018 which explicitly states * that "SACK block MUST reflect the newest segment. Even if the newest * segment is going to be discarded ...", not that it looks very clever * in case of head skb. Due to potentional receiver driven attacks, we * choose to avoid immediate execution of a walk in write queue due to * reneging and defer head skb's loss recovery to standard loss recovery * procedure that will eventually trigger (nothing forbids us doing this). * * Implements also blockage to start_seq wrap-around. Problem lies in the * fact that though start_seq (s) is before end_seq (i.e., not reversed), * there's no guarantee that it will be before snd_nxt (n). The problem * happens when start_seq resides between end_seq wrap (e_w) and snd_nxt * wrap (s_w): * * <- outs wnd -> <- wrapzone -> * u e n u_w e_w s n_w * | | | | | | | * |<------------+------+----- TCP seqno space --------------+---------->| * ...-- <2^31 ->| |<--------... * ...---- >2^31 ------>| |<--------... * * Current code wouldn't be vulnerable but it's better still to discard such * crazy SACK blocks. Doing this check for start_seq alone closes somewhat * similar case (end_seq after snd_nxt wrap) as earlier reversed check in * snd_nxt wrap -> snd_una region will then become "well defined", i.e., * equal to the ideal case (infinite seqno space without wrap caused issues). * * With D-SACK the lower bound is extended to cover sequence space below * SND.UNA down to undo_marker, which is the last point of interest. Yet * again, D-SACK block must not to go across snd_una (for the same reason as * for the normal SACK blocks, explained above). But there all simplicity * ends, TCP might receive valid D-SACKs below that. As long as they reside * fully below undo_marker they do not affect behavior in anyway and can * therefore be safely ignored. In rare cases (which are more or less * theoretical ones), the D-SACK will nicely cross that boundary due to skb * fragmentation and packet reordering past skb's retransmission. To consider * them correctly, the acceptable range must be extended even more though * the exact amount is rather hard to quantify. However, tp->max_window can * be used as an exaggerated estimate. */ static bool tcp_is_sackblock_valid(struct tcp_sock *tp, bool is_dsack, u32 start_seq, u32 end_seq) { /* Too far in future, or reversed (interpretation is ambiguous) */ if (after(end_seq, tp->snd_nxt) || !before(start_seq, end_seq)) return false; /* Nasty start_seq wrap-around check (see comments above) */ if (!before(start_seq, tp->snd_nxt)) return false; /* In outstanding window? ...This is valid exit for D-SACKs too. * start_seq == snd_una is non-sensical (see comments above) */ if (after(start_seq, tp->snd_una)) return true; if (!is_dsack || !tp->undo_marker) return false; /* ...Then it's D-SACK, and must reside below snd_una completely */ if (after(end_seq, tp->snd_una)) return false; if (!before(start_seq, tp->undo_marker)) return true; /* Too old */ if (!after(end_seq, tp->undo_marker)) return false; /* Undo_marker boundary crossing (overestimates a lot). Known already: * start_seq < undo_marker and end_seq >= undo_marker. */ return !before(start_seq, end_seq - tp->max_window); } static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb, struct tcp_sack_block_wire *sp, int num_sacks, u32 prior_snd_una, struct tcp_sacktag_state *state) { struct tcp_sock *tp = tcp_sk(sk); u32 start_seq_0 = get_unaligned_be32(&sp[0].start_seq); u32 end_seq_0 = get_unaligned_be32(&sp[0].end_seq); u32 dup_segs; if (before(start_seq_0, TCP_SKB_CB(ack_skb)->ack_seq)) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKRECV); } else if (num_sacks > 1) { u32 end_seq_1 = get_unaligned_be32(&sp[1].end_seq); u32 start_seq_1 = get_unaligned_be32(&sp[1].start_seq); if (after(end_seq_0, end_seq_1) || before(start_seq_0, start_seq_1)) return false; NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKOFORECV); } else { return false; } dup_segs = tcp_dsack_seen(tp, start_seq_0, end_seq_0, state); if (!dup_segs) { /* Skip dubious DSACK */ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKIGNOREDDUBIOUS); return false; } NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDSACKRECVSEGS, dup_segs); /* D-SACK for already forgotten data... Do dumb counting. */ if (tp->undo_marker && tp->undo_retrans > 0 && !after(end_seq_0, prior_snd_una) && after(end_seq_0, tp->undo_marker)) tp->undo_retrans = max_t(int, 0, tp->undo_retrans - dup_segs); return true; } /* Check if skb is fully within the SACK block. In presence of GSO skbs, * the incoming SACK may not exactly match but we can find smaller MSS * aligned portion of it that matches. Therefore we might need to fragment * which may fail and creates some hassle (caller must handle error case * returns). * * FIXME: this could be merged to shift decision code */ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb, u32 start_seq, u32 end_seq) { int err; bool in_sack; unsigned int pkt_len; unsigned int mss; in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) && !before(end_seq, TCP_SKB_CB(skb)->end_seq); if (tcp_skb_pcount(skb) > 1 && !in_sack && after(TCP_SKB_CB(skb)->end_seq, start_seq)) { mss = tcp_skb_mss(skb); in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq); if (!in_sack) { pkt_len = start_seq - TCP_SKB_CB(skb)->seq; if (pkt_len < mss) pkt_len = mss; } else { pkt_len = end_seq - TCP_SKB_CB(skb)->seq; if (pkt_len < mss) return -EINVAL; } /* Round if necessary so that SACKs cover only full MSSes * and/or the remaining small portion (if present) */ if (pkt_len > mss) { unsigned int new_len = (pkt_len / mss) * mss; if (!in_sack && new_len < pkt_len) new_len += mss; pkt_len = new_len; } if (pkt_len >= skb->len && !in_sack) return 0; err = tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, pkt_len, mss, GFP_ATOMIC); if (err < 0) return err; } return in_sack; } /* Mark the given newly-SACKed range as such, adjusting counters and hints. */ static u8 tcp_sacktag_one(struct sock *sk, struct tcp_sacktag_state *state, u8 sacked, u32 start_seq, u32 end_seq, int dup_sack, int pcount, u64 xmit_time) { struct tcp_sock *tp = tcp_sk(sk); /* Account D-SACK for retransmitted packet. */ if (dup_sack && (sacked & TCPCB_RETRANS)) { if (tp->undo_marker && tp->undo_retrans > 0 && after(end_seq, tp->undo_marker)) tp->undo_retrans = max_t(int, 0, tp->undo_retrans - pcount); if ((sacked & TCPCB_SACKED_ACKED) && before(start_seq, state->reord)) state->reord = start_seq; } /* Nothing to do; acked frame is about to be dropped (was ACKed). */ if (!after(end_seq, tp->snd_una)) return sacked; if (!(sacked & TCPCB_SACKED_ACKED)) { tcp_rack_advance(tp, sacked, end_seq, xmit_time); if (sacked & TCPCB_SACKED_RETRANS) { /* If the segment is not tagged as lost, * we do not clear RETRANS, believing * that retransmission is still in flight. */ if (sacked & TCPCB_LOST) { sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS); tp->lost_out -= pcount; tp->retrans_out -= pcount; } } else { if (!(sacked & TCPCB_RETRANS)) { /* New sack for not retransmitted frame, * which was in hole. It is reordering. */ if (before(start_seq, tcp_highest_sack_seq(tp)) && before(start_seq, state->reord)) state->reord = start_seq; if (!after(end_seq, tp->high_seq)) state->flag |= FLAG_ORIG_SACK_ACKED; if (state->first_sackt == 0) state->first_sackt = xmit_time; state->last_sackt = xmit_time; } if (sacked & TCPCB_LOST) { sacked &= ~TCPCB_LOST; tp->lost_out -= pcount; } } sacked |= TCPCB_SACKED_ACKED; state->flag |= FLAG_DATA_SACKED; tp->sacked_out += pcount; /* Out-of-order packets delivered */ state->sack_delivered += pcount; /* Lost marker hint past SACKed? Tweak RFC3517 cnt */ if (tp->lost_skb_hint && before(start_seq, TCP_SKB_CB(tp->lost_skb_hint)->seq)) tp->lost_cnt_hint += pcount; } /* D-SACK. We can detect redundant retransmission in S|R and plain R * frames and clear it. undo_retrans is decreased above, L|R frames * are accounted above as well. */ if (dup_sack && (sacked & TCPCB_SACKED_RETRANS)) { sacked &= ~TCPCB_SACKED_RETRANS; tp->retrans_out -= pcount; } return sacked; } /* Shift newly-SACKed bytes from this skb to the immediately previous * already-SACKed sk_buff. Mark the newly-SACKed bytes as such. */ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev, struct sk_buff *skb, struct tcp_sacktag_state *state, unsigned int pcount, int shifted, int mss, bool dup_sack) { struct tcp_sock *tp = tcp_sk(sk); u32 start_seq = TCP_SKB_CB(skb)->seq; /* start of newly-SACKed */ u32 end_seq = start_seq + shifted; /* end of newly-SACKed */ BUG_ON(!pcount); /* Adjust counters and hints for the newly sacked sequence * range but discard the return value since prev is already * marked. We must tag the range first because the seq * advancement below implicitly advances * tcp_highest_sack_seq() when skb is highest_sack. */ tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked, start_seq, end_seq, dup_sack, pcount, tcp_skb_timestamp_us(skb)); tcp_rate_skb_delivered(sk, skb, state->rate); if (skb == tp->lost_skb_hint) tp->lost_cnt_hint += pcount; TCP_SKB_CB(prev)->end_seq += shifted; TCP_SKB_CB(skb)->seq += shifted; tcp_skb_pcount_add(prev, pcount); WARN_ON_ONCE(tcp_skb_pcount(skb) < pcount); tcp_skb_pcount_add(skb, -pcount); /* When we're adding to gso_segs == 1, gso_size will be zero, * in theory this shouldn't be necessary but as long as DSACK * code can come after this skb later on it's better to keep * setting gso_size to something. */ if (!TCP_SKB_CB(prev)->tcp_gso_size) TCP_SKB_CB(prev)->tcp_gso_size = mss; /* CHECKME: To clear or not to clear? Mimics normal skb currently */ if (tcp_skb_pcount(skb) <= 1) TCP_SKB_CB(skb)->tcp_gso_size = 0; /* Difference in this won't matter, both ACKed by the same cumul. ACK */ TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS); if (skb->len > 0) { BUG_ON(!tcp_skb_pcount(skb)); NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKSHIFTED); return false; } /* Whole SKB was eaten :-) */ if (skb == tp->retransmit_skb_hint) tp->retransmit_skb_hint = prev; if (skb == tp->lost_skb_hint) { tp->lost_skb_hint = prev; tp->lost_cnt_hint -= tcp_skb_pcount(prev); } TCP_SKB_CB(prev)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags; TCP_SKB_CB(prev)->eor = TCP_SKB_CB(skb)->eor; if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) TCP_SKB_CB(prev)->end_seq++; if (skb == tcp_highest_sack(sk)) tcp_advance_highest_sack(sk, skb); tcp_skb_collapse_tstamp(prev, skb); if (unlikely(TCP_SKB_CB(prev)->tx.delivered_mstamp)) TCP_SKB_CB(prev)->tx.delivered_mstamp = 0; tcp_rtx_queue_unlink_and_free(skb, sk); NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKMERGED); return true; } /* I wish gso_size would have a bit more sane initialization than * something-or-zero which complicates things */ static int tcp_skb_seglen(const struct sk_buff *skb) { return tcp_skb_pcount(skb) == 1 ? skb->len : tcp_skb_mss(skb); } /* Shifting pages past head area doesn't work */ static int skb_can_shift(const struct sk_buff *skb) { return !skb_headlen(skb) && skb_is_nonlinear(skb); } int tcp_skb_shift(struct sk_buff *to, struct sk_buff *from, int pcount, int shiftlen) { /* TCP min gso_size is 8 bytes (TCP_MIN_GSO_SIZE) * Since TCP_SKB_CB(skb)->tcp_gso_segs is 16 bits, we need * to make sure not storing more than 65535 * 8 bytes per skb, * even if current MSS is bigger. */ if (unlikely(to->len + shiftlen >= 65535 * TCP_MIN_GSO_SIZE)) return 0; if (unlikely(tcp_skb_pcount(to) + pcount > 65535)) return 0; return skb_shift(to, from, shiftlen); } /* Try collapsing SACK blocks spanning across multiple skbs to a single * skb. */ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, struct tcp_sacktag_state *state, u32 start_seq, u32 end_seq, bool dup_sack) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *prev; int mss; int pcount = 0; int len; int in_sack; /* Normally R but no L won't result in plain S */ if (!dup_sack && (TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_RETRANS)) == TCPCB_SACKED_RETRANS) goto fallback; if (!skb_can_shift(skb)) goto fallback; /* This frame is about to be dropped (was ACKed). */ if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) goto fallback; /* Can only happen with delayed DSACK + discard craziness */ prev = skb_rb_prev(skb); if (!prev) goto fallback; if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) goto fallback; if (!tcp_skb_can_collapse(prev, skb)) goto fallback; in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) && !before(end_seq, TCP_SKB_CB(skb)->end_seq); if (in_sack) { len = skb->len; pcount = tcp_skb_pcount(skb); mss = tcp_skb_seglen(skb); /* TODO: Fix DSACKs to not fragment already SACKed and we can * drop this restriction as unnecessary */ if (mss != tcp_skb_seglen(prev)) goto fallback; } else { if (!after(TCP_SKB_CB(skb)->end_seq, start_seq)) goto noop; /* CHECKME: This is non-MSS split case only?, this will * cause skipped skbs due to advancing loop btw, original * has that feature too */ if (tcp_skb_pcount(skb) <= 1) goto noop; in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq); if (!in_sack) { /* TODO: head merge to next could be attempted here * if (!after(TCP_SKB_CB(skb)->end_seq, end_seq)), * though it might not be worth of the additional hassle * * ...we can probably just fallback to what was done * previously. We could try merging non-SACKed ones * as well but it probably isn't going to buy off * because later SACKs might again split them, and * it would make skb timestamp tracking considerably * harder problem. */ goto fallback; } len = end_seq - TCP_SKB_CB(skb)->seq; BUG_ON(len < 0); BUG_ON(len > skb->len); /* MSS boundaries should be honoured or else pcount will * severely break even though it makes things bit trickier. * Optimize common case to avoid most of the divides */ mss = tcp_skb_mss(skb); /* TODO: Fix DSACKs to not fragment already SACKed and we can * drop this restriction as unnecessary */ if (mss != tcp_skb_seglen(prev)) goto fallback; if (len == mss) { pcount = 1; } else if (len < mss) { goto noop; } else { pcount = len / mss; len = pcount * mss; } } /* tcp_sacktag_one() won't SACK-tag ranges below snd_una */ if (!after(TCP_SKB_CB(skb)->seq + len, tp->snd_una)) goto fallback; if (!tcp_skb_shift(prev, skb, pcount, len)) goto fallback; if (!tcp_shifted_skb(sk, prev, skb, state, pcount, len, mss, dup_sack)) goto out; /* Hole filled allows collapsing with the next as well, this is very * useful when hole on every nth skb pattern happens */ skb = skb_rb_next(prev); if (!skb) goto out; if (!skb_can_shift(skb) || ((TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) || (mss != tcp_skb_seglen(skb))) goto out; if (!tcp_skb_can_collapse(prev, skb)) goto out; len = skb->len; pcount = tcp_skb_pcount(skb); if (tcp_skb_shift(prev, skb, pcount, len)) tcp_shifted_skb(sk, prev, skb, state, pcount, len, mss, 0); out: return prev; noop: return skb; fallback: NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKSHIFTFALLBACK); return NULL; } static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, struct tcp_sack_block *next_dup, struct tcp_sacktag_state *state, u32 start_seq, u32 end_seq, bool dup_sack_in) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *tmp; skb_rbtree_walk_from(skb) { int in_sack = 0; bool dup_sack = dup_sack_in; /* queue is in-order => we can short-circuit the walk early */ if (!before(TCP_SKB_CB(skb)->seq, end_seq)) break; if (next_dup && before(TCP_SKB_CB(skb)->seq, next_dup->end_seq)) { in_sack = tcp_match_skb_to_sack(sk, skb, next_dup->start_seq, next_dup->end_seq); if (in_sack > 0) dup_sack = true; } /* skb reference here is a bit tricky to get right, since * shifting can eat and free both this skb and the next, * so not even _safe variant of the loop is enough. */ if (in_sack <= 0) { tmp = tcp_shift_skb_data(sk, skb, state, start_seq, end_seq, dup_sack); if (tmp) { if (tmp != skb) { skb = tmp; continue; } in_sack = 0; } else { in_sack = tcp_match_skb_to_sack(sk, skb, start_seq, end_seq); } } if (unlikely(in_sack < 0)) break; if (in_sack) { TCP_SKB_CB(skb)->sacked = tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, dup_sack, tcp_skb_pcount(skb), tcp_skb_timestamp_us(skb)); tcp_rate_skb_delivered(sk, skb, state->rate); if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) list_del_init(&skb->tcp_tsorted_anchor); if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp))) tcp_advance_highest_sack(sk, skb); } } return skb; } static struct sk_buff *tcp_sacktag_bsearch(struct sock *sk, u32 seq) { struct rb_node *parent, **p = &sk->tcp_rtx_queue.rb_node; struct sk_buff *skb; while (*p) { parent = *p; skb = rb_to_skb(parent); if (before(seq, TCP_SKB_CB(skb)->seq)) { p = &parent->rb_left; continue; } if (!before(seq, TCP_SKB_CB(skb)->end_seq)) { p = &parent->rb_right; continue; } return skb; } return NULL; } static struct sk_buff *tcp_sacktag_skip(struct sk_buff *skb, struct sock *sk, u32 skip_to_seq) { if (skb && after(TCP_SKB_CB(skb)->seq, skip_to_seq)) return skb; return tcp_sacktag_bsearch(sk, skip_to_seq); } static struct sk_buff *tcp_maybe_skipping_dsack(struct sk_buff *skb, struct sock *sk, struct tcp_sack_block *next_dup, struct tcp_sacktag_state *state, u32 skip_to_seq) { if (!next_dup) return skb; if (before(next_dup->start_seq, skip_to_seq)) { skb = tcp_sacktag_skip(skb, sk, next_dup->start_seq); skb = tcp_sacktag_walk(skb, sk, NULL, state, next_dup->start_seq, next_dup->end_seq, 1); } return skb; } static int tcp_sack_cache_ok(const struct tcp_sock *tp, const struct tcp_sack_block *cache) { return cache < tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache); } static int tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, u32 prior_snd_una, struct tcp_sacktag_state *state) { struct tcp_sock *tp = tcp_sk(sk); const unsigned char *ptr = (skb_transport_header(ack_skb) + TCP_SKB_CB(ack_skb)->sacked); struct tcp_sack_block_wire *sp_wire = (struct tcp_sack_block_wire *)(ptr+2); struct tcp_sack_block sp[TCP_NUM_SACKS]; struct tcp_sack_block *cache; struct sk_buff *skb; int num_sacks = min(TCP_NUM_SACKS, (ptr[1] - TCPOLEN_SACK_BASE) >> 3); int used_sacks; bool found_dup_sack = false; int i, j; int first_sack_index; state->flag = 0; state->reord = tp->snd_nxt; if (!tp->sacked_out) tcp_highest_sack_reset(sk); found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire, num_sacks, prior_snd_una, state); /* Eliminate too old ACKs, but take into * account more or less fresh ones, they can * contain valid SACK info. */ if (before(TCP_SKB_CB(ack_skb)->ack_seq, prior_snd_una - tp->max_window)) return 0; if (!tp->packets_out) goto out; used_sacks = 0; first_sack_index = 0; for (i = 0; i < num_sacks; i++) { bool dup_sack = !i && found_dup_sack; sp[used_sacks].start_seq = get_unaligned_be32(&sp_wire[i].start_seq); sp[used_sacks].end_seq = get_unaligned_be32(&sp_wire[i].end_seq); if (!tcp_is_sackblock_valid(tp, dup_sack, sp[used_sacks].start_seq, sp[used_sacks].end_seq)) { int mib_idx; if (dup_sack) { if (!tp->undo_marker) mib_idx = LINUX_MIB_TCPDSACKIGNOREDNOUNDO; else mib_idx = LINUX_MIB_TCPDSACKIGNOREDOLD; } else { /* Don't count olds caused by ACK reordering */ if ((TCP_SKB_CB(ack_skb)->ack_seq != tp->snd_una) && !after(sp[used_sacks].end_seq, tp->snd_una)) continue; mib_idx = LINUX_MIB_TCPSACKDISCARD; } NET_INC_STATS(sock_net(sk), mib_idx); if (i == 0) first_sack_index = -1; continue; } /* Ignore very old stuff early */ if (!after(sp[used_sacks].end_seq, prior_snd_una)) { if (i == 0) first_sack_index = -1; continue; } used_sacks++; } /* order SACK blocks to allow in order walk of the retrans queue */ for (i = used_sacks - 1; i > 0; i--) { for (j = 0; j < i; j++) { if (after(sp[j].start_seq, sp[j + 1].start_seq)) { swap(sp[j], sp[j + 1]); /* Track where the first SACK block goes to */ if (j == first_sack_index) first_sack_index = j + 1; } } } state->mss_now = tcp_current_mss(sk); skb = NULL; i = 0; if (!tp->sacked_out) { /* It's already past, so skip checking against it */ cache = tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache); } else { cache = tp->recv_sack_cache; /* Skip empty blocks in at head of the cache */ while (tcp_sack_cache_ok(tp, cache) && !cache->start_seq && !cache->end_seq) cache++; } while (i < used_sacks) { u32 start_seq = sp[i].start_seq; u32 end_seq = sp[i].end_seq; bool dup_sack = (found_dup_sack && (i == first_sack_index)); struct tcp_sack_block *next_dup = NULL; if (found_dup_sack && ((i + 1) == first_sack_index)) next_dup = &sp[i + 1]; /* Skip too early cached blocks */ while (tcp_sack_cache_ok(tp, cache) && !before(start_seq, cache->end_seq)) cache++; /* Can skip some work by looking recv_sack_cache? */ if (tcp_sack_cache_ok(tp, cache) && !dup_sack && after(end_seq, cache->start_seq)) { /* Head todo? */ if (before(start_seq, cache->start_seq)) { skb = tcp_sacktag_skip(skb, sk, start_seq); skb = tcp_sacktag_walk(skb, sk, next_dup, state, start_seq, cache->start_seq, dup_sack); } /* Rest of the block already fully processed? */ if (!after(end_seq, cache->end_seq)) goto advance_sp; skb = tcp_maybe_skipping_dsack(skb, sk, next_dup, state, cache->end_seq); /* ...tail remains todo... */ if (tcp_highest_sack_seq(tp) == cache->end_seq) { /* ...but better entrypoint exists! */ skb = tcp_highest_sack(sk); if (!skb) break; cache++; goto walk; } skb = tcp_sacktag_skip(skb, sk, cache->end_seq); /* Check overlap against next cached too (past this one already) */ cache++; continue; } if (!before(start_seq, tcp_highest_sack_seq(tp))) { skb = tcp_highest_sack(sk); if (!skb) break; } skb = tcp_sacktag_skip(skb, sk, start_seq); walk: skb = tcp_sacktag_walk(skb, sk, next_dup, state, start_seq, end_seq, dup_sack); advance_sp: i++; } /* Clear the head of the cache sack blocks so we can skip it next time */ for (i = 0; i < ARRAY_SIZE(tp->recv_sack_cache) - used_sacks; i++) { tp->recv_sack_cache[i].start_seq = 0; tp->recv_sack_cache[i].end_seq = 0; } for (j = 0; j < used_sacks; j++) tp->recv_sack_cache[i++] = sp[j]; if (inet_csk(sk)->icsk_ca_state != TCP_CA_Loss || tp->undo_marker) tcp_check_sack_reordering(sk, state->reord, 0); tcp_verify_left_out(tp); out: #if FASTRETRANS_DEBUG > 0 WARN_ON((int)tp->sacked_out < 0); WARN_ON((int)tp->lost_out < 0); WARN_ON((int)tp->retrans_out < 0); WARN_ON((int)tcp_packets_in_flight(tp) < 0); #endif return state->flag; } /* Limits sacked_out so that sum with lost_out isn't ever larger than * packets_out. Returns false if sacked_out adjustement wasn't necessary. */ static bool tcp_limit_reno_sacked(struct tcp_sock *tp) { u32 holes; holes = max(tp->lost_out, 1U); holes = min(holes, tp->packets_out); if ((tp->sacked_out + holes) > tp->packets_out) { tp->sacked_out = tp->packets_out - holes; return true; } return false; } /* If we receive more dupacks than we expected counting segments * in assumption of absent reordering, interpret this as reordering. * The only another reason could be bug in receiver TCP. */ static void tcp_check_reno_reordering(struct sock *sk, const int addend) { struct tcp_sock *tp = tcp_sk(sk); if (!tcp_limit_reno_sacked(tp)) return; tp->reordering = min_t(u32, tp->packets_out + addend, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_max_reordering)); tp->reord_seen++; NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRENOREORDER); } /* Emulate SACKs for SACKless connection: account for a new dupack. */ static void tcp_add_reno_sack(struct sock *sk, int num_dupack, bool ece_ack) { if (num_dupack) { struct tcp_sock *tp = tcp_sk(sk); u32 prior_sacked = tp->sacked_out; s32 delivered; tp->sacked_out += num_dupack; tcp_check_reno_reordering(sk, 0); delivered = tp->sacked_out - prior_sacked; if (delivered > 0) tcp_count_delivered(tp, delivered, ece_ack); tcp_verify_left_out(tp); } } /* Account for ACK, ACKing some data in Reno Recovery phase. */ static void tcp_remove_reno_sacks(struct sock *sk, int acked, bool ece_ack) { struct tcp_sock *tp = tcp_sk(sk); if (acked > 0) { /* One ACK acked hole. The rest eat duplicate ACKs. */ tcp_count_delivered(tp, max_t(int, acked - tp->sacked_out, 1), ece_ack); if (acked - 1 >= tp->sacked_out) tp->sacked_out = 0; else tp->sacked_out -= acked - 1; } tcp_check_reno_reordering(sk, acked); tcp_verify_left_out(tp); } static inline void tcp_reset_reno_sack(struct tcp_sock *tp) { tp->sacked_out = 0; } void tcp_clear_retrans(struct tcp_sock *tp) { tp->retrans_out = 0; tp->lost_out = 0; tp->undo_marker = 0; tp->undo_retrans = -1; tp->sacked_out = 0; tp->rto_stamp = 0; tp->total_rto = 0; tp->total_rto_recoveries = 0; tp->total_rto_time = 0; } static inline void tcp_init_undo(struct tcp_sock *tp) { tp->undo_marker = tp->snd_una; /* Retransmission still in flight may cause DSACKs later. */ /* First, account for regular retransmits in flight: */ tp->undo_retrans = tp->retrans_out; /* Next, account for TLP retransmits in flight: */ if (tp->tlp_high_seq && tp->tlp_retrans) tp->undo_retrans++; /* Finally, avoid 0, because undo_retrans==0 means "can undo now": */ if (!tp->undo_retrans) tp->undo_retrans = -1; } static bool tcp_is_rack(const struct sock *sk) { return READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_recovery) & TCP_RACK_LOSS_DETECTION; } /* If we detect SACK reneging, forget all SACK information * and reset tags completely, otherwise preserve SACKs. If receiver * dropped its ofo queue, we will know this due to reneging detection. */ static void tcp_timeout_mark_lost(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb, *head; bool is_reneg; /* is receiver reneging on SACKs? */ head = tcp_rtx_queue_head(sk); is_reneg = head && (TCP_SKB_CB(head)->sacked & TCPCB_SACKED_ACKED); if (is_reneg) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSACKRENEGING); tp->sacked_out = 0; /* Mark SACK reneging until we recover from this loss event. */ tp->is_sack_reneg = 1; } else if (tcp_is_reno(tp)) { tcp_reset_reno_sack(tp); } skb = head; skb_rbtree_walk_from(skb) { if (is_reneg) TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED; else if (tcp_is_rack(sk) && skb != head && tcp_rack_skb_timeout(tp, skb, 0) > 0) continue; /* Don't mark recently sent ones lost yet */ tcp_mark_skb_lost(sk, skb); } tcp_verify_left_out(tp); tcp_clear_all_retrans_hints(tp); } /* Enter Loss state. */ void tcp_enter_loss(struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery; u8 reordering; tcp_timeout_mark_lost(sk); /* Reduce ssthresh if it has not yet been made inside this window. */ if (icsk->icsk_ca_state <= TCP_CA_Disorder || !after(tp->high_seq, tp->snd_una) || (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) { tp->prior_ssthresh = tcp_current_ssthresh(sk); tp->prior_cwnd = tcp_snd_cwnd(tp); tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); tcp_ca_event(sk, CA_EVENT_LOSS); tcp_init_undo(tp); } tcp_snd_cwnd_set(tp, tcp_packets_in_flight(tp) + 1); tp->snd_cwnd_cnt = 0; tp->snd_cwnd_stamp = tcp_jiffies32; /* Timeout in disordered state after receiving substantial DUPACKs * suggests that the degree of reordering is over-estimated. */ reordering = READ_ONCE(net->ipv4.sysctl_tcp_reordering); if (icsk->icsk_ca_state <= TCP_CA_Disorder && tp->sacked_out >= reordering) tp->reordering = min_t(unsigned int, tp->reordering, reordering); tcp_set_ca_state(sk, TCP_CA_Loss); tp->high_seq = tp->snd_nxt; tp->tlp_high_seq = 0; tcp_ecn_queue_cwr(tp); /* F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous * loss recovery is underway except recurring timeout(s) on * the same SND.UNA (sec 3.2). Disable F-RTO on path MTU probing */ tp->frto = READ_ONCE(net->ipv4.sysctl_tcp_frto) && (new_recovery || icsk->icsk_retransmits) && !inet_csk(sk)->icsk_mtup.probe_size; } /* If ACK arrived pointing to a remembered SACK, it means that our * remembered SACKs do not reflect real state of receiver i.e. * receiver _host_ is heavily congested (or buggy). * * To avoid big spurious retransmission bursts due to transient SACK * scoreboard oddities that look like reneging, we give the receiver a * little time (max(RTT/2, 10ms)) to send us some more ACKs that will * restore sanity to the SACK scoreboard. If the apparent reneging * persists until this RTO then we'll clear the SACK scoreboard. */ static bool tcp_check_sack_reneging(struct sock *sk, int *ack_flag) { if (*ack_flag & FLAG_SACK_RENEGING && *ack_flag & FLAG_SND_UNA_ADVANCED) { struct tcp_sock *tp = tcp_sk(sk); unsigned long delay = max(usecs_to_jiffies(tp->srtt_us >> 4), msecs_to_jiffies(10)); tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, delay, false); *ack_flag &= ~FLAG_SET_XMIT_TIMER; return true; } return false; } /* Heurestics to calculate number of duplicate ACKs. There's no dupACKs * counter when SACK is enabled (without SACK, sacked_out is used for * that purpose). * * With reordering, holes may still be in flight, so RFC3517 recovery * uses pure sacked_out (total number of SACKed segments) even though * it violates the RFC that uses duplicate ACKs, often these are equal * but when e.g. out-of-window ACKs or packet duplication occurs, * they differ. Since neither occurs due to loss, TCP should really * ignore them. */ static inline int tcp_dupack_heuristics(const struct tcp_sock *tp) { return tp->sacked_out + 1; } /* Linux NewReno/SACK/ECN state machine. * -------------------------------------- * * "Open" Normal state, no dubious events, fast path. * "Disorder" In all the respects it is "Open", * but requires a bit more attention. It is entered when * we see some SACKs or dupacks. It is split of "Open" * mainly to move some processing from fast path to slow one. * "CWR" CWND was reduced due to some Congestion Notification event. * It can be ECN, ICMP source quench, local device congestion. * "Recovery" CWND was reduced, we are fast-retransmitting. * "Loss" CWND was reduced due to RTO timeout or SACK reneging. * * tcp_fastretrans_alert() is entered: * - each incoming ACK, if state is not "Open" * - when arrived ACK is unusual, namely: * * SACK * * Duplicate ACK. * * ECN ECE. * * Counting packets in flight is pretty simple. * * in_flight = packets_out - left_out + retrans_out * * packets_out is SND.NXT-SND.UNA counted in packets. * * retrans_out is number of retransmitted segments. * * left_out is number of segments left network, but not ACKed yet. * * left_out = sacked_out + lost_out * * sacked_out: Packets, which arrived to receiver out of order * and hence not ACKed. With SACKs this number is simply * amount of SACKed data. Even without SACKs * it is easy to give pretty reliable estimate of this number, * counting duplicate ACKs. * * lost_out: Packets lost by network. TCP has no explicit * "loss notification" feedback from network (for now). * It means that this number can be only _guessed_. * Actually, it is the heuristics to predict lossage that * distinguishes different algorithms. * * F.e. after RTO, when all the queue is considered as lost, * lost_out = packets_out and in_flight = retrans_out. * * Essentially, we have now a few algorithms detecting * lost packets. * * If the receiver supports SACK: * * RFC6675/3517: It is the conventional algorithm. A packet is * considered lost if the number of higher sequence packets * SACKed is greater than or equal the DUPACK thoreshold * (reordering). This is implemented in tcp_mark_head_lost and * tcp_update_scoreboard. * * RACK (draft-ietf-tcpm-rack-01): it is a newer algorithm * (2017-) that checks timing instead of counting DUPACKs. * Essentially a packet is considered lost if it's not S/ACKed * after RTT + reordering_window, where both metrics are * dynamically measured and adjusted. This is implemented in * tcp_rack_mark_lost. * * If the receiver does not support SACK: * * NewReno (RFC6582): in Recovery we assume that one segment * is lost (classic Reno). While we are in Recovery and * a partial ACK arrives, we assume that one more packet * is lost (NewReno). This heuristics are the same in NewReno * and SACK. * * Really tricky (and requiring careful tuning) part of algorithm * is hidden in functions tcp_time_to_recover() and tcp_xmit_retransmit_queue(). * The first determines the moment _when_ we should reduce CWND and, * hence, slow down forward transmission. In fact, it determines the moment * when we decide that hole is caused by loss, rather than by a reorder. * * tcp_xmit_retransmit_queue() decides, _what_ we should retransmit to fill * holes, caused by lost packets. * * And the most logically complicated part of algorithm is undo * heuristics. We detect false retransmits due to both too early * fast retransmit (reordering) and underestimated RTO, analyzing * timestamps and D-SACKs. When we detect that some segments were * retransmitted by mistake and CWND reduction was wrong, we undo * window reduction and abort recovery phase. This logic is hidden * inside several functions named tcp_try_undo_<something>. */ /* This function decides, when we should leave Disordered state * and enter Recovery phase, reducing congestion window. * * Main question: may we further continue forward transmission * with the same cwnd? */ static bool tcp_time_to_recover(struct sock *sk, int flag) { struct tcp_sock *tp = tcp_sk(sk); /* Trick#1: The loss is proven. */ if (tp->lost_out) return true; /* Not-A-Trick#2 : Classic rule... */ if (!tcp_is_rack(sk) && tcp_dupack_heuristics(tp) > tp->reordering) return true; return false; } /* Detect loss in event "A" above by marking head of queue up as lost. * For RFC3517 SACK, a segment is considered lost if it * has at least tp->reordering SACKed seqments above it; "packets" refers to * the maximum SACKed segments to pass before reaching this limit. */ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; int cnt; /* Use SACK to deduce losses of new sequences sent during recovery */ const u32 loss_high = tp->snd_nxt; WARN_ON(packets > tp->packets_out); skb = tp->lost_skb_hint; if (skb) { /* Head already handled? */ if (mark_head && after(TCP_SKB_CB(skb)->seq, tp->snd_una)) return; cnt = tp->lost_cnt_hint; } else { skb = tcp_rtx_queue_head(sk); cnt = 0; } skb_rbtree_walk_from(skb) { /* TODO: do this better */ /* this is not the most efficient way to do this... */ tp->lost_skb_hint = skb; tp->lost_cnt_hint = cnt; if (after(TCP_SKB_CB(skb)->end_seq, loss_high)) break; if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) cnt += tcp_skb_pcount(skb); if (cnt > packets) break; if (!(TCP_SKB_CB(skb)->sacked & TCPCB_LOST)) tcp_mark_skb_lost(sk, skb); if (mark_head) break; } tcp_verify_left_out(tp); } /* Account newly detected lost packet(s) */ static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit) { struct tcp_sock *tp = tcp_sk(sk); if (tcp_is_sack(tp)) { int sacked_upto = tp->sacked_out - tp->reordering; if (sacked_upto >= 0) tcp_mark_head_lost(sk, sacked_upto, 0); else if (fast_rexmit) tcp_mark_head_lost(sk, 1, 1); } } static bool tcp_tsopt_ecr_before(const struct tcp_sock *tp, u32 when) { return tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && before(tp->rx_opt.rcv_tsecr, when); } /* skb is spurious retransmitted if the returned timestamp echo * reply is prior to the skb transmission time */ static bool tcp_skb_spurious_retrans(const struct tcp_sock *tp, const struct sk_buff *skb) { return (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) && tcp_tsopt_ecr_before(tp, tcp_skb_timestamp_ts(tp->tcp_usec_ts, skb)); } /* Nothing was retransmitted or returned timestamp is less * than timestamp of the first retransmission. */ static inline bool tcp_packet_delayed(const struct tcp_sock *tp) { const struct sock *sk = (const struct sock *)tp; if (tp->retrans_stamp && tcp_tsopt_ecr_before(tp, tp->retrans_stamp)) return true; /* got echoed TS before first retransmission */ /* Check if nothing was retransmitted (retrans_stamp==0), which may * happen in fast recovery due to TSQ. But we ignore zero retrans_stamp * in TCP_SYN_SENT, since when we set FLAG_SYN_ACKED we also clear * retrans_stamp even if we had retransmitted the SYN. */ if (!tp->retrans_stamp && /* no record of a retransmit/SYN? */ sk->sk_state != TCP_SYN_SENT) /* not the FLAG_SYN_ACKED case? */ return true; /* nothing was retransmitted */ return false; } /* Undo procedures. */ /* We can clear retrans_stamp when there are no retransmissions in the * window. It would seem that it is trivially available for us in * tp->retrans_out, however, that kind of assumptions doesn't consider * what will happen if errors occur when sending retransmission for the * second time. ...It could the that such segment has only * TCPCB_EVER_RETRANS set at the present time. It seems that checking * the head skb is enough except for some reneging corner cases that * are not worth the effort. * * Main reason for all this complexity is the fact that connection dying * time now depends on the validity of the retrans_stamp, in particular, * that successive retransmissions of a segment must not advance * retrans_stamp under any conditions. */ static bool tcp_any_retrans_done(const struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; if (tp->retrans_out) return true; skb = tcp_rtx_queue_head(sk); if (unlikely(skb && TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS)) return true; return false; } /* If loss recovery is finished and there are no retransmits out in the * network, then we clear retrans_stamp so that upon the next loss recovery * retransmits_timed_out() and timestamp-undo are using the correct value. */ static void tcp_retrans_stamp_cleanup(struct sock *sk) { if (!tcp_any_retrans_done(sk)) tcp_sk(sk)->retrans_stamp = 0; } static void DBGUNDO(struct sock *sk, const char *msg) { #if FASTRETRANS_DEBUG > 1 struct tcp_sock *tp = tcp_sk(sk); struct inet_sock *inet = inet_sk(sk); if (sk->sk_family == AF_INET) { pr_debug("Undo %s %pI4/%u c%u l%u ss%u/%u p%u\n", msg, &inet->inet_daddr, ntohs(inet->inet_dport), tcp_snd_cwnd(tp), tcp_left_out(tp), tp->snd_ssthresh, tp->prior_ssthresh, tp->packets_out); } #if IS_ENABLED(CONFIG_IPV6) else if (sk->sk_family == AF_INET6) { pr_debug("Undo %s %pI6/%u c%u l%u ss%u/%u p%u\n", msg, &sk->sk_v6_daddr, ntohs(inet->inet_dport), tcp_snd_cwnd(tp), tcp_left_out(tp), tp->snd_ssthresh, tp->prior_ssthresh, tp->packets_out); } #endif #endif } static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss) { struct tcp_sock *tp = tcp_sk(sk); if (unmark_loss) { struct sk_buff *skb; skb_rbtree_walk(skb, &sk->tcp_rtx_queue) { TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; } tp->lost_out = 0; tcp_clear_all_retrans_hints(tp); } if (tp->prior_ssthresh) { const struct inet_connection_sock *icsk = inet_csk(sk); tcp_snd_cwnd_set(tp, icsk->icsk_ca_ops->undo_cwnd(sk)); if (tp->prior_ssthresh > tp->snd_ssthresh) { tp->snd_ssthresh = tp->prior_ssthresh; tcp_ecn_withdraw_cwr(tp); } } tp->snd_cwnd_stamp = tcp_jiffies32; tp->undo_marker = 0; tp->rack.advanced = 1; /* Force RACK to re-exam losses */ } static inline bool tcp_may_undo(const struct tcp_sock *tp) { return tp->undo_marker && (!tp->undo_retrans || tcp_packet_delayed(tp)); } static bool tcp_is_non_sack_preventing_reopen(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) { /* Hold old state until something *above* high_seq * is ACKed. For Reno it is MUST to prevent false * fast retransmits (RFC2582). SACK TCP is safe. */ if (!tcp_any_retrans_done(sk)) tp->retrans_stamp = 0; return true; } return false; } /* People celebrate: "We love our President!" */ static bool tcp_try_undo_recovery(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); if (tcp_may_undo(tp)) { int mib_idx; /* Happy end! We did not retransmit anything * or our original transmission succeeded. */ DBGUNDO(sk, inet_csk(sk)->icsk_ca_state == TCP_CA_Loss ? "loss" : "retrans"); tcp_undo_cwnd_reduction(sk, false); if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss) mib_idx = LINUX_MIB_TCPLOSSUNDO; else mib_idx = LINUX_MIB_TCPFULLUNDO; NET_INC_STATS(sock_net(sk), mib_idx); } else if (tp->rack.reo_wnd_persist) { tp->rack.reo_wnd_persist--; } if (tcp_is_non_sack_preventing_reopen(sk)) return true; tcp_set_ca_state(sk, TCP_CA_Open); tp->is_sack_reneg = 0; return false; } /* Try to undo cwnd reduction, because D-SACKs acked all retransmitted data */ static bool tcp_try_undo_dsack(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); if (tp->undo_marker && !tp->undo_retrans) { tp->rack.reo_wnd_persist = min(TCP_RACK_RECOVERY_THRESH, tp->rack.reo_wnd_persist + 1); DBGUNDO(sk, "D-SACK"); tcp_undo_cwnd_reduction(sk, false); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKUNDO); return true; } return false; } /* Undo during loss recovery after partial ACK or using F-RTO. */ static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo) { struct tcp_sock *tp = tcp_sk(sk); if (frto_undo || tcp_may_undo(tp)) { tcp_undo_cwnd_reduction(sk, true); DBGUNDO(sk, "partial loss"); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPLOSSUNDO); if (frto_undo) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSPURIOUSRTOS); inet_csk(sk)->icsk_retransmits = 0; if (tcp_is_non_sack_preventing_reopen(sk)) return true; if (frto_undo || tcp_is_sack(tp)) { tcp_set_ca_state(sk, TCP_CA_Open); tp->is_sack_reneg = 0; } return true; } return false; } /* The cwnd reduction in CWR and Recovery uses the PRR algorithm in RFC 6937. * It computes the number of packets to send (sndcnt) based on packets newly * delivered: * 1) If the packets in flight is larger than ssthresh, PRR spreads the * cwnd reductions across a full RTT. * 2) Otherwise PRR uses packet conservation to send as much as delivered. * But when SND_UNA is acked without further losses, * slow starts cwnd up to ssthresh to speed up the recovery. */ static void tcp_init_cwnd_reduction(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); tp->high_seq = tp->snd_nxt; tp->tlp_high_seq = 0; tp->snd_cwnd_cnt = 0; tp->prior_cwnd = tcp_snd_cwnd(tp); tp->prr_delivered = 0; tp->prr_out = 0; tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk); tcp_ecn_queue_cwr(tp); } void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int newly_lost, int flag) { struct tcp_sock *tp = tcp_sk(sk); int sndcnt = 0; int delta = tp->snd_ssthresh - tcp_packets_in_flight(tp); if (newly_acked_sacked <= 0 || WARN_ON_ONCE(!tp->prior_cwnd)) return; trace_tcp_cwnd_reduction_tp(sk, newly_acked_sacked, newly_lost, flag); tp->prr_delivered += newly_acked_sacked; if (delta < 0) { u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered + tp->prior_cwnd - 1; sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out; } else { sndcnt = max_t(int, tp->prr_delivered - tp->prr_out, newly_acked_sacked); if (flag & FLAG_SND_UNA_ADVANCED && !newly_lost) sndcnt++; sndcnt = min(delta, sndcnt); } /* Force a fast retransmit upon entering fast recovery */ sndcnt = max(sndcnt, (tp->prr_out ? 0 : 1)); tcp_snd_cwnd_set(tp, tcp_packets_in_flight(tp) + sndcnt); } static inline void tcp_end_cwnd_reduction(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); if (inet_csk(sk)->icsk_ca_ops->cong_control) return; /* Reset cwnd to ssthresh in CWR or Recovery (unless it's undone) */ if (tp->snd_ssthresh < TCP_INFINITE_SSTHRESH && (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR || tp->undo_marker)) { tcp_snd_cwnd_set(tp, tp->snd_ssthresh); tp->snd_cwnd_stamp = tcp_jiffies32; } tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR); } /* Enter CWR state. Disable cwnd undo since congestion is proven with ECN */ void tcp_enter_cwr(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); tp->prior_ssthresh = 0; if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) { tp->undo_marker = 0; tcp_init_cwnd_reduction(sk); tcp_set_ca_state(sk, TCP_CA_CWR); } } EXPORT_SYMBOL(tcp_enter_cwr); static void tcp_try_keep_open(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); int state = TCP_CA_Open; if (tcp_left_out(tp) || tcp_any_retrans_done(sk)) state = TCP_CA_Disorder; if (inet_csk(sk)->icsk_ca_state != state) { tcp_set_ca_state(sk, state); tp->high_seq = tp->snd_nxt; } } static void tcp_try_to_open(struct sock *sk, int flag) { struct tcp_sock *tp = tcp_sk(sk); tcp_verify_left_out(tp); if (!tcp_any_retrans_done(sk)) tp->retrans_stamp = 0; if (flag & FLAG_ECE) tcp_enter_cwr(sk); if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) { tcp_try_keep_open(sk); } } static void tcp_mtup_probe_failed(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); icsk->icsk_mtup.search_high = icsk->icsk_mtup.probe_size - 1; icsk->icsk_mtup.probe_size = 0; NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMTUPFAIL); } static void tcp_mtup_probe_success(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); u64 val; tp->prior_ssthresh = tcp_current_ssthresh(sk); val = (u64)tcp_snd_cwnd(tp) * tcp_mss_to_mtu(sk, tp->mss_cache); do_div(val, icsk->icsk_mtup.probe_size); DEBUG_NET_WARN_ON_ONCE((u32)val != val); tcp_snd_cwnd_set(tp, max_t(u32, 1U, val)); tp->snd_cwnd_cnt = 0; tp->snd_cwnd_stamp = tcp_jiffies32; tp->snd_ssthresh = tcp_current_ssthresh(sk); icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size; icsk->icsk_mtup.probe_size = 0; tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMTUPSUCCESS); } /* Sometimes we deduce that packets have been dropped due to reasons other than * congestion, like path MTU reductions or failed client TFO attempts. In these * cases we call this function to retransmit as many packets as cwnd allows, * without reducing cwnd. Given that retransmits will set retrans_stamp to a * non-zero value (and may do so in a later calling context due to TSQ), we * also enter CA_Loss so that we track when all retransmitted packets are ACKed * and clear retrans_stamp when that happens (to ensure later recurring RTOs * are using the correct retrans_stamp and don't declare ETIMEDOUT * prematurely). */ static void tcp_non_congestion_loss_retransmit(struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); if (icsk->icsk_ca_state != TCP_CA_Loss) { tp->high_seq = tp->snd_nxt; tp->snd_ssthresh = tcp_current_ssthresh(sk); tp->prior_ssthresh = 0; tp->undo_marker = 0; tcp_set_ca_state(sk, TCP_CA_Loss); } tcp_xmit_retransmit_queue(sk); } /* Do a simple retransmit without using the backoff mechanisms in * tcp_timer. This is used for path mtu discovery. * The socket is already locked here. */ void tcp_simple_retransmit(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; int mss; /* A fastopen SYN request is stored as two separate packets within * the retransmit queue, this is done by tcp_send_syn_data(). * As a result simply checking the MSS of the frames in the queue * will not work for the SYN packet. * * Us being here is an indication of a path MTU issue so we can * assume that the fastopen SYN was lost and just mark all the * frames in the retransmit queue as lost. We will use an MSS of * -1 to mark all frames as lost, otherwise compute the current MSS. */ if (tp->syn_data && sk->sk_state == TCP_SYN_SENT) mss = -1; else mss = tcp_current_mss(sk); skb_rbtree_walk(skb, &sk->tcp_rtx_queue) { if (tcp_skb_seglen(skb) > mss) tcp_mark_skb_lost(sk, skb); } tcp_clear_retrans_hints_partial(tp); if (!tp->lost_out) return; if (tcp_is_reno(tp)) tcp_limit_reno_sacked(tp); tcp_verify_left_out(tp); /* Don't muck with the congestion window here. * Reason is that we do not increase amount of _data_ * in network, but units changed and effective * cwnd/ssthresh really reduced now. */ tcp_non_congestion_loss_retransmit(sk); } EXPORT_IPV6_MOD(tcp_simple_retransmit); void tcp_enter_recovery(struct sock *sk, bool ece_ack) { struct tcp_sock *tp = tcp_sk(sk); int mib_idx; /* Start the clock with our fast retransmit, for undo and ETIMEDOUT. */ tcp_retrans_stamp_cleanup(sk); if (tcp_is_reno(tp)) mib_idx = LINUX_MIB_TCPRENORECOVERY; else mib_idx = LINUX_MIB_TCPSACKRECOVERY; NET_INC_STATS(sock_net(sk), mib_idx); tp->prior_ssthresh = 0; tcp_init_undo(tp); if (!tcp_in_cwnd_reduction(sk)) { if (!ece_ack) tp->prior_ssthresh = tcp_current_ssthresh(sk); tcp_init_cwnd_reduction(sk); } tcp_set_ca_state(sk, TCP_CA_Recovery); } static void tcp_update_rto_time(struct tcp_sock *tp) { if (tp->rto_stamp) { tp->total_rto_time += tcp_time_stamp_ms(tp) - tp->rto_stamp; tp->rto_stamp = 0; } } /* Process an ACK in CA_Loss state. Move to CA_Open if lost data are * recovered or spurious. Otherwise retransmits more on partial ACKs. */ static void tcp_process_loss(struct sock *sk, int flag, int num_dupack, int *rexmit) { struct tcp_sock *tp = tcp_sk(sk); bool recovered = !before(tp->snd_una, tp->high_seq); if ((flag & FLAG_SND_UNA_ADVANCED || rcu_access_pointer(tp->fastopen_rsk)) && tcp_try_undo_loss(sk, false)) return; if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */ /* Step 3.b. A timeout is spurious if not all data are * lost, i.e., never-retransmitted data are (s)acked. */ if ((flag & FLAG_ORIG_SACK_ACKED) && tcp_try_undo_loss(sk, true)) return; if (after(tp->snd_nxt, tp->high_seq)) { if (flag & FLAG_DATA_SACKED || num_dupack) tp->frto = 0; /* Step 3.a. loss was real */ } else if (flag & FLAG_SND_UNA_ADVANCED && !recovered) { tp->high_seq = tp->snd_nxt; /* Step 2.b. Try send new data (but deferred until cwnd * is updated in tcp_ack()). Otherwise fall back to * the conventional recovery. */ if (!tcp_write_queue_empty(sk) && after(tcp_wnd_end(tp), tp->snd_nxt)) { *rexmit = REXMIT_NEW; return; } tp->frto = 0; } } if (recovered) { /* F-RTO RFC5682 sec 3.1 step 2.a and 1st part of step 3.a */ tcp_try_undo_recovery(sk); return; } if (tcp_is_reno(tp)) { /* A Reno DUPACK means new data in F-RTO step 2.b above are * delivered. Lower inflight to clock out (re)transmissions. */ if (after(tp->snd_nxt, tp->high_seq) && num_dupack) tcp_add_reno_sack(sk, num_dupack, flag & FLAG_ECE); else if (flag & FLAG_SND_UNA_ADVANCED) tcp_reset_reno_sack(tp); } *rexmit = REXMIT_LOST; } static bool tcp_force_fast_retransmit(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); return after(tcp_highest_sack_seq(tp), tp->snd_una + tp->reordering * tp->mss_cache); } /* Undo during fast recovery after partial ACK. */ static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una, bool *do_lost) { struct tcp_sock *tp = tcp_sk(sk); if (tp->undo_marker && tcp_packet_delayed(tp)) { /* Plain luck! Hole if filled with delayed * packet, rather than with a retransmit. Check reordering. */ tcp_check_sack_reordering(sk, prior_snd_una, 1); /* We are getting evidence that the reordering degree is higher * than we realized. If there are no retransmits out then we * can undo. Otherwise we clock out new packets but do not * mark more packets lost or retransmit more. */ if (tp->retrans_out) return true; if (!tcp_any_retrans_done(sk)) tp->retrans_stamp = 0; DBGUNDO(sk, "partial recovery"); tcp_undo_cwnd_reduction(sk, true); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO); tcp_try_keep_open(sk); } else { /* Partial ACK arrived. Force fast retransmit. */ *do_lost = tcp_force_fast_retransmit(sk); } return false; } static void tcp_identify_packet_loss(struct sock *sk, int *ack_flag) { struct tcp_sock *tp = tcp_sk(sk); if (tcp_rtx_queue_empty(sk)) return; if (unlikely(tcp_is_reno(tp))) { tcp_newreno_mark_lost(sk, *ack_flag & FLAG_SND_UNA_ADVANCED); } else if (tcp_is_rack(sk)) { u32 prior_retrans = tp->retrans_out; if (tcp_rack_mark_lost(sk)) *ack_flag &= ~FLAG_SET_XMIT_TIMER; if (prior_retrans > tp->retrans_out) *ack_flag |= FLAG_LOST_RETRANS; } } /* Process an event, which can update packets-in-flight not trivially. * Main goal of this function is to calculate new estimate for left_out, * taking into account both packets sitting in receiver's buffer and * packets lost by network. * * Besides that it updates the congestion state when packet loss or ECN * is detected. But it does not reduce the cwnd, it is done by the * congestion control later. * * It does _not_ decide what to send, it is made in function * tcp_xmit_retransmit_queue(). */ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, int num_dupack, int *ack_flag, int *rexmit) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); int fast_rexmit = 0, flag = *ack_flag; bool ece_ack = flag & FLAG_ECE; bool do_lost = num_dupack || ((flag & FLAG_DATA_SACKED) && tcp_force_fast_retransmit(sk)); if (!tp->packets_out && tp->sacked_out) tp->sacked_out = 0; /* Now state machine starts. * A. ECE, hence prohibit cwnd undoing, the reduction is required. */ if (ece_ack) tp->prior_ssthresh = 0; /* B. In all the states check for reneging SACKs. */ if (tcp_check_sack_reneging(sk, ack_flag)) return; /* C. Check consistency of the current state. */ tcp_verify_left_out(tp); /* D. Check state exit conditions. State can be terminated * when high_seq is ACKed. */ if (icsk->icsk_ca_state == TCP_CA_Open) { WARN_ON(tp->retrans_out != 0 && !tp->syn_data); tp->retrans_stamp = 0; } else if (!before(tp->snd_una, tp->high_seq)) { switch (icsk->icsk_ca_state) { case TCP_CA_CWR: /* CWR is to be held something *above* high_seq * is ACKed for CWR bit to reach receiver. */ if (tp->snd_una != tp->high_seq) { tcp_end_cwnd_reduction(sk); tcp_set_ca_state(sk, TCP_CA_Open); } break; case TCP_CA_Recovery: if (tcp_is_reno(tp)) tcp_reset_reno_sack(tp); if (tcp_try_undo_recovery(sk)) return; tcp_end_cwnd_reduction(sk); break; } } /* E. Process state. */ switch (icsk->icsk_ca_state) { case TCP_CA_Recovery: if (!(flag & FLAG_SND_UNA_ADVANCED)) { if (tcp_is_reno(tp)) tcp_add_reno_sack(sk, num_dupack, ece_ack); } else if (tcp_try_undo_partial(sk, prior_snd_una, &do_lost)) return; if (tcp_try_undo_dsack(sk)) tcp_try_to_open(sk, flag); tcp_identify_packet_loss(sk, ack_flag); if (icsk->icsk_ca_state != TCP_CA_Recovery) { if (!tcp_time_to_recover(sk, flag)) return; /* Undo reverts the recovery state. If loss is evident, * starts a new recovery (e.g. reordering then loss); */ tcp_enter_recovery(sk, ece_ack); } break; case TCP_CA_Loss: tcp_process_loss(sk, flag, num_dupack, rexmit); if (icsk->icsk_ca_state != TCP_CA_Loss) tcp_update_rto_time(tp); tcp_identify_packet_loss(sk, ack_flag); if (!(icsk->icsk_ca_state == TCP_CA_Open || (*ack_flag & FLAG_LOST_RETRANS))) return; /* Change state if cwnd is undone or retransmits are lost */ fallthrough; default: if (tcp_is_reno(tp)) { if (flag & FLAG_SND_UNA_ADVANCED) tcp_reset_reno_sack(tp); tcp_add_reno_sack(sk, num_dupack, ece_ack); } if (icsk->icsk_ca_state <= TCP_CA_Disorder) tcp_try_undo_dsack(sk); tcp_identify_packet_loss(sk, ack_flag); if (!tcp_time_to_recover(sk, flag)) { tcp_try_to_open(sk, flag); return; } /* MTU probe failure: don't reduce cwnd */ if (icsk->icsk_ca_state < TCP_CA_CWR && icsk->icsk_mtup.probe_size && tp->snd_una == tp->mtu_probe.probe_seq_start) { tcp_mtup_probe_failed(sk); /* Restores the reduction we did in tcp_mtup_probe() */ tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + 1); tcp_simple_retransmit(sk); return; } /* Otherwise enter Recovery state */ tcp_enter_recovery(sk, ece_ack); fast_rexmit = 1; } if (!tcp_is_rack(sk) && do_lost) tcp_update_scoreboard(sk, fast_rexmit); *rexmit = REXMIT_LOST; } static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us, const int flag) { u32 wlen = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_rtt_wlen) * HZ; struct tcp_sock *tp = tcp_sk(sk); if ((flag & FLAG_ACK_MAYBE_DELAYED) && rtt_us > tcp_min_rtt(tp)) { /* If the remote keeps returning delayed ACKs, eventually * the min filter would pick it up and overestimate the * prop. delay when it expires. Skip suspected delayed ACKs. */ return; } minmax_running_min(&tp->rtt_min, wlen, tcp_jiffies32, rtt_us ? : jiffies_to_usecs(1)); } static bool tcp_ack_update_rtt(struct sock *sk, const int flag, long seq_rtt_us, long sack_rtt_us, long ca_rtt_us, struct rate_sample *rs) { const struct tcp_sock *tp = tcp_sk(sk); /* Prefer RTT measured from ACK's timing to TS-ECR. This is because * broken middle-boxes or peers may corrupt TS-ECR fields. But * Karn's algorithm forbids taking RTT if some retransmitted data * is acked (RFC6298). */ if (seq_rtt_us < 0) seq_rtt_us = sack_rtt_us; /* RTTM Rule: A TSecr value received in a segment is used to * update the averaged RTT measurement only if the segment * acknowledges some new data, i.e., only if it advances the * left edge of the send window. * See draft-ietf-tcplw-high-performance-00, section 3.3. */ if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && flag & FLAG_ACKED) seq_rtt_us = ca_rtt_us = tcp_rtt_tsopt_us(tp, 1); rs->rtt_us = ca_rtt_us; /* RTT of last (S)ACKed packet (or -1) */ if (seq_rtt_us < 0) return false; /* ca_rtt_us >= 0 is counting on the invariant that ca_rtt_us is * always taken together with ACK, SACK, or TS-opts. Any negative * values will be skipped with the seq_rtt_us < 0 check above. */ tcp_update_rtt_min(sk, ca_rtt_us, flag); tcp_rtt_estimator(sk, seq_rtt_us); tcp_set_rto(sk); /* RFC6298: only reset backoff on valid RTT measurement. */ inet_csk(sk)->icsk_backoff = 0; return true; } /* Compute time elapsed between (last) SYNACK and the ACK completing 3WHS. */ void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req) { struct rate_sample rs; long rtt_us = -1L; if (req && !req->num_retrans && tcp_rsk(req)->snt_synack) rtt_us = tcp_stamp_us_delta(tcp_clock_us(), tcp_rsk(req)->snt_synack); tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, rtt_us, -1L, rtt_us, &rs); } static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) { const struct inet_connection_sock *icsk = inet_csk(sk); icsk->icsk_ca_ops->cong_avoid(sk, ack, acked); tcp_sk(sk)->snd_cwnd_stamp = tcp_jiffies32; } /* Restart timer after forward progress on connection. * RFC2988 recommends to restart timer to now+rto. */ void tcp_rearm_rto(struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); /* If the retrans timer is currently being used by Fast Open * for SYN-ACK retrans purpose, stay put. */ if (rcu_access_pointer(tp->fastopen_rsk)) return; if (!tp->packets_out) { inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); } else { u32 rto = inet_csk(sk)->icsk_rto; /* Offset the time elapsed after installing regular RTO */ if (icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { s64 delta_us = tcp_rto_delta_us(sk); /* delta_us may not be positive if the socket is locked * when the retrans timer fires and is rescheduled. */ rto = usecs_to_jiffies(max_t(int, delta_us, 1)); } tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto, true); } } /* Try to schedule a loss probe; if that doesn't work, then schedule an RTO. */ static void tcp_set_xmit_timer(struct sock *sk) { if (!tcp_schedule_loss_probe(sk, true)) tcp_rearm_rto(sk); } /* If we get here, the whole TSO packet has not been acked. */ static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); u32 packets_acked; BUG_ON(!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una)); packets_acked = tcp_skb_pcount(skb); if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq)) return 0; packets_acked -= tcp_skb_pcount(skb); if (packets_acked) { BUG_ON(tcp_skb_pcount(skb) == 0); BUG_ON(!before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)); } return packets_acked; } static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb, const struct sk_buff *ack_skb, u32 prior_snd_una) { const struct skb_shared_info *shinfo; /* Avoid cache line misses to get skb_shinfo() and shinfo->tx_flags */ if (likely(!TCP_SKB_CB(skb)->txstamp_ack)) return; shinfo = skb_shinfo(skb); if (!before(shinfo->tskey, prior_snd_una) && before(shinfo->tskey, tcp_sk(sk)->snd_una)) { tcp_skb_tsorted_save(skb) { __skb_tstamp_tx(skb, ack_skb, NULL, sk, SCM_TSTAMP_ACK); } tcp_skb_tsorted_restore(skb); } } /* Remove acknowledged frames from the retransmission queue. If our packet * is before the ack sequence we can discard it as it's confirmed to have * arrived at the other end. */ static int tcp_clean_rtx_queue(struct sock *sk, const struct sk_buff *ack_skb, u32 prior_fack, u32 prior_snd_una, struct tcp_sacktag_state *sack, bool ece_ack) { const struct inet_connection_sock *icsk = inet_csk(sk); u64 first_ackt, last_ackt; struct tcp_sock *tp = tcp_sk(sk); u32 prior_sacked = tp->sacked_out; u32 reord = tp->snd_nxt; /* lowest acked un-retx un-sacked seq */ struct sk_buff *skb, *next; bool fully_acked = true; long sack_rtt_us = -1L; long seq_rtt_us = -1L; long ca_rtt_us = -1L; u32 pkts_acked = 0; bool rtt_update; int flag = 0; first_ackt = 0; for (skb = skb_rb_first(&sk->tcp_rtx_queue); skb; skb = next) { struct tcp_skb_cb *scb = TCP_SKB_CB(skb); const u32 start_seq = scb->seq; u8 sacked = scb->sacked; u32 acked_pcount; /* Determine how many packets and what bytes were acked, tso and else */ if (after(scb->end_seq, tp->snd_una)) { if (tcp_skb_pcount(skb) == 1 || !after(tp->snd_una, scb->seq)) break; acked_pcount = tcp_tso_acked(sk, skb); if (!acked_pcount) break; fully_acked = false; } else { acked_pcount = tcp_skb_pcount(skb); } if (unlikely(sacked & TCPCB_RETRANS)) { if (sacked & TCPCB_SACKED_RETRANS) tp->retrans_out -= acked_pcount; flag |= FLAG_RETRANS_DATA_ACKED; } else if (!(sacked & TCPCB_SACKED_ACKED)) { last_ackt = tcp_skb_timestamp_us(skb); WARN_ON_ONCE(last_ackt == 0); if (!first_ackt) first_ackt = last_ackt; if (before(start_seq, reord)) reord = start_seq; if (!after(scb->end_seq, tp->high_seq)) flag |= FLAG_ORIG_SACK_ACKED; } if (sacked & TCPCB_SACKED_ACKED) { tp->sacked_out -= acked_pcount; } else if (tcp_is_sack(tp)) { tcp_count_delivered(tp, acked_pcount, ece_ack); if (!tcp_skb_spurious_retrans(tp, skb)) tcp_rack_advance(tp, sacked, scb->end_seq, tcp_skb_timestamp_us(skb)); } if (sacked & TCPCB_LOST) tp->lost_out -= acked_pcount; tp->packets_out -= acked_pcount; pkts_acked += acked_pcount; tcp_rate_skb_delivered(sk, skb, sack->rate); /* Initial outgoing SYN's get put onto the write_queue * just like anything else we transmit. It is not * true data, and if we misinform our callers that * this ACK acks real data, we will erroneously exit * connection startup slow start one packet too * quickly. This is severely frowned upon behavior. */ if (likely(!(scb->tcp_flags & TCPHDR_SYN))) { flag |= FLAG_DATA_ACKED; } else { flag |= FLAG_SYN_ACKED; tp->retrans_stamp = 0; } if (!fully_acked) break; tcp_ack_tstamp(sk, skb, ack_skb, prior_snd_una); next = skb_rb_next(skb); if (unlikely(skb == tp->retransmit_skb_hint)) tp->retransmit_skb_hint = NULL; if (unlikely(skb == tp->lost_skb_hint)) tp->lost_skb_hint = NULL; tcp_highest_sack_replace(sk, skb, next); tcp_rtx_queue_unlink_and_free(skb, sk); } if (!skb) tcp_chrono_stop(sk, TCP_CHRONO_BUSY); if (likely(between(tp->snd_up, prior_snd_una, tp->snd_una))) tp->snd_up = tp->snd_una; if (skb) { tcp_ack_tstamp(sk, skb, ack_skb, prior_snd_una); if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) flag |= FLAG_SACK_RENEGING; } if (likely(first_ackt) && !(flag & FLAG_RETRANS_DATA_ACKED)) { seq_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, first_ackt); ca_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, last_ackt); if (pkts_acked == 1 && fully_acked && !prior_sacked && (tp->snd_una - prior_snd_una) < tp->mss_cache && sack->rate->prior_delivered + 1 == tp->delivered && !(flag & (FLAG_CA_ALERT | FLAG_SYN_ACKED))) { /* Conservatively mark a delayed ACK. It's typically * from a lone runt packet over the round trip to * a receiver w/o out-of-order or CE events. */ flag |= FLAG_ACK_MAYBE_DELAYED; } } if (sack->first_sackt) { sack_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, sack->first_sackt); ca_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, sack->last_sackt); } rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us, ca_rtt_us, sack->rate); if (flag & FLAG_ACKED) { flag |= FLAG_SET_XMIT_TIMER; /* set TLP or RTO timer */ if (unlikely(icsk->icsk_mtup.probe_size && !after(tp->mtu_probe.probe_seq_end, tp->snd_una))) { tcp_mtup_probe_success(sk); } if (tcp_is_reno(tp)) { tcp_remove_reno_sacks(sk, pkts_acked, ece_ack); /* If any of the cumulatively ACKed segments was * retransmitted, non-SACK case cannot confirm that * progress was due to original transmission due to * lack of TCPCB_SACKED_ACKED bits even if some of * the packets may have been never retransmitted. */ if (flag & FLAG_RETRANS_DATA_ACKED) flag &= ~FLAG_ORIG_SACK_ACKED; } else { int delta; /* Non-retransmitted hole got filled? That's reordering */ if (before(reord, prior_fack)) tcp_check_sack_reordering(sk, reord, 0); delta = prior_sacked - tp->sacked_out; tp->lost_cnt_hint -= min(tp->lost_cnt_hint, delta); } } else if (skb && rtt_update && sack_rtt_us >= 0 && sack_rtt_us > tcp_stamp_us_delta(tp->tcp_mstamp, tcp_skb_timestamp_us(skb))) { /* Do not re-arm RTO if the sack RTT is measured from data sent * after when the head was last (re)transmitted. Otherwise the * timeout may continue to extend in loss recovery. */ flag |= FLAG_SET_XMIT_TIMER; /* set TLP or RTO timer */ } if (icsk->icsk_ca_ops->pkts_acked) { struct ack_sample sample = { .pkts_acked = pkts_acked, .rtt_us = sack->rate->rtt_us }; sample.in_flight = tp->mss_cache * (tp->delivered - sack->rate->prior_delivered); icsk->icsk_ca_ops->pkts_acked(sk, &sample); } #if FASTRETRANS_DEBUG > 0 WARN_ON((int)tp->sacked_out < 0); WARN_ON((int)tp->lost_out < 0); WARN_ON((int)tp->retrans_out < 0); if (!tp->packets_out && tcp_is_sack(tp)) { icsk = inet_csk(sk); if (tp->lost_out) { pr_debug("Leak l=%u %d\n", tp->lost_out, icsk->icsk_ca_state); tp->lost_out = 0; } if (tp->sacked_out) { pr_debug("Leak s=%u %d\n", tp->sacked_out, icsk->icsk_ca_state); tp->sacked_out = 0; } if (tp->retrans_out) { pr_debug("Leak r=%u %d\n", tp->retrans_out, icsk->icsk_ca_state); tp->retrans_out = 0; } } #endif return flag; } static void tcp_ack_probe(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct sk_buff *head = tcp_send_head(sk); const struct tcp_sock *tp = tcp_sk(sk); /* Was it a usable window open? */ if (!head) return; if (!after(TCP_SKB_CB(head)->end_seq, tcp_wnd_end(tp))) { icsk->icsk_backoff = 0; icsk->icsk_probes_tstamp = 0; inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0); /* Socket must be waked up by subsequent tcp_data_snd_check(). * This function is not for random using! */ } else { unsigned long when = tcp_probe0_when(sk, tcp_rto_max(sk)); when = tcp_clamp_probe0_to_user_timeout(sk, when); tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, when, true); } } static inline bool tcp_ack_is_dubious(const struct sock *sk, const int flag) { return !(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) || inet_csk(sk)->icsk_ca_state != TCP_CA_Open; } /* Decide wheather to run the increase function of congestion control. */ static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag) { /* If reordering is high then always grow cwnd whenever data is * delivered regardless of its ordering. Otherwise stay conservative * and only grow cwnd on in-order delivery (RFC5681). A stretched ACK w/ * new SACK or ECE mark may first advance cwnd here and later reduce * cwnd in tcp_fastretrans_alert() based on more states. */ if (tcp_sk(sk)->reordering > READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reordering)) return flag & FLAG_FORWARD_PROGRESS; return flag & FLAG_DATA_ACKED; } /* The "ultimate" congestion control function that aims to replace the rigid * cwnd increase and decrease control (tcp_cong_avoid,tcp_*cwnd_reduction). * It's called toward the end of processing an ACK with precise rate * information. All transmission or retransmission are delayed afterwards. */ static void tcp_cong_control(struct sock *sk, u32 ack, u32 acked_sacked, int flag, const struct rate_sample *rs) { const struct inet_connection_sock *icsk = inet_csk(sk); if (icsk->icsk_ca_ops->cong_control) { icsk->icsk_ca_ops->cong_control(sk, ack, flag, rs); return; } if (tcp_in_cwnd_reduction(sk)) { /* Reduce cwnd if state mandates */ tcp_cwnd_reduction(sk, acked_sacked, rs->losses, flag); } else if (tcp_may_raise_cwnd(sk, flag)) { /* Advance cwnd if state allows */ tcp_cong_avoid(sk, ack, acked_sacked); } tcp_update_pacing_rate(sk); } /* Check that window update is acceptable. * The function assumes that snd_una<=ack<=snd_next. */ static inline bool tcp_may_update_window(const struct tcp_sock *tp, const u32 ack, const u32 ack_seq, const u32 nwin) { return after(ack, tp->snd_una) || after(ack_seq, tp->snd_wl1) || (ack_seq == tp->snd_wl1 && (nwin > tp->snd_wnd || !nwin)); } static void tcp_snd_sne_update(struct tcp_sock *tp, u32 ack) { #ifdef CONFIG_TCP_AO struct tcp_ao_info *ao; if (!static_branch_unlikely(&tcp_ao_needed.key)) return; ao = rcu_dereference_protected(tp->ao_info, lockdep_sock_is_held((struct sock *)tp)); if (ao && ack < tp->snd_una) { ao->snd_sne++; trace_tcp_ao_snd_sne_update((struct sock *)tp, ao->snd_sne); } #endif } /* If we update tp->snd_una, also update tp->bytes_acked */ static void tcp_snd_una_update(struct tcp_sock *tp, u32 ack) { u32 delta = ack - tp->snd_una; sock_owned_by_me((struct sock *)tp); tp->bytes_acked += delta; tcp_snd_sne_update(tp, ack); tp->snd_una = ack; } static void tcp_rcv_sne_update(struct tcp_sock *tp, u32 seq) { #ifdef CONFIG_TCP_AO struct tcp_ao_info *ao; if (!static_branch_unlikely(&tcp_ao_needed.key)) return; ao = rcu_dereference_protected(tp->ao_info, lockdep_sock_is_held((struct sock *)tp)); if (ao && seq < tp->rcv_nxt) { ao->rcv_sne++; trace_tcp_ao_rcv_sne_update((struct sock *)tp, ao->rcv_sne); } #endif } /* If we update tp->rcv_nxt, also update tp->bytes_received */ static void tcp_rcv_nxt_update(struct tcp_sock *tp, u32 seq) { u32 delta = seq - tp->rcv_nxt; sock_owned_by_me((struct sock *)tp); tp->bytes_received += delta; tcp_rcv_sne_update(tp, seq); WRITE_ONCE(tp->rcv_nxt, seq); } /* Update our send window. * * Window update algorithm, described in RFC793/RFC1122 (used in linux-2.2 * and in FreeBSD. NetBSD's one is even worse.) is wrong. */ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32 ack, u32 ack_seq) { struct tcp_sock *tp = tcp_sk(sk); int flag = 0; u32 nwin = ntohs(tcp_hdr(skb)->window); if (likely(!tcp_hdr(skb)->syn)) nwin <<= tp->rx_opt.snd_wscale; if (tcp_may_update_window(tp, ack, ack_seq, nwin)) { flag |= FLAG_WIN_UPDATE; tcp_update_wl(tp, ack_seq); if (tp->snd_wnd != nwin) { tp->snd_wnd = nwin; /* Note, it is the only place, where * fast path is recovered for sending TCP. */ tp->pred_flags = 0; tcp_fast_path_check(sk); if (!tcp_write_queue_empty(sk)) tcp_slow_start_after_idle_check(sk); if (nwin > tp->max_window) { tp->max_window = nwin; tcp_sync_mss(sk, inet_csk(sk)->icsk_pmtu_cookie); } } } tcp_snd_una_update(tp, ack); return flag; } static bool __tcp_oow_rate_limited(struct net *net, int mib_idx, u32 *last_oow_ack_time) { /* Paired with the WRITE_ONCE() in this function. */ u32 val = READ_ONCE(*last_oow_ack_time); if (val) { s32 elapsed = (s32)(tcp_jiffies32 - val); if (0 <= elapsed && elapsed < READ_ONCE(net->ipv4.sysctl_tcp_invalid_ratelimit)) { NET_INC_STATS(net, mib_idx); return true; /* rate-limited: don't send yet! */ } } /* Paired with the prior READ_ONCE() and with itself, * as we might be lockless. */ WRITE_ONCE(*last_oow_ack_time, tcp_jiffies32); return false; /* not rate-limited: go ahead, send dupack now! */ } /* Return true if we're currently rate-limiting out-of-window ACKs and * thus shouldn't send a dupack right now. We rate-limit dupacks in * response to out-of-window SYNs or ACKs to mitigate ACK loops or DoS * attacks that send repeated SYNs or ACKs for the same connection. To * do this, we do not send a duplicate SYNACK or ACK if the remote * endpoint is sending out-of-window SYNs or pure ACKs at a high rate. */ bool tcp_oow_rate_limited(struct net *net, const struct sk_buff *skb, int mib_idx, u32 *last_oow_ack_time) { /* Data packets without SYNs are not likely part of an ACK loop. */ if ((TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq) && !tcp_hdr(skb)->syn) return false; return __tcp_oow_rate_limited(net, mib_idx, last_oow_ack_time); } /* RFC 5961 7 [ACK Throttling] */ static void tcp_send_challenge_ack(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); u32 count, now, ack_limit; /* First check our per-socket dupack rate limit. */ if (__tcp_oow_rate_limited(net, LINUX_MIB_TCPACKSKIPPEDCHALLENGE, &tp->last_oow_ack_time)) return; ack_limit = READ_ONCE(net->ipv4.sysctl_tcp_challenge_ack_limit); if (ack_limit == INT_MAX) goto send_ack; /* Then check host-wide RFC 5961 rate limit. */ now = jiffies / HZ; if (now != READ_ONCE(net->ipv4.tcp_challenge_timestamp)) { u32 half = (ack_limit + 1) >> 1; WRITE_ONCE(net->ipv4.tcp_challenge_timestamp, now); WRITE_ONCE(net->ipv4.tcp_challenge_count, get_random_u32_inclusive(half, ack_limit + half - 1)); } count = READ_ONCE(net->ipv4.tcp_challenge_count); if (count > 0) { WRITE_ONCE(net->ipv4.tcp_challenge_count, count - 1); send_ack: NET_INC_STATS(net, LINUX_MIB_TCPCHALLENGEACK); tcp_send_ack(sk); } } static void tcp_store_ts_recent(struct tcp_sock *tp) { tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval; tp->rx_opt.ts_recent_stamp = ktime_get_seconds(); } static int __tcp_replace_ts_recent(struct tcp_sock *tp, s32 tstamp_delta) { tcp_store_ts_recent(tp); return tstamp_delta > 0 ? FLAG_TS_PROGRESS : 0; } static int tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq) { s32 delta; if (tp->rx_opt.saw_tstamp && !after(seq, tp->rcv_wup)) { /* PAWS bug workaround wrt. ACK frames, the PAWS discard * extra check below makes sure this can only happen * for pure ACK frames. -DaveM * * Not only, also it occurs for expired timestamps. */ if (tcp_paws_check(&tp->rx_opt, 0)) { delta = tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent; return __tcp_replace_ts_recent(tp, delta); } } return 0; } /* This routine deals with acks during a TLP episode and ends an episode by * resetting tlp_high_seq. Ref: TLP algorithm in draft-ietf-tcpm-rack */ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) { struct tcp_sock *tp = tcp_sk(sk); if (before(ack, tp->tlp_high_seq)) return; if (!tp->tlp_retrans) { /* TLP of new data has been acknowledged */ tp->tlp_high_seq = 0; } else if (flag & FLAG_DSACK_TLP) { /* This DSACK means original and TLP probe arrived; no loss */ tp->tlp_high_seq = 0; } else if (after(ack, tp->tlp_high_seq)) { /* ACK advances: there was a loss, so reduce cwnd. Reset * tlp_high_seq in tcp_init_cwnd_reduction() */ tcp_init_cwnd_reduction(sk); tcp_set_ca_state(sk, TCP_CA_CWR); tcp_end_cwnd_reduction(sk); tcp_try_keep_open(sk); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPLOSSPROBERECOVERY); } else if (!(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP | FLAG_DATA_SACKED))) { /* Pure dupack: original and TLP probe arrived; no loss */ tp->tlp_high_seq = 0; } } static void tcp_in_ack_event(struct sock *sk, int flag) { const struct inet_connection_sock *icsk = inet_csk(sk); if (icsk->icsk_ca_ops->in_ack_event) { u32 ack_ev_flags = 0; if (flag & FLAG_WIN_UPDATE) ack_ev_flags |= CA_ACK_WIN_UPDATE; if (flag & FLAG_SLOWPATH) { ack_ev_flags |= CA_ACK_SLOWPATH; if (flag & FLAG_ECE) ack_ev_flags |= CA_ACK_ECE; } icsk->icsk_ca_ops->in_ack_event(sk, ack_ev_flags); } } /* Congestion control has updated the cwnd already. So if we're in * loss recovery then now we do any new sends (for FRTO) or * retransmits (for CA_Loss or CA_recovery) that make sense. */ static void tcp_xmit_recovery(struct sock *sk, int rexmit) { struct tcp_sock *tp = tcp_sk(sk); if (rexmit == REXMIT_NONE || sk->sk_state == TCP_SYN_SENT) return; if (unlikely(rexmit == REXMIT_NEW)) { __tcp_push_pending_frames(sk, tcp_current_mss(sk), TCP_NAGLE_OFF); if (after(tp->snd_nxt, tp->high_seq)) return; tp->frto = 0; } tcp_xmit_retransmit_queue(sk); } /* Returns the number of packets newly acked or sacked by the current ACK */ static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, int flag) { const struct net *net = sock_net(sk); struct tcp_sock *tp = tcp_sk(sk); u32 delivered; delivered = tp->delivered - prior_delivered; NET_ADD_STATS(net, LINUX_MIB_TCPDELIVERED, delivered); if (flag & FLAG_ECE) NET_ADD_STATS(net, LINUX_MIB_TCPDELIVEREDCE, delivered); return delivered; } /* This routine deals with incoming acks, but not outgoing ones. */ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct tcp_sacktag_state sack_state; struct rate_sample rs = { .prior_delivered = 0 }; u32 prior_snd_una = tp->snd_una; bool is_sack_reneg = tp->is_sack_reneg; u32 ack_seq = TCP_SKB_CB(skb)->seq; u32 ack = TCP_SKB_CB(skb)->ack_seq; int num_dupack = 0; int prior_packets = tp->packets_out; u32 delivered = tp->delivered; u32 lost = tp->lost; int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */ u32 prior_fack; sack_state.first_sackt = 0; sack_state.rate = &rs; sack_state.sack_delivered = 0; /* We very likely will need to access rtx queue. */ prefetch(sk->tcp_rtx_queue.rb_node); /* If the ack is older than previous acks * then we can probably ignore it. */ if (before(ack, prior_snd_una)) { u32 max_window; /* do not accept ACK for bytes we never sent. */ max_window = min_t(u64, tp->max_window, tp->bytes_acked); /* RFC 5961 5.2 [Blind Data Injection Attack].[Mitigation] */ if (before(ack, prior_snd_una - max_window)) { if (!(flag & FLAG_NO_CHALLENGE_ACK)) tcp_send_challenge_ack(sk); return -SKB_DROP_REASON_TCP_TOO_OLD_ACK; } goto old_ack; } /* If the ack includes data we haven't sent yet, discard * this segment (RFC793 Section 3.9). */ if (after(ack, tp->snd_nxt)) return -SKB_DROP_REASON_TCP_ACK_UNSENT_DATA; if (after(ack, prior_snd_una)) { flag |= FLAG_SND_UNA_ADVANCED; icsk->icsk_retransmits = 0; #if IS_ENABLED(CONFIG_TLS_DEVICE) if (static_branch_unlikely(&clean_acked_data_enabled.key)) if (tp->tcp_clean_acked) tp->tcp_clean_acked(sk, ack); #endif } prior_fack = tcp_is_sack(tp) ? tcp_highest_sack_seq(tp) : tp->snd_una; rs.prior_in_flight = tcp_packets_in_flight(tp); /* ts_recent update must be made after we are sure that the packet * is in window. */ if (flag & FLAG_UPDATE_TS_RECENT) flag |= tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq); if ((flag & (FLAG_SLOWPATH | FLAG_SND_UNA_ADVANCED)) == FLAG_SND_UNA_ADVANCED) { /* Window is constant, pure forward advance. * No more checks are required. * Note, we use the fact that SND.UNA>=SND.WL2. */ tcp_update_wl(tp, ack_seq); tcp_snd_una_update(tp, ack); flag |= FLAG_WIN_UPDATE; NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPACKS); } else { if (ack_seq != TCP_SKB_CB(skb)->end_seq) flag |= FLAG_DATA; else NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPUREACKS); flag |= tcp_ack_update_window(sk, skb, ack, ack_seq); if (TCP_SKB_CB(skb)->sacked) flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, &sack_state); if (tcp_ecn_rcv_ecn_echo(tp, tcp_hdr(skb))) flag |= FLAG_ECE; if (sack_state.sack_delivered) tcp_count_delivered(tp, sack_state.sack_delivered, flag & FLAG_ECE); } /* This is a deviation from RFC3168 since it states that: * "When the TCP data sender is ready to set the CWR bit after reducing * the congestion window, it SHOULD set the CWR bit only on the first * new data packet that it transmits." * We accept CWR on pure ACKs to be more robust * with widely-deployed TCP implementations that do this. */ tcp_ecn_accept_cwr(sk, skb); /* We passed data and got it acked, remove any soft error * log. Something worked... */ WRITE_ONCE(sk->sk_err_soft, 0); icsk->icsk_probes_out = 0; tp->rcv_tstamp = tcp_jiffies32; if (!prior_packets) goto no_queue; /* See if we can take anything off of the retransmit queue. */ flag |= tcp_clean_rtx_queue(sk, skb, prior_fack, prior_snd_una, &sack_state, flag & FLAG_ECE); tcp_rack_update_reo_wnd(sk, &rs); tcp_in_ack_event(sk, flag); if (tp->tlp_high_seq) tcp_process_tlp_ack(sk, ack, flag); if (tcp_ack_is_dubious(sk, flag)) { if (!(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP | FLAG_DSACKING_ACK))) { num_dupack = 1; /* Consider if pure acks were aggregated in tcp_add_backlog() */ if (!(flag & FLAG_DATA)) num_dupack = max_t(u16, 1, skb_shinfo(skb)->gso_segs); } tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag, &rexmit); } /* If needed, reset TLP/RTO timer when RACK doesn't set. */ if (flag & FLAG_SET_XMIT_TIMER) tcp_set_xmit_timer(sk); if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) sk_dst_confirm(sk); delivered = tcp_newly_delivered(sk, delivered, flag); lost = tp->lost - lost; /* freshly marked lost */ rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED); tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate); tcp_cong_control(sk, ack, delivered, flag, sack_state.rate); tcp_xmit_recovery(sk, rexmit); return 1; no_queue: tcp_in_ack_event(sk, flag); /* If data was DSACKed, see if we can undo a cwnd reduction. */ if (flag & FLAG_DSACKING_ACK) { tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag, &rexmit); tcp_newly_delivered(sk, delivered, flag); } /* If this ack opens up a zero window, clear backoff. It was * being used to time the probes, and is probably far higher than * it needs to be for normal retransmission. */ tcp_ack_probe(sk); if (tp->tlp_high_seq) tcp_process_tlp_ack(sk, ack, flag); return 1; old_ack: /* If data was SACKed, tag it and see if we should send more data. * If data was DSACKed, see if we can undo a cwnd reduction. */ if (TCP_SKB_CB(skb)->sacked) { flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, &sack_state); tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag, &rexmit); tcp_newly_delivered(sk, delivered, flag); tcp_xmit_recovery(sk, rexmit); } return 0; } static void tcp_parse_fastopen_option(int len, const unsigned char *cookie, bool syn, struct tcp_fastopen_cookie *foc, bool exp_opt) { /* Valid only in SYN or SYN-ACK with an even length. */ if (!foc || !syn || len < 0 || (len & 1)) return; if (len >= TCP_FASTOPEN_COOKIE_MIN && len <= TCP_FASTOPEN_COOKIE_MAX) memcpy(foc->val, cookie, len); else if (len != 0) len = -1; foc->len = len; foc->exp = exp_opt; } static bool smc_parse_options(const struct tcphdr *th, struct tcp_options_received *opt_rx, const unsigned char *ptr, int opsize) { #if IS_ENABLED(CONFIG_SMC) if (static_branch_unlikely(&tcp_have_smc)) { if (th->syn && !(opsize & 1) && opsize >= TCPOLEN_EXP_SMC_BASE && get_unaligned_be32(ptr) == TCPOPT_SMC_MAGIC) { opt_rx->smc_ok = 1; return true; } } #endif return false; } /* Try to parse the MSS option from the TCP header. Return 0 on failure, clamped * value on success. */ u16 tcp_parse_mss_option(const struct tcphdr *th, u16 user_mss) { const unsigned char *ptr = (const unsigned char *)(th + 1); int length = (th->doff * 4) - sizeof(struct tcphdr); u16 mss = 0; while (length > 0) { int opcode = *ptr++; int opsize; switch (opcode) { case TCPOPT_EOL: return mss; case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ length--; continue; default: if (length < 2) return mss; opsize = *ptr++; if (opsize < 2) /* "silly options" */ return mss; if (opsize > length) return mss; /* fail on partial options */ if (opcode == TCPOPT_MSS && opsize == TCPOLEN_MSS) { u16 in_mss = get_unaligned_be16(ptr); if (in_mss) { if (user_mss && user_mss < in_mss) in_mss = user_mss; mss = in_mss; } } ptr += opsize - 2; length -= opsize; } } return mss; } /* Look for tcp options. Normally only called on SYN and SYNACK packets. * But, this can also be called on packets in the established flow when * the fast version below fails. */ void tcp_parse_options(const struct net *net, const struct sk_buff *skb, struct tcp_options_received *opt_rx, int estab, struct tcp_fastopen_cookie *foc) { const unsigned char *ptr; const struct tcphdr *th = tcp_hdr(skb); int length = (th->doff * 4) - sizeof(struct tcphdr); ptr = (const unsigned char *)(th + 1); opt_rx->saw_tstamp = 0; opt_rx->saw_unknown = 0; while (length > 0) { int opcode = *ptr++; int opsize; switch (opcode) { case TCPOPT_EOL: return; case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ length--; continue; default: if (length < 2) return; opsize = *ptr++; if (opsize < 2) /* "silly options" */ return; if (opsize > length) return; /* don't parse partial options */ switch (opcode) { case TCPOPT_MSS: if (opsize == TCPOLEN_MSS && th->syn && !estab) { u16 in_mss = get_unaligned_be16(ptr); if (in_mss) { if (opt_rx->user_mss && opt_rx->user_mss < in_mss) in_mss = opt_rx->user_mss; opt_rx->mss_clamp = in_mss; } } break; case TCPOPT_WINDOW: if (opsize == TCPOLEN_WINDOW && th->syn && !estab && READ_ONCE(net->ipv4.sysctl_tcp_window_scaling)) { __u8 snd_wscale = *(__u8 *)ptr; opt_rx->wscale_ok = 1; if (snd_wscale > TCP_MAX_WSCALE) { net_info_ratelimited("%s: Illegal window scaling value %d > %u received\n", __func__, snd_wscale, TCP_MAX_WSCALE); snd_wscale = TCP_MAX_WSCALE; } opt_rx->snd_wscale = snd_wscale; } break; case TCPOPT_TIMESTAMP: if ((opsize == TCPOLEN_TIMESTAMP) && ((estab && opt_rx->tstamp_ok) || (!estab && READ_ONCE(net->ipv4.sysctl_tcp_timestamps)))) { opt_rx->saw_tstamp = 1; opt_rx->rcv_tsval = get_unaligned_be32(ptr); opt_rx->rcv_tsecr = get_unaligned_be32(ptr + 4); } break; case TCPOPT_SACK_PERM: if (opsize == TCPOLEN_SACK_PERM && th->syn && !estab && READ_ONCE(net->ipv4.sysctl_tcp_sack)) { opt_rx->sack_ok = TCP_SACK_SEEN; tcp_sack_reset(opt_rx); } break; case TCPOPT_SACK: if ((opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK)) && !((opsize - TCPOLEN_SACK_BASE) % TCPOLEN_SACK_PERBLOCK) && opt_rx->sack_ok) { TCP_SKB_CB(skb)->sacked = (ptr - 2) - (unsigned char *)th; } break; #ifdef CONFIG_TCP_MD5SIG case TCPOPT_MD5SIG: /* The MD5 Hash has already been * checked (see tcp_v{4,6}_rcv()). */ break; #endif #ifdef CONFIG_TCP_AO case TCPOPT_AO: /* TCP AO has already been checked * (see tcp_inbound_ao_hash()). */ break; #endif case TCPOPT_FASTOPEN: tcp_parse_fastopen_option( opsize - TCPOLEN_FASTOPEN_BASE, ptr, th->syn, foc, false); break; case TCPOPT_EXP: /* Fast Open option shares code 254 using a * 16 bits magic number. */ if (opsize >= TCPOLEN_EXP_FASTOPEN_BASE && get_unaligned_be16(ptr) == TCPOPT_FASTOPEN_MAGIC) { tcp_parse_fastopen_option(opsize - TCPOLEN_EXP_FASTOPEN_BASE, ptr + 2, th->syn, foc, true); break; } if (smc_parse_options(th, opt_rx, ptr, opsize)) break; opt_rx->saw_unknown = 1; break; default: opt_rx->saw_unknown = 1; } ptr += opsize-2; length -= opsize; } } } EXPORT_SYMBOL(tcp_parse_options); static bool tcp_parse_aligned_timestamp(struct tcp_sock *tp, const struct tcphdr *th) { const __be32 *ptr = (const __be32 *)(th + 1); if (*ptr == htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) { tp->rx_opt.saw_tstamp = 1; ++ptr; tp->rx_opt.rcv_tsval = ntohl(*ptr); ++ptr; if (*ptr) tp->rx_opt.rcv_tsecr = ntohl(*ptr) - tp->tsoffset; else tp->rx_opt.rcv_tsecr = 0; return true; } return false; } /* Fast parse options. This hopes to only see timestamps. * If it is wrong it falls back on tcp_parse_options(). */ static bool tcp_fast_parse_options(const struct net *net, const struct sk_buff *skb, const struct tcphdr *th, struct tcp_sock *tp) { /* In the spirit of fast parsing, compare doff directly to constant * values. Because equality is used, short doff can be ignored here. */ if (th->doff == (sizeof(*th) / 4)) { tp->rx_opt.saw_tstamp = 0; return false; } else if (tp->rx_opt.tstamp_ok && th->doff == ((sizeof(*th) + TCPOLEN_TSTAMP_ALIGNED) / 4)) { if (tcp_parse_aligned_timestamp(tp, th)) return true; } tcp_parse_options(net, skb, &tp->rx_opt, 1, NULL); if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) tp->rx_opt.rcv_tsecr -= tp->tsoffset; return true; } #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) /* * Parse Signature options */ int tcp_do_parse_auth_options(const struct tcphdr *th, const u8 **md5_hash, const u8 **ao_hash) { int length = (th->doff << 2) - sizeof(*th); const u8 *ptr = (const u8 *)(th + 1); unsigned int minlen = TCPOLEN_MD5SIG; if (IS_ENABLED(CONFIG_TCP_AO)) minlen = sizeof(struct tcp_ao_hdr) + 1; *md5_hash = NULL; *ao_hash = NULL; /* If not enough data remaining, we can short cut */ while (length >= minlen) { int opcode = *ptr++; int opsize; switch (opcode) { case TCPOPT_EOL: return 0; case TCPOPT_NOP: length--; continue; default: opsize = *ptr++; if (opsize < 2 || opsize > length) return -EINVAL; if (opcode == TCPOPT_MD5SIG) { if (opsize != TCPOLEN_MD5SIG) return -EINVAL; if (unlikely(*md5_hash || *ao_hash)) return -EEXIST; *md5_hash = ptr; } else if (opcode == TCPOPT_AO) { if (opsize <= sizeof(struct tcp_ao_hdr)) return -EINVAL; if (unlikely(*md5_hash || *ao_hash)) return -EEXIST; *ao_hash = ptr; } } ptr += opsize - 2; length -= opsize; } return 0; } EXPORT_SYMBOL(tcp_do_parse_auth_options); #endif /* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM * * It is not fatal. If this ACK does _not_ change critical state (seqs, window) * it can pass through stack. So, the following predicate verifies that * this segment is not used for anything but congestion avoidance or * fast retransmit. Moreover, we even are able to eliminate most of such * second order effects, if we apply some small "replay" window (~RTO) * to timestamp space. * * All these measures still do not guarantee that we reject wrapped ACKs * on networks with high bandwidth, when sequence space is recycled fastly, * but it guarantees that such events will be very rare and do not affect * connection seriously. This doesn't look nice, but alas, PAWS is really * buggy extension. * * [ Later note. Even worse! It is buggy for segments _with_ data. RFC * states that events when retransmit arrives after original data are rare. * It is a blatant lie. VJ forgot about fast retransmit! 8)8) It is * the biggest problem on large power networks even with minor reordering. * OK, let's give it small replay window. If peer clock is even 1hz, it is safe * up to bandwidth of 18Gigabit/sec. 8) ] */ /* Estimates max number of increments of remote peer TSval in * a replay window (based on our current RTO estimation). */ static u32 tcp_tsval_replay(const struct sock *sk) { /* If we use usec TS resolution, * then expect the remote peer to use the same resolution. */ if (tcp_sk(sk)->tcp_usec_ts) return inet_csk(sk)->icsk_rto * (USEC_PER_SEC / HZ); /* RFC 7323 recommends a TSval clock between 1ms and 1sec. * We know that some OS (including old linux) can use 1200 Hz. */ return inet_csk(sk)->icsk_rto * 1200 / HZ; } static enum skb_drop_reason tcp_disordered_ack_check(const struct sock *sk, const struct sk_buff *skb) { const struct tcp_sock *tp = tcp_sk(sk); const struct tcphdr *th = tcp_hdr(skb); SKB_DR_INIT(reason, TCP_RFC7323_PAWS); u32 ack = TCP_SKB_CB(skb)->ack_seq; u32 seq = TCP_SKB_CB(skb)->seq; /* 1. Is this not a pure ACK ? */ if (!th->ack || seq != TCP_SKB_CB(skb)->end_seq) return reason; /* 2. Is its sequence not the expected one ? */ if (seq != tp->rcv_nxt) return before(seq, tp->rcv_nxt) ? SKB_DROP_REASON_TCP_RFC7323_PAWS_ACK : reason; /* 3. Is this not a duplicate ACK ? */ if (ack != tp->snd_una) return reason; /* 4. Is this updating the window ? */ if (tcp_may_update_window(tp, ack, seq, ntohs(th->window) << tp->rx_opt.snd_wscale)) return reason; /* 5. Is this not in the replay window ? */ if ((s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) > tcp_tsval_replay(sk)) return reason; return 0; } /* Check segment sequence number for validity. * * Segment controls are considered valid, if the segment * fits to the window after truncation to the window. Acceptability * of data (and SYN, FIN, of course) is checked separately. * See tcp_data_queue(), for example. * * Also, controls (RST is main one) are accepted using RCV.WUP instead * of RCV.NXT. Peer still did not advance his SND.UNA when we * delayed ACK, so that hisSND.UNA<=ourRCV.WUP. * (borrowed from freebsd) */ static enum skb_drop_reason tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq) { if (before(end_seq, tp->rcv_wup)) return SKB_DROP_REASON_TCP_OLD_SEQUENCE; if (after(seq, tp->rcv_nxt + tcp_receive_window(tp))) return SKB_DROP_REASON_TCP_INVALID_SEQUENCE; return SKB_NOT_DROPPED_YET; } void tcp_done_with_error(struct sock *sk, int err) { /* This barrier is coupled with smp_rmb() in tcp_poll() */ WRITE_ONCE(sk->sk_err, err); smp_wmb(); tcp_write_queue_purge(sk); tcp_done(sk); if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); } EXPORT_IPV6_MOD(tcp_done_with_error); /* When we get a reset we do this. */ void tcp_reset(struct sock *sk, struct sk_buff *skb) { int err; trace_tcp_receive_reset(sk); /* mptcp can't tell us to ignore reset pkts, * so just ignore the return value of mptcp_incoming_options(). */ if (sk_is_mptcp(sk)) mptcp_incoming_options(sk, skb); /* We want the right error as BSD sees it (and indeed as we do). */ switch (sk->sk_state) { case TCP_SYN_SENT: err = ECONNREFUSED; break; case TCP_CLOSE_WAIT: err = EPIPE; break; case TCP_CLOSE: return; default: err = ECONNRESET; } tcp_done_with_error(sk, err); } /* * Process the FIN bit. This now behaves as it is supposed to work * and the FIN takes effect when it is validly part of sequence * space. Not before when we get holes. * * If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT * (and thence onto LAST-ACK and finally, CLOSE, we never enter * TIME-WAIT) * * If we are in FINWAIT-1, a received FIN indicates simultaneous * close and we go into CLOSING (and later onto TIME-WAIT) * * If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT. */ void tcp_fin(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); inet_csk_schedule_ack(sk); WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | RCV_SHUTDOWN); sock_set_flag(sk, SOCK_DONE); switch (sk->sk_state) { case TCP_SYN_RECV: case TCP_ESTABLISHED: /* Move to CLOSE_WAIT */ tcp_set_state(sk, TCP_CLOSE_WAIT); inet_csk_enter_pingpong_mode(sk); break; case TCP_CLOSE_WAIT: case TCP_CLOSING: /* Received a retransmission of the FIN, do * nothing. */ break; case TCP_LAST_ACK: /* RFC793: Remain in the LAST-ACK state. */ break; case TCP_FIN_WAIT1: /* This case occurs when a simultaneous close * happens, we must ack the received FIN and * enter the CLOSING state. */ tcp_send_ack(sk); tcp_set_state(sk, TCP_CLOSING); break; case TCP_FIN_WAIT2: /* Received a FIN -- send ACK and enter TIME_WAIT. */ tcp_send_ack(sk); tcp_time_wait(sk, TCP_TIME_WAIT, 0); break; default: /* Only TCP_LISTEN and TCP_CLOSE are left, in these * cases we should never reach this piece of code. */ pr_err("%s: Impossible, sk->sk_state=%d\n", __func__, sk->sk_state); break; } /* It _is_ possible, that we have something out-of-order _after_ FIN. * Probably, we should reset in this case. For now drop them. */ skb_rbtree_purge(&tp->out_of_order_queue); if (tcp_is_sack(tp)) tcp_sack_reset(&tp->rx_opt); if (!sock_flag(sk, SOCK_DEAD)) { sk->sk_state_change(sk); /* Do not send POLL_HUP for half duplex close. */ if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE) sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP); else sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); } } static inline bool tcp_sack_extend(struct tcp_sack_block *sp, u32 seq, u32 end_seq) { if (!after(seq, sp->end_seq) && !after(sp->start_seq, end_seq)) { if (before(seq, sp->start_seq)) sp->start_seq = seq; if (after(end_seq, sp->end_seq)) sp->end_seq = end_seq; return true; } return false; } static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq) { struct tcp_sock *tp = tcp_sk(sk); if (tcp_is_sack(tp) && READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_dsack)) { int mib_idx; if (before(seq, tp->rcv_nxt)) mib_idx = LINUX_MIB_TCPDSACKOLDSENT; else mib_idx = LINUX_MIB_TCPDSACKOFOSENT; NET_INC_STATS(sock_net(sk), mib_idx); tp->rx_opt.dsack = 1; tp->duplicate_sack[0].start_seq = seq; tp->duplicate_sack[0].end_seq = end_seq; } } static void tcp_dsack_extend(struct sock *sk, u32 seq, u32 end_seq) { struct tcp_sock *tp = tcp_sk(sk); if (!tp->rx_opt.dsack) tcp_dsack_set(sk, seq, end_seq); else tcp_sack_extend(tp->duplicate_sack, seq, end_seq); } static void tcp_rcv_spurious_retrans(struct sock *sk, const struct sk_buff *skb) { /* When the ACK path fails or drops most ACKs, the sender would * timeout and spuriously retransmit the same segment repeatedly. * If it seems our ACKs are not reaching the other side, * based on receiving a duplicate data segment with new flowlabel * (suggesting the sender suffered an RTO), and we are not already * repathing due to our own RTO, then rehash the socket to repath our * packets. */ #if IS_ENABLED(CONFIG_IPV6) if (inet_csk(sk)->icsk_ca_state != TCP_CA_Loss && skb->protocol == htons(ETH_P_IPV6) && (tcp_sk(sk)->inet_conn.icsk_ack.lrcv_flowlabel != ntohl(ip6_flowlabel(ipv6_hdr(skb)))) && sk_rethink_txhash(sk)) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDUPLICATEDATAREHASH); /* Save last flowlabel after a spurious retrans. */ tcp_save_lrcv_flowlabel(sk, skb); #endif } static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST); tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS); if (tcp_is_sack(tp) && READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_dsack)) { u32 end_seq = TCP_SKB_CB(skb)->end_seq; tcp_rcv_spurious_retrans(sk, skb); if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) end_seq = tp->rcv_nxt; tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, end_seq); } } tcp_send_ack(sk); } /* These routines update the SACK block as out-of-order packets arrive or * in-order packets close up the sequence space. */ static void tcp_sack_maybe_coalesce(struct tcp_sock *tp) { int this_sack; struct tcp_sack_block *sp = &tp->selective_acks[0]; struct tcp_sack_block *swalk = sp + 1; /* See if the recent change to the first SACK eats into * or hits the sequence space of other SACK blocks, if so coalesce. */ for (this_sack = 1; this_sack < tp->rx_opt.num_sacks;) { if (tcp_sack_extend(sp, swalk->start_seq, swalk->end_seq)) { int i; /* Zap SWALK, by moving every further SACK up by one slot. * Decrease num_sacks. */ tp->rx_opt.num_sacks--; for (i = this_sack; i < tp->rx_opt.num_sacks; i++) sp[i] = sp[i + 1]; continue; } this_sack++; swalk++; } } void tcp_sack_compress_send_ack(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); if (!tp->compressed_ack) return; if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1) __sock_put(sk); /* Since we have to send one ack finally, * substract one from tp->compressed_ack to keep * LINUX_MIB_TCPACKCOMPRESSED accurate. */ NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED, tp->compressed_ack - 1); tp->compressed_ack = 0; tcp_send_ack(sk); } /* Reasonable amount of sack blocks included in TCP SACK option * The max is 4, but this becomes 3 if TCP timestamps are there. * Given that SACK packets might be lost, be conservative and use 2. */ #define TCP_SACK_BLOCKS_EXPECTED 2 static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq) { struct tcp_sock *tp = tcp_sk(sk); struct tcp_sack_block *sp = &tp->selective_acks[0]; int cur_sacks = tp->rx_opt.num_sacks; int this_sack; if (!cur_sacks) goto new_sack; for (this_sack = 0; this_sack < cur_sacks; this_sack++, sp++) { if (tcp_sack_extend(sp, seq, end_seq)) { if (this_sack >= TCP_SACK_BLOCKS_EXPECTED) tcp_sack_compress_send_ack(sk); /* Rotate this_sack to the first one. */ for (; this_sack > 0; this_sack--, sp--) swap(*sp, *(sp - 1)); if (cur_sacks > 1) tcp_sack_maybe_coalesce(tp); return; } } if (this_sack >= TCP_SACK_BLOCKS_EXPECTED) tcp_sack_compress_send_ack(sk); /* Could not find an adjacent existing SACK, build a new one, * put it at the front, and shift everyone else down. We * always know there is at least one SACK present already here. * * If the sack array is full, forget about the last one. */ if (this_sack >= TCP_NUM_SACKS) { this_sack--; tp->rx_opt.num_sacks--; sp--; } for (; this_sack > 0; this_sack--, sp--) *sp = *(sp - 1); new_sack: /* Build the new head SACK, and we're done. */ sp->start_seq = seq; sp->end_seq = end_seq; tp->rx_opt.num_sacks++; } /* RCV.NXT advances, some SACKs should be eaten. */ static void tcp_sack_remove(struct tcp_sock *tp) { struct tcp_sack_block *sp = &tp->selective_acks[0]; int num_sacks = tp->rx_opt.num_sacks; int this_sack; /* Empty ofo queue, hence, all the SACKs are eaten. Clear. */ if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) { tp->rx_opt.num_sacks = 0; return; } for (this_sack = 0; this_sack < num_sacks;) { /* Check if the start of the sack is covered by RCV.NXT. */ if (!before(tp->rcv_nxt, sp->start_seq)) { int i; /* RCV.NXT must cover all the block! */ WARN_ON(before(tp->rcv_nxt, sp->end_seq)); /* Zap this SACK, by moving forward any other SACKS. */ for (i = this_sack+1; i < num_sacks; i++) tp->selective_acks[i-1] = tp->selective_acks[i]; num_sacks--; continue; } this_sack++; sp++; } tp->rx_opt.num_sacks = num_sacks; } /** * tcp_try_coalesce - try to merge skb to prior one * @sk: socket * @to: prior buffer * @from: buffer to add in queue * @fragstolen: pointer to boolean * * Before queueing skb @from after @to, try to merge them * to reduce overall memory use and queue lengths, if cost is small. * Packets in ofo or receive queues can stay a long time. * Better try to coalesce them right now to avoid future collapses. * Returns true if caller should free @from instead of queueing it */ static bool tcp_try_coalesce(struct sock *sk, struct sk_buff *to, struct sk_buff *from, bool *fragstolen) { int delta; *fragstolen = false; /* Its possible this segment overlaps with prior segment in queue */ if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq) return false; if (!tcp_skb_can_collapse_rx(to, from)) return false; if (!skb_try_coalesce(to, from, fragstolen, &delta)) return false; atomic_add(delta, &sk->sk_rmem_alloc); sk_mem_charge(sk, delta); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE); TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq; TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq; TCP_SKB_CB(to)->tcp_flags |= TCP_SKB_CB(from)->tcp_flags; if (TCP_SKB_CB(from)->has_rxtstamp) { TCP_SKB_CB(to)->has_rxtstamp = true; to->tstamp = from->tstamp; skb_hwtstamps(to)->hwtstamp = skb_hwtstamps(from)->hwtstamp; } return true; } static bool tcp_ooo_try_coalesce(struct sock *sk, struct sk_buff *to, struct sk_buff *from, bool *fragstolen) { bool res = tcp_try_coalesce(sk, to, from, fragstolen); /* In case tcp_drop_reason() is called later, update to->gso_segs */ if (res) { u32 gso_segs = max_t(u16, 1, skb_shinfo(to)->gso_segs) + max_t(u16, 1, skb_shinfo(from)->gso_segs); skb_shinfo(to)->gso_segs = min_t(u32, gso_segs, 0xFFFF); } return res; } noinline_for_tracing static void tcp_drop_reason(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason reason) { sk_drops_add(sk, skb); sk_skb_reason_drop(sk, skb, reason); } /* This one checks to see if we can put data from the * out_of_order queue into the receive_queue. */ static void tcp_ofo_queue(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); __u32 dsack_high = tp->rcv_nxt; bool fin, fragstolen, eaten; struct sk_buff *skb, *tail; struct rb_node *p; p = rb_first(&tp->out_of_order_queue); while (p) { skb = rb_to_skb(p); if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) break; if (before(TCP_SKB_CB(skb)->seq, dsack_high)) { __u32 dsack = dsack_high; if (before(TCP_SKB_CB(skb)->end_seq, dsack_high)) dsack_high = TCP_SKB_CB(skb)->end_seq; tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack); } p = rb_next(p); rb_erase(&skb->rbnode, &tp->out_of_order_queue); if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) { tcp_drop_reason(sk, skb, SKB_DROP_REASON_TCP_OFO_DROP); continue; } tail = skb_peek_tail(&sk->sk_receive_queue); eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen); tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq); fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN; if (!eaten) tcp_add_receive_queue(sk, skb); else kfree_skb_partial(skb, fragstolen); if (unlikely(fin)) { tcp_fin(sk); /* tcp_fin() purges tp->out_of_order_queue, * so we must end this loop right now. */ break; } } } static bool tcp_prune_ofo_queue(struct sock *sk, const struct sk_buff *in_skb); static int tcp_prune_queue(struct sock *sk, const struct sk_buff *in_skb); static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb, unsigned int size) { if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || !sk_rmem_schedule(sk, skb, size)) { if (tcp_prune_queue(sk, skb) < 0) return -1; while (!sk_rmem_schedule(sk, skb, size)) { if (!tcp_prune_ofo_queue(sk, skb)) return -1; } } return 0; } static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); struct rb_node **p, *parent; struct sk_buff *skb1; u32 seq, end_seq; bool fragstolen; tcp_save_lrcv_flowlabel(sk, skb); tcp_data_ecn_check(sk, skb); if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFODROP); sk->sk_data_ready(sk); tcp_drop_reason(sk, skb, SKB_DROP_REASON_PROTO_MEM); return; } /* Disable header prediction. */ tp->pred_flags = 0; inet_csk_schedule_ack(sk); tp->rcv_ooopack += max_t(u16, 1, skb_shinfo(skb)->gso_segs); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE); seq = TCP_SKB_CB(skb)->seq; end_seq = TCP_SKB_CB(skb)->end_seq; p = &tp->out_of_order_queue.rb_node; if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) { /* Initial out of order segment, build 1 SACK. */ if (tcp_is_sack(tp)) { tp->rx_opt.num_sacks = 1; tp->selective_acks[0].start_seq = seq; tp->selective_acks[0].end_seq = end_seq; } rb_link_node(&skb->rbnode, NULL, p); rb_insert_color(&skb->rbnode, &tp->out_of_order_queue); tp->ooo_last_skb = skb; goto end; } /* In the typical case, we are adding an skb to the end of the list. * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup. */ if (tcp_ooo_try_coalesce(sk, tp->ooo_last_skb, skb, &fragstolen)) { coalesce_done: /* For non sack flows, do not grow window to force DUPACK * and trigger fast retransmit. */ if (tcp_is_sack(tp)) tcp_grow_window(sk, skb, true); kfree_skb_partial(skb, fragstolen); skb = NULL; goto add_sack; } /* Can avoid an rbtree lookup if we are adding skb after ooo_last_skb */ if (!before(seq, TCP_SKB_CB(tp->ooo_last_skb)->end_seq)) { parent = &tp->ooo_last_skb->rbnode; p = &parent->rb_right; goto insert; } /* Find place to insert this segment. Handle overlaps on the way. */ parent = NULL; while (*p) { parent = *p; skb1 = rb_to_skb(parent); if (before(seq, TCP_SKB_CB(skb1)->seq)) { p = &parent->rb_left; continue; } if (before(seq, TCP_SKB_CB(skb1)->end_seq)) { if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) { /* All the bits are present. Drop. */ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE); tcp_drop_reason(sk, skb, SKB_DROP_REASON_TCP_OFOMERGE); skb = NULL; tcp_dsack_set(sk, seq, end_seq); goto add_sack; } if (after(seq, TCP_SKB_CB(skb1)->seq)) { /* Partial overlap. */ tcp_dsack_set(sk, seq, TCP_SKB_CB(skb1)->end_seq); } else { /* skb's seq == skb1's seq and skb covers skb1. * Replace skb1 with skb. */ rb_replace_node(&skb1->rbnode, &skb->rbnode, &tp->out_of_order_queue); tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq, TCP_SKB_CB(skb1)->end_seq); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE); tcp_drop_reason(sk, skb1, SKB_DROP_REASON_TCP_OFOMERGE); goto merge_right; } } else if (tcp_ooo_try_coalesce(sk, skb1, skb, &fragstolen)) { goto coalesce_done; } p = &parent->rb_right; } insert: /* Insert segment into RB tree. */ rb_link_node(&skb->rbnode, parent, p); rb_insert_color(&skb->rbnode, &tp->out_of_order_queue); merge_right: /* Remove other segments covered by skb. */ while ((skb1 = skb_rb_next(skb)) != NULL) { if (!after(end_seq, TCP_SKB_CB(skb1)->seq)) break; if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) { tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq, end_seq); break; } rb_erase(&skb1->rbnode, &tp->out_of_order_queue); tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq, TCP_SKB_CB(skb1)->end_seq); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE); tcp_drop_reason(sk, skb1, SKB_DROP_REASON_TCP_OFOMERGE); } /* If there is no skb after us, we are the last_skb ! */ if (!skb1) tp->ooo_last_skb = skb; add_sack: if (tcp_is_sack(tp)) tcp_sack_new_ofo_skb(sk, seq, end_seq); end: if (skb) { /* For non sack flows, do not grow window to force DUPACK * and trigger fast retransmit. */ if (tcp_is_sack(tp)) tcp_grow_window(sk, skb, false); skb_condense(skb); skb_set_owner_r(skb, sk); } tcp_rcvbuf_grow(sk); } static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, bool *fragstolen) { int eaten; struct sk_buff *tail = skb_peek_tail(&sk->sk_receive_queue); eaten = (tail && tcp_try_coalesce(sk, tail, skb, fragstolen)) ? 1 : 0; tcp_rcv_nxt_update(tcp_sk(sk), TCP_SKB_CB(skb)->end_seq); if (!eaten) { tcp_add_receive_queue(sk, skb); skb_set_owner_r(skb, sk); } return eaten; } int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size) { struct sk_buff *skb; int err = -ENOMEM; int data_len = 0; bool fragstolen; if (size == 0) return 0; if (size > PAGE_SIZE) { int npages = min_t(size_t, size >> PAGE_SHIFT, MAX_SKB_FRAGS); data_len = npages << PAGE_SHIFT; size = data_len + (size & ~PAGE_MASK); } skb = alloc_skb_with_frags(size - data_len, data_len, PAGE_ALLOC_COSTLY_ORDER, &err, sk->sk_allocation); if (!skb) goto err; skb_put(skb, size - data_len); skb->data_len = data_len; skb->len = size; if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP); goto err_free; } err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size); if (err) goto err_free; TCP_SKB_CB(skb)->seq = tcp_sk(sk)->rcv_nxt; TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + size; TCP_SKB_CB(skb)->ack_seq = tcp_sk(sk)->snd_una - 1; if (tcp_queue_rcv(sk, skb, &fragstolen)) { WARN_ON_ONCE(fragstolen); /* should not happen */ __kfree_skb(skb); } return size; err_free: kfree_skb(skb); err: return err; } void tcp_data_ready(struct sock *sk) { if (tcp_epollin_ready(sk, sk->sk_rcvlowat) || sock_flag(sk, SOCK_DONE)) sk->sk_data_ready(sk); } static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); enum skb_drop_reason reason; bool fragstolen; int eaten; /* If a subflow has been reset, the packet should not continue * to be processed, drop the packet. */ if (sk_is_mptcp(sk) && !mptcp_incoming_options(sk, skb)) { __kfree_skb(skb); return; } if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) { __kfree_skb(skb); return; } tcp_cleanup_skb(skb); __skb_pull(skb, tcp_hdr(skb)->doff * 4); reason = SKB_DROP_REASON_NOT_SPECIFIED; tp->rx_opt.dsack = 0; /* Queue data for delivery to the user. * Packets in sequence go to the receive queue. * Out of sequence packets to the out_of_order_queue. */ if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) { if (tcp_receive_window(tp) == 0) { /* Some stacks are known to send bare FIN packets * in a loop even if we send RWIN 0 in our ACK. * Accepting this FIN does not hurt memory pressure * because the FIN flag will simply be merged to the * receive queue tail skb in most cases. */ if (!skb->len && (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) goto queue_and_out; reason = SKB_DROP_REASON_TCP_ZEROWINDOW; NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP); goto out_of_window; } /* Ok. In sequence. In window. */ queue_and_out: if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) { /* TODO: maybe ratelimit these WIN 0 ACK ? */ inet_csk(sk)->icsk_ack.pending |= (ICSK_ACK_NOMEM | ICSK_ACK_NOW); inet_csk_schedule_ack(sk); sk->sk_data_ready(sk); if (skb_queue_len(&sk->sk_receive_queue) && skb->len) { reason = SKB_DROP_REASON_PROTO_MEM; NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP); goto drop; } sk_forced_mem_schedule(sk, skb->truesize); } eaten = tcp_queue_rcv(sk, skb, &fragstolen); if (skb->len) tcp_event_data_recv(sk, skb); if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) tcp_fin(sk); if (!RB_EMPTY_ROOT(&tp->out_of_order_queue)) { tcp_ofo_queue(sk); /* RFC5681. 4.2. SHOULD send immediate ACK, when * gap in queue is filled. */ if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW; } if (tp->rx_opt.num_sacks) tcp_sack_remove(tp); tcp_fast_path_check(sk); if (eaten > 0) kfree_skb_partial(skb, fragstolen); if (!sock_flag(sk, SOCK_DEAD)) tcp_data_ready(sk); return; } if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) { tcp_rcv_spurious_retrans(sk, skb); /* A retransmit, 2nd most common case. Force an immediate ack. */ reason = SKB_DROP_REASON_TCP_OLD_DATA; NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST); tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); out_of_window: tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS); inet_csk_schedule_ack(sk); drop: tcp_drop_reason(sk, skb, reason); return; } /* Out of window. F.e. zero window probe. */ if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt + tcp_receive_window(tp))) { reason = SKB_DROP_REASON_TCP_OVERWINDOW; goto out_of_window; } if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { /* Partial packet, seq < rcv_next < end_seq */ tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, tp->rcv_nxt); /* If window is closed, drop tail of packet. But after * remembering D-SACK for its head made in previous line. */ if (!tcp_receive_window(tp)) { reason = SKB_DROP_REASON_TCP_ZEROWINDOW; NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP); goto out_of_window; } goto queue_and_out; } tcp_data_queue_ofo(sk, skb); } static struct sk_buff *tcp_skb_next(struct sk_buff *skb, struct sk_buff_head *list) { if (list) return !skb_queue_is_last(list, skb) ? skb->next : NULL; return skb_rb_next(skb); } static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb, struct sk_buff_head *list, struct rb_root *root) { struct sk_buff *next = tcp_skb_next(skb, list); if (list) __skb_unlink(skb, list); else rb_erase(&skb->rbnode, root); __kfree_skb(skb); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED); return next; } /* Insert skb into rb tree, ordered by TCP_SKB_CB(skb)->seq */ void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb) { struct rb_node **p = &root->rb_node; struct rb_node *parent = NULL; struct sk_buff *skb1; while (*p) { parent = *p; skb1 = rb_to_skb(parent); if (before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq)) p = &parent->rb_left; else p = &parent->rb_right; } rb_link_node(&skb->rbnode, parent, p); rb_insert_color(&skb->rbnode, root); } /* Collapse contiguous sequence of skbs head..tail with * sequence numbers start..end. * * If tail is NULL, this means until the end of the queue. * * Segments with FIN/SYN are not collapsed (only because this * simplifies code) */ static void tcp_collapse(struct sock *sk, struct sk_buff_head *list, struct rb_root *root, struct sk_buff *head, struct sk_buff *tail, u32 start, u32 end) { struct sk_buff *skb = head, *n; struct sk_buff_head tmp; bool end_of_skbs; /* First, check that queue is collapsible and find * the point where collapsing can be useful. */ restart: for (end_of_skbs = true; skb != NULL && skb != tail; skb = n) { n = tcp_skb_next(skb, list); if (!skb_frags_readable(skb)) goto skip_this; /* No new bits? It is possible on ofo queue. */ if (!before(start, TCP_SKB_CB(skb)->end_seq)) { skb = tcp_collapse_one(sk, skb, list, root); if (!skb) break; goto restart; } /* The first skb to collapse is: * - not SYN/FIN and * - bloated or contains data before "start" or * overlaps to the next one and mptcp allow collapsing. */ if (!(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)) && (tcp_win_from_space(sk, skb->truesize) > skb->len || before(TCP_SKB_CB(skb)->seq, start))) { end_of_skbs = false; break; } if (n && n != tail && skb_frags_readable(n) && tcp_skb_can_collapse_rx(skb, n) && TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(n)->seq) { end_of_skbs = false; break; } skip_this: /* Decided to skip this, advance start seq. */ start = TCP_SKB_CB(skb)->end_seq; } if (end_of_skbs || (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)) || !skb_frags_readable(skb)) return; __skb_queue_head_init(&tmp); while (before(start, end)) { int copy = min_t(int, SKB_MAX_ORDER(0, 0), end - start); struct sk_buff *nskb; nskb = alloc_skb(copy, GFP_ATOMIC); if (!nskb) break; memcpy(nskb->cb, skb->cb, sizeof(skb->cb)); skb_copy_decrypted(nskb, skb); TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start; if (list) __skb_queue_before(list, skb, nskb); else __skb_queue_tail(&tmp, nskb); /* defer rbtree insertion */ skb_set_owner_r(nskb, sk); mptcp_skb_ext_move(nskb, skb); /* Copy data, releasing collapsed skbs. */ while (copy > 0) { int offset = start - TCP_SKB_CB(skb)->seq; int size = TCP_SKB_CB(skb)->end_seq - start; BUG_ON(offset < 0); if (size > 0) { size = min(copy, size); if (skb_copy_bits(skb, offset, skb_put(nskb, size), size)) BUG(); TCP_SKB_CB(nskb)->end_seq += size; copy -= size; start += size; } if (!before(start, TCP_SKB_CB(skb)->end_seq)) { skb = tcp_collapse_one(sk, skb, list, root); if (!skb || skb == tail || !tcp_skb_can_collapse_rx(nskb, skb) || (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)) || !skb_frags_readable(skb)) goto end; } } } end: skb_queue_walk_safe(&tmp, skb, n) tcp_rbtree_insert(root, skb); } /* Collapse ofo queue. Algorithm: select contiguous sequence of skbs * and tcp_collapse() them until all the queue is collapsed. */ static void tcp_collapse_ofo_queue(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); u32 range_truesize, sum_tiny = 0; struct sk_buff *skb, *head; u32 start, end; skb = skb_rb_first(&tp->out_of_order_queue); new_range: if (!skb) { tp->ooo_last_skb = skb_rb_last(&tp->out_of_order_queue); return; } start = TCP_SKB_CB(skb)->seq; end = TCP_SKB_CB(skb)->end_seq; range_truesize = skb->truesize; for (head = skb;;) { skb = skb_rb_next(skb); /* Range is terminated when we see a gap or when * we are at the queue end. */ if (!skb || after(TCP_SKB_CB(skb)->seq, end) || before(TCP_SKB_CB(skb)->end_seq, start)) { /* Do not attempt collapsing tiny skbs */ if (range_truesize != head->truesize || end - start >= SKB_WITH_OVERHEAD(PAGE_SIZE)) { tcp_collapse(sk, NULL, &tp->out_of_order_queue, head, skb, start, end); } else { sum_tiny += range_truesize; if (sum_tiny > sk->sk_rcvbuf >> 3) return; } goto new_range; } range_truesize += skb->truesize; if (unlikely(before(TCP_SKB_CB(skb)->seq, start))) start = TCP_SKB_CB(skb)->seq; if (after(TCP_SKB_CB(skb)->end_seq, end)) end = TCP_SKB_CB(skb)->end_seq; } } /* * Clean the out-of-order queue to make room. * We drop high sequences packets to : * 1) Let a chance for holes to be filled. * This means we do not drop packets from ooo queue if their sequence * is before incoming packet sequence. * 2) not add too big latencies if thousands of packets sit there. * (But if application shrinks SO_RCVBUF, we could still end up * freeing whole queue here) * 3) Drop at least 12.5 % of sk_rcvbuf to avoid malicious attacks. * * Return true if queue has shrunk. */ static bool tcp_prune_ofo_queue(struct sock *sk, const struct sk_buff *in_skb) { struct tcp_sock *tp = tcp_sk(sk); struct rb_node *node, *prev; bool pruned = false; int goal; if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) return false; goal = sk->sk_rcvbuf >> 3; node = &tp->ooo_last_skb->rbnode; do { struct sk_buff *skb = rb_to_skb(node); /* If incoming skb would land last in ofo queue, stop pruning. */ if (after(TCP_SKB_CB(in_skb)->seq, TCP_SKB_CB(skb)->seq)) break; pruned = true; prev = rb_prev(node); rb_erase(node, &tp->out_of_order_queue); goal -= skb->truesize; tcp_drop_reason(sk, skb, SKB_DROP_REASON_TCP_OFO_QUEUE_PRUNE); tp->ooo_last_skb = rb_to_skb(prev); if (!prev || goal <= 0) { if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf && !tcp_under_memory_pressure(sk)) break; goal = sk->sk_rcvbuf >> 3; } node = prev; } while (node); if (pruned) { NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED); /* Reset SACK state. A conforming SACK implementation will * do the same at a timeout based retransmit. When a connection * is in a sad state like this, we care only about integrity * of the connection not performance. */ if (tp->rx_opt.sack_ok) tcp_sack_reset(&tp->rx_opt); } return pruned; } /* Reduce allocated memory if we can, trying to get * the socket within its memory limits again. * * Return less than zero if we should start dropping frames * until the socket owning process reads some of the data * to stabilize the situation. */ static int tcp_prune_queue(struct sock *sk, const struct sk_buff *in_skb) { struct tcp_sock *tp = tcp_sk(sk); NET_INC_STATS(sock_net(sk), LINUX_MIB_PRUNECALLED); if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) tcp_clamp_window(sk); else if (tcp_under_memory_pressure(sk)) tcp_adjust_rcv_ssthresh(sk); if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) return 0; tcp_collapse_ofo_queue(sk); if (!skb_queue_empty(&sk->sk_receive_queue)) tcp_collapse(sk, &sk->sk_receive_queue, NULL, skb_peek(&sk->sk_receive_queue), NULL, tp->copied_seq, tp->rcv_nxt); if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) return 0; /* Collapsing did not help, destructive actions follow. * This must not ever occur. */ tcp_prune_ofo_queue(sk, in_skb); if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) return 0; /* If we are really being abused, tell the caller to silently * drop receive data on the floor. It will get retransmitted * and hopefully then we'll have sufficient space. */ NET_INC_STATS(sock_net(sk), LINUX_MIB_RCVPRUNED); /* Massive buffer overcommit. */ tp->pred_flags = 0; return -1; } static bool tcp_should_expand_sndbuf(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); /* If the user specified a specific send buffer setting, do * not modify it. */ if (sk->sk_userlocks & SOCK_SNDBUF_LOCK) return false; /* If we are under global TCP memory pressure, do not expand. */ if (tcp_under_memory_pressure(sk)) { int unused_mem = sk_unused_reserved_mem(sk); /* Adjust sndbuf according to reserved mem. But make sure * it never goes below SOCK_MIN_SNDBUF. * See sk_stream_moderate_sndbuf() for more details. */ if (unused_mem > SOCK_MIN_SNDBUF) WRITE_ONCE(sk->sk_sndbuf, unused_mem); return false; } /* If we are under soft global TCP memory pressure, do not expand. */ if (sk_memory_allocated(sk) >= sk_prot_mem_limits(sk, 0)) return false; /* If we filled the congestion window, do not expand. */ if (tcp_packets_in_flight(tp) >= tcp_snd_cwnd(tp)) return false; return true; } static void tcp_new_space(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); if (tcp_should_expand_sndbuf(sk)) { tcp_sndbuf_expand(sk); tp->snd_cwnd_stamp = tcp_jiffies32; } INDIRECT_CALL_1(sk->sk_write_space, sk_stream_write_space, sk); } /* Caller made space either from: * 1) Freeing skbs in rtx queues (after tp->snd_una has advanced) * 2) Sent skbs from output queue (and thus advancing tp->snd_nxt) * * We might be able to generate EPOLLOUT to the application if: * 1) Space consumed in output/rtx queues is below sk->sk_sndbuf/2 * 2) notsent amount (tp->write_seq - tp->snd_nxt) became * small enough that tcp_stream_memory_free() decides it * is time to generate EPOLLOUT. */ void tcp_check_space(struct sock *sk) { /* pairs with tcp_poll() */ smp_mb(); if (sk->sk_socket && test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { tcp_new_space(sk); if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED); } } static inline void tcp_data_snd_check(struct sock *sk) { tcp_push_pending_frames(sk); tcp_check_space(sk); } /* * Check if sending an ack is needed. */ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) { struct tcp_sock *tp = tcp_sk(sk); unsigned long rtt, delay; /* More than one full frame received... */ if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss && /* ... and right edge of window advances far enough. * (tcp_recvmsg() will send ACK otherwise). * If application uses SO_RCVLOWAT, we want send ack now if * we have not received enough bytes to satisfy the condition. */ (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat || __tcp_select_window(sk) >= tp->rcv_wnd)) || /* We ACK each frame or... */ tcp_in_quickack_mode(sk) || /* Protocol state mandates a one-time immediate ACK */ inet_csk(sk)->icsk_ack.pending & ICSK_ACK_NOW) { /* If we are running from __release_sock() in user context, * Defer the ack until tcp_release_cb(). */ if (sock_owned_by_user_nocheck(sk) && READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_backlog_ack_defer)) { set_bit(TCP_ACK_DEFERRED, &sk->sk_tsq_flags); return; } send_now: tcp_send_ack(sk); return; } if (!ofo_possible || RB_EMPTY_ROOT(&tp->out_of_order_queue)) { tcp_send_delayed_ack(sk); return; } if (!tcp_is_sack(tp) || tp->compressed_ack >= READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_comp_sack_nr)) goto send_now; if (tp->compressed_ack_rcv_nxt != tp->rcv_nxt) { tp->compressed_ack_rcv_nxt = tp->rcv_nxt; tp->dup_ack_counter = 0; } if (tp->dup_ack_counter < TCP_FASTRETRANS_THRESH) { tp->dup_ack_counter++; goto send_now; } tp->compressed_ack++; if (hrtimer_is_queued(&tp->compressed_ack_timer)) return; /* compress ack timer : 5 % of rtt, but no more than tcp_comp_sack_delay_ns */ rtt = tp->rcv_rtt_est.rtt_us; if (tp->srtt_us && tp->srtt_us < rtt) rtt = tp->srtt_us; delay = min_t(unsigned long, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_comp_sack_delay_ns), rtt * (NSEC_PER_USEC >> 3)/20); sock_hold(sk); hrtimer_start_range_ns(&tp->compressed_ack_timer, ns_to_ktime(delay), READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_comp_sack_slack_ns), HRTIMER_MODE_REL_PINNED_SOFT); } static inline void tcp_ack_snd_check(struct sock *sk) { if (!inet_csk_ack_scheduled(sk)) { /* We sent a data segment already. */ return; } __tcp_ack_snd_check(sk, 1); } /* * This routine is only called when we have urgent data * signaled. Its the 'slow' part of tcp_urg. It could be * moved inline now as tcp_urg is only called from one * place. We handle URGent data wrong. We have to - as * BSD still doesn't use the correction from RFC961. * For 1003.1g we should support a new option TCP_STDURG to permit * either form (or just set the sysctl tcp_stdurg). */ static void tcp_check_urg(struct sock *sk, const struct tcphdr *th) { struct tcp_sock *tp = tcp_sk(sk); u32 ptr = ntohs(th->urg_ptr); if (ptr && !READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_stdurg)) ptr--; ptr += ntohl(th->seq); /* Ignore urgent data that we've already seen and read. */ if (after(tp->copied_seq, ptr)) return; /* Do not replay urg ptr. * * NOTE: interesting situation not covered by specs. * Misbehaving sender may send urg ptr, pointing to segment, * which we already have in ofo queue. We are not able to fetch * such data and will stay in TCP_URG_NOTYET until will be eaten * by recvmsg(). Seems, we are not obliged to handle such wicked * situations. But it is worth to think about possibility of some * DoSes using some hypothetical application level deadlock. */ if (before(ptr, tp->rcv_nxt)) return; /* Do we already have a newer (or duplicate) urgent pointer? */ if (tp->urg_data && !after(ptr, tp->urg_seq)) return; /* Tell the world about our new urgent pointer. */ sk_send_sigurg(sk); /* We may be adding urgent data when the last byte read was * urgent. To do this requires some care. We cannot just ignore * tp->copied_seq since we would read the last urgent byte again * as data, nor can we alter copied_seq until this data arrives * or we break the semantics of SIOCATMARK (and thus sockatmark()) * * NOTE. Double Dutch. Rendering to plain English: author of comment * above did something sort of send("A", MSG_OOB); send("B", MSG_OOB); * and expect that both A and B disappear from stream. This is _wrong_. * Though this happens in BSD with high probability, this is occasional. * Any application relying on this is buggy. Note also, that fix "works" * only in this artificial test. Insert some normal data between A and B and we will * decline of BSD again. Verdict: it is better to remove to trap * buggy users. */ if (tp->urg_seq == tp->copied_seq && tp->urg_data && !sock_flag(sk, SOCK_URGINLINE) && tp->copied_seq != tp->rcv_nxt) { struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); tp->copied_seq++; if (skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)) { __skb_unlink(skb, &sk->sk_receive_queue); __kfree_skb(skb); } } WRITE_ONCE(tp->urg_data, TCP_URG_NOTYET); WRITE_ONCE(tp->urg_seq, ptr); /* Disable header prediction. */ tp->pred_flags = 0; } /* This is the 'fast' part of urgent handling. */ static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th) { struct tcp_sock *tp = tcp_sk(sk); /* Check if we get a new urgent pointer - normally not. */ if (unlikely(th->urg)) tcp_check_urg(sk, th); /* Do we wait for any urgent data? - normally not... */ if (unlikely(tp->urg_data == TCP_URG_NOTYET)) { u32 ptr = tp->urg_seq - ntohl(th->seq) + (th->doff * 4) - th->syn; /* Is the urgent pointer pointing into this packet? */ if (ptr < skb->len) { u8 tmp; if (skb_copy_bits(skb, ptr, &tmp, 1)) BUG(); WRITE_ONCE(tp->urg_data, TCP_URG_VALID | tmp); if (!sock_flag(sk, SOCK_DEAD)) sk->sk_data_ready(sk); } } } /* Accept RST for rcv_nxt - 1 after a FIN. * When tcp connections are abruptly terminated from Mac OSX (via ^C), a * FIN is sent followed by a RST packet. The RST is sent with the same * sequence number as the FIN, and thus according to RFC 5961 a challenge * ACK should be sent. However, Mac OSX rate limits replies to challenge * ACKs on the closed socket. In addition middleboxes can drop either the * challenge ACK or a subsequent RST. */ static bool tcp_reset_check(const struct sock *sk, const struct sk_buff *skb) { const struct tcp_sock *tp = tcp_sk(sk); return unlikely(TCP_SKB_CB(skb)->seq == (tp->rcv_nxt - 1) && (1 << sk->sk_state) & (TCPF_CLOSE_WAIT | TCPF_LAST_ACK | TCPF_CLOSING)); } /* Does PAWS and seqno based validation of an incoming segment, flags will * play significant role here. */ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th, int syn_inerr) { struct tcp_sock *tp = tcp_sk(sk); SKB_DR(reason); /* RFC1323: H1. Apply PAWS check first. */ if (!tcp_fast_parse_options(sock_net(sk), skb, th, tp) || !tp->rx_opt.saw_tstamp || tcp_paws_check(&tp->rx_opt, TCP_PAWS_WINDOW)) goto step1; reason = tcp_disordered_ack_check(sk, skb); if (!reason) goto step1; /* Reset is accepted even if it did not pass PAWS. */ if (th->rst) goto step1; if (unlikely(th->syn)) goto syn_challenge; /* Old ACK are common, increment PAWS_OLD_ACK * and do not send a dupack. */ if (reason == SKB_DROP_REASON_TCP_RFC7323_PAWS_ACK) { NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWS_OLD_ACK); goto discard; } NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); if (!tcp_oow_rate_limited(sock_net(sk), skb, LINUX_MIB_TCPACKSKIPPEDPAWS, &tp->last_oow_ack_time)) tcp_send_dupack(sk, skb); goto discard; step1: /* Step 1: check sequence number */ reason = tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); if (reason) { /* RFC793, page 37: "In all states except SYN-SENT, all reset * (RST) segments are validated by checking their SEQ-fields." * And page 69: "If an incoming segment is not acceptable, * an acknowledgment should be sent in reply (unless the RST * bit is set, if so drop the segment and return)". */ if (!th->rst) { if (th->syn) goto syn_challenge; if (!tcp_oow_rate_limited(sock_net(sk), skb, LINUX_MIB_TCPACKSKIPPEDSEQ, &tp->last_oow_ack_time)) tcp_send_dupack(sk, skb); } else if (tcp_reset_check(sk, skb)) { goto reset; } goto discard; } /* Step 2: check RST bit */ if (th->rst) { /* RFC 5961 3.2 (extend to match against (RCV.NXT - 1) after a * FIN and SACK too if available): * If seq num matches RCV.NXT or (RCV.NXT - 1) after a FIN, or * the right-most SACK block, * then * RESET the connection * else * Send a challenge ACK */ if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt || tcp_reset_check(sk, skb)) goto reset; if (tcp_is_sack(tp) && tp->rx_opt.num_sacks > 0) { struct tcp_sack_block *sp = &tp->selective_acks[0]; int max_sack = sp[0].end_seq; int this_sack; for (this_sack = 1; this_sack < tp->rx_opt.num_sacks; ++this_sack) { max_sack = after(sp[this_sack].end_seq, max_sack) ? sp[this_sack].end_seq : max_sack; } if (TCP_SKB_CB(skb)->seq == max_sack) goto reset; } /* Disable TFO if RST is out-of-order * and no data has been received * for current active TFO socket */ if (tp->syn_fastopen && !tp->data_segs_in && sk->sk_state == TCP_ESTABLISHED) tcp_fastopen_active_disable(sk); tcp_send_challenge_ack(sk); SKB_DR_SET(reason, TCP_RESET); goto discard; } /* step 3: check security and precedence [ignored] */ /* step 4: Check for a SYN * RFC 5961 4.2 : Send a challenge ack */ if (th->syn) { if (sk->sk_state == TCP_SYN_RECV && sk->sk_socket && th->ack && TCP_SKB_CB(skb)->seq + 1 == TCP_SKB_CB(skb)->end_seq && TCP_SKB_CB(skb)->seq + 1 == tp->rcv_nxt && TCP_SKB_CB(skb)->ack_seq == tp->snd_nxt) goto pass; syn_challenge: if (syn_inerr) TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE); tcp_send_challenge_ack(sk); SKB_DR_SET(reason, TCP_INVALID_SYN); goto discard; } pass: bpf_skops_parse_hdr(sk, skb); return true; discard: tcp_drop_reason(sk, skb, reason); return false; reset: tcp_reset(sk, skb); __kfree_skb(skb); return false; } /* * TCP receive function for the ESTABLISHED state. * * It is split into a fast path and a slow path. The fast path is * disabled when: * - A zero window was announced from us - zero window probing * is only handled properly in the slow path. * - Out of order segments arrived. * - Urgent data is expected. * - There is no buffer space left * - Unexpected TCP flags/window values/header lengths are received * (detected by checking the TCP header against pred_flags) * - Data is sent in both directions. Fast path only supports pure senders * or pure receivers (this means either the sequence number or the ack * value must stay constant) * - Unexpected TCP option. * * When these conditions are not satisfied it drops into a standard * receive procedure patterned after RFC793 to handle all cases. * The first three cases are guaranteed by proper pred_flags setting, * the rest is checked inline. Fast processing is turned on in * tcp_data_queue when everything is OK. */ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) { enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED; const struct tcphdr *th = (const struct tcphdr *)skb->data; struct tcp_sock *tp = tcp_sk(sk); unsigned int len = skb->len; /* TCP congestion window tracking */ trace_tcp_probe(sk, skb); tcp_mstamp_refresh(tp); if (unlikely(!rcu_access_pointer(sk->sk_rx_dst))) inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb); /* * Header prediction. * The code loosely follows the one in the famous * "30 instruction TCP receive" Van Jacobson mail. * * Van's trick is to deposit buffers into socket queue * on a device interrupt, to call tcp_recv function * on the receive process context and checksum and copy * the buffer to user space. smart... * * Our current scheme is not silly either but we take the * extra cost of the net_bh soft interrupt processing... * We do checksum and copy also but from device to kernel. */ tp->rx_opt.saw_tstamp = 0; /* pred_flags is 0xS?10 << 16 + snd_wnd * if header_prediction is to be made * 'S' will always be tp->tcp_header_len >> 2 * '?' will be 0 for the fast path, otherwise pred_flags is 0 to * turn it off (when there are holes in the receive * space for instance) * PSH flag is ignored. */ if ((tcp_flag_word(th) & TCP_HP_BITS) == tp->pred_flags && TCP_SKB_CB(skb)->seq == tp->rcv_nxt && !after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) { int tcp_header_len = tp->tcp_header_len; s32 delta = 0; int flag = 0; /* Timestamp header prediction: tcp_header_len * is automatically equal to th->doff*4 due to pred_flags * match. */ /* Check timestamp */ if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) { /* No? Slow path! */ if (!tcp_parse_aligned_timestamp(tp, th)) goto slow_path; delta = tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent; /* If PAWS failed, check it more carefully in slow path */ if (delta < 0) goto slow_path; /* DO NOT update ts_recent here, if checksum fails * and timestamp was corrupted part, it will result * in a hung connection since we will drop all * future packets due to the PAWS test. */ } if (len <= tcp_header_len) { /* Bulk data transfer: sender */ if (len == tcp_header_len) { /* Predicted packet is in window by definition. * seq == rcv_nxt and rcv_wup <= rcv_nxt. * Hence, check seq<=rcv_wup reduces to: */ if (tcp_header_len == (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) && tp->rcv_nxt == tp->rcv_wup) flag |= __tcp_replace_ts_recent(tp, delta); /* We know that such packets are checksummed * on entry. */ tcp_ack(sk, skb, flag); __kfree_skb(skb); tcp_data_snd_check(sk); /* When receiving pure ack in fast path, update * last ts ecr directly instead of calling * tcp_rcv_rtt_measure_ts() */ tp->rcv_rtt_last_tsecr = tp->rx_opt.rcv_tsecr; return; } else { /* Header too small */ reason = SKB_DROP_REASON_PKT_TOO_SMALL; TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); goto discard; } } else { int eaten = 0; bool fragstolen = false; if (tcp_checksum_complete(skb)) goto csum_error; if ((int)skb->truesize > sk->sk_forward_alloc) goto step5; /* Predicted packet is in window by definition. * seq == rcv_nxt and rcv_wup <= rcv_nxt. * Hence, check seq<=rcv_wup reduces to: */ if (tcp_header_len == (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) && tp->rcv_nxt == tp->rcv_wup) flag |= __tcp_replace_ts_recent(tp, delta); tcp_rcv_rtt_measure_ts(sk, skb); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPHITS); /* Bulk data transfer: receiver */ tcp_cleanup_skb(skb); __skb_pull(skb, tcp_header_len); eaten = tcp_queue_rcv(sk, skb, &fragstolen); tcp_event_data_recv(sk, skb); if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) { /* Well, only one small jumplet in fast path... */ tcp_ack(sk, skb, flag | FLAG_DATA); tcp_data_snd_check(sk); if (!inet_csk_ack_scheduled(sk)) goto no_ack; } else { tcp_update_wl(tp, TCP_SKB_CB(skb)->seq); } __tcp_ack_snd_check(sk, 0); no_ack: if (eaten) kfree_skb_partial(skb, fragstolen); tcp_data_ready(sk); return; } } slow_path: if (len < (th->doff << 2) || tcp_checksum_complete(skb)) goto csum_error; if (!th->ack && !th->rst && !th->syn) { reason = SKB_DROP_REASON_TCP_FLAGS; goto discard; } /* * Standard slow path. */ if (!tcp_validate_incoming(sk, skb, th, 1)) return; step5: reason = tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT); if ((int)reason < 0) { reason = -reason; goto discard; } tcp_rcv_rtt_measure_ts(sk, skb); /* Process urgent data. */ tcp_urg(sk, skb, th); /* step 7: process the segment text */ tcp_data_queue(sk, skb); tcp_data_snd_check(sk); tcp_ack_snd_check(sk); return; csum_error: reason = SKB_DROP_REASON_TCP_CSUM; trace_tcp_bad_csum(skb); TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); discard: tcp_drop_reason(sk, skb, reason); } EXPORT_IPV6_MOD(tcp_rcv_established); void tcp_init_transfer(struct sock *sk, int bpf_op, struct sk_buff *skb) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); tcp_mtup_init(sk); icsk->icsk_af_ops->rebuild_header(sk); tcp_init_metrics(sk); /* Initialize the congestion window to start the transfer. * Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been * retransmitted. In light of RFC6298 more aggressive 1sec * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK * retransmission has occurred. */ if (tp->total_retrans > 1 && tp->undo_marker) tcp_snd_cwnd_set(tp, 1); else tcp_snd_cwnd_set(tp, tcp_init_cwnd(tp, __sk_dst_get(sk))); tp->snd_cwnd_stamp = tcp_jiffies32; bpf_skops_established(sk, bpf_op, skb); /* Initialize congestion control unless BPF initialized it already: */ if (!icsk->icsk_ca_initialized) tcp_init_congestion_control(sk); tcp_init_buffer_space(sk); } void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); tcp_ao_finish_connect(sk, skb); tcp_set_state(sk, TCP_ESTABLISHED); icsk->icsk_ack.lrcvtime = tcp_jiffies32; if (skb) { icsk->icsk_af_ops->sk_rx_dst_set(sk, skb); security_inet_conn_established(sk, skb); sk_mark_napi_id(sk, skb); } tcp_init_transfer(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB, skb); /* Prevent spurious tcp_cwnd_restart() on first data * packet. */ tp->lsndtime = tcp_jiffies32; if (sock_flag(sk, SOCK_KEEPOPEN)) tcp_reset_keepalive_timer(sk, keepalive_time_when(tp)); if (!tp->rx_opt.snd_wscale) __tcp_fast_path_on(tp, tp->snd_wnd); else tp->pred_flags = 0; } static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, struct tcp_fastopen_cookie *cookie) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *data = tp->syn_data ? tcp_rtx_queue_head(sk) : NULL; u16 mss = tp->rx_opt.mss_clamp, try_exp = 0; bool syn_drop = false; if (mss == tp->rx_opt.user_mss) { struct tcp_options_received opt; /* Get original SYNACK MSS value if user MSS sets mss_clamp */ tcp_clear_options(&opt); opt.user_mss = opt.mss_clamp = 0; tcp_parse_options(sock_net(sk), synack, &opt, 0, NULL); mss = opt.mss_clamp; } if (!tp->syn_fastopen) { /* Ignore an unsolicited cookie */ cookie->len = -1; } else if (tp->total_retrans) { /* SYN timed out and the SYN-ACK neither has a cookie nor * acknowledges data. Presumably the remote received only * the retransmitted (regular) SYNs: either the original * SYN-data or the corresponding SYN-ACK was dropped. */ syn_drop = (cookie->len < 0 && data); } else if (cookie->len < 0 && !tp->syn_data) { /* We requested a cookie but didn't get it. If we did not use * the (old) exp opt format then try so next time (try_exp=1). * Otherwise we go back to use the RFC7413 opt (try_exp=2). */ try_exp = tp->syn_fastopen_exp ? 2 : 1; } tcp_fastopen_cache_set(sk, mss, cookie, syn_drop, try_exp); if (data) { /* Retransmit unacked data in SYN */ if (tp->total_retrans) tp->fastopen_client_fail = TFO_SYN_RETRANSMITTED; else tp->fastopen_client_fail = TFO_DATA_NOT_ACKED; skb_rbtree_walk_from(data) tcp_mark_skb_lost(sk, data); tcp_non_congestion_loss_retransmit(sk); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVEFAIL); return true; } tp->syn_data_acked = tp->syn_data; if (tp->syn_data_acked) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE); /* SYN-data is counted as two separate packets in tcp_ack() */ if (tp->delivered > 1) --tp->delivered; } tcp_fastopen_add_skb(sk, synack); return false; } static void smc_check_reset_syn(struct tcp_sock *tp) { #if IS_ENABLED(CONFIG_SMC) if (static_branch_unlikely(&tcp_have_smc)) { if (tp->syn_smc && !tp->rx_opt.smc_ok) tp->syn_smc = 0; } #endif } static void tcp_try_undo_spurious_syn(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); u32 syn_stamp; /* undo_marker is set when SYN or SYNACK times out. The timeout is * spurious if the ACK's timestamp option echo value matches the * original SYN timestamp. */ syn_stamp = tp->retrans_stamp; if (tp->undo_marker && syn_stamp && tp->rx_opt.saw_tstamp && syn_stamp == tp->rx_opt.rcv_tsecr) tp->undo_marker = 0; } static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct tcp_fastopen_cookie foc = { .len = -1 }; int saved_clamp = tp->rx_opt.mss_clamp; bool fastopen_fail; SKB_DR(reason); tcp_parse_options(sock_net(sk), skb, &tp->rx_opt, 0, &foc); if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) tp->rx_opt.rcv_tsecr -= tp->tsoffset; if (th->ack) { /* rfc793: * "If the state is SYN-SENT then * first check the ACK bit * If the ACK bit is set * If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send * a reset (unless the RST bit is set, if so drop * the segment and return)" */ if (!after(TCP_SKB_CB(skb)->ack_seq, tp->snd_una) || after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) { /* Previous FIN/ACK or RST/ACK might be ignored. */ if (icsk->icsk_retransmits == 0) tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, TCP_TIMEOUT_MIN, false); SKB_DR_SET(reason, TCP_INVALID_ACK_SEQUENCE); goto reset_and_undo; } if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && !between(tp->rx_opt.rcv_tsecr, tp->retrans_stamp, tcp_time_stamp_ts(tp))) { NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSACTIVEREJECTED); SKB_DR_SET(reason, TCP_RFC7323_PAWS); goto reset_and_undo; } /* Now ACK is acceptable. * * "If the RST bit is set * If the ACK was acceptable then signal the user "error: * connection reset", drop the segment, enter CLOSED state, * delete TCB, and return." */ if (th->rst) { tcp_reset(sk, skb); consume: __kfree_skb(skb); return 0; } /* rfc793: * "fifth, if neither of the SYN or RST bits is set then * drop the segment and return." * * See note below! * --ANK(990513) */ if (!th->syn) { SKB_DR_SET(reason, TCP_FLAGS); goto discard_and_undo; } /* rfc793: * "If the SYN bit is on ... * are acceptable then ... * (our SYN has been ACKed), change the connection * state to ESTABLISHED..." */ tcp_ecn_rcv_synack(tp, th); tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); tcp_try_undo_spurious_syn(sk); tcp_ack(sk, skb, FLAG_SLOWPATH); /* Ok.. it's good. Set up sequence numbers and * move to established. */ WRITE_ONCE(tp->rcv_nxt, TCP_SKB_CB(skb)->seq + 1); tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1; /* RFC1323: The window in SYN & SYN/ACK segments is * never scaled. */ tp->snd_wnd = ntohs(th->window); if (!tp->rx_opt.wscale_ok) { tp->rx_opt.snd_wscale = tp->rx_opt.rcv_wscale = 0; WRITE_ONCE(tp->window_clamp, min(tp->window_clamp, 65535U)); } if (tp->rx_opt.saw_tstamp) { tp->rx_opt.tstamp_ok = 1; tp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; tcp_store_ts_recent(tp); } else { tp->tcp_header_len = sizeof(struct tcphdr); } tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); tcp_initialize_rcv_mss(sk); /* Remember, tcp_poll() does not lock socket! * Change state from SYN-SENT only after copied_seq * is initialized. */ WRITE_ONCE(tp->copied_seq, tp->rcv_nxt); smc_check_reset_syn(tp); smp_mb(); tcp_finish_connect(sk, skb); fastopen_fail = (tp->syn_fastopen || tp->syn_data) && tcp_rcv_fastopen_synack(sk, skb, &foc); if (!sock_flag(sk, SOCK_DEAD)) { sk->sk_state_change(sk); sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT); } if (fastopen_fail) return -1; if (sk->sk_write_pending || READ_ONCE(icsk->icsk_accept_queue.rskq_defer_accept) || inet_csk_in_pingpong_mode(sk)) { /* Save one ACK. Data will be ready after * several ticks, if write_pending is set. * * It may be deleted, but with this feature tcpdumps * look so _wonderfully_ clever, that I was not able * to stand against the temptation 8) --ANK */ inet_csk_schedule_ack(sk); tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS); tcp_reset_xmit_timer(sk, ICSK_TIME_DACK, TCP_DELACK_MAX, false); goto consume; } tcp_send_ack(sk); return -1; } /* No ACK in the segment */ if (th->rst) { /* rfc793: * "If the RST bit is set * * Otherwise (no ACK) drop the segment and return." */ SKB_DR_SET(reason, TCP_RESET); goto discard_and_undo; } /* PAWS check. */ if (tp->rx_opt.ts_recent_stamp && tp->rx_opt.saw_tstamp && tcp_paws_reject(&tp->rx_opt, 0)) { SKB_DR_SET(reason, TCP_RFC7323_PAWS); goto discard_and_undo; } if (th->syn) { /* We see SYN without ACK. It is attempt of * simultaneous connect with crossed SYNs. * Particularly, it can be connect to self. */ #ifdef CONFIG_TCP_AO struct tcp_ao_info *ao; ao = rcu_dereference_protected(tp->ao_info, lockdep_sock_is_held(sk)); if (ao) { WRITE_ONCE(ao->risn, th->seq); ao->rcv_sne = 0; } #endif tcp_set_state(sk, TCP_SYN_RECV); if (tp->rx_opt.saw_tstamp) { tp->rx_opt.tstamp_ok = 1; tcp_store_ts_recent(tp); tp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; } else { tp->tcp_header_len = sizeof(struct tcphdr); } WRITE_ONCE(tp->rcv_nxt, TCP_SKB_CB(skb)->seq + 1); WRITE_ONCE(tp->copied_seq, tp->rcv_nxt); tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1; /* RFC1323: The window in SYN & SYN/ACK segments is * never scaled. */ tp->snd_wnd = ntohs(th->window); tp->snd_wl1 = TCP_SKB_CB(skb)->seq; tp->max_window = tp->snd_wnd; tcp_ecn_rcv_syn(tp, th); tcp_mtup_init(sk); tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); tcp_initialize_rcv_mss(sk); tcp_send_synack(sk); #if 0 /* Note, we could accept data and URG from this segment. * There are no obstacles to make this (except that we must * either change tcp_recvmsg() to prevent it from returning data * before 3WHS completes per RFC793, or employ TCP Fast Open). * * However, if we ignore data in ACKless segments sometimes, * we have no reasons to accept it sometimes. * Also, seems the code doing it in step6 of tcp_rcv_state_process * is not flawless. So, discard packet for sanity. * Uncomment this return to process the data. */ return -1; #else goto consume; #endif } /* "fifth, if neither of the SYN or RST bits is set then * drop the segment and return." */ discard_and_undo: tcp_clear_options(&tp->rx_opt); tp->rx_opt.mss_clamp = saved_clamp; tcp_drop_reason(sk, skb, reason); return 0; reset_and_undo: tcp_clear_options(&tp->rx_opt); tp->rx_opt.mss_clamp = saved_clamp; /* we can reuse/return @reason to its caller to handle the exception */ return reason; } static void tcp_rcv_synrecv_state_fastopen(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct request_sock *req; /* If we are still handling the SYNACK RTO, see if timestamp ECR allows * undo. If peer SACKs triggered fast recovery, we can't undo here. */ if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss && !tp->packets_out) tcp_try_undo_recovery(sk); tcp_update_rto_time(tp); inet_csk(sk)->icsk_retransmits = 0; /* In tcp_fastopen_synack_timer() on the first SYNACK RTO we set * retrans_stamp but don't enter CA_Loss, so in case that happened we * need to zero retrans_stamp here to prevent spurious * retransmits_timed_out(). However, if the ACK of our SYNACK caused us * to enter CA_Recovery then we need to leave retrans_stamp as it was * set entering CA_Recovery, for correct retransmits_timed_out() and * undo behavior. */ tcp_retrans_stamp_cleanup(sk); /* Once we leave TCP_SYN_RECV or TCP_FIN_WAIT_1, * we no longer need req so release it. */ req = rcu_dereference_protected(tp->fastopen_rsk, lockdep_sock_is_held(sk)); reqsk_fastopen_remove(sk, req, false); /* Re-arm the timer because data may have been sent out. * This is similar to the regular data transmission case * when new data has just been ack'ed. * * (TFO) - we could try to be more aggressive and * retransmitting any data sooner based on when they * are sent out. */ tcp_rearm_rto(sk); } /* * This function implements the receiving procedure of RFC 793 for * all states except ESTABLISHED and TIME_WAIT. * It's called from both tcp_v4_rcv and tcp_v6_rcv and should be * address independent. */ enum skb_drop_reason tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); const struct tcphdr *th = tcp_hdr(skb); struct request_sock *req; int queued = 0; SKB_DR(reason); switch (sk->sk_state) { case TCP_CLOSE: SKB_DR_SET(reason, TCP_CLOSE); goto discard; case TCP_LISTEN: if (th->ack) return SKB_DROP_REASON_TCP_FLAGS; if (th->rst) { SKB_DR_SET(reason, TCP_RESET); goto discard; } if (th->syn) { if (th->fin) { SKB_DR_SET(reason, TCP_FLAGS); goto discard; } /* It is possible that we process SYN packets from backlog, * so we need to make sure to disable BH and RCU right there. */ rcu_read_lock(); local_bh_disable(); icsk->icsk_af_ops->conn_request(sk, skb); local_bh_enable(); rcu_read_unlock(); consume_skb(skb); return 0; } SKB_DR_SET(reason, TCP_FLAGS); goto discard; case TCP_SYN_SENT: tp->rx_opt.saw_tstamp = 0; tcp_mstamp_refresh(tp); queued = tcp_rcv_synsent_state_process(sk, skb, th); if (queued >= 0) return queued; /* Do step6 onward by hand. */ tcp_urg(sk, skb, th); __kfree_skb(skb); tcp_data_snd_check(sk); return 0; } tcp_mstamp_refresh(tp); tp->rx_opt.saw_tstamp = 0; req = rcu_dereference_protected(tp->fastopen_rsk, lockdep_sock_is_held(sk)); if (req) { bool req_stolen; WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV && sk->sk_state != TCP_FIN_WAIT1); SKB_DR_SET(reason, TCP_FASTOPEN); if (!tcp_check_req(sk, skb, req, true, &req_stolen, &reason)) goto discard; } if (!th->ack && !th->rst && !th->syn) { SKB_DR_SET(reason, TCP_FLAGS); goto discard; } if (!tcp_validate_incoming(sk, skb, th, 0)) return 0; /* step 5: check the ACK field */ reason = tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT | FLAG_NO_CHALLENGE_ACK); if ((int)reason <= 0) { if (sk->sk_state == TCP_SYN_RECV) { /* send one RST */ if (!reason) return SKB_DROP_REASON_TCP_OLD_ACK; return -reason; } /* accept old ack during closing */ if ((int)reason < 0) { tcp_send_challenge_ack(sk); reason = -reason; goto discard; } } SKB_DR_SET(reason, NOT_SPECIFIED); switch (sk->sk_state) { case TCP_SYN_RECV: tp->delivered++; /* SYN-ACK delivery isn't tracked in tcp_ack */ if (!tp->srtt_us) tcp_synack_rtt_meas(sk, req); if (tp->rx_opt.tstamp_ok) tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; if (req) { tcp_rcv_synrecv_state_fastopen(sk); } else { tcp_try_undo_spurious_syn(sk); tp->retrans_stamp = 0; tcp_init_transfer(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB, skb); WRITE_ONCE(tp->copied_seq, tp->rcv_nxt); } tcp_ao_established(sk); smp_mb(); tcp_set_state(sk, TCP_ESTABLISHED); sk->sk_state_change(sk); /* Note, that this wakeup is only for marginal crossed SYN case. * Passively open sockets are not waked up, because * sk->sk_sleep == NULL and sk->sk_socket == NULL. */ if (sk->sk_socket) sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT); tp->snd_una = TCP_SKB_CB(skb)->ack_seq; tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale; tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); if (!inet_csk(sk)->icsk_ca_ops->cong_control) tcp_update_pacing_rate(sk); /* Prevent spurious tcp_cwnd_restart() on first data packet */ tp->lsndtime = tcp_jiffies32; tcp_initialize_rcv_mss(sk); tcp_fast_path_on(tp); if (sk->sk_shutdown & SEND_SHUTDOWN) tcp_shutdown(sk, SEND_SHUTDOWN); break; case TCP_FIN_WAIT1: { int tmo; if (req) tcp_rcv_synrecv_state_fastopen(sk); if (tp->snd_una != tp->write_seq) break; tcp_set_state(sk, TCP_FIN_WAIT2); WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | SEND_SHUTDOWN); sk_dst_confirm(sk); if (!sock_flag(sk, SOCK_DEAD)) { /* Wake up lingering close() */ sk->sk_state_change(sk); break; } if (READ_ONCE(tp->linger2) < 0) { tcp_done(sk); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA); return SKB_DROP_REASON_TCP_ABORT_ON_DATA; } if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) { /* Receive out of order FIN after close() */ if (tp->syn_fastopen && th->fin) tcp_fastopen_active_disable(sk); tcp_done(sk); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA); return SKB_DROP_REASON_TCP_ABORT_ON_DATA; } tmo = tcp_fin_time(sk); if (tmo > TCP_TIMEWAIT_LEN) { tcp_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN); } else if (th->fin || sock_owned_by_user(sk)) { /* Bad case. We could lose such FIN otherwise. * It is not a big problem, but it looks confusing * and not so rare event. We still can lose it now, * if it spins in bh_lock_sock(), but it is really * marginal case. */ tcp_reset_keepalive_timer(sk, tmo); } else { tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); goto consume; } break; } case TCP_CLOSING: if (tp->snd_una == tp->write_seq) { tcp_time_wait(sk, TCP_TIME_WAIT, 0); goto consume; } break; case TCP_LAST_ACK: if (tp->snd_una == tp->write_seq) { tcp_update_metrics(sk); tcp_done(sk); goto consume; } break; } /* step 6: check the URG bit */ tcp_urg(sk, skb, th); /* step 7: process the segment text */ switch (sk->sk_state) { case TCP_CLOSE_WAIT: case TCP_CLOSING: case TCP_LAST_ACK: if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { /* If a subflow has been reset, the packet should not * continue to be processed, drop the packet. */ if (sk_is_mptcp(sk) && !mptcp_incoming_options(sk, skb)) goto discard; break; } fallthrough; case TCP_FIN_WAIT1: case TCP_FIN_WAIT2: /* RFC 793 says to queue data in these states, * RFC 1122 says we MUST send a reset. * BSD 4.4 also does reset. */ if (sk->sk_shutdown & RCV_SHUTDOWN) { if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA); tcp_reset(sk, skb); return SKB_DROP_REASON_TCP_ABORT_ON_DATA; } } fallthrough; case TCP_ESTABLISHED: tcp_data_queue(sk, skb); queued = 1; break; } /* tcp_data could move socket to TIME-WAIT */ if (sk->sk_state != TCP_CLOSE) { tcp_data_snd_check(sk); tcp_ack_snd_check(sk); } if (!queued) { discard: tcp_drop_reason(sk, skb, reason); } return 0; consume: __kfree_skb(skb); return 0; } EXPORT_IPV6_MOD(tcp_rcv_state_process); static inline void pr_drop_req(struct request_sock *req, __u16 port, int family) { struct inet_request_sock *ireq = inet_rsk(req); if (family == AF_INET) net_dbg_ratelimited("drop open request from %pI4/%u\n", &ireq->ir_rmt_addr, port); #if IS_ENABLED(CONFIG_IPV6) else if (family == AF_INET6) net_dbg_ratelimited("drop open request from %pI6/%u\n", &ireq->ir_v6_rmt_addr, port); #endif } /* RFC3168 : 6.1.1 SYN packets must not have ECT/ECN bits set * * If we receive a SYN packet with these bits set, it means a * network is playing bad games with TOS bits. In order to * avoid possible false congestion notifications, we disable * TCP ECN negotiation. * * Exception: tcp_ca wants ECN. This is required for DCTCP * congestion control: Linux DCTCP asserts ECT on all packets, * including SYN, which is most optimal solution; however, * others, such as FreeBSD do not. * * Exception: At least one of the reserved bits of the TCP header (th->res1) is * set, indicating the use of a future TCP extension (such as AccECN). See * RFC8311 §4.3 which updates RFC3168 to allow the development of such * extensions. */ static void tcp_ecn_create_request(struct request_sock *req, const struct sk_buff *skb, const struct sock *listen_sk, const struct dst_entry *dst) { const struct tcphdr *th = tcp_hdr(skb); const struct net *net = sock_net(listen_sk); bool th_ecn = th->ece && th->cwr; bool ect, ecn_ok; u32 ecn_ok_dst; if (!th_ecn) return; ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield); ecn_ok_dst = dst_feature(dst, DST_FEATURE_ECN_MASK); ecn_ok = READ_ONCE(net->ipv4.sysctl_tcp_ecn) || ecn_ok_dst; if (((!ect || th->res1) && ecn_ok) || tcp_ca_needs_ecn(listen_sk) || (ecn_ok_dst & DST_FEATURE_ECN_CA) || tcp_bpf_ca_needs_ecn((struct sock *)req)) inet_rsk(req)->ecn_ok = 1; } static void tcp_openreq_init(struct request_sock *req, const struct tcp_options_received *rx_opt, struct sk_buff *skb, const struct sock *sk) { struct inet_request_sock *ireq = inet_rsk(req); req->rsk_rcv_wnd = 0; /* So that tcp_send_synack() knows! */ tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq; tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; tcp_rsk(req)->snt_synack = 0; tcp_rsk(req)->snt_tsval_first = 0; tcp_rsk(req)->last_oow_ack_time = 0; req->mss = rx_opt->mss_clamp; req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0; ireq->tstamp_ok = rx_opt->tstamp_ok; ireq->sack_ok = rx_opt->sack_ok; ireq->snd_wscale = rx_opt->snd_wscale; ireq->wscale_ok = rx_opt->wscale_ok; ireq->acked = 0; ireq->ecn_ok = 0; ireq->ir_rmt_port = tcp_hdr(skb)->source; ireq->ir_num = ntohs(tcp_hdr(skb)->dest); ireq->ir_mark = inet_request_mark(sk, skb); #if IS_ENABLED(CONFIG_SMC) ireq->smc_ok = rx_opt->smc_ok && !(tcp_sk(sk)->smc_hs_congested && tcp_sk(sk)->smc_hs_congested(sk)); #endif } /* * Return true if a syncookie should be sent */ static bool tcp_syn_flood_action(struct sock *sk, const char *proto) { struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; const char *msg = "Dropping request"; struct net *net = sock_net(sk); bool want_cookie = false; u8 syncookies; syncookies = READ_ONCE(net->ipv4.sysctl_tcp_syncookies); #ifdef CONFIG_SYN_COOKIES if (syncookies) { msg = "Sending cookies"; want_cookie = true; __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES); } else #endif __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP); if (!READ_ONCE(queue->synflood_warned) && syncookies != 2 && xchg(&queue->synflood_warned, 1) == 0) { if (IS_ENABLED(CONFIG_IPV6) && sk->sk_family == AF_INET6) { net_info_ratelimited("%s: Possible SYN flooding on port [%pI6c]:%u. %s.\n", proto, inet6_rcv_saddr(sk), sk->sk_num, msg); } else { net_info_ratelimited("%s: Possible SYN flooding on port %pI4:%u. %s.\n", proto, &sk->sk_rcv_saddr, sk->sk_num, msg); } } return want_cookie; } static void tcp_reqsk_record_syn(const struct sock *sk, struct request_sock *req, const struct sk_buff *skb) { if (tcp_sk(sk)->save_syn) { u32 len = skb_network_header_len(skb) + tcp_hdrlen(skb); struct saved_syn *saved_syn; u32 mac_hdrlen; void *base; if (tcp_sk(sk)->save_syn == 2) { /* Save full header. */ base = skb_mac_header(skb); mac_hdrlen = skb_mac_header_len(skb); len += mac_hdrlen; } else { base = skb_network_header(skb); mac_hdrlen = 0; } saved_syn = kmalloc(struct_size(saved_syn, data, len), GFP_ATOMIC); if (saved_syn) { saved_syn->mac_hdrlen = mac_hdrlen; saved_syn->network_hdrlen = skb_network_header_len(skb); saved_syn->tcp_hdrlen = tcp_hdrlen(skb); memcpy(saved_syn->data, base, len); req->saved_syn = saved_syn; } } } /* If a SYN cookie is required and supported, returns a clamped MSS value to be * used for SYN cookie generation. */ u16 tcp_get_syncookie_mss(struct request_sock_ops *rsk_ops, const struct tcp_request_sock_ops *af_ops, struct sock *sk, struct tcphdr *th) { struct tcp_sock *tp = tcp_sk(sk); u16 mss; if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_syncookies) != 2 && !inet_csk_reqsk_queue_is_full(sk)) return 0; if (!tcp_syn_flood_action(sk, rsk_ops->slab_name)) return 0; if (sk_acceptq_is_full(sk)) { NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); return 0; } mss = tcp_parse_mss_option(th, tp->rx_opt.user_mss); if (!mss) mss = af_ops->mss_clamp; return mss; } EXPORT_IPV6_MOD_GPL(tcp_get_syncookie_mss); int tcp_conn_request(struct request_sock_ops *rsk_ops, const struct tcp_request_sock_ops *af_ops, struct sock *sk, struct sk_buff *skb) { struct tcp_fastopen_cookie foc = { .len = -1 }; struct tcp_options_received tmp_opt; struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); struct sock *fastopen_sk = NULL; struct request_sock *req; bool want_cookie = false; struct dst_entry *dst; struct flowi fl; u8 syncookies; u32 isn; #ifdef CONFIG_TCP_AO const struct tcp_ao_hdr *aoh; #endif isn = __this_cpu_read(tcp_tw_isn); if (isn) { /* TW buckets are converted to open requests without * limitations, they conserve resources and peer is * evidently real one. */ __this_cpu_write(tcp_tw_isn, 0); } else { syncookies = READ_ONCE(net->ipv4.sysctl_tcp_syncookies); if (syncookies == 2 || inet_csk_reqsk_queue_is_full(sk)) { want_cookie = tcp_syn_flood_action(sk, rsk_ops->slab_name); if (!want_cookie) goto drop; } } if (sk_acceptq_is_full(sk)) { NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); goto drop; } req = inet_reqsk_alloc(rsk_ops, sk, !want_cookie); if (!req) goto drop; req->syncookie = want_cookie; tcp_rsk(req)->af_specific = af_ops; tcp_rsk(req)->ts_off = 0; tcp_rsk(req)->req_usec_ts = false; #if IS_ENABLED(CONFIG_MPTCP) tcp_rsk(req)->is_mptcp = 0; #endif tcp_clear_options(&tmp_opt); tmp_opt.mss_clamp = af_ops->mss_clamp; tmp_opt.user_mss = tp->rx_opt.user_mss; tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0, want_cookie ? NULL : &foc); if (want_cookie && !tmp_opt.saw_tstamp) tcp_clear_options(&tmp_opt); if (IS_ENABLED(CONFIG_SMC) && want_cookie) tmp_opt.smc_ok = 0; tmp_opt.tstamp_ok = tmp_opt.saw_tstamp; tcp_openreq_init(req, &tmp_opt, skb, sk); inet_rsk(req)->no_srccheck = inet_test_bit(TRANSPARENT, sk); /* Note: tcp_v6_init_req() might override ir_iif for link locals */ inet_rsk(req)->ir_iif = inet_request_bound_dev_if(sk, skb); dst = af_ops->route_req(sk, skb, &fl, req, isn); if (!dst) goto drop_and_free; if (tmp_opt.tstamp_ok) { tcp_rsk(req)->req_usec_ts = dst_tcp_usec_ts(dst); tcp_rsk(req)->ts_off = af_ops->init_ts_off(net, skb); } if (!want_cookie && !isn) { int max_syn_backlog = READ_ONCE(net->ipv4.sysctl_max_syn_backlog); /* Kill the following clause, if you dislike this way. */ if (!syncookies && (max_syn_backlog - inet_csk_reqsk_queue_len(sk) < (max_syn_backlog >> 2)) && !tcp_peer_is_proven(req, dst)) { /* Without syncookies last quarter of * backlog is filled with destinations, * proven to be alive. * It means that we continue to communicate * to destinations, already remembered * to the moment of synflood. */ pr_drop_req(req, ntohs(tcp_hdr(skb)->source), rsk_ops->family); goto drop_and_release; } isn = af_ops->init_seq(skb); } tcp_ecn_create_request(req, skb, sk, dst); if (want_cookie) { isn = cookie_init_sequence(af_ops, sk, skb, &req->mss); if (!tmp_opt.tstamp_ok) inet_rsk(req)->ecn_ok = 0; } #ifdef CONFIG_TCP_AO if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) goto drop_and_release; /* Invalid TCP options */ if (aoh) { tcp_rsk(req)->used_tcp_ao = true; tcp_rsk(req)->ao_rcv_next = aoh->keyid; tcp_rsk(req)->ao_keyid = aoh->rnext_keyid; } else { tcp_rsk(req)->used_tcp_ao = false; } #endif tcp_rsk(req)->snt_isn = isn; tcp_rsk(req)->txhash = net_tx_rndhash(); tcp_rsk(req)->syn_tos = TCP_SKB_CB(skb)->ip_dsfield; tcp_openreq_init_rwin(req, sk, dst); sk_rx_queue_set(req_to_sk(req), skb); if (!want_cookie) { tcp_reqsk_record_syn(sk, req, skb); fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc, dst); } if (fastopen_sk) { af_ops->send_synack(fastopen_sk, dst, &fl, req, &foc, TCP_SYNACK_FASTOPEN, skb); /* Add the child socket directly into the accept queue */ if (!inet_csk_reqsk_queue_add(sk, req, fastopen_sk)) { reqsk_fastopen_remove(fastopen_sk, req, false); bh_unlock_sock(fastopen_sk); sock_put(fastopen_sk); goto drop_and_free; } sk->sk_data_ready(sk); bh_unlock_sock(fastopen_sk); sock_put(fastopen_sk); } else { tcp_rsk(req)->tfo_listener = false; if (!want_cookie) { req->timeout = tcp_timeout_init((struct sock *)req); if (unlikely(!inet_csk_reqsk_queue_hash_add(sk, req, req->timeout))) { reqsk_free(req); dst_release(dst); return 0; } } af_ops->send_synack(sk, dst, &fl, req, &foc, !want_cookie ? TCP_SYNACK_NORMAL : TCP_SYNACK_COOKIE, skb); if (want_cookie) { reqsk_free(req); return 0; } } reqsk_put(req); return 0; drop_and_release: dst_release(dst); drop_and_free: __reqsk_free(req); drop: tcp_listendrop(sk); return 0; } EXPORT_IPV6_MOD(tcp_conn_request);
4 9 9 6 3 23 4 5 2 3 5 2 3 3 1 2 9 2 7 2 1 2 25 25 8 8 26 33 1 1 32 5 2 29 26 2 26 26 19 7 22 25 2 4 22 22 15 6 6 6 3 1 2 3 1 2 11 5 1 3 3 3 1 1 7 1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 /* * Copyright (c) 2007, 2020 Oracle and/or its affiliates. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ #include <linux/pagemap.h> #include <linux/slab.h> #include <linux/rbtree.h> #include <linux/dma-mapping.h> /* for DMA_*_DEVICE */ #include "rds.h" /* * XXX * - build with sparse * - should we detect duplicate keys on a socket? hmm. * - an rdma is an mlock, apply rlimit? */ /* * get the number of pages by looking at the page indices that the start and * end addresses fall in. * * Returns 0 if the vec is invalid. It is invalid if the number of bytes * causes the address to wrap or overflows an unsigned int. This comes * from being stored in the 'length' member of 'struct scatterlist'. */ static unsigned int rds_pages_in_vec(struct rds_iovec *vec) { if ((vec->addr + vec->bytes <= vec->addr) || (vec->bytes > (u64)UINT_MAX)) return 0; return ((vec->addr + vec->bytes + PAGE_SIZE - 1) >> PAGE_SHIFT) - (vec->addr >> PAGE_SHIFT); } static struct rds_mr *rds_mr_tree_walk(struct rb_root *root, u64 key, struct rds_mr *insert) { struct rb_node **p = &root->rb_node; struct rb_node *parent = NULL; struct rds_mr *mr; while (*p) { parent = *p; mr = rb_entry(parent, struct rds_mr, r_rb_node); if (key < mr->r_key) p = &(*p)->rb_left; else if (key > mr->r_key) p = &(*p)->rb_right; else return mr; } if (insert) { rb_link_node(&insert->r_rb_node, parent, p); rb_insert_color(&insert->r_rb_node, root); kref_get(&insert->r_kref); } return NULL; } /* * Destroy the transport-specific part of a MR. */ static void rds_destroy_mr(struct rds_mr *mr) { struct rds_sock *rs = mr->r_sock; void *trans_private = NULL; unsigned long flags; rdsdebug("RDS: destroy mr key is %x refcnt %u\n", mr->r_key, kref_read(&mr->r_kref)); spin_lock_irqsave(&rs->rs_rdma_lock, flags); if (!RB_EMPTY_NODE(&mr->r_rb_node)) rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys); trans_private = mr->r_trans_private; mr->r_trans_private = NULL; spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); if (trans_private) mr->r_trans->free_mr(trans_private, mr->r_invalidate); } void __rds_put_mr_final(struct kref *kref) { struct rds_mr *mr = container_of(kref, struct rds_mr, r_kref); rds_destroy_mr(mr); kfree(mr); } /* * By the time this is called we can't have any more ioctls called on * the socket so we don't need to worry about racing with others. */ void rds_rdma_drop_keys(struct rds_sock *rs) { struct rds_mr *mr; struct rb_node *node; unsigned long flags; /* Release any MRs associated with this socket */ spin_lock_irqsave(&rs->rs_rdma_lock, flags); while ((node = rb_first(&rs->rs_rdma_keys))) { mr = rb_entry(node, struct rds_mr, r_rb_node); if (mr->r_trans == rs->rs_transport) mr->r_invalidate = 0; rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys); RB_CLEAR_NODE(&mr->r_rb_node); spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); kref_put(&mr->r_kref, __rds_put_mr_final); spin_lock_irqsave(&rs->rs_rdma_lock, flags); } spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); if (rs->rs_transport && rs->rs_transport->flush_mrs) rs->rs_transport->flush_mrs(); } /* * Helper function to pin user pages. */ static int rds_pin_pages(unsigned long user_addr, unsigned int nr_pages, struct page **pages, int write) { unsigned int gup_flags = FOLL_LONGTERM; int ret; if (write) gup_flags |= FOLL_WRITE; ret = pin_user_pages_fast(user_addr, nr_pages, gup_flags, pages); if (ret >= 0 && ret < nr_pages) { unpin_user_pages(pages, ret); ret = -EFAULT; } return ret; } static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args, u64 *cookie_ret, struct rds_mr **mr_ret, struct rds_conn_path *cp) { struct rds_mr *mr = NULL, *found; struct scatterlist *sg = NULL; unsigned int nr_pages; struct page **pages = NULL; void *trans_private; unsigned long flags; rds_rdma_cookie_t cookie; unsigned int nents = 0; int need_odp = 0; long i; int ret; if (ipv6_addr_any(&rs->rs_bound_addr) || !rs->rs_transport) { ret = -ENOTCONN; /* XXX not a great errno */ goto out; } if (!rs->rs_transport->get_mr) { ret = -EOPNOTSUPP; goto out; } /* If the combination of the addr and size requested for this memory * region causes an integer overflow, return error. */ if (((args->vec.addr + args->vec.bytes) < args->vec.addr) || PAGE_ALIGN(args->vec.addr + args->vec.bytes) < (args->vec.addr + args->vec.bytes)) { ret = -EINVAL; goto out; } if (!can_do_mlock()) { ret = -EPERM; goto out; } nr_pages = rds_pages_in_vec(&args->vec); if (nr_pages == 0) { ret = -EINVAL; goto out; } /* Restrict the size of mr irrespective of underlying transport * To account for unaligned mr regions, subtract one from nr_pages */ if ((nr_pages - 1) > (RDS_MAX_MSG_SIZE >> PAGE_SHIFT)) { ret = -EMSGSIZE; goto out; } rdsdebug("RDS: get_mr addr %llx len %llu nr_pages %u\n", args->vec.addr, args->vec.bytes, nr_pages); /* XXX clamp nr_pages to limit the size of this alloc? */ pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); if (!pages) { ret = -ENOMEM; goto out; } mr = kzalloc(sizeof(struct rds_mr), GFP_KERNEL); if (!mr) { ret = -ENOMEM; goto out; } kref_init(&mr->r_kref); RB_CLEAR_NODE(&mr->r_rb_node); mr->r_trans = rs->rs_transport; mr->r_sock = rs; if (args->flags & RDS_RDMA_USE_ONCE) mr->r_use_once = 1; if (args->flags & RDS_RDMA_INVALIDATE) mr->r_invalidate = 1; if (args->flags & RDS_RDMA_READWRITE) mr->r_write = 1; /* * Pin the pages that make up the user buffer and transfer the page * pointers to the mr's sg array. We check to see if we've mapped * the whole region after transferring the partial page references * to the sg array so that we can have one page ref cleanup path. * * For now we have no flag that tells us whether the mapping is * r/o or r/w. We need to assume r/w, or we'll do a lot of RDMA to * the zero page. */ ret = rds_pin_pages(args->vec.addr, nr_pages, pages, 1); if (ret == -EOPNOTSUPP) { need_odp = 1; } else if (ret <= 0) { goto out; } else { nents = ret; sg = kmalloc_array(nents, sizeof(*sg), GFP_KERNEL); if (!sg) { ret = -ENOMEM; goto out; } WARN_ON(!nents); sg_init_table(sg, nents); /* Stick all pages into the scatterlist */ for (i = 0 ; i < nents; i++) sg_set_page(&sg[i], pages[i], PAGE_SIZE, 0); rdsdebug("RDS: trans_private nents is %u\n", nents); } /* Obtain a transport specific MR. If this succeeds, the * s/g list is now owned by the MR. * Note that dma_map() implies that pending writes are * flushed to RAM, so no dma_sync is needed here. */ trans_private = rs->rs_transport->get_mr( sg, nents, rs, &mr->r_key, cp ? cp->cp_conn : NULL, args->vec.addr, args->vec.bytes, need_odp ? ODP_ZEROBASED : ODP_NOT_NEEDED); if (IS_ERR(trans_private)) { /* In ODP case, we don't GUP pages, so don't need * to release anything. */ if (!need_odp) { unpin_user_pages(pages, nr_pages); kfree(sg); } ret = PTR_ERR(trans_private); /* Trigger connection so that its ready for the next retry */ if (ret == -ENODEV && cp) rds_conn_connect_if_down(cp->cp_conn); goto out; } mr->r_trans_private = trans_private; rdsdebug("RDS: get_mr put_user key is %x cookie_addr %p\n", mr->r_key, (void *)(unsigned long) args->cookie_addr); /* The user may pass us an unaligned address, but we can only * map page aligned regions. So we keep the offset, and build * a 64bit cookie containing <R_Key, offset> and pass that * around. */ if (need_odp) cookie = rds_rdma_make_cookie(mr->r_key, 0); else cookie = rds_rdma_make_cookie(mr->r_key, args->vec.addr & ~PAGE_MASK); if (cookie_ret) *cookie_ret = cookie; if (args->cookie_addr && put_user(cookie, (u64 __user *)(unsigned long)args->cookie_addr)) { if (!need_odp) { unpin_user_pages(pages, nr_pages); kfree(sg); } ret = -EFAULT; goto out; } /* Inserting the new MR into the rbtree bumps its * reference count. */ spin_lock_irqsave(&rs->rs_rdma_lock, flags); found = rds_mr_tree_walk(&rs->rs_rdma_keys, mr->r_key, mr); spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); BUG_ON(found && found != mr); rdsdebug("RDS: get_mr key is %x\n", mr->r_key); if (mr_ret) { kref_get(&mr->r_kref); *mr_ret = mr; } ret = 0; out: kfree(pages); if (mr) kref_put(&mr->r_kref, __rds_put_mr_final); return ret; } int rds_get_mr(struct rds_sock *rs, sockptr_t optval, int optlen) { struct rds_get_mr_args args; if (optlen != sizeof(struct rds_get_mr_args)) return -EINVAL; if (copy_from_sockptr(&args, optval, sizeof(struct rds_get_mr_args))) return -EFAULT; return __rds_rdma_map(rs, &args, NULL, NULL, NULL); } int rds_get_mr_for_dest(struct rds_sock *rs, sockptr_t optval, int optlen) { struct rds_get_mr_for_dest_args args; struct rds_get_mr_args new_args; if (optlen != sizeof(struct rds_get_mr_for_dest_args)) return -EINVAL; if (copy_from_sockptr(&args, optval, sizeof(struct rds_get_mr_for_dest_args))) return -EFAULT; /* * Initially, just behave like get_mr(). * TODO: Implement get_mr as wrapper around this * and deprecate it. */ new_args.vec = args.vec; new_args.cookie_addr = args.cookie_addr; new_args.flags = args.flags; return __rds_rdma_map(rs, &new_args, NULL, NULL, NULL); } /* * Free the MR indicated by the given R_Key */ int rds_free_mr(struct rds_sock *rs, sockptr_t optval, int optlen) { struct rds_free_mr_args args; struct rds_mr *mr; unsigned long flags; if (optlen != sizeof(struct rds_free_mr_args)) return -EINVAL; if (copy_from_sockptr(&args, optval, sizeof(struct rds_free_mr_args))) return -EFAULT; /* Special case - a null cookie means flush all unused MRs */ if (args.cookie == 0) { if (!rs->rs_transport || !rs->rs_transport->flush_mrs) return -EINVAL; rs->rs_transport->flush_mrs(); return 0; } /* Look up the MR given its R_key and remove it from the rbtree * so nobody else finds it. * This should also prevent races with rds_rdma_unuse. */ spin_lock_irqsave(&rs->rs_rdma_lock, flags); mr = rds_mr_tree_walk(&rs->rs_rdma_keys, rds_rdma_cookie_key(args.cookie), NULL); if (mr) { rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys); RB_CLEAR_NODE(&mr->r_rb_node); if (args.flags & RDS_RDMA_INVALIDATE) mr->r_invalidate = 1; } spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); if (!mr) return -EINVAL; kref_put(&mr->r_kref, __rds_put_mr_final); return 0; } /* * This is called when we receive an extension header that * tells us this MR was used. It allows us to implement * use_once semantics */ void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force) { struct rds_mr *mr; unsigned long flags; int zot_me = 0; spin_lock_irqsave(&rs->rs_rdma_lock, flags); mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL); if (!mr) { pr_debug("rds: trying to unuse MR with unknown r_key %u!\n", r_key); spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); return; } /* Get a reference so that the MR won't go away before calling * sync_mr() below. */ kref_get(&mr->r_kref); /* If it is going to be freed, remove it from the tree now so * that no other thread can find it and free it. */ if (mr->r_use_once || force) { rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys); RB_CLEAR_NODE(&mr->r_rb_node); zot_me = 1; } spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); /* May have to issue a dma_sync on this memory region. * Note we could avoid this if the operation was a RDMA READ, * but at this point we can't tell. */ if (mr->r_trans->sync_mr) mr->r_trans->sync_mr(mr->r_trans_private, DMA_FROM_DEVICE); /* Release the reference held above. */ kref_put(&mr->r_kref, __rds_put_mr_final); /* If the MR was marked as invalidate, this will * trigger an async flush. */ if (zot_me) kref_put(&mr->r_kref, __rds_put_mr_final); } void rds_rdma_free_op(struct rm_rdma_op *ro) { unsigned int i; if (ro->op_odp_mr) { kref_put(&ro->op_odp_mr->r_kref, __rds_put_mr_final); } else { for (i = 0; i < ro->op_nents; i++) { struct page *page = sg_page(&ro->op_sg[i]); /* Mark page dirty if it was possibly modified, which * is the case for a RDMA_READ which copies from remote * to local memory */ unpin_user_pages_dirty_lock(&page, 1, !ro->op_write); } } kfree(ro->op_notifier); ro->op_notifier = NULL; ro->op_active = 0; ro->op_odp_mr = NULL; } void rds_atomic_free_op(struct rm_atomic_op *ao) { struct page *page = sg_page(ao->op_sg); /* Mark page dirty if it was possibly modified, which * is the case for a RDMA_READ which copies from remote * to local memory */ unpin_user_pages_dirty_lock(&page, 1, true); kfree(ao->op_notifier); ao->op_notifier = NULL; ao->op_active = 0; } /* * Count the number of pages needed to describe an incoming iovec array. */ static int rds_rdma_pages(struct rds_iovec iov[], int nr_iovecs) { int tot_pages = 0; unsigned int nr_pages; unsigned int i; /* figure out the number of pages in the vector */ for (i = 0; i < nr_iovecs; i++) { nr_pages = rds_pages_in_vec(&iov[i]); if (nr_pages == 0) return -EINVAL; tot_pages += nr_pages; /* * nr_pages for one entry is limited to (UINT_MAX>>PAGE_SHIFT)+1, * so tot_pages cannot overflow without first going negative. */ if (tot_pages < 0) return -EINVAL; } return tot_pages; } int rds_rdma_extra_size(struct rds_rdma_args *args, struct rds_iov_vector *iov) { struct rds_iovec *vec; struct rds_iovec __user *local_vec; int tot_pages = 0; unsigned int nr_pages; unsigned int i; local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr; if (args->nr_local == 0) return -EINVAL; if (args->nr_local > UIO_MAXIOV) return -EMSGSIZE; iov->iov = kcalloc(args->nr_local, sizeof(struct rds_iovec), GFP_KERNEL); if (!iov->iov) return -ENOMEM; vec = &iov->iov[0]; if (copy_from_user(vec, local_vec, args->nr_local * sizeof(struct rds_iovec))) return -EFAULT; iov->len = args->nr_local; /* figure out the number of pages in the vector */ for (i = 0; i < args->nr_local; i++, vec++) { nr_pages = rds_pages_in_vec(vec); if (nr_pages == 0) return -EINVAL; tot_pages += nr_pages; /* * nr_pages for one entry is limited to (UINT_MAX>>PAGE_SHIFT)+1, * so tot_pages cannot overflow without first going negative. */ if (tot_pages < 0) return -EINVAL; } return tot_pages * sizeof(struct scatterlist); } /* * The application asks for a RDMA transfer. * Extract all arguments and set up the rdma_op */ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, struct cmsghdr *cmsg, struct rds_iov_vector *vec) { struct rds_rdma_args *args; struct rm_rdma_op *op = &rm->rdma; int nr_pages; unsigned int nr_bytes; struct page **pages = NULL; struct rds_iovec *iovs; unsigned int i, j; int ret = 0; bool odp_supported = true; if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_rdma_args)) || rm->rdma.op_active) return -EINVAL; args = CMSG_DATA(cmsg); if (ipv6_addr_any(&rs->rs_bound_addr)) { ret = -ENOTCONN; /* XXX not a great errno */ goto out_ret; } if (args->nr_local > UIO_MAXIOV) { ret = -EMSGSIZE; goto out_ret; } if (vec->len != args->nr_local) { ret = -EINVAL; goto out_ret; } /* odp-mr is not supported for multiple requests within one message */ if (args->nr_local != 1) odp_supported = false; iovs = vec->iov; nr_pages = rds_rdma_pages(iovs, args->nr_local); if (nr_pages < 0) { ret = -EINVAL; goto out_ret; } pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); if (!pages) { ret = -ENOMEM; goto out_ret; } op->op_write = !!(args->flags & RDS_RDMA_READWRITE); op->op_fence = !!(args->flags & RDS_RDMA_FENCE); op->op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME); op->op_silent = !!(args->flags & RDS_RDMA_SILENT); op->op_active = 1; op->op_recverr = rs->rs_recverr; op->op_odp_mr = NULL; WARN_ON(!nr_pages); op->op_sg = rds_message_alloc_sgs(rm, nr_pages); if (IS_ERR(op->op_sg)) { ret = PTR_ERR(op->op_sg); goto out_pages; } if (op->op_notify || op->op_recverr) { /* We allocate an uninitialized notifier here, because * we don't want to do that in the completion handler. We * would have to use GFP_ATOMIC there, and don't want to deal * with failed allocations. */ op->op_notifier = kmalloc(sizeof(struct rds_notifier), GFP_KERNEL); if (!op->op_notifier) { ret = -ENOMEM; goto out_pages; } op->op_notifier->n_user_token = args->user_token; op->op_notifier->n_status = RDS_RDMA_SUCCESS; } /* The cookie contains the R_Key of the remote memory region, and * optionally an offset into it. This is how we implement RDMA into * unaligned memory. * When setting up the RDMA, we need to add that offset to the * destination address (which is really an offset into the MR) * FIXME: We may want to move this into ib_rdma.c */ op->op_rkey = rds_rdma_cookie_key(args->cookie); op->op_remote_addr = args->remote_vec.addr + rds_rdma_cookie_offset(args->cookie); nr_bytes = 0; rdsdebug("RDS: rdma prepare nr_local %llu rva %llx rkey %x\n", (unsigned long long)args->nr_local, (unsigned long long)args->remote_vec.addr, op->op_rkey); for (i = 0; i < args->nr_local; i++) { struct rds_iovec *iov = &iovs[i]; /* don't need to check, rds_rdma_pages() verified nr will be +nonzero */ unsigned int nr = rds_pages_in_vec(iov); rs->rs_user_addr = iov->addr; rs->rs_user_bytes = iov->bytes; /* If it's a WRITE operation, we want to pin the pages for reading. * If it's a READ operation, we need to pin the pages for writing. */ ret = rds_pin_pages(iov->addr, nr, pages, !op->op_write); if ((!odp_supported && ret <= 0) || (odp_supported && ret <= 0 && ret != -EOPNOTSUPP)) goto out_pages; if (ret == -EOPNOTSUPP) { struct rds_mr *local_odp_mr; if (!rs->rs_transport->get_mr) { ret = -EOPNOTSUPP; goto out_pages; } local_odp_mr = kzalloc(sizeof(*local_odp_mr), GFP_KERNEL); if (!local_odp_mr) { ret = -ENOMEM; goto out_pages; } RB_CLEAR_NODE(&local_odp_mr->r_rb_node); kref_init(&local_odp_mr->r_kref); local_odp_mr->r_trans = rs->rs_transport; local_odp_mr->r_sock = rs; local_odp_mr->r_trans_private = rs->rs_transport->get_mr( NULL, 0, rs, &local_odp_mr->r_key, NULL, iov->addr, iov->bytes, ODP_VIRTUAL); if (IS_ERR(local_odp_mr->r_trans_private)) { ret = PTR_ERR(local_odp_mr->r_trans_private); rdsdebug("get_mr ret %d %p\"", ret, local_odp_mr->r_trans_private); kfree(local_odp_mr); ret = -EOPNOTSUPP; goto out_pages; } rdsdebug("Need odp; local_odp_mr %p trans_private %p\n", local_odp_mr, local_odp_mr->r_trans_private); op->op_odp_mr = local_odp_mr; op->op_odp_addr = iov->addr; } rdsdebug("RDS: nr_bytes %u nr %u iov->bytes %llu iov->addr %llx\n", nr_bytes, nr, iov->bytes, iov->addr); nr_bytes += iov->bytes; for (j = 0; j < nr; j++) { unsigned int offset = iov->addr & ~PAGE_MASK; struct scatterlist *sg; sg = &op->op_sg[op->op_nents + j]; sg_set_page(sg, pages[j], min_t(unsigned int, iov->bytes, PAGE_SIZE - offset), offset); sg_dma_len(sg) = sg->length; rdsdebug("RDS: sg->offset %x sg->len %x iov->addr %llx iov->bytes %llu\n", sg->offset, sg->length, iov->addr, iov->bytes); iov->addr += sg->length; iov->bytes -= sg->length; } op->op_nents += nr; } if (nr_bytes > args->remote_vec.bytes) { rdsdebug("RDS nr_bytes %u remote_bytes %u do not match\n", nr_bytes, (unsigned int) args->remote_vec.bytes); ret = -EINVAL; goto out_pages; } op->op_bytes = nr_bytes; ret = 0; out_pages: kfree(pages); out_ret: if (ret) rds_rdma_free_op(op); else rds_stats_inc(s_send_rdma); return ret; } /* * The application wants us to pass an RDMA destination (aka MR) * to the remote */ int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm, struct cmsghdr *cmsg) { unsigned long flags; struct rds_mr *mr; u32 r_key; int err = 0; if (cmsg->cmsg_len < CMSG_LEN(sizeof(rds_rdma_cookie_t)) || rm->m_rdma_cookie != 0) return -EINVAL; memcpy(&rm->m_rdma_cookie, CMSG_DATA(cmsg), sizeof(rm->m_rdma_cookie)); /* We are reusing a previously mapped MR here. Most likely, the * application has written to the buffer, so we need to explicitly * flush those writes to RAM. Otherwise the HCA may not see them * when doing a DMA from that buffer. */ r_key = rds_rdma_cookie_key(rm->m_rdma_cookie); spin_lock_irqsave(&rs->rs_rdma_lock, flags); mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL); if (!mr) err = -EINVAL; /* invalid r_key */ else kref_get(&mr->r_kref); spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); if (mr) { mr->r_trans->sync_mr(mr->r_trans_private, DMA_TO_DEVICE); rm->rdma.op_rdma_mr = mr; } return err; } /* * The application passes us an address range it wants to enable RDMA * to/from. We map the area, and save the <R_Key,offset> pair * in rm->m_rdma_cookie. This causes it to be sent along to the peer * in an extension header. */ int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm, struct cmsghdr *cmsg) { if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_get_mr_args)) || rm->m_rdma_cookie != 0) return -EINVAL; return __rds_rdma_map(rs, CMSG_DATA(cmsg), &rm->m_rdma_cookie, &rm->rdma.op_rdma_mr, rm->m_conn_path); } /* * Fill in rds_message for an atomic request. */ int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm, struct cmsghdr *cmsg) { struct page *page = NULL; struct rds_atomic_args *args; int ret = 0; if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_atomic_args)) || rm->atomic.op_active) return -EINVAL; args = CMSG_DATA(cmsg); /* Nonmasked & masked cmsg ops converted to masked hw ops */ switch (cmsg->cmsg_type) { case RDS_CMSG_ATOMIC_FADD: rm->atomic.op_type = RDS_ATOMIC_TYPE_FADD; rm->atomic.op_m_fadd.add = args->fadd.add; rm->atomic.op_m_fadd.nocarry_mask = 0; break; case RDS_CMSG_MASKED_ATOMIC_FADD: rm->atomic.op_type = RDS_ATOMIC_TYPE_FADD; rm->atomic.op_m_fadd.add = args->m_fadd.add; rm->atomic.op_m_fadd.nocarry_mask = args->m_fadd.nocarry_mask; break; case RDS_CMSG_ATOMIC_CSWP: rm->atomic.op_type = RDS_ATOMIC_TYPE_CSWP; rm->atomic.op_m_cswp.compare = args->cswp.compare; rm->atomic.op_m_cswp.swap = args->cswp.swap; rm->atomic.op_m_cswp.compare_mask = ~0; rm->atomic.op_m_cswp.swap_mask = ~0; break; case RDS_CMSG_MASKED_ATOMIC_CSWP: rm->atomic.op_type = RDS_ATOMIC_TYPE_CSWP; rm->atomic.op_m_cswp.compare = args->m_cswp.compare; rm->atomic.op_m_cswp.swap = args->m_cswp.swap; rm->atomic.op_m_cswp.compare_mask = args->m_cswp.compare_mask; rm->atomic.op_m_cswp.swap_mask = args->m_cswp.swap_mask; break; default: BUG(); /* should never happen */ } rm->atomic.op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME); rm->atomic.op_silent = !!(args->flags & RDS_RDMA_SILENT); rm->atomic.op_active = 1; rm->atomic.op_recverr = rs->rs_recverr; rm->atomic.op_sg = rds_message_alloc_sgs(rm, 1); if (IS_ERR(rm->atomic.op_sg)) { ret = PTR_ERR(rm->atomic.op_sg); goto err; } /* verify 8 byte-aligned */ if (args->local_addr & 0x7) { ret = -EFAULT; goto err; } ret = rds_pin_pages(args->local_addr, 1, &page, 1); if (ret != 1) goto err; ret = 0; sg_set_page(rm->atomic.op_sg, page, 8, offset_in_page(args->local_addr)); if (rm->atomic.op_notify || rm->atomic.op_recverr) { /* We allocate an uninitialized notifier here, because * we don't want to do that in the completion handler. We * would have to use GFP_ATOMIC there, and don't want to deal * with failed allocations. */ rm->atomic.op_notifier = kmalloc(sizeof(*rm->atomic.op_notifier), GFP_KERNEL); if (!rm->atomic.op_notifier) { ret = -ENOMEM; goto err; } rm->atomic.op_notifier->n_user_token = args->user_token; rm->atomic.op_notifier->n_status = RDS_RDMA_SUCCESS; } rm->atomic.op_rkey = rds_rdma_cookie_key(args->cookie); rm->atomic.op_remote_addr = args->remote_addr + rds_rdma_cookie_offset(args->cookie); return ret; err: if (page) unpin_user_page(page); rm->atomic.op_active = 0; kfree(rm->atomic.op_notifier); return ret; }
24 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * NET An implementation of the SOCKET network access protocol. * This is the master header file for the Linux NET layer, * or, in plain English: the networking handling part of the * kernel. * * Version: @(#)net.h 1.0.3 05/25/93 * * Authors: Orest Zborowski, <obz@Kodak.COM> * Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> */ #ifndef _LINUX_NET_H #define _LINUX_NET_H #include <linux/stringify.h> #include <linux/random.h> #include <linux/wait.h> #include <linux/fcntl.h> /* For O_CLOEXEC and O_NONBLOCK */ #include <linux/rcupdate.h> #include <linux/once.h> #include <linux/fs.h> #include <linux/mm.h> #include <linux/sockptr.h> #include <uapi/linux/net.h> struct poll_table_struct; struct pipe_inode_info; struct inode; struct file; struct net; /* Historically, SOCKWQ_ASYNC_NOSPACE & SOCKWQ_ASYNC_WAITDATA were located * in sock->flags, but moved into sk->sk_wq->flags to be RCU protected. * Eventually all flags will be in sk->sk_wq->flags. */ #define SOCKWQ_ASYNC_NOSPACE 0 #define SOCKWQ_ASYNC_WAITDATA 1 #define SOCK_NOSPACE 2 #define SOCK_PASSCRED 3 #define SOCK_PASSSEC 4 #define SOCK_SUPPORT_ZC 5 #define SOCK_CUSTOM_SOCKOPT 6 #define SOCK_PASSPIDFD 7 #ifndef ARCH_HAS_SOCKET_TYPES /** * enum sock_type - Socket types * @SOCK_STREAM: stream (connection) socket * @SOCK_DGRAM: datagram (conn.less) socket * @SOCK_RAW: raw socket * @SOCK_RDM: reliably-delivered message * @SOCK_SEQPACKET: sequential packet socket * @SOCK_DCCP: Datagram Congestion Control Protocol socket * @SOCK_PACKET: linux specific way of getting packets at the dev level. * For writing rarp and other similar things on the user level. * * When adding some new socket type please * grep ARCH_HAS_SOCKET_TYPE include/asm-* /socket.h, at least MIPS * overrides this enum for binary compat reasons. */ enum sock_type { SOCK_STREAM = 1, SOCK_DGRAM = 2, SOCK_RAW = 3, SOCK_RDM = 4, SOCK_SEQPACKET = 5, SOCK_DCCP = 6, SOCK_PACKET = 10, }; #define SOCK_MAX (SOCK_PACKET + 1) /* Mask which covers at least up to SOCK_MASK-1. The * remaining bits are used as flags. */ #define SOCK_TYPE_MASK 0xf /* Flags for socket, socketpair, accept4 */ #define SOCK_CLOEXEC O_CLOEXEC #ifndef SOCK_NONBLOCK #define SOCK_NONBLOCK O_NONBLOCK #endif #endif /* ARCH_HAS_SOCKET_TYPES */ /** * enum sock_shutdown_cmd - Shutdown types * @SHUT_RD: shutdown receptions * @SHUT_WR: shutdown transmissions * @SHUT_RDWR: shutdown receptions/transmissions */ enum sock_shutdown_cmd { SHUT_RD, SHUT_WR, SHUT_RDWR, }; struct socket_wq { /* Note: wait MUST be first field of socket_wq */ wait_queue_head_t wait; struct fasync_struct *fasync_list; unsigned long flags; /* %SOCKWQ_ASYNC_NOSPACE, etc */ struct rcu_head rcu; } ____cacheline_aligned_in_smp; /** * struct socket - general BSD socket * @state: socket state (%SS_CONNECTED, etc) * @type: socket type (%SOCK_STREAM, etc) * @flags: socket flags (%SOCK_NOSPACE, etc) * @ops: protocol specific socket operations * @file: File back pointer for gc * @sk: internal networking protocol agnostic socket representation * @wq: wait queue for several uses */ struct socket { socket_state state; short type; unsigned long flags; struct file *file; struct sock *sk; const struct proto_ops *ops; /* Might change with IPV6_ADDRFORM or MPTCP. */ struct socket_wq wq; }; /* * "descriptor" for what we're up to with a read. * This allows us to use the same read code yet * have multiple different users of the data that * we read from a file. * * The simplest case just copies the data to user * mode. */ typedef struct { size_t written; size_t count; union { char __user *buf; void *data; } arg; int error; } read_descriptor_t; struct vm_area_struct; struct page; struct sockaddr; struct msghdr; struct module; struct sk_buff; struct proto_accept_arg; typedef int (*sk_read_actor_t)(read_descriptor_t *, struct sk_buff *, unsigned int, size_t); typedef int (*skb_read_actor_t)(struct sock *, struct sk_buff *); struct proto_ops { int family; struct module *owner; int (*release) (struct socket *sock); int (*bind) (struct socket *sock, struct sockaddr *myaddr, int sockaddr_len); int (*connect) (struct socket *sock, struct sockaddr *vaddr, int sockaddr_len, int flags); int (*socketpair)(struct socket *sock1, struct socket *sock2); int (*accept) (struct socket *sock, struct socket *newsock, struct proto_accept_arg *arg); int (*getname) (struct socket *sock, struct sockaddr *addr, int peer); __poll_t (*poll) (struct file *file, struct socket *sock, struct poll_table_struct *wait); int (*ioctl) (struct socket *sock, unsigned int cmd, unsigned long arg); #ifdef CONFIG_COMPAT int (*compat_ioctl) (struct socket *sock, unsigned int cmd, unsigned long arg); #endif int (*gettstamp) (struct socket *sock, void __user *userstamp, bool timeval, bool time32); int (*listen) (struct socket *sock, int len); int (*shutdown) (struct socket *sock, int flags); int (*setsockopt)(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen); int (*getsockopt)(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen); void (*show_fdinfo)(struct seq_file *m, struct socket *sock); int (*sendmsg) (struct socket *sock, struct msghdr *m, size_t total_len); /* Notes for implementing recvmsg: * =============================== * msg->msg_namelen should get updated by the recvmsg handlers * iff msg_name != NULL. It is by default 0 to prevent * returning uninitialized memory to user space. The recvfrom * handlers can assume that msg.msg_name is either NULL or has * a minimum size of sizeof(struct sockaddr_storage). */ int (*recvmsg) (struct socket *sock, struct msghdr *m, size_t total_len, int flags); int (*mmap) (struct file *file, struct socket *sock, struct vm_area_struct * vma); ssize_t (*splice_read)(struct socket *sock, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags); void (*splice_eof)(struct socket *sock); int (*set_peek_off)(struct sock *sk, int val); int (*peek_len)(struct socket *sock); /* The following functions are called internally by kernel with * sock lock already held. */ int (*read_sock)(struct sock *sk, read_descriptor_t *desc, sk_read_actor_t recv_actor); /* This is different from read_sock(), it reads an entire skb at a time. */ int (*read_skb)(struct sock *sk, skb_read_actor_t recv_actor); int (*sendmsg_locked)(struct sock *sk, struct msghdr *msg, size_t size); int (*set_rcvlowat)(struct sock *sk, int val); }; #define DECLARE_SOCKADDR(type, dst, src) \ type dst = ({ __sockaddr_check_size(sizeof(*dst)); (type) src; }) struct net_proto_family { int family; int (*create)(struct net *net, struct socket *sock, int protocol, int kern); struct module *owner; }; struct iovec; struct kvec; enum { SOCK_WAKE_IO, SOCK_WAKE_WAITD, SOCK_WAKE_SPACE, SOCK_WAKE_URG, }; int sock_wake_async(struct socket_wq *sk_wq, int how, int band); int sock_register(const struct net_proto_family *fam); void sock_unregister(int family); bool sock_is_registered(int family); int __sock_create(struct net *net, int family, int type, int proto, struct socket **res, int kern); int sock_create(int family, int type, int proto, struct socket **res); int sock_create_kern(struct net *net, int family, int type, int proto, struct socket **res); int sock_create_lite(int family, int type, int proto, struct socket **res); struct socket *sock_alloc(void); void sock_release(struct socket *sock); int sock_sendmsg(struct socket *sock, struct msghdr *msg); int sock_recvmsg(struct socket *sock, struct msghdr *msg, int flags); struct file *sock_alloc_file(struct socket *sock, int flags, const char *dname); struct socket *sockfd_lookup(int fd, int *err); struct socket *sock_from_file(struct file *file); #define sockfd_put(sock) fput(sock->file) int net_ratelimit(void); #define net_ratelimited_function(function, ...) \ do { \ if (net_ratelimit()) \ function(__VA_ARGS__); \ } while (0) #define net_emerg_ratelimited(fmt, ...) \ net_ratelimited_function(pr_emerg, fmt, ##__VA_ARGS__) #define net_alert_ratelimited(fmt, ...) \ net_ratelimited_function(pr_alert, fmt, ##__VA_ARGS__) #define net_crit_ratelimited(fmt, ...) \ net_ratelimited_function(pr_crit, fmt, ##__VA_ARGS__) #define net_err_ratelimited(fmt, ...) \ net_ratelimited_function(pr_err, fmt, ##__VA_ARGS__) #define net_notice_ratelimited(fmt, ...) \ net_ratelimited_function(pr_notice, fmt, ##__VA_ARGS__) #define net_warn_ratelimited(fmt, ...) \ net_ratelimited_function(pr_warn, fmt, ##__VA_ARGS__) #define net_info_ratelimited(fmt, ...) \ net_ratelimited_function(pr_info, fmt, ##__VA_ARGS__) #if defined(CONFIG_DYNAMIC_DEBUG) || \ (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) #define net_dbg_ratelimited(fmt, ...) \ do { \ DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, fmt); \ if (DYNAMIC_DEBUG_BRANCH(descriptor) && \ net_ratelimit()) \ __dynamic_pr_debug(&descriptor, pr_fmt(fmt), \ ##__VA_ARGS__); \ } while (0) #elif defined(DEBUG) #define net_dbg_ratelimited(fmt, ...) \ net_ratelimited_function(pr_debug, fmt, ##__VA_ARGS__) #else #define net_dbg_ratelimited(fmt, ...) \ no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__) #endif #define net_get_random_once(buf, nbytes) \ get_random_once((buf), (nbytes)) /* * E.g. XFS meta- & log-data is in slab pages, or bcache meta * data pages, or other high order pages allocated by * __get_free_pages() without __GFP_COMP, which have a page_count * of 0 and/or have PageSlab() set. We cannot use send_page for * those, as that does get_page(); put_page(); and would cause * either a VM_BUG directly, or __page_cache_release a page that * would actually still be referenced by someone, leading to some * obscure delayed Oops somewhere else. */ static inline bool sendpage_ok(struct page *page) { return !PageSlab(page) && page_count(page) >= 1; } /* * Check sendpage_ok on contiguous pages. */ static inline bool sendpages_ok(struct page *page, size_t len, size_t offset) { struct page *p = page + (offset >> PAGE_SHIFT); size_t count = 0; while (count < len) { if (!sendpage_ok(p)) return false; p++; count += PAGE_SIZE; } return true; } int kernel_sendmsg(struct socket *sock, struct msghdr *msg, struct kvec *vec, size_t num, size_t len); int kernel_recvmsg(struct socket *sock, struct msghdr *msg, struct kvec *vec, size_t num, size_t len, int flags); int kernel_bind(struct socket *sock, struct sockaddr *addr, int addrlen); int kernel_listen(struct socket *sock, int backlog); int kernel_accept(struct socket *sock, struct socket **newsock, int flags); int kernel_connect(struct socket *sock, struct sockaddr *addr, int addrlen, int flags); int kernel_getsockname(struct socket *sock, struct sockaddr *addr); int kernel_getpeername(struct socket *sock, struct sockaddr *addr); int kernel_sock_shutdown(struct socket *sock, enum sock_shutdown_cmd how); /* Routine returns the IP overhead imposed by a (caller-protected) socket. */ u32 kernel_sock_ip_overhead(struct sock *sk); #define MODULE_ALIAS_NETPROTO(proto) \ MODULE_ALIAS("net-pf-" __stringify(proto)) #define MODULE_ALIAS_NET_PF_PROTO(pf, proto) \ MODULE_ALIAS("net-pf-" __stringify(pf) "-proto-" __stringify(proto)) #define MODULE_ALIAS_NET_PF_PROTO_TYPE(pf, proto, type) \ MODULE_ALIAS("net-pf-" __stringify(pf) "-proto-" __stringify(proto) \ "-type-" __stringify(type)) #define MODULE_ALIAS_NET_PF_PROTO_NAME(pf, proto, name) \ MODULE_ALIAS("net-pf-" __stringify(pf) "-proto-" __stringify(proto) \ name) #endif /* _LINUX_NET_H */
653 656 646 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 /* SPDX-License-Identifier: GPL-2.0+ */ #ifndef _LINUX_MAPLE_TREE_H #define _LINUX_MAPLE_TREE_H /* * Maple Tree - An RCU-safe adaptive tree for storing ranges * Copyright (c) 2018-2022 Oracle * Authors: Liam R. Howlett <Liam.Howlett@Oracle.com> * Matthew Wilcox <willy@infradead.org> */ #include <linux/kernel.h> #include <linux/rcupdate.h> #include <linux/spinlock.h> /* #define CONFIG_MAPLE_RCU_DISABLED */ /* * Allocated nodes are mutable until they have been inserted into the tree, * at which time they cannot change their type until they have been removed * from the tree and an RCU grace period has passed. * * Removed nodes have their ->parent set to point to themselves. RCU readers * check ->parent before relying on the value that they loaded from the * slots array. This lets us reuse the slots array for the RCU head. * * Nodes in the tree point to their parent unless bit 0 is set. */ #if defined(CONFIG_64BIT) || defined(BUILD_VDSO32_64) /* 64bit sizes */ #define MAPLE_NODE_SLOTS 31 /* 256 bytes including ->parent */ #define MAPLE_RANGE64_SLOTS 16 /* 256 bytes */ #define MAPLE_ARANGE64_SLOTS 10 /* 240 bytes */ #define MAPLE_ALLOC_SLOTS (MAPLE_NODE_SLOTS - 1) #else /* 32bit sizes */ #define MAPLE_NODE_SLOTS 63 /* 256 bytes including ->parent */ #define MAPLE_RANGE64_SLOTS 32 /* 256 bytes */ #define MAPLE_ARANGE64_SLOTS 21 /* 240 bytes */ #define MAPLE_ALLOC_SLOTS (MAPLE_NODE_SLOTS - 2) #endif /* defined(CONFIG_64BIT) || defined(BUILD_VDSO32_64) */ #define MAPLE_NODE_MASK 255UL /* * The node->parent of the root node has bit 0 set and the rest of the pointer * is a pointer to the tree itself. No more bits are available in this pointer * (on m68k, the data structure may only be 2-byte aligned). * * Internal non-root nodes can only have maple_range_* nodes as parents. The * parent pointer is 256B aligned like all other tree nodes. When storing a 32 * or 64 bit values, the offset can fit into 4 bits. The 16 bit values need an * extra bit to store the offset. This extra bit comes from a reuse of the last * bit in the node type. This is possible by using bit 1 to indicate if bit 2 * is part of the type or the slot. * * Once the type is decided, the decision of an allocation range type or a * range type is done by examining the immutable tree flag for the * MT_FLAGS_ALLOC_RANGE flag. * * Node types: * 0x??1 = Root * 0x?00 = 16 bit nodes * 0x010 = 32 bit nodes * 0x110 = 64 bit nodes * * Slot size and location in the parent pointer: * type : slot location * 0x??1 : Root * 0x?00 : 16 bit values, type in 0-1, slot in 2-6 * 0x010 : 32 bit values, type in 0-2, slot in 3-6 * 0x110 : 64 bit values, type in 0-2, slot in 3-6 */ /* * This metadata is used to optimize the gap updating code and in reverse * searching for gaps or any other code that needs to find the end of the data. */ struct maple_metadata { unsigned char end; unsigned char gap; }; /* * Leaf nodes do not store pointers to nodes, they store user data. Users may * store almost any bit pattern. As noted above, the optimisation of storing an * entry at 0 in the root pointer cannot be done for data which have the bottom * two bits set to '10'. We also reserve values with the bottom two bits set to * '10' which are below 4096 (ie 2, 6, 10 .. 4094) for internal use. Some APIs * return errnos as a negative errno shifted right by two bits and the bottom * two bits set to '10', and while choosing to store these values in the array * is not an error, it may lead to confusion if you're testing for an error with * mas_is_err(). * * Non-leaf nodes store the type of the node pointed to (enum maple_type in bits * 3-6), bit 2 is reserved. That leaves bits 0-1 unused for now. * * In regular B-Tree terms, pivots are called keys. The term pivot is used to * indicate that the tree is specifying ranges, Pivots may appear in the * subtree with an entry attached to the value whereas keys are unique to a * specific position of a B-tree. Pivot values are inclusive of the slot with * the same index. */ struct maple_range_64 { struct maple_pnode *parent; unsigned long pivot[MAPLE_RANGE64_SLOTS - 1]; union { void __rcu *slot[MAPLE_RANGE64_SLOTS]; struct { void __rcu *pad[MAPLE_RANGE64_SLOTS - 1]; struct maple_metadata meta; }; }; }; /* * At tree creation time, the user can specify that they're willing to trade off * storing fewer entries in a tree in return for storing more information in * each node. * * The maple tree supports recording the largest range of NULL entries available * in this node, also called gaps. This optimises the tree for allocating a * range. */ struct maple_arange_64 { struct maple_pnode *parent; unsigned long pivot[MAPLE_ARANGE64_SLOTS - 1]; void __rcu *slot[MAPLE_ARANGE64_SLOTS]; unsigned long gap[MAPLE_ARANGE64_SLOTS]; struct maple_metadata meta; }; struct maple_alloc { unsigned long total; unsigned char node_count; unsigned int request_count; struct maple_alloc *slot[MAPLE_ALLOC_SLOTS]; }; struct maple_topiary { struct maple_pnode *parent; struct maple_enode *next; /* Overlaps the pivot */ }; enum maple_type { maple_dense, maple_leaf_64, maple_range_64, maple_arange_64, }; enum store_type { wr_invalid, wr_new_root, wr_store_root, wr_exact_fit, wr_spanning_store, wr_split_store, wr_rebalance, wr_append, wr_node_store, wr_slot_store, }; /** * DOC: Maple tree flags * * * MT_FLAGS_ALLOC_RANGE - Track gaps in this tree * * MT_FLAGS_USE_RCU - Operate in RCU mode * * MT_FLAGS_HEIGHT_OFFSET - The position of the tree height in the flags * * MT_FLAGS_HEIGHT_MASK - The mask for the maple tree height value * * MT_FLAGS_LOCK_MASK - How the mt_lock is used * * MT_FLAGS_LOCK_IRQ - Acquired irq-safe * * MT_FLAGS_LOCK_BH - Acquired bh-safe * * MT_FLAGS_LOCK_EXTERN - mt_lock is not used * * MAPLE_HEIGHT_MAX The largest height that can be stored */ #define MT_FLAGS_ALLOC_RANGE 0x01 #define MT_FLAGS_USE_RCU 0x02 #define MT_FLAGS_HEIGHT_OFFSET 0x02 #define MT_FLAGS_HEIGHT_MASK 0x7C #define MT_FLAGS_LOCK_MASK 0x300 #define MT_FLAGS_LOCK_IRQ 0x100 #define MT_FLAGS_LOCK_BH 0x200 #define MT_FLAGS_LOCK_EXTERN 0x300 #define MT_FLAGS_ALLOC_WRAPPED 0x0800 #define MAPLE_HEIGHT_MAX 31 #define MAPLE_NODE_TYPE_MASK 0x0F #define MAPLE_NODE_TYPE_SHIFT 0x03 #define MAPLE_RESERVED_RANGE 4096 #ifdef CONFIG_LOCKDEP typedef struct lockdep_map *lockdep_map_p; #define mt_lock_is_held(mt) \ (!(mt)->ma_external_lock || lock_is_held((mt)->ma_external_lock)) #define mt_write_lock_is_held(mt) \ (!(mt)->ma_external_lock || \ lock_is_held_type((mt)->ma_external_lock, 0)) #define mt_set_external_lock(mt, lock) \ (mt)->ma_external_lock = &(lock)->dep_map #define mt_on_stack(mt) (mt).ma_external_lock = NULL #else typedef struct { /* nothing */ } lockdep_map_p; #define mt_lock_is_held(mt) 1 #define mt_write_lock_is_held(mt) 1 #define mt_set_external_lock(mt, lock) do { } while (0) #define mt_on_stack(mt) do { } while (0) #endif /* * If the tree contains a single entry at index 0, it is usually stored in * tree->ma_root. To optimise for the page cache, an entry which ends in '00', * '01' or '11' is stored in the root, but an entry which ends in '10' will be * stored in a node. Bits 3-6 are used to store enum maple_type. * * The flags are used both to store some immutable information about this tree * (set at tree creation time) and dynamic information set under the spinlock. * * Another use of flags are to indicate global states of the tree. This is the * case with the MT_FLAGS_USE_RCU flag, which indicates the tree is currently in * RCU mode. This mode was added to allow the tree to reuse nodes instead of * re-allocating and RCU freeing nodes when there is a single user. */ struct maple_tree { union { spinlock_t ma_lock; lockdep_map_p ma_external_lock; }; unsigned int ma_flags; void __rcu *ma_root; }; /** * MTREE_INIT() - Initialize a maple tree * @name: The maple tree name * @__flags: The maple tree flags * */ #define MTREE_INIT(name, __flags) { \ .ma_lock = __SPIN_LOCK_UNLOCKED((name).ma_lock), \ .ma_flags = __flags, \ .ma_root = NULL, \ } /** * MTREE_INIT_EXT() - Initialize a maple tree with an external lock. * @name: The tree name * @__flags: The maple tree flags * @__lock: The external lock */ #ifdef CONFIG_LOCKDEP #define MTREE_INIT_EXT(name, __flags, __lock) { \ .ma_external_lock = &(__lock).dep_map, \ .ma_flags = (__flags), \ .ma_root = NULL, \ } #else #define MTREE_INIT_EXT(name, __flags, __lock) MTREE_INIT(name, __flags) #endif #define DEFINE_MTREE(name) \ struct maple_tree name = MTREE_INIT(name, 0) #define mtree_lock(mt) spin_lock((&(mt)->ma_lock)) #define mtree_lock_nested(mas, subclass) \ spin_lock_nested((&(mt)->ma_lock), subclass) #define mtree_unlock(mt) spin_unlock((&(mt)->ma_lock)) /* * The Maple Tree squeezes various bits in at various points which aren't * necessarily obvious. Usually, this is done by observing that pointers are * N-byte aligned and thus the bottom log_2(N) bits are available for use. We * don't use the high bits of pointers to store additional information because * we don't know what bits are unused on any given architecture. * * Nodes are 256 bytes in size and are also aligned to 256 bytes, giving us 8 * low bits for our own purposes. Nodes are currently of 4 types: * 1. Single pointer (Range is 0-0) * 2. Non-leaf Allocation Range nodes * 3. Non-leaf Range nodes * 4. Leaf Range nodes All nodes consist of a number of node slots, * pivots, and a parent pointer. */ struct maple_node { union { struct { struct maple_pnode *parent; void __rcu *slot[MAPLE_NODE_SLOTS]; }; struct { void *pad; struct rcu_head rcu; struct maple_enode *piv_parent; unsigned char parent_slot; enum maple_type type; unsigned char slot_len; unsigned int ma_flags; }; struct maple_range_64 mr64; struct maple_arange_64 ma64; struct maple_alloc alloc; }; }; /* * More complicated stores can cause two nodes to become one or three and * potentially alter the height of the tree. Either half of the tree may need * to be rebalanced against the other. The ma_topiary struct is used to track * which nodes have been 'cut' from the tree so that the change can be done * safely at a later date. This is done to support RCU. */ struct ma_topiary { struct maple_enode *head; struct maple_enode *tail; struct maple_tree *mtree; }; void *mtree_load(struct maple_tree *mt, unsigned long index); int mtree_insert(struct maple_tree *mt, unsigned long index, void *entry, gfp_t gfp); int mtree_insert_range(struct maple_tree *mt, unsigned long first, unsigned long last, void *entry, gfp_t gfp); int mtree_alloc_range(struct maple_tree *mt, unsigned long *startp, void *entry, unsigned long size, unsigned long min, unsigned long max, gfp_t gfp); int mtree_alloc_cyclic(struct maple_tree *mt, unsigned long *startp, void *entry, unsigned long range_lo, unsigned long range_hi, unsigned long *next, gfp_t gfp); int mtree_alloc_rrange(struct maple_tree *mt, unsigned long *startp, void *entry, unsigned long size, unsigned long min, unsigned long max, gfp_t gfp); int mtree_store_range(struct maple_tree *mt, unsigned long first, unsigned long last, void *entry, gfp_t gfp); int mtree_store(struct maple_tree *mt, unsigned long index, void *entry, gfp_t gfp); void *mtree_erase(struct maple_tree *mt, unsigned long index); int mtree_dup(struct maple_tree *mt, struct maple_tree *new, gfp_t gfp); int __mt_dup(struct maple_tree *mt, struct maple_tree *new, gfp_t gfp); void mtree_destroy(struct maple_tree *mt); void __mt_destroy(struct maple_tree *mt); /** * mtree_empty() - Determine if a tree has any present entries. * @mt: Maple Tree. * * Context: Any context. * Return: %true if the tree contains only NULL pointers. */ static inline bool mtree_empty(const struct maple_tree *mt) { return mt->ma_root == NULL; } /* Advanced API */ /* * Maple State Status * ma_active means the maple state is pointing to a node and offset and can * continue operating on the tree. * ma_start means we have not searched the tree. * ma_root means we have searched the tree and the entry we found lives in * the root of the tree (ie it has index 0, length 1 and is the only entry in * the tree). * ma_none means we have searched the tree and there is no node in the * tree for this entry. For example, we searched for index 1 in an empty * tree. Or we have a tree which points to a full leaf node and we * searched for an entry which is larger than can be contained in that * leaf node. * ma_pause means the data within the maple state may be stale, restart the * operation * ma_overflow means the search has reached the upper limit of the search * ma_underflow means the search has reached the lower limit of the search * ma_error means there was an error, check the node for the error number. */ enum maple_status { ma_active, ma_start, ma_root, ma_none, ma_pause, ma_overflow, ma_underflow, ma_error, }; /* * The maple state is defined in the struct ma_state and is used to keep track * of information during operations, and even between operations when using the * advanced API. * * If state->node has bit 0 set then it references a tree location which is not * a node (eg the root). If bit 1 is set, the rest of the bits are a negative * errno. Bit 2 (the 'unallocated slots' bit) is clear. Bits 3-6 indicate the * node type. * * state->alloc either has a request number of nodes or an allocated node. If * stat->alloc has a requested number of nodes, the first bit will be set (0x1) * and the remaining bits are the value. If state->alloc is a node, then the * node will be of type maple_alloc. maple_alloc has MAPLE_NODE_SLOTS - 1 for * storing more allocated nodes, a total number of nodes allocated, and the * node_count in this node. node_count is the number of allocated nodes in this * node. The scaling beyond MAPLE_NODE_SLOTS - 1 is handled by storing further * nodes into state->alloc->slot[0]'s node. Nodes are taken from state->alloc * by removing a node from the state->alloc node until state->alloc->node_count * is 1, when state->alloc is returned and the state->alloc->slot[0] is promoted * to state->alloc. Nodes are pushed onto state->alloc by putting the current * state->alloc into the pushed node's slot[0]. * * The state also contains the implied min/max of the state->node, the depth of * this search, and the offset. The implied min/max are either from the parent * node or are 0-oo for the root node. The depth is incremented or decremented * every time a node is walked down or up. The offset is the slot/pivot of * interest in the node - either for reading or writing. * * When returning a value the maple state index and last respectively contain * the start and end of the range for the entry. Ranges are inclusive in the * Maple Tree. * * The status of the state is used to determine how the next action should treat * the state. For instance, if the status is ma_start then the next action * should start at the root of the tree and walk down. If the status is * ma_pause then the node may be stale data and should be discarded. If the * status is ma_overflow, then the last action hit the upper limit. * */ struct ma_state { struct maple_tree *tree; /* The tree we're operating in */ unsigned long index; /* The index we're operating on - range start */ unsigned long last; /* The last index we're operating on - range end */ struct maple_enode *node; /* The node containing this entry */ unsigned long min; /* The minimum index of this node - implied pivot min */ unsigned long max; /* The maximum index of this node - implied pivot max */ struct maple_alloc *alloc; /* Allocated nodes for this operation */ enum maple_status status; /* The status of the state (active, start, none, etc) */ unsigned char depth; /* depth of tree descent during write */ unsigned char offset; unsigned char mas_flags; unsigned char end; /* The end of the node */ enum store_type store_type; /* The type of store needed for this operation */ }; struct ma_wr_state { struct ma_state *mas; struct maple_node *node; /* Decoded mas->node */ unsigned long r_min; /* range min */ unsigned long r_max; /* range max */ enum maple_type type; /* mas->node type */ unsigned char offset_end; /* The offset where the write ends */ unsigned long *pivots; /* mas->node->pivots pointer */ unsigned long end_piv; /* The pivot at the offset end */ void __rcu **slots; /* mas->node->slots pointer */ void *entry; /* The entry to write */ void *content; /* The existing entry that is being overwritten */ }; #define mas_lock(mas) spin_lock(&((mas)->tree->ma_lock)) #define mas_lock_nested(mas, subclass) \ spin_lock_nested(&((mas)->tree->ma_lock), subclass) #define mas_unlock(mas) spin_unlock(&((mas)->tree->ma_lock)) /* * Special values for ma_state.node. * MA_ERROR represents an errno. After dropping the lock and attempting * to resolve the error, the walk would have to be restarted from the * top of the tree as the tree may have been modified. */ #define MA_ERROR(err) \ ((struct maple_enode *)(((unsigned long)err << 2) | 2UL)) #define MA_STATE(name, mt, first, end) \ struct ma_state name = { \ .tree = mt, \ .index = first, \ .last = end, \ .node = NULL, \ .status = ma_start, \ .min = 0, \ .max = ULONG_MAX, \ .alloc = NULL, \ .mas_flags = 0, \ .store_type = wr_invalid, \ } #define MA_WR_STATE(name, ma_state, wr_entry) \ struct ma_wr_state name = { \ .mas = ma_state, \ .content = NULL, \ .entry = wr_entry, \ } #define MA_TOPIARY(name, tree) \ struct ma_topiary name = { \ .head = NULL, \ .tail = NULL, \ .mtree = tree, \ } void *mas_walk(struct ma_state *mas); void *mas_store(struct ma_state *mas, void *entry); void *mas_erase(struct ma_state *mas); int mas_store_gfp(struct ma_state *mas, void *entry, gfp_t gfp); void mas_store_prealloc(struct ma_state *mas, void *entry); void *mas_find(struct ma_state *mas, unsigned long max); void *mas_find_range(struct ma_state *mas, unsigned long max); void *mas_find_rev(struct ma_state *mas, unsigned long min); void *mas_find_range_rev(struct ma_state *mas, unsigned long max); int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp); int mas_alloc_cyclic(struct ma_state *mas, unsigned long *startp, void *entry, unsigned long range_lo, unsigned long range_hi, unsigned long *next, gfp_t gfp); bool mas_nomem(struct ma_state *mas, gfp_t gfp); void mas_pause(struct ma_state *mas); void maple_tree_init(void); void mas_destroy(struct ma_state *mas); int mas_expected_entries(struct ma_state *mas, unsigned long nr_entries); void *mas_prev(struct ma_state *mas, unsigned long min); void *mas_prev_range(struct ma_state *mas, unsigned long max); void *mas_next(struct ma_state *mas, unsigned long max); void *mas_next_range(struct ma_state *mas, unsigned long max); int mas_empty_area(struct ma_state *mas, unsigned long min, unsigned long max, unsigned long size); /* * This finds an empty area from the highest address to the lowest. * AKA "Topdown" version, */ int mas_empty_area_rev(struct ma_state *mas, unsigned long min, unsigned long max, unsigned long size); static inline void mas_init(struct ma_state *mas, struct maple_tree *tree, unsigned long addr) { memset(mas, 0, sizeof(struct ma_state)); mas->tree = tree; mas->index = mas->last = addr; mas->max = ULONG_MAX; mas->status = ma_start; mas->node = NULL; } static inline bool mas_is_active(struct ma_state *mas) { return mas->status == ma_active; } static inline bool mas_is_err(struct ma_state *mas) { return mas->status == ma_error; } /** * mas_reset() - Reset a Maple Tree operation state. * @mas: Maple Tree operation state. * * Resets the error or walk state of the @mas so future walks of the * array will start from the root. Use this if you have dropped the * lock and want to reuse the ma_state. * * Context: Any context. */ static __always_inline void mas_reset(struct ma_state *mas) { mas->status = ma_start; mas->node = NULL; } /** * mas_for_each() - Iterate over a range of the maple tree. * @__mas: Maple Tree operation state (maple_state) * @__entry: Entry retrieved from the tree * @__max: maximum index to retrieve from the tree * * When returned, mas->index and mas->last will hold the entire range for the * entry. * * Note: may return the zero entry. */ #define mas_for_each(__mas, __entry, __max) \ while (((__entry) = mas_find((__mas), (__max))) != NULL) /** * mas_for_each_rev() - Iterate over a range of the maple tree in reverse order. * @__mas: Maple Tree operation state (maple_state) * @__entry: Entry retrieved from the tree * @__min: minimum index to retrieve from the tree * * When returned, mas->index and mas->last will hold the entire range for the * entry. * * Note: may return the zero entry. */ #define mas_for_each_rev(__mas, __entry, __min) \ while (((__entry) = mas_find_rev((__mas), (__min))) != NULL) #ifdef CONFIG_DEBUG_MAPLE_TREE enum mt_dump_format { mt_dump_dec, mt_dump_hex, }; extern atomic_t maple_tree_tests_run; extern atomic_t maple_tree_tests_passed; void mt_dump(const struct maple_tree *mt, enum mt_dump_format format); void mas_dump(const struct ma_state *mas); void mas_wr_dump(const struct ma_wr_state *wr_mas); void mt_validate(struct maple_tree *mt); void mt_cache_shrink(void); #define MT_BUG_ON(__tree, __x) do { \ atomic_inc(&maple_tree_tests_run); \ if (__x) { \ pr_info("BUG at %s:%d (%u)\n", \ __func__, __LINE__, __x); \ mt_dump(__tree, mt_dump_hex); \ pr_info("Pass: %u Run:%u\n", \ atomic_read(&maple_tree_tests_passed), \ atomic_read(&maple_tree_tests_run)); \ dump_stack(); \ } else { \ atomic_inc(&maple_tree_tests_passed); \ } \ } while (0) #define MAS_BUG_ON(__mas, __x) do { \ atomic_inc(&maple_tree_tests_run); \ if (__x) { \ pr_info("BUG at %s:%d (%u)\n", \ __func__, __LINE__, __x); \ mas_dump(__mas); \ mt_dump((__mas)->tree, mt_dump_hex); \ pr_info("Pass: %u Run:%u\n", \ atomic_read(&maple_tree_tests_passed), \ atomic_read(&maple_tree_tests_run)); \ dump_stack(); \ } else { \ atomic_inc(&maple_tree_tests_passed); \ } \ } while (0) #define MAS_WR_BUG_ON(__wrmas, __x) do { \ atomic_inc(&maple_tree_tests_run); \ if (__x) { \ pr_info("BUG at %s:%d (%u)\n", \ __func__, __LINE__, __x); \ mas_wr_dump(__wrmas); \ mas_dump((__wrmas)->mas); \ mt_dump((__wrmas)->mas->tree, mt_dump_hex); \ pr_info("Pass: %u Run:%u\n", \ atomic_read(&maple_tree_tests_passed), \ atomic_read(&maple_tree_tests_run)); \ dump_stack(); \ } else { \ atomic_inc(&maple_tree_tests_passed); \ } \ } while (0) #define MT_WARN_ON(__tree, __x) ({ \ int ret = !!(__x); \ atomic_inc(&maple_tree_tests_run); \ if (ret) { \ pr_info("WARN at %s:%d (%u)\n", \ __func__, __LINE__, __x); \ mt_dump(__tree, mt_dump_hex); \ pr_info("Pass: %u Run:%u\n", \ atomic_read(&maple_tree_tests_passed), \ atomic_read(&maple_tree_tests_run)); \ dump_stack(); \ } else { \ atomic_inc(&maple_tree_tests_passed); \ } \ unlikely(ret); \ }) #define MAS_WARN_ON(__mas, __x) ({ \ int ret = !!(__x); \ atomic_inc(&maple_tree_tests_run); \ if (ret) { \ pr_info("WARN at %s:%d (%u)\n", \ __func__, __LINE__, __x); \ mas_dump(__mas); \ mt_dump((__mas)->tree, mt_dump_hex); \ pr_info("Pass: %u Run:%u\n", \ atomic_read(&maple_tree_tests_passed), \ atomic_read(&maple_tree_tests_run)); \ dump_stack(); \ } else { \ atomic_inc(&maple_tree_tests_passed); \ } \ unlikely(ret); \ }) #define MAS_WR_WARN_ON(__wrmas, __x) ({ \ int ret = !!(__x); \ atomic_inc(&maple_tree_tests_run); \ if (ret) { \ pr_info("WARN at %s:%d (%u)\n", \ __func__, __LINE__, __x); \ mas_wr_dump(__wrmas); \ mas_dump((__wrmas)->mas); \ mt_dump((__wrmas)->mas->tree, mt_dump_hex); \ pr_info("Pass: %u Run:%u\n", \ atomic_read(&maple_tree_tests_passed), \ atomic_read(&maple_tree_tests_run)); \ dump_stack(); \ } else { \ atomic_inc(&maple_tree_tests_passed); \ } \ unlikely(ret); \ }) #else #define MT_BUG_ON(__tree, __x) BUG_ON(__x) #define MAS_BUG_ON(__mas, __x) BUG_ON(__x) #define MAS_WR_BUG_ON(__mas, __x) BUG_ON(__x) #define MT_WARN_ON(__tree, __x) WARN_ON(__x) #define MAS_WARN_ON(__mas, __x) WARN_ON(__x) #define MAS_WR_WARN_ON(__mas, __x) WARN_ON(__x) #endif /* CONFIG_DEBUG_MAPLE_TREE */ /** * __mas_set_range() - Set up Maple Tree operation state to a sub-range of the * current location. * @mas: Maple Tree operation state. * @start: New start of range in the Maple Tree. * @last: New end of range in the Maple Tree. * * set the internal maple state values to a sub-range. * Please use mas_set_range() if you do not know where you are in the tree. */ static inline void __mas_set_range(struct ma_state *mas, unsigned long start, unsigned long last) { /* Ensure the range starts within the current slot */ MAS_WARN_ON(mas, mas_is_active(mas) && (mas->index > start || mas->last < start)); mas->index = start; mas->last = last; } /** * mas_set_range() - Set up Maple Tree operation state for a different index. * @mas: Maple Tree operation state. * @start: New start of range in the Maple Tree. * @last: New end of range in the Maple Tree. * * Move the operation state to refer to a different range. This will * have the effect of starting a walk from the top; see mas_next() * to move to an adjacent index. */ static inline void mas_set_range(struct ma_state *mas, unsigned long start, unsigned long last) { mas_reset(mas); __mas_set_range(mas, start, last); } /** * mas_set() - Set up Maple Tree operation state for a different index. * @mas: Maple Tree operation state. * @index: New index into the Maple Tree. * * Move the operation state to refer to a different index. This will * have the effect of starting a walk from the top; see mas_next() * to move to an adjacent index. */ static inline void mas_set(struct ma_state *mas, unsigned long index) { mas_set_range(mas, index, index); } static inline bool mt_external_lock(const struct maple_tree *mt) { return (mt->ma_flags & MT_FLAGS_LOCK_MASK) == MT_FLAGS_LOCK_EXTERN; } /** * mt_init_flags() - Initialise an empty maple tree with flags. * @mt: Maple Tree * @flags: maple tree flags. * * If you need to initialise a Maple Tree with special flags (eg, an * allocation tree), use this function. * * Context: Any context. */ static inline void mt_init_flags(struct maple_tree *mt, unsigned int flags) { mt->ma_flags = flags; if (!mt_external_lock(mt)) spin_lock_init(&mt->ma_lock); rcu_assign_pointer(mt->ma_root, NULL); } /** * mt_init() - Initialise an empty maple tree. * @mt: Maple Tree * * An empty Maple Tree. * * Context: Any context. */ static inline void mt_init(struct maple_tree *mt) { mt_init_flags(mt, 0); } static inline bool mt_in_rcu(struct maple_tree *mt) { #ifdef CONFIG_MAPLE_RCU_DISABLED return false; #endif return mt->ma_flags & MT_FLAGS_USE_RCU; } /** * mt_clear_in_rcu() - Switch the tree to non-RCU mode. * @mt: The Maple Tree */ static inline void mt_clear_in_rcu(struct maple_tree *mt) { if (!mt_in_rcu(mt)) return; if (mt_external_lock(mt)) { WARN_ON(!mt_lock_is_held(mt)); mt->ma_flags &= ~MT_FLAGS_USE_RCU; } else { mtree_lock(mt); mt->ma_flags &= ~MT_FLAGS_USE_RCU; mtree_unlock(mt); } } /** * mt_set_in_rcu() - Switch the tree to RCU safe mode. * @mt: The Maple Tree */ static inline void mt_set_in_rcu(struct maple_tree *mt) { if (mt_in_rcu(mt)) return; if (mt_external_lock(mt)) { WARN_ON(!mt_lock_is_held(mt)); mt->ma_flags |= MT_FLAGS_USE_RCU; } else { mtree_lock(mt); mt->ma_flags |= MT_FLAGS_USE_RCU; mtree_unlock(mt); } } static inline unsigned int mt_height(const struct maple_tree *mt) { return (mt->ma_flags & MT_FLAGS_HEIGHT_MASK) >> MT_FLAGS_HEIGHT_OFFSET; } void *mt_find(struct maple_tree *mt, unsigned long *index, unsigned long max); void *mt_find_after(struct maple_tree *mt, unsigned long *index, unsigned long max); void *mt_prev(struct maple_tree *mt, unsigned long index, unsigned long min); void *mt_next(struct maple_tree *mt, unsigned long index, unsigned long max); /** * mt_for_each - Iterate over each entry starting at index until max. * @__tree: The Maple Tree * @__entry: The current entry * @__index: The index to start the search from. Subsequently used as iterator. * @__max: The maximum limit for @index * * This iterator skips all entries, which resolve to a NULL pointer, * e.g. entries which has been reserved with XA_ZERO_ENTRY. */ #define mt_for_each(__tree, __entry, __index, __max) \ for (__entry = mt_find(__tree, &(__index), __max); \ __entry; __entry = mt_find_after(__tree, &(__index), __max)) #endif /*_LINUX_MAPLE_TREE_H */
2 12 12 2 4 1 2 5 12 12 3 5 5 5 39 5 39 5 5 5 5 5 44 42 44 25 1 17 1 8 19 3 1 22 20 2 20 2 20 2 19 2 19 3 17 5 18 4 19 3 19 3 20 2 22 43 27 1 26 23 22 1 41 41 41 40 41 41 41 2 1 12 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 // SPDX-License-Identifier: GPL-2.0-or-later /* * Fair Queue CoDel discipline * * Copyright (C) 2012,2015 Eric Dumazet <edumazet@google.com> */ #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/jiffies.h> #include <linux/string.h> #include <linux/in.h> #include <linux/errno.h> #include <linux/init.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <net/netlink.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> #include <net/codel.h> #include <net/codel_impl.h> #include <net/codel_qdisc.h> /* Fair Queue CoDel. * * Principles : * Packets are classified (internal classifier or external) on flows. * This is a Stochastic model (as we use a hash, several flows * might be hashed on same slot) * Each flow has a CoDel managed queue. * Flows are linked onto two (Round Robin) lists, * so that new flows have priority on old ones. * * For a given flow, packets are not reordered (CoDel uses a FIFO) * head drops only. * ECN capability is on by default. * Low memory footprint (64 bytes per flow) */ struct fq_codel_flow { struct sk_buff *head; struct sk_buff *tail; struct list_head flowchain; int deficit; struct codel_vars cvars; }; /* please try to keep this structure <= 64 bytes */ struct fq_codel_sched_data { struct tcf_proto __rcu *filter_list; /* optional external classifier */ struct tcf_block *block; struct fq_codel_flow *flows; /* Flows table [flows_cnt] */ u32 *backlogs; /* backlog table [flows_cnt] */ u32 flows_cnt; /* number of flows */ u32 quantum; /* psched_mtu(qdisc_dev(sch)); */ u32 drop_batch_size; u32 memory_limit; struct codel_params cparams; struct codel_stats cstats; u32 memory_usage; u32 drop_overmemory; u32 drop_overlimit; u32 new_flow_count; struct list_head new_flows; /* list of new flows */ struct list_head old_flows; /* list of old flows */ }; static unsigned int fq_codel_hash(const struct fq_codel_sched_data *q, struct sk_buff *skb) { return reciprocal_scale(skb_get_hash(skb), q->flows_cnt); } static unsigned int fq_codel_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) { struct fq_codel_sched_data *q = qdisc_priv(sch); struct tcf_proto *filter; struct tcf_result res; int result; if (TC_H_MAJ(skb->priority) == sch->handle && TC_H_MIN(skb->priority) > 0 && TC_H_MIN(skb->priority) <= q->flows_cnt) return TC_H_MIN(skb->priority); filter = rcu_dereference_bh(q->filter_list); if (!filter) return fq_codel_hash(q, skb) + 1; *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; result = tcf_classify(skb, NULL, filter, &res, false); if (result >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { case TC_ACT_STOLEN: case TC_ACT_QUEUED: case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; fallthrough; case TC_ACT_SHOT: return 0; } #endif if (TC_H_MIN(res.classid) <= q->flows_cnt) return TC_H_MIN(res.classid); } return 0; } /* helper functions : might be changed when/if skb use a standard list_head */ /* remove one skb from head of slot queue */ static inline struct sk_buff *dequeue_head(struct fq_codel_flow *flow) { struct sk_buff *skb = flow->head; flow->head = skb->next; skb_mark_not_on_list(skb); return skb; } /* add skb to flow queue (tail add) */ static inline void flow_queue_add(struct fq_codel_flow *flow, struct sk_buff *skb) { if (flow->head == NULL) flow->head = skb; else flow->tail->next = skb; flow->tail = skb; skb->next = NULL; } static unsigned int fq_codel_drop(struct Qdisc *sch, unsigned int max_packets, struct sk_buff **to_free) { struct fq_codel_sched_data *q = qdisc_priv(sch); struct sk_buff *skb; unsigned int maxbacklog = 0, idx = 0, i, len; struct fq_codel_flow *flow; unsigned int threshold; unsigned int mem = 0; /* Queue is full! Find the fat flow and drop packet(s) from it. * This might sound expensive, but with 1024 flows, we scan * 4KB of memory, and we dont need to handle a complex tree * in fast path (packet queue/enqueue) with many cache misses. * In stress mode, we'll try to drop 64 packets from the flow, * amortizing this linear lookup to one cache line per drop. */ for (i = 0; i < q->flows_cnt; i++) { if (q->backlogs[i] > maxbacklog) { maxbacklog = q->backlogs[i]; idx = i; } } /* Our goal is to drop half of this fat flow backlog */ threshold = maxbacklog >> 1; flow = &q->flows[idx]; len = 0; i = 0; do { skb = dequeue_head(flow); len += qdisc_pkt_len(skb); mem += get_codel_cb(skb)->mem_usage; tcf_set_drop_reason(skb, SKB_DROP_REASON_QDISC_OVERLIMIT); __qdisc_drop(skb, to_free); } while (++i < max_packets && len < threshold); /* Tell codel to increase its signal strength also */ flow->cvars.count += i; q->backlogs[idx] -= len; q->memory_usage -= mem; sch->qstats.drops += i; sch->qstats.backlog -= len; sch->q.qlen -= i; return idx; } static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { struct fq_codel_sched_data *q = qdisc_priv(sch); unsigned int idx, prev_backlog, prev_qlen; struct fq_codel_flow *flow; int ret; unsigned int pkt_len; bool memory_limited; idx = fq_codel_classify(skb, sch, &ret); if (idx == 0) { if (ret & __NET_XMIT_BYPASS) qdisc_qstats_drop(sch); __qdisc_drop(skb, to_free); return ret; } idx--; codel_set_enqueue_time(skb); flow = &q->flows[idx]; flow_queue_add(flow, skb); q->backlogs[idx] += qdisc_pkt_len(skb); qdisc_qstats_backlog_inc(sch, skb); if (list_empty(&flow->flowchain)) { list_add_tail(&flow->flowchain, &q->new_flows); q->new_flow_count++; flow->deficit = q->quantum; } get_codel_cb(skb)->mem_usage = skb->truesize; q->memory_usage += get_codel_cb(skb)->mem_usage; memory_limited = q->memory_usage > q->memory_limit; if (++sch->q.qlen <= sch->limit && !memory_limited) return NET_XMIT_SUCCESS; prev_backlog = sch->qstats.backlog; prev_qlen = sch->q.qlen; /* save this packet length as it might be dropped by fq_codel_drop() */ pkt_len = qdisc_pkt_len(skb); /* fq_codel_drop() is quite expensive, as it performs a linear search * in q->backlogs[] to find a fat flow. * So instead of dropping a single packet, drop half of its backlog * with a 64 packets limit to not add a too big cpu spike here. */ ret = fq_codel_drop(sch, q->drop_batch_size, to_free); prev_qlen -= sch->q.qlen; prev_backlog -= sch->qstats.backlog; q->drop_overlimit += prev_qlen; if (memory_limited) q->drop_overmemory += prev_qlen; /* As we dropped packet(s), better let upper stack know this. * If we dropped a packet for this flow, return NET_XMIT_CN, * but in this case, our parents wont increase their backlogs. */ if (ret == idx) { qdisc_tree_reduce_backlog(sch, prev_qlen - 1, prev_backlog - pkt_len); return NET_XMIT_CN; } qdisc_tree_reduce_backlog(sch, prev_qlen, prev_backlog); return NET_XMIT_SUCCESS; } /* This is the specific function called from codel_dequeue() * to dequeue a packet from queue. Note: backlog is handled in * codel, we dont need to reduce it here. */ static struct sk_buff *dequeue_func(struct codel_vars *vars, void *ctx) { struct Qdisc *sch = ctx; struct fq_codel_sched_data *q = qdisc_priv(sch); struct fq_codel_flow *flow; struct sk_buff *skb = NULL; flow = container_of(vars, struct fq_codel_flow, cvars); if (flow->head) { skb = dequeue_head(flow); q->backlogs[flow - q->flows] -= qdisc_pkt_len(skb); q->memory_usage -= get_codel_cb(skb)->mem_usage; sch->q.qlen--; sch->qstats.backlog -= qdisc_pkt_len(skb); } return skb; } static void drop_func(struct sk_buff *skb, void *ctx) { struct Qdisc *sch = ctx; kfree_skb_reason(skb, SKB_DROP_REASON_QDISC_CONGESTED); qdisc_qstats_drop(sch); } static struct sk_buff *fq_codel_dequeue(struct Qdisc *sch) { struct fq_codel_sched_data *q = qdisc_priv(sch); struct sk_buff *skb; struct fq_codel_flow *flow; struct list_head *head; begin: head = &q->new_flows; if (list_empty(head)) { head = &q->old_flows; if (list_empty(head)) return NULL; } flow = list_first_entry(head, struct fq_codel_flow, flowchain); if (flow->deficit <= 0) { flow->deficit += q->quantum; list_move_tail(&flow->flowchain, &q->old_flows); goto begin; } skb = codel_dequeue(sch, &sch->qstats.backlog, &q->cparams, &flow->cvars, &q->cstats, qdisc_pkt_len, codel_get_enqueue_time, drop_func, dequeue_func); if (!skb) { /* force a pass through old_flows to prevent starvation */ if ((head == &q->new_flows) && !list_empty(&q->old_flows)) list_move_tail(&flow->flowchain, &q->old_flows); else list_del_init(&flow->flowchain); goto begin; } qdisc_bstats_update(sch, skb); flow->deficit -= qdisc_pkt_len(skb); if (q->cstats.drop_count) { qdisc_tree_reduce_backlog(sch, q->cstats.drop_count, q->cstats.drop_len); q->cstats.drop_count = 0; q->cstats.drop_len = 0; } return skb; } static void fq_codel_flow_purge(struct fq_codel_flow *flow) { rtnl_kfree_skbs(flow->head, flow->tail); flow->head = NULL; } static void fq_codel_reset(struct Qdisc *sch) { struct fq_codel_sched_data *q = qdisc_priv(sch); int i; INIT_LIST_HEAD(&q->new_flows); INIT_LIST_HEAD(&q->old_flows); for (i = 0; i < q->flows_cnt; i++) { struct fq_codel_flow *flow = q->flows + i; fq_codel_flow_purge(flow); INIT_LIST_HEAD(&flow->flowchain); codel_vars_init(&flow->cvars); } memset(q->backlogs, 0, q->flows_cnt * sizeof(u32)); q->memory_usage = 0; } static const struct nla_policy fq_codel_policy[TCA_FQ_CODEL_MAX + 1] = { [TCA_FQ_CODEL_TARGET] = { .type = NLA_U32 }, [TCA_FQ_CODEL_LIMIT] = { .type = NLA_U32 }, [TCA_FQ_CODEL_INTERVAL] = { .type = NLA_U32 }, [TCA_FQ_CODEL_ECN] = { .type = NLA_U32 }, [TCA_FQ_CODEL_FLOWS] = { .type = NLA_U32 }, [TCA_FQ_CODEL_QUANTUM] = { .type = NLA_U32 }, [TCA_FQ_CODEL_CE_THRESHOLD] = { .type = NLA_U32 }, [TCA_FQ_CODEL_DROP_BATCH_SIZE] = { .type = NLA_U32 }, [TCA_FQ_CODEL_MEMORY_LIMIT] = { .type = NLA_U32 }, [TCA_FQ_CODEL_CE_THRESHOLD_SELECTOR] = { .type = NLA_U8 }, [TCA_FQ_CODEL_CE_THRESHOLD_MASK] = { .type = NLA_U8 }, }; static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct fq_codel_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_FQ_CODEL_MAX + 1]; u32 quantum = 0; int err; err = nla_parse_nested_deprecated(tb, TCA_FQ_CODEL_MAX, opt, fq_codel_policy, NULL); if (err < 0) return err; if (tb[TCA_FQ_CODEL_FLOWS]) { if (q->flows) return -EINVAL; q->flows_cnt = nla_get_u32(tb[TCA_FQ_CODEL_FLOWS]); if (!q->flows_cnt || q->flows_cnt > 65536) return -EINVAL; } if (tb[TCA_FQ_CODEL_QUANTUM]) { quantum = max(256U, nla_get_u32(tb[TCA_FQ_CODEL_QUANTUM])); if (quantum > FQ_CODEL_QUANTUM_MAX) { NL_SET_ERR_MSG(extack, "Invalid quantum"); return -EINVAL; } } sch_tree_lock(sch); if (tb[TCA_FQ_CODEL_TARGET]) { u64 target = nla_get_u32(tb[TCA_FQ_CODEL_TARGET]); WRITE_ONCE(q->cparams.target, (target * NSEC_PER_USEC) >> CODEL_SHIFT); } if (tb[TCA_FQ_CODEL_CE_THRESHOLD]) { u64 val = nla_get_u32(tb[TCA_FQ_CODEL_CE_THRESHOLD]); WRITE_ONCE(q->cparams.ce_threshold, (val * NSEC_PER_USEC) >> CODEL_SHIFT); } if (tb[TCA_FQ_CODEL_CE_THRESHOLD_SELECTOR]) WRITE_ONCE(q->cparams.ce_threshold_selector, nla_get_u8(tb[TCA_FQ_CODEL_CE_THRESHOLD_SELECTOR])); if (tb[TCA_FQ_CODEL_CE_THRESHOLD_MASK]) WRITE_ONCE(q->cparams.ce_threshold_mask, nla_get_u8(tb[TCA_FQ_CODEL_CE_THRESHOLD_MASK])); if (tb[TCA_FQ_CODEL_INTERVAL]) { u64 interval = nla_get_u32(tb[TCA_FQ_CODEL_INTERVAL]); WRITE_ONCE(q->cparams.interval, (interval * NSEC_PER_USEC) >> CODEL_SHIFT); } if (tb[TCA_FQ_CODEL_LIMIT]) WRITE_ONCE(sch->limit, nla_get_u32(tb[TCA_FQ_CODEL_LIMIT])); if (tb[TCA_FQ_CODEL_ECN]) WRITE_ONCE(q->cparams.ecn, !!nla_get_u32(tb[TCA_FQ_CODEL_ECN])); if (quantum) WRITE_ONCE(q->quantum, quantum); if (tb[TCA_FQ_CODEL_DROP_BATCH_SIZE]) WRITE_ONCE(q->drop_batch_size, max(1U, nla_get_u32(tb[TCA_FQ_CODEL_DROP_BATCH_SIZE]))); if (tb[TCA_FQ_CODEL_MEMORY_LIMIT]) WRITE_ONCE(q->memory_limit, min(1U << 31, nla_get_u32(tb[TCA_FQ_CODEL_MEMORY_LIMIT]))); while (sch->q.qlen > sch->limit || q->memory_usage > q->memory_limit) { struct sk_buff *skb = qdisc_dequeue_internal(sch, false); q->cstats.drop_len += qdisc_pkt_len(skb); rtnl_kfree_skbs(skb, skb); q->cstats.drop_count++; } qdisc_tree_reduce_backlog(sch, q->cstats.drop_count, q->cstats.drop_len); q->cstats.drop_count = 0; q->cstats.drop_len = 0; sch_tree_unlock(sch); return 0; } static void fq_codel_destroy(struct Qdisc *sch) { struct fq_codel_sched_data *q = qdisc_priv(sch); tcf_block_put(q->block); kvfree(q->backlogs); kvfree(q->flows); } static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct fq_codel_sched_data *q = qdisc_priv(sch); int i; int err; sch->limit = 10*1024; q->flows_cnt = 1024; q->memory_limit = 32 << 20; /* 32 MBytes */ q->drop_batch_size = 64; q->quantum = psched_mtu(qdisc_dev(sch)); INIT_LIST_HEAD(&q->new_flows); INIT_LIST_HEAD(&q->old_flows); codel_params_init(&q->cparams); codel_stats_init(&q->cstats); q->cparams.ecn = true; q->cparams.mtu = psched_mtu(qdisc_dev(sch)); if (opt) { err = fq_codel_change(sch, opt, extack); if (err) goto init_failure; } err = tcf_block_get(&q->block, &q->filter_list, sch, extack); if (err) goto init_failure; if (!q->flows) { q->flows = kvcalloc(q->flows_cnt, sizeof(struct fq_codel_flow), GFP_KERNEL); if (!q->flows) { err = -ENOMEM; goto init_failure; } q->backlogs = kvcalloc(q->flows_cnt, sizeof(u32), GFP_KERNEL); if (!q->backlogs) { err = -ENOMEM; goto alloc_failure; } for (i = 0; i < q->flows_cnt; i++) { struct fq_codel_flow *flow = q->flows + i; INIT_LIST_HEAD(&flow->flowchain); codel_vars_init(&flow->cvars); } } if (sch->limit >= 1) sch->flags |= TCQ_F_CAN_BYPASS; else sch->flags &= ~TCQ_F_CAN_BYPASS; return 0; alloc_failure: kvfree(q->flows); q->flows = NULL; init_failure: q->flows_cnt = 0; return err; } static int fq_codel_dump(struct Qdisc *sch, struct sk_buff *skb) { struct fq_codel_sched_data *q = qdisc_priv(sch); codel_time_t ce_threshold; struct nlattr *opts; opts = nla_nest_start_noflag(skb, TCA_OPTIONS); if (opts == NULL) goto nla_put_failure; if (nla_put_u32(skb, TCA_FQ_CODEL_TARGET, codel_time_to_us(READ_ONCE(q->cparams.target))) || nla_put_u32(skb, TCA_FQ_CODEL_LIMIT, READ_ONCE(sch->limit)) || nla_put_u32(skb, TCA_FQ_CODEL_INTERVAL, codel_time_to_us(READ_ONCE(q->cparams.interval))) || nla_put_u32(skb, TCA_FQ_CODEL_ECN, READ_ONCE(q->cparams.ecn)) || nla_put_u32(skb, TCA_FQ_CODEL_QUANTUM, READ_ONCE(q->quantum)) || nla_put_u32(skb, TCA_FQ_CODEL_DROP_BATCH_SIZE, READ_ONCE(q->drop_batch_size)) || nla_put_u32(skb, TCA_FQ_CODEL_MEMORY_LIMIT, READ_ONCE(q->memory_limit)) || nla_put_u32(skb, TCA_FQ_CODEL_FLOWS, READ_ONCE(q->flows_cnt))) goto nla_put_failure; ce_threshold = READ_ONCE(q->cparams.ce_threshold); if (ce_threshold != CODEL_DISABLED_THRESHOLD) { if (nla_put_u32(skb, TCA_FQ_CODEL_CE_THRESHOLD, codel_time_to_us(ce_threshold))) goto nla_put_failure; if (nla_put_u8(skb, TCA_FQ_CODEL_CE_THRESHOLD_SELECTOR, READ_ONCE(q->cparams.ce_threshold_selector))) goto nla_put_failure; if (nla_put_u8(skb, TCA_FQ_CODEL_CE_THRESHOLD_MASK, READ_ONCE(q->cparams.ce_threshold_mask))) goto nla_put_failure; } return nla_nest_end(skb, opts); nla_put_failure: return -1; } static int fq_codel_dump_stats(struct Qdisc *sch, struct gnet_dump *d) { struct fq_codel_sched_data *q = qdisc_priv(sch); struct tc_fq_codel_xstats st = { .type = TCA_FQ_CODEL_XSTATS_QDISC, }; struct list_head *pos; st.qdisc_stats.maxpacket = q->cstats.maxpacket; st.qdisc_stats.drop_overlimit = q->drop_overlimit; st.qdisc_stats.ecn_mark = q->cstats.ecn_mark; st.qdisc_stats.new_flow_count = q->new_flow_count; st.qdisc_stats.ce_mark = q->cstats.ce_mark; st.qdisc_stats.memory_usage = q->memory_usage; st.qdisc_stats.drop_overmemory = q->drop_overmemory; sch_tree_lock(sch); list_for_each(pos, &q->new_flows) st.qdisc_stats.new_flows_len++; list_for_each(pos, &q->old_flows) st.qdisc_stats.old_flows_len++; sch_tree_unlock(sch); return gnet_stats_copy_app(d, &st, sizeof(st)); } static struct Qdisc *fq_codel_leaf(struct Qdisc *sch, unsigned long arg) { return NULL; } static unsigned long fq_codel_find(struct Qdisc *sch, u32 classid) { return 0; } static unsigned long fq_codel_bind(struct Qdisc *sch, unsigned long parent, u32 classid) { return 0; } static void fq_codel_unbind(struct Qdisc *q, unsigned long cl) { } static struct tcf_block *fq_codel_tcf_block(struct Qdisc *sch, unsigned long cl, struct netlink_ext_ack *extack) { struct fq_codel_sched_data *q = qdisc_priv(sch); if (cl) return NULL; return q->block; } static int fq_codel_dump_class(struct Qdisc *sch, unsigned long cl, struct sk_buff *skb, struct tcmsg *tcm) { tcm->tcm_handle |= TC_H_MIN(cl); return 0; } static int fq_codel_dump_class_stats(struct Qdisc *sch, unsigned long cl, struct gnet_dump *d) { struct fq_codel_sched_data *q = qdisc_priv(sch); u32 idx = cl - 1; struct gnet_stats_queue qs = { 0 }; struct tc_fq_codel_xstats xstats; if (idx < q->flows_cnt) { const struct fq_codel_flow *flow = &q->flows[idx]; const struct sk_buff *skb; memset(&xstats, 0, sizeof(xstats)); xstats.type = TCA_FQ_CODEL_XSTATS_CLASS; xstats.class_stats.deficit = flow->deficit; xstats.class_stats.ldelay = codel_time_to_us(flow->cvars.ldelay); xstats.class_stats.count = flow->cvars.count; xstats.class_stats.lastcount = flow->cvars.lastcount; xstats.class_stats.dropping = flow->cvars.dropping; if (flow->cvars.dropping) { codel_tdiff_t delta = flow->cvars.drop_next - codel_get_time(); xstats.class_stats.drop_next = (delta >= 0) ? codel_time_to_us(delta) : -codel_time_to_us(-delta); } if (flow->head) { sch_tree_lock(sch); skb = flow->head; while (skb) { qs.qlen++; skb = skb->next; } sch_tree_unlock(sch); } qs.backlog = q->backlogs[idx]; qs.drops = 0; } if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0) return -1; if (idx < q->flows_cnt) return gnet_stats_copy_app(d, &xstats, sizeof(xstats)); return 0; } static void fq_codel_walk(struct Qdisc *sch, struct qdisc_walker *arg) { struct fq_codel_sched_data *q = qdisc_priv(sch); unsigned int i; if (arg->stop) return; for (i = 0; i < q->flows_cnt; i++) { if (list_empty(&q->flows[i].flowchain)) { arg->count++; continue; } if (!tc_qdisc_stats_dump(sch, i + 1, arg)) break; } } static const struct Qdisc_class_ops fq_codel_class_ops = { .leaf = fq_codel_leaf, .find = fq_codel_find, .tcf_block = fq_codel_tcf_block, .bind_tcf = fq_codel_bind, .unbind_tcf = fq_codel_unbind, .dump = fq_codel_dump_class, .dump_stats = fq_codel_dump_class_stats, .walk = fq_codel_walk, }; static struct Qdisc_ops fq_codel_qdisc_ops __read_mostly = { .cl_ops = &fq_codel_class_ops, .id = "fq_codel", .priv_size = sizeof(struct fq_codel_sched_data), .enqueue = fq_codel_enqueue, .dequeue = fq_codel_dequeue, .peek = qdisc_peek_dequeued, .init = fq_codel_init, .reset = fq_codel_reset, .destroy = fq_codel_destroy, .change = fq_codel_change, .dump = fq_codel_dump, .dump_stats = fq_codel_dump_stats, .owner = THIS_MODULE, }; MODULE_ALIAS_NET_SCH("fq_codel"); static int __init fq_codel_module_init(void) { return register_qdisc(&fq_codel_qdisc_ops); } static void __exit fq_codel_module_exit(void) { unregister_qdisc(&fq_codel_qdisc_ops); } module_init(fq_codel_module_init) module_exit(fq_codel_module_exit) MODULE_AUTHOR("Eric Dumazet"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Fair Queue CoDel discipline");
4 4 4 1 3 1 2 4 2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 // SPDX-License-Identifier: GPL-2.0 /* * xfrm4_input.c * * Changes: * YOSHIFUJI Hideaki @USAGI * Split up af-specific portion * Derek Atkins <derek@ihtfp.com> * Add Encapsulation support * */ #include <linux/slab.h> #include <linux/module.h> #include <linux/string.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv4.h> #include <net/ip.h> #include <net/xfrm.h> #include <net/protocol.h> #include <net/gro.h> static int xfrm4_rcv_encap_finish2(struct net *net, struct sock *sk, struct sk_buff *skb) { return dst_input(skb); } static inline int xfrm4_rcv_encap_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { if (!skb_dst(skb)) { const struct iphdr *iph = ip_hdr(skb); if (ip_route_input_noref(skb, iph->daddr, iph->saddr, ip4h_dscp(iph), skb->dev)) goto drop; } if (xfrm_trans_queue(skb, xfrm4_rcv_encap_finish2)) goto drop; return 0; drop: kfree_skb(skb); return NET_RX_DROP; } int xfrm4_transport_finish(struct sk_buff *skb, int async) { struct xfrm_offload *xo = xfrm_offload(skb); struct iphdr *iph = ip_hdr(skb); iph->protocol = XFRM_MODE_SKB_CB(skb)->protocol; #ifndef CONFIG_NETFILTER if (!async) return -iph->protocol; #endif __skb_push(skb, -skb_network_offset(skb)); iph->tot_len = htons(skb->len); ip_send_check(iph); if (xo && (xo->flags & XFRM_GRO)) { /* The full l2 header needs to be preserved so that re-injecting the packet at l2 * works correctly in the presence of vlan tags. */ skb_mac_header_rebuild_full(skb, xo->orig_mac_len); skb_reset_network_header(skb); skb_reset_transport_header(skb); return 0; } NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, dev_net(skb->dev), NULL, skb, skb->dev, NULL, xfrm4_rcv_encap_finish); return 0; } static int __xfrm4_udp_encap_rcv(struct sock *sk, struct sk_buff *skb, bool pull) { struct udp_sock *up = udp_sk(sk); struct udphdr *uh; struct iphdr *iph; int iphlen, len; __u8 *udpdata; __be32 *udpdata32; u16 encap_type; encap_type = READ_ONCE(up->encap_type); /* if this is not encapsulated socket, then just return now */ if (!encap_type) return 1; /* If this is a paged skb, make sure we pull up * whatever data we need to look at. */ len = skb->len - sizeof(struct udphdr); if (!pskb_may_pull(skb, sizeof(struct udphdr) + min(len, 8))) return 1; /* Now we can get the pointers */ uh = udp_hdr(skb); udpdata = (__u8 *)uh + sizeof(struct udphdr); udpdata32 = (__be32 *)udpdata; switch (encap_type) { default: case UDP_ENCAP_ESPINUDP: /* Check if this is a keepalive packet. If so, eat it. */ if (len == 1 && udpdata[0] == 0xff) { return -EINVAL; } else if (len > sizeof(struct ip_esp_hdr) && udpdata32[0] != 0) { /* ESP Packet without Non-ESP header */ len = sizeof(struct udphdr); } else /* Must be an IKE packet.. pass it through */ return 1; break; } /* At this point we are sure that this is an ESPinUDP packet, * so we need to remove 'len' bytes from the packet (the UDP * header and optional ESP marker bytes) and then modify the * protocol to ESP, and then call into the transform receiver. */ if (skb_unclone(skb, GFP_ATOMIC)) return -EINVAL; /* Now we can update and verify the packet length... */ iph = ip_hdr(skb); iphlen = iph->ihl << 2; iph->tot_len = htons(ntohs(iph->tot_len) - len); if (skb->len < iphlen + len) { /* packet is too small!?! */ return -EINVAL; } /* pull the data buffer up to the ESP header and set the * transport header to point to ESP. Keep UDP on the stack * for later. */ if (pull) { __skb_pull(skb, len); skb_reset_transport_header(skb); } else { skb_set_transport_header(skb, len); } /* process ESP */ return 0; } /* If it's a keepalive packet, then just eat it. * If it's an encapsulated packet, then pass it to the * IPsec xfrm input. * Returns 0 if skb passed to xfrm or was dropped. * Returns >0 if skb should be passed to UDP. * Returns <0 if skb should be resubmitted (-ret is protocol) */ int xfrm4_udp_encap_rcv(struct sock *sk, struct sk_buff *skb) { int ret; ret = __xfrm4_udp_encap_rcv(sk, skb, true); if (!ret) return xfrm4_rcv_encap(skb, IPPROTO_ESP, 0, udp_sk(sk)->encap_type); if (ret < 0) { kfree_skb(skb); return 0; } return ret; } EXPORT_SYMBOL(xfrm4_udp_encap_rcv); struct sk_buff *xfrm4_gro_udp_encap_rcv(struct sock *sk, struct list_head *head, struct sk_buff *skb) { int offset = skb_gro_offset(skb); const struct net_offload *ops; struct sk_buff *pp = NULL; int ret; offset = offset - sizeof(struct udphdr); if (!pskb_pull(skb, offset)) return NULL; rcu_read_lock(); ops = rcu_dereference(inet_offloads[IPPROTO_ESP]); if (!ops || !ops->callbacks.gro_receive) goto out; ret = __xfrm4_udp_encap_rcv(sk, skb, false); if (ret) goto out; skb_push(skb, offset); NAPI_GRO_CB(skb)->proto = IPPROTO_UDP; pp = call_gro_receive(ops->callbacks.gro_receive, head, skb); rcu_read_unlock(); return pp; out: rcu_read_unlock(); skb_push(skb, offset); NAPI_GRO_CB(skb)->same_flow = 0; NAPI_GRO_CB(skb)->flush = 1; return NULL; } EXPORT_SYMBOL(xfrm4_gro_udp_encap_rcv); int xfrm4_rcv(struct sk_buff *skb) { return xfrm4_rcv_spi(skb, ip_hdr(skb)->protocol, 0); } EXPORT_SYMBOL(xfrm4_rcv);
32 6 15 35 35 35 4 4 4 4 4 4 9 9 9 9 9 9 13 4 9 12 4 9 38 13 35 38 13 35 24 24 24 1 1 1 1 1 1 8 8 4 23 64 23 42 2 19 2 2 2 2 5 3 2 5 5 5 5 23 5 5 5 19 9 10 2 4 8 12 5 18 5 13 18 5 18 22 22 22 22 10 10 1 1 33 2 2 3 1 1 29 1 28 6 8 15 14 10 1 8 2 34 34 37 37 37 8 8 8 2 2 6 6 7 1 4 4 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 4 4 2 2 2 18 2 7 7 7 7 4 3 23 23 2 23 23 7 18 11 10 18 12 3 9 12 1 1 1 1